<a href="https://colab.research.google.com/github/saitzaw/apache-spark-colab/blob/main/ResolutionPySpark2023v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q findspark
!pip install -q pyspark

In [2]:
import os
import folium
import sqlite3
import findspark
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, Row
from datetime import datetime, date
from pyspark.sql.types import StructType,StructField,DateType, DoubleType
from pyspark.sql.functions import date_format, to_date, year, month, regexp_replace

In [3]:
findspark.init()
spark = SparkSession.builder\
                    .master("local")\
                    .appName("Resolution2023v2")\
                    .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
SampleData = '/content/drive/MyDrive/ColabDataset/MotorVehicleCollisionsCrashes2.parquet'

In [6]:
MVCCDF = spark.read.parquet(SampleData)

In [7]:
MVCCDF.count()

1000

In [8]:
MVCCDF.show(1, truncate = False, vertical=True)

-RECORD 0-----------------------------------------------------
 crash_date                    | 2021-09-11T00:00:00.000      
 crash_time                    | 2:39                         
 borough                       | null                         
 zip_code                      | null                         
 latitude                      | null                         
 longitude                     | null                         
 location                      | null                         
 on_street_name                | WHITESTONE EXPRESSWAY        
 off_street_name               | 20 AVENUE                    
 cross_street_name             | null                         
 number_of_persons_injured     | 2                            
 number_of_persons_killed      | 0                            
 number_of_pedestrians_injured | 0                            
 number_of_pedestrians_killed  | 0                            
 number_of_cyclist_injured     | 0                     

In [9]:
MVCCDF.printSchema()

root
 |-- crash_date: string (nullable = true)
 |-- crash_time: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- zip_code: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- location: string (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- cross_street_name: string (nullable = true)
 |-- number_of_persons_injured: long (nullable = true)
 |-- number_of_persons_killed: long (nullable = true)
 |-- number_of_pedestrians_injured: long (nullable = true)
 |-- number_of_pedestrians_killed: long (nullable = true)
 |-- number_of_cyclist_injured: long (nullable = true)
 |-- number_of_cyclist_killed: long (nullable = true)
 |-- number_of_motorist_injured: long (nullable = true)
 |-- number_of_motorist_killed: long (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 

In [10]:
MVCCDfAnalysis = MVCCDF.withColumn("crash_date", date_format(MVCCDF["crash_date"], "yyyy-MM-dd"))\
.withColumn("location", regexp_replace("location", "\n,", ""))\
.withColumn("location", regexp_replace("location", "\n", ""))

In [11]:
MVCCDfAnalysis.show(1, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------
 crash_date                    | 2021-09-11                   
 crash_time                    | 2:39                         
 borough                       | null                         
 zip_code                      | null                         
 latitude                      | null                         
 longitude                     | null                         
 location                      | null                         
 on_street_name                | WHITESTONE EXPRESSWAY        
 off_street_name               | 20 AVENUE                    
 cross_street_name             | null                         
 number_of_persons_injured     | 2                            
 number_of_persons_killed      | 0                            
 number_of_pedestrians_injured | 0                            
 number_of_pedestrians_killed  | 0                            
 number_of_cyclist_injured     | 0                     

In [12]:
MVCCDKilled = MVCCDfAnalysis.filter(MVCCDfAnalysis["number_of_persons_killed"] >= 1)\
.filter(MVCCDfAnalysis['contributing_factor_vehicle_1'].isNotNull())\
.withColumn("crash_year", year(MVCCDfAnalysis["crash_date"]))\
.withColumn("crash_month", month(MVCCDfAnalysis["crash_date"]))\
.withColumn("crash_day", date_format(MVCCDfAnalysis["crash_date"], "d"))\
.select("collision_id", "crash_date", 
        "crash_year","crash_month", "crash_day",
        "crash_time","borough", 
        "location", "number_of_persons_killed",
        "number_of_pedestrians_killed",
        "number_of_cyclist_killed", 
        "number_of_motorist_killed")

In [13]:
MVCCDKilled.show(1, truncate=True, vertical=True)

-RECORD 0--------------------------------------------
 collision_id                 | 4456659              
 crash_date                   | 2021-07-09           
 crash_year                   | 2021                 
 crash_month                  | 7                    
 crash_day                    | 9                    
 crash_time                   | 0:43                 
 borough                      | null                 
 location                     |   (40.720535, -73... 
 number_of_persons_killed     | 1                    
 number_of_pedestrians_killed | 1                    
 number_of_cyclist_killed     | 0                    
 number_of_motorist_killed    | 0                    
only showing top 1 row



In [14]:
MVCCDKilled.groupBy("crash_year", "crash_month")\
.sum("number_of_persons_killed")\
.orderBy("crash_year", "crash_month")\
.show(truncate=True)

+----------+-----------+-----------------------------+
|crash_year|crash_month|sum(number_of_persons_killed)|
+----------+-----------+-----------------------------+
|      2021|          4|                            2|
|      2021|          7|                            1|
|      2021|         12|                            1|
+----------+-----------+-----------------------------+

