<a href="https://colab.research.google.com/github/saitzaw/apache-spark-colab/blob/main/Resolution2023_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install PySpark in Colab


In [5]:
!pip install -q findspark
!pip install -q pyspark

In [19]:
import os
import sqlite3
import findspark
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Row
from datetime import datetime, date
from pyspark.sql.types import StructType,StructField,DateType, DoubleType
from pyspark.sql.functions import date_format, to_date

In [7]:
findspark.init()
spark = SparkSession.builder\
                    .master("local")\
                    .appName("Resolution2023")\
                    .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

# Mount to Google Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
SampleData = '/content/drive/MyDrive/ColabDataset/MotorVehicleCollisionsCrashes.parquet'

In [12]:
MVCCDF = spark.read.parquet(SampleData)

In [13]:
MVCCDF.show(1, truncate = True, vertical=True)

-RECORD 0---------------------------------------------
 CRASH_DATE                    | 04/14/2021           
 CRASH_TIME                    | 5:32                 
 BOROUGH                       | null                 
 ZIP_CODE                      | null                 
 LATITUDE                      | null                 
 LONGITUDE                     | null                 
 LOCATION                      | null                 
 ON_STREET_NAME                | BRONX WHITESTONE ... 
 CROSS_STREET_NAME             | null                 
 OFF_STREET_NAME               | null                 
 NUMBER_OF_PERSONS_INJURED     | 0.0                  
 NUMBER_OF_PERSONS_KILLED      | 0.0                  
 NUMBER_OF_PEDESTRIANS_INJURED | 0                    
 NUMBER_OF_PEDESTRIANS_KILLED  | 0                    
 NUMBER_OF_CYCLIST_INJURED     | 0                    
 NUMBER_OF_CYCLIST_KILLED      | 0                    
 NUMBER_OF_MOTORIST_INJURED    | 0                    
 NUMBER_OF

In [31]:
MVCCDF.printSchema()

root
 |-- CRASH_DATE: string (nullable = true)
 |-- CRASH_TIME: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- ZIP_CODE: double (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- ON_STREET_NAME: string (nullable = true)
 |-- CROSS_STREET_NAME: string (nullable = true)
 |-- OFF_STREET_NAME: string (nullable = true)
 |-- NUMBER_OF_PERSONS_INJURED: double (nullable = true)
 |-- NUMBER_OF_PERSONS_KILLED: double (nullable = true)
 |-- NUMBER_OF_PEDESTRIANS_INJURED: long (nullable = true)
 |-- NUMBER_OF_PEDESTRIANS_KILLED: long (nullable = true)
 |-- NUMBER_OF_CYCLIST_INJURED: long (nullable = true)
 |-- NUMBER_OF_CYCLIST_KILLED: long (nullable = true)
 |-- NUMBER_OF_MOTORIST_INJURED: long (nullable = true)
 |-- NUMBER_OF_MOTORIST_KILLED: long (nullable = true)
 |-- CONTRIBUTING_FACTOR_VEHICLE_1: string (nullable = true)
 |-- CONTRIBUTING_FACTOR_VEHICLE_2: string (nullable = tru

# Change Column name to inprove readability 
- pandas that is very easy, you use very simple way 
- df.rename({'old_col_name1': 'new_col_name2', 'old_col_name2':'new_col_name3'}, axis=1, inplace=True)
- SDF.withColumnRenamed("old_col_name1", "new_col_name1").withColumnRenamed("old_col_name2", "new_col_name2").drop('columns')

In [29]:
MVCCDfAnalysis = MVCCDF.withColumn(
    "CRASH_DATE", date_format(to_date("CRASH_DATE", "MM/dd/yyyy"), "yyyy-MM-dd"))\
.withColumnRenamed("CRASH_DATE", "crash_date")\
.withColumnRenamed("CRASH_TIME", "crash_time")\
.withColumnRenamed("BOROUGH", "borough")\
.withColumnRenamed("ZIP_CODE", "zip")\
.withColumnRenamed("LATITUDE", "lat")\
.withColumnRenamed("LONGITUDE", "long")\
.withColumnRenamed("LOCATION", "loca")\
.withColumnRenamed("ON_STREET_NAME", "on_street_name")\
.withColumnRenamed("NUMBER_OF_PERSONS_INJURED", "num_person_injured")\
.withColumnRenamed("NUMBER_OF_PERSONS_KILLED", "num_person_killed")\
.withColumnRenamed("NUMBER_OF_PEDESTRIANS_INJURED", "num_pedestrians_injured")\
.withColumnRenamed("NUMBER_OF_CYCLIST_INJURED", "num_cyclist_injured")\
.withColumnRenamed("NUMBER_OF_CYCLIST_KILLED", "num_cyclist_killed")\
.withColumnRenamed("NUMBER_OF_MOTORIST_INJURED", "num_motorist_injured")\
.withColumnRenamed("NUMBER_OF_MOTORIST_KILLED", "num_motorist_killed")\
.withColumnRenamed("CONTRIBUTING_FACTOR_VEHICLE_1", "contributing_factor_venicle_1")\
.withColumnRenamed("CONTRIBUTING_FACTOR_VEHICLE_2", "contributing_factor_venicle_2")\
.withColumnRenamed("CONTRIBUTING_FACTOR_VEHICLE_3", "contributing_factor_venicle_3")\
.withColumnRenamed("CONTRIBUTING_FACTOR_VEHICLE_4", "contributing_factor_venicle_4")\
.withColumnRenamed("CONTRIBUTING_FACTOR_VEHICLE_5", "contributing_factor_venicle_5")\
.withColumnRenamed("COLLISION_ID", "collision_id")\
.withColumnRenamed("VEHICLE_TYPE_CODE_1", "venicle_type_code_1")\
.withColumnRenamed("VEHICLE_TYPE_CODE_2", "venicle_type_code_2")\
.withColumnRenamed("VEHICLE_TYPE_CODE_3", "venicle_type_code_3")\
.withColumnRenamed("VEHICLE_TYPE_CODE_4", "venicle_type_code_4")\
.withColumnRenamed("VEHICLE_TYPE_CODE_5", "venicle_type_code_5")

In [30]:
MVCCDfAnalysis

crash_date,crash_time,borough,zip,lat,long,loca,on_street_name,CROSS_STREET_NAME,OFF_STREET_NAME,num_person_injured,num_person_killed,num_pedestrians_injured,NUMBER_OF_PEDESTRIANS_KILLED,num_cyclist_injured,num_cyclist_killed,num_motorist_injured,num_motorist_killed,contributing_factor_venicle_1,contributing_factor_venicle_2,contributing_factor_venicle_3,contributing_factor_venicle_4,contributing_factor_venicle_5,collision_id,venicle_type_code_1,venicle_type_code_2,venicle_type_code_3,venicle_type_code_4,venicle_type_code_5
2021-04-14,5:32,,,,,,BRONX WHITESTONE ...,,,0.0,0.0,0,0,0,0,0,0,Following Too Clo...,Unspecified,,,,4407480,Sedan,Sedan,,,
2021-04-13,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97...",,,620 ATLANTI...,1.0,0.0,1,0,0,0,0,0,Unspecified,,,,,4407147,Sedan,,,,
2021-04-15,16:15,,,,,,HUTCHINSON RIVER ...,,,0.0,0.0,0,0,0,0,0,0,Pavement Slippery,,,,,4407665,Station Wagon/Spo...,,,,
2021-04-13,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,0.0,0.0,0,0,0,0,0,0,Following Too Clo...,Unspecified,,,,4407811,Sedan,,,,
2021-04-12,8:25,,,0.0,0.0,"(0.0, 0.0)",EDSON AVENUE ...,,,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,4406885,Station Wagon/Spo...,Sedan,,,
2021-04-13,17:11,,,,,,VERRAZANO BRIDGE ...,,,0.0,0.0,0,0,0,0,0,0,Following Too Clo...,Unspecified,,,,4407883,Sedan,Box Truck,,,
2021-04-13,17:30,QUEENS,11106.0,,,,33 st,31ave,,0.0,0.0,0,0,0,0,0,0,Driver Inattentio...,Unspecified,,,,4408019,Sedan,Sedan,,,
2021-04-16,23:30,,,,,,SHORE PARKWAY,,,0.0,0.0,0,0,0,0,0,0,Unspecified,,,,,4408060,Sedan,,,,
2021-04-11,17:00,,,,,,GOWANUS RAMP ...,,,1.0,0.0,0,0,0,0,1,0,Other Vehicular,Other Vehicular,,,,4406314,Sedan,Sedan,,,
2021-04-16,21:15,,,,,,BRONX RIVER PARKW...,,,0.0,0.0,0,0,0,0,0,0,Driver Inattentio...,Unspecified,,,,4408149,Station Wagon/Spo...,Sedan,,,
