### Simple data analysis using Apache Spark
- Read the Parquet file in Google Drive
- Analysis the data using Apache Spark

In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar -xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install pyarrow

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 49 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 67.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=5e2b980e70ecef95ba3e4bef8a4c2c5f216fac419b9f7f49032d35bf99d32fd7
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [4]:
import os
import findspark
from pyarrow import parquet
from pyspark.sql import SparkSession

In [5]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [6]:
findspark.init()
spark = SparkSession.builder\
                    .master("local")\
                    .appName("colab")\
                    .config("spark.ui.port", "4050")\
                    .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

### Data source 
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95

### remark 
- change this csv file to parquet format to save file size

#### method
```
# This is formatted as code
df = pd.read_csv('path/file.csv', low_memory=False)
df.to_parquet('path/file.parquet')
```
then upload to Google Drive


In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
data_file = '/content/gdrive/MyDrive/ColabDataset/MotorVehicleCollisionsCrashes.parquet'

In [9]:
df = spark.read.parquet(data_file)

In [10]:
df.head(5)

[Row(CRASH_DATE='04/14/2021', CRASH_TIME='5:32', BOROUGH=None, ZIP_CODE=None, LATITUDE=None, LONGITUDE=None, LOCATION=None, ON_STREET_NAME='BRONX WHITESTONE BRIDGE', CROSS_STREET_NAME=None, OFF_STREET_NAME=None, NUMBER_OF_PERSONS_INJURED=0.0, NUMBER_OF_PERSONS_KILLED=0.0, NUMBER_OF_PEDESTRIANS_INJURED=0, NUMBER_OF_PEDESTRIANS_KILLED=0, NUMBER_OF_CYCLIST_INJURED=0, NUMBER_OF_CYCLIST_KILLED=0, NUMBER_OF_MOTORIST_INJURED=0, NUMBER_OF_MOTORIST_KILLED=0, CONTRIBUTING_FACTOR_VEHICLE_1='Following Too Closely', CONTRIBUTING_FACTOR_VEHICLE_2='Unspecified', CONTRIBUTING_FACTOR_VEHICLE_3=None, CONTRIBUTING_FACTOR_VEHICLE_4=None, CONTRIBUTING_FACTOR_VEHICLE_5=None, COLLISION_ID=4407480, VEHICLE_TYPE_CODE_1='Sedan', VEHICLE_TYPE_CODE_2='Sedan', VEHICLE_TYPE_CODE_3=None, VEHICLE_TYPE_CODE_4=None, VEHICLE_TYPE_CODE_5=None),
 Row(CRASH_DATE='04/13/2021', CRASH_TIME='21:35', BOROUGH='BROOKLYN', ZIP_CODE=11217.0, LATITUDE=40.68358, LONGITUDE=-73.97617, LOCATION='(40.68358, -73.97617)', ON_STREET_NAME=No

In [11]:
df.show(5, truncate=True, vertical=True)

-RECORD 0---------------------------------------------
 CRASH_DATE                    | 04/14/2021           
 CRASH_TIME                    | 5:32                 
 BOROUGH                       | null                 
 ZIP_CODE                      | null                 
 LATITUDE                      | null                 
 LONGITUDE                     | null                 
 LOCATION                      | null                 
 ON_STREET_NAME                | BRONX WHITESTONE ... 
 CROSS_STREET_NAME             | null                 
 OFF_STREET_NAME               | null                 
 NUMBER_OF_PERSONS_INJURED     | 0.0                  
 NUMBER_OF_PERSONS_KILLED      | 0.0                  
 NUMBER_OF_PEDESTRIANS_INJURED | 0                    
 NUMBER_OF_PEDESTRIANS_KILLED  | 0                    
 NUMBER_OF_CYCLIST_INJURED     | 0                    
 NUMBER_OF_CYCLIST_KILLED      | 0                    
 NUMBER_OF_MOTORIST_INJURED    | 0                    
 NUMBER_OF

In [12]:
df.printSchema()

root
 |-- CRASH_DATE: string (nullable = true)
 |-- CRASH_TIME: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- ZIP_CODE: double (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- ON_STREET_NAME: string (nullable = true)
 |-- CROSS_STREET_NAME: string (nullable = true)
 |-- OFF_STREET_NAME: string (nullable = true)
 |-- NUMBER_OF_PERSONS_INJURED: double (nullable = true)
 |-- NUMBER_OF_PERSONS_KILLED: double (nullable = true)
 |-- NUMBER_OF_PEDESTRIANS_INJURED: long (nullable = true)
 |-- NUMBER_OF_PEDESTRIANS_KILLED: long (nullable = true)
 |-- NUMBER_OF_CYCLIST_INJURED: long (nullable = true)
 |-- NUMBER_OF_CYCLIST_KILLED: long (nullable = true)
 |-- NUMBER_OF_MOTORIST_INJURED: long (nullable = true)
 |-- NUMBER_OF_MOTORIST_KILLED: long (nullable = true)
 |-- CONTRIBUTING_FACTOR_VEHICLE_1: string (nullable = true)
 |-- CONTRIBUTING_FACTOR_VEHICLE_2: string (nullable = tru

### Na Check and count

In [13]:
Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}

In [14]:
Dict_Null

{'BOROUGH': 377582,
 'COLLISION_ID': 0,
 'CONTRIBUTING_FACTOR_VEHICLE_1': 3645,
 'CONTRIBUTING_FACTOR_VEHICLE_2': 175068,
 'CONTRIBUTING_FACTOR_VEHICLE_3': 972302,
 'CONTRIBUTING_FACTOR_VEHICLE_4': 1031042,
 'CONTRIBUTING_FACTOR_VEHICLE_5': 1043709,
 'CRASH_DATE': 0,
 'CRASH_TIME': 0,
 'CROSS_STREET_NAME': 541455,
 'LATITUDE': 75849,
 'LOCATION': 75849,
 'LONGITUDE': 75849,
 'NUMBER_OF_CYCLIST_INJURED': 0,
 'NUMBER_OF_CYCLIST_KILLED': 0,
 'NUMBER_OF_MOTORIST_INJURED': 0,
 'NUMBER_OF_MOTORIST_KILLED': 0,
 'NUMBER_OF_PEDESTRIANS_INJURED': 0,
 'NUMBER_OF_PEDESTRIANS_KILLED': 0,
 'NUMBER_OF_PERSONS_INJURED': 17,
 'NUMBER_OF_PERSONS_KILLED': 30,
 'OFF_STREET_NAME': 794513,
 'ON_STREET_NAME': 255438,
 'VEHICLE_TYPE_CODE_1': 8721,
 'VEHICLE_TYPE_CODE_2': 243280,
 'VEHICLE_TYPE_CODE_3': 976623,
 'VEHICLE_TYPE_CODE_4': 1031949,
 'VEHICLE_TYPE_CODE_5': 1043919,
 'ZIP_CODE': 377763}

In [15]:
df.count()

1048575

In [16]:
Dict_Null_per = {col:(
    df.filter(
        df[col].isNull()
        ).count()/df.count() * 100
        ) for col in df.columns}

In [17]:
Dict_Null_per

{'BOROUGH': 36.009059914646066,
 'COLLISION_ID': 0.0,
 'CONTRIBUTING_FACTOR_VEHICLE_1': 0.34761461984121306,
 'CONTRIBUTING_FACTOR_VEHICLE_2': 16.695801444817967,
 'CONTRIBUTING_FACTOR_VEHICLE_3': 92.72603294947905,
 'CONTRIBUTING_FACTOR_VEHICLE_4': 98.32792122642634,
 'CONTRIBUTING_FACTOR_VEHICLE_5': 99.53594163507617,
 'CRASH_DATE': 0.0,
 'CRASH_TIME': 0.0,
 'CROSS_STREET_NAME': 51.637221944066944,
 'LATITUDE': 7.233531220942709,
 'LOCATION': 7.233531220942709,
 'LONGITUDE': 7.233531220942709,
 'NUMBER_OF_CYCLIST_INJURED': 0.0,
 'NUMBER_OF_CYCLIST_KILLED': 0.0,
 'NUMBER_OF_MOTORIST_INJURED': 0.0,
 'NUMBER_OF_MOTORIST_KILLED': 0.0,
 'NUMBER_OF_PEDESTRIANS_INJURED': 0.0,
 'NUMBER_OF_PEDESTRIANS_KILLED': 0.0,
 'NUMBER_OF_PERSONS_INJURED': 0.0016212478840330925,
 'NUMBER_OF_PERSONS_KILLED': 0.0028610256777054574,
 'OFF_STREET_NAME': 75.7707364756932,
 'ON_STREET_NAME': 24.360489235390887,
 'VEHICLE_TYPE_CODE_1': 0.8317001645089765,
 'VEHICLE_TYPE_CODE_2': 23.201010895739458,
 'VEHICLE_TY

### Convert the string to datetime string
- need to import to_date method

In [18]:
from pyspark.sql.functions import date_format, when, col, to_date

### Note
- if spark version > 3, use this setting 

spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [19]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [20]:
ndf = df.select(
    date_format(
        to_date(col('CRASH_DATE'), 'dd/mm/yyyyy'),'dd-mm-yyyy'
        ).alias('date'))

In [21]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

In [22]:
def splitUDF(row):
    if "/" in row:
        mm,dd,yyyy = row.split("/")
    elif "-" in row:
        yyyy,mm,dd = row.split("-")

    return [yyyy,mm,dd]

In [23]:
datSplitterUDF = udf(lambda row : splitUDF(row),ArrayType(StringType()))
df\
.select(datSplitterUDF(df.CRASH_DATE).alias("dt"))\
.withColumn('year',col('dt').getItem(0).cast('int'))\
.withColumn('month',col('dt').getItem(1).cast('int'))\
.withColumn('day',col('dt').getItem(2).cast('int'))\
.show()

+--------------+----+-----+---+
|            dt|year|month|day|
+--------------+----+-----+---+
|[2021, 04, 14]|2021|    4| 14|
|[2021, 04, 13]|2021|    4| 13|
|[2021, 04, 15]|2021|    4| 15|
|[2021, 04, 13]|2021|    4| 13|
|[2021, 04, 12]|2021|    4| 12|
|[2021, 04, 13]|2021|    4| 13|
|[2021, 04, 13]|2021|    4| 13|
|[2021, 04, 16]|2021|    4| 16|
|[2021, 04, 11]|2021|    4| 11|
|[2021, 04, 16]|2021|    4| 16|
|[2021, 04, 11]|2021|    4| 11|
|[2021, 04, 15]|2021|    4| 15|
|[2021, 04, 10]|2021|    4| 10|
|[2019, 05, 21]|2019|    5| 21|
|[2020, 01, 21]|2020|    1| 21|
|[2021, 02, 26]|2021|    2| 26|
|[2021, 03, 09]|2021|    3|  9|
|[2021, 03, 31]|2021|    3| 31|
|[2021, 04, 06]|2021|    4|  6|
|[2021, 04, 09]|2021|    4|  9|
+--------------+----+-----+---+
only showing top 20 rows



In [24]:
df.columns

['CRASH_DATE',
 'CRASH_TIME',
 'BOROUGH',
 'ZIP_CODE',
 'LATITUDE',
 'LONGITUDE',
 'LOCATION',
 'ON_STREET_NAME',
 'CROSS_STREET_NAME',
 'OFF_STREET_NAME',
 'NUMBER_OF_PERSONS_INJURED',
 'NUMBER_OF_PERSONS_KILLED',
 'NUMBER_OF_PEDESTRIANS_INJURED',
 'NUMBER_OF_PEDESTRIANS_KILLED',
 'NUMBER_OF_CYCLIST_INJURED',
 'NUMBER_OF_CYCLIST_KILLED',
 'NUMBER_OF_MOTORIST_INJURED',
 'NUMBER_OF_MOTORIST_KILLED',
 'CONTRIBUTING_FACTOR_VEHICLE_1',
 'CONTRIBUTING_FACTOR_VEHICLE_2',
 'CONTRIBUTING_FACTOR_VEHICLE_3',
 'CONTRIBUTING_FACTOR_VEHICLE_4',
 'CONTRIBUTING_FACTOR_VEHICLE_5',
 'COLLISION_ID',
 'VEHICLE_TYPE_CODE_1',
 'VEHICLE_TYPE_CODE_2',
 'VEHICLE_TYPE_CODE_3',
 'VEHICLE_TYPE_CODE_4',
 'VEHICLE_TYPE_CODE_5']

In [25]:
ndf = df\
.select('*', datSplitterUDF(df.CRASH_DATE).alias("dt"))\
.withColumn('year',col('dt').getItem(0).cast('int'))\
.withColumn('month',col('dt').getItem(1).cast('int'))\
.withColumn('day',col('dt').getItem(2).cast('int'))

In [26]:
ndf.show(5, truncate=False)

+----------+----------+--------+--------+--------+---------+---------------------+--------------------------------+-----------------+----------------------------------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+-----------------------------------+-------------------+-------------------+-------------------+-------------------+--------------+----+-----+---+
|CRASH_DATE|CRASH_TIME|BOROUGH |ZIP_CODE|LATITUDE|LONGITUDE|LOCATION             |ON_STREET_NAME                  |CROSS_STREET_NAME|OFF_STREET_NAME                         |NUMBER_OF_PERSONS_INJURED|NUMBER_OF_PERSONS_KILLED|NUMBER_OF_PEDESTRIANS_INJURED|NUMBER_OF_PEDESTRIANS_KILLED|NUMBER_OF_CYCLIST_

In [27]:
drop_list = [
             'CRASH_DATE',
             'BOROUGH',
             'ZIP_CODE',
             'LOCATION',
             'ON_STREET_NAME',
             'CROSS_STREET_NAME',
             'OFF_STREET_NAME',
             'CONTRIBUTING_FACTOR_VEHICLE_1',
             'CONTRIBUTING_FACTOR_VEHICLE_2',
             'CONTRIBUTING_FACTOR_VEHICLE_3',
             'CONTRIBUTING_FACTOR_VEHICLE_4',
             'CONTRIBUTING_FACTOR_VEHICLE_5',
             'COLLISION_ID',
             'VEHICLE_TYPE_CODE_1',
             'VEHICLE_TYPE_CODE_2',
             'VEHICLE_TYPE_CODE_3',
             'VEHICLE_TYPE_CODE_4',
             'VEHICLE_TYPE_CODE_5',
             'dt'
             ]

In [28]:
slice_df = ndf.drop(*drop_list)

In [45]:
slice_df.show(5, truncate=False, vertical=True)

-RECORD 0----------------------------------
 CRASH_TIME                    | 5:32      
 LATITUDE                      | null      
 LONGITUDE                     | null      
 NUMBER_OF_PERSONS_INJURED     | 0.0       
 NUMBER_OF_PERSONS_KILLED      | 0.0       
 NUMBER_OF_PEDESTRIANS_INJURED | 0         
 NUMBER_OF_PEDESTRIANS_KILLED  | 0         
 NUMBER_OF_CYCLIST_INJURED     | 0         
 NUMBER_OF_CYCLIST_KILLED      | 0         
 NUMBER_OF_MOTORIST_INJURED    | 0         
 NUMBER_OF_MOTORIST_KILLED     | 0         
 year                          | 2021      
 month                         | 4         
 day                           | 14        
-RECORD 1----------------------------------
 CRASH_TIME                    | 21:35     
 LATITUDE                      | 40.68358  
 LONGITUDE                     | -73.97617 
 NUMBER_OF_PERSONS_INJURED     | 1.0       
 NUMBER_OF_PERSONS_KILLED      | 0.0       
 NUMBER_OF_PEDESTRIANS_INJURED | 1         
 NUMBER_OF_PEDESTRIANS_KILLED  |

In [30]:
slice_df.count()

1048575

In [31]:
rm_na_loc_null = slice_df.filter(
    (df['LATITUDE'].isNotNull())
    &(df['LONGITUDE'].isNotNull()))

In [32]:
rm_na_loc_null.count()

972726

In [33]:
rm_na_loc_null.show(5, truncate=False)

+----------+---------+----------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+----+-----+---+
|CRASH_TIME|LATITUDE |LONGITUDE |NUMBER_OF_PERSONS_INJURED|NUMBER_OF_PERSONS_KILLED|NUMBER_OF_PEDESTRIANS_INJURED|NUMBER_OF_PEDESTRIANS_KILLED|NUMBER_OF_CYCLIST_INJURED|NUMBER_OF_CYCLIST_KILLED|NUMBER_OF_MOTORIST_INJURED|NUMBER_OF_MOTORIST_KILLED|year|month|day|
+----------+---------+----------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+----+-----+---+
|21:35     |40.68358 |-73.97617 |1.0                      |0.0                     |1                            |0                           |0                        |0                       |0                

In [34]:

rename_df = rm_na_loc_null\
.withColumnRenamed("CRASH_TIME", "time")\
.withColumnRenamed("LATITUDE", "lat")\
.withColumnRenamed("LONGITUDE", "lon")\
.withColumnRenamed("NUMBER_OF_PERSONS_INJURED", "npi")\
.withColumnRenamed("NUMBER_OF_PERSONS_KILLED", "npk")\
.withColumnRenamed("NUMBER_OF_PEDESTRIANS_INJURED", "npedi")\
.withColumnRenamed("NUMBER_OF_PEDESTRIANS_KILLED", "npedk")\
.withColumnRenamed("NUMBER_OF_CYCLIST_INJURED", "nci")\
.withColumnRenamed("NUMBER_OF_CYCLIST_KILLED", "nck")\
.withColumnRenamed("NUMBER_OF_MOTORIST_INJURED", "nmi")\
.withColumnRenamed("NUMBER_OF_MOTORIST_KILLED", "nmk")


In [35]:
rename_df.show(5, truncate=False)

+-----+---------+----------+---+---+-----+-----+---+---+---+---+----+-----+---+
|time |lat      |lon       |npi|npk|npedi|npedk|nci|nck|nmi|nmk|year|month|day|
+-----+---------+----------+---+---+-----+-----+---+---+---+---+----+-----+---+
|21:35|40.68358 |-73.97617 |1.0|0.0|1    |0    |0  |0  |0  |0  |2021|4    |13 |
|8:25 |0.0      |0.0       |0.0|0.0|0    |0    |0  |0  |0  |0  |2021|4    |12 |
|22:50|40.69754 |-73.98312 |0.0|0.0|0    |0    |0  |0  |0  |0  |2019|5    |21 |
|14:50|40.843464|-73.836   |0.0|0.0|0    |0    |0  |0  |0  |0  |2021|2    |26 |
|11:00|40.692547|-73.990974|1.0|0.0|0    |0    |0  |0  |1  |0  |2021|3    |9  |
+-----+---------+----------+---+---+-----+-----+---+---+---+---+----+-----+---+
only showing top 5 rows



In [36]:
rename_df.columns

['time',
 'lat',
 'lon',
 'npi',
 'npk',
 'npedi',
 'npedk',
 'nci',
 'nck',
 'nmi',
 'nmk',
 'year',
 'month',
 'day']

In [37]:
reorder_df = rename_df.select(
    "year", "month", "day",
    "time", "lat", "lon",
    "npi", "npk", "npedi",
    "npedk", "nci", "nck",
    "nmi", "nmk"
    )

In [38]:
reorder_df.show(5, truncate=False)

+----+-----+---+-----+---------+----------+---+---+-----+-----+---+---+---+---+
|year|month|day|time |lat      |lon       |npi|npk|npedi|npedk|nci|nck|nmi|nmk|
+----+-----+---+-----+---------+----------+---+---+-----+-----+---+---+---+---+
|2021|4    |13 |21:35|40.68358 |-73.97617 |1.0|0.0|1    |0    |0  |0  |0  |0  |
|2021|4    |12 |8:25 |0.0      |0.0       |0.0|0.0|0    |0    |0  |0  |0  |0  |
|2019|5    |21 |22:50|40.69754 |-73.98312 |0.0|0.0|0    |0    |0  |0  |0  |0  |
|2021|2    |26 |14:50|40.843464|-73.836   |0.0|0.0|0    |0    |0  |0  |0  |0  |
|2021|3    |9  |11:00|40.692547|-73.990974|1.0|0.0|0    |0    |0  |0  |1  |0  |
+----+-----+---+-----+---------+----------+---+---+-----+-----+---+---+---+---+
only showing top 5 rows



In [39]:
reorder_df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- npi: double (nullable = true)
 |-- npk: double (nullable = true)
 |-- npedi: long (nullable = true)
 |-- npedk: long (nullable = true)
 |-- nci: long (nullable = true)
 |-- nck: long (nullable = true)
 |-- nmi: long (nullable = true)
 |-- nmk: long (nullable = true)



### get only 2020 data set

In [40]:
df_20 = reorder_df.filter(col('year') == 2020)

In [41]:
df_20.count()

103989

In [44]:
df_20.select(
    "time", "npi", "npk",
    "npedi","npedk", "nci",
    "nck", "nmi", "nmk"
).describe().show(vertical=True)

-RECORD 0-----------------------
 summary | count                
 time    | 103989               
 npi     | 103989               
 npk     | 103989               
 npedi   | 103989               
 npedk   | 103989               
 nci     | 103989               
 nck     | 103989               
 nmi     | 103989               
 nmk     | 103989               
-RECORD 1-----------------------
 summary | mean                 
 time    | null                 
 npi     | 0.3909355797247786   
 npk     | 0.002288703612882... 
 npedi   | 0.06161228591485638  
 npedk   | 9.039417630710941E-4 
 nci     | 0.05073613555279886  
 nck     | 2.307936416351729... 
 nmi     | 0.2785871582571233   
 nmk     | 0.001153968208175... 
-RECORD 2-----------------------
 summary | stddev               
 time    | null                 
 npi     | 0.7416904881029619   
 npk     | 0.051280417306981005 
 npedi   | 0.2520113741506671   
 npedk   | 0.030370484449129483 
 nci     | 0.2243563481228609   
 nck     |