In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc,count,row_number,col
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.master('local').appName("Spark DataFrame").getOrCreate()

In [3]:
df_person = spark.read.options(inferSchema='True', header='True', delimiter=',').csv("./data/Primary_Person_use.csv")
df_person.printSchema()

root
 |-- CRASH_ID: integer (nullable = true)
 |-- UNIT_NBR: integer (nullable = true)
 |-- PRSN_NBR: integer (nullable = true)
 |-- PRSN_TYPE_ID: string (nullable = true)
 |-- PRSN_OCCPNT_POS_ID: string (nullable = true)
 |-- PRSN_INJRY_SEV_ID: string (nullable = true)
 |-- PRSN_AGE: string (nullable = true)
 |-- PRSN_ETHNICITY_ID: string (nullable = true)
 |-- PRSN_GNDR_ID: string (nullable = true)
 |-- PRSN_EJCT_ID: string (nullable = true)
 |-- PRSN_REST_ID: string (nullable = true)
 |-- PRSN_AIRBAG_ID: string (nullable = true)
 |-- PRSN_HELMET_ID: string (nullable = true)
 |-- PRSN_SOL_FL: string (nullable = true)
 |-- PRSN_ALC_SPEC_TYPE_ID: string (nullable = true)
 |-- PRSN_ALC_RSLT_ID: string (nullable = true)
 |-- PRSN_BAC_TEST_RSLT: string (nullable = true)
 |-- PRSN_DRG_SPEC_TYPE_ID: string (nullable = true)
 |-- PRSN_DRG_RSLT_ID: string (nullable = true)
 |-- DRVR_DRG_CAT_1_ID: string (nullable = true)
 |-- PRSN_DEATH_TIME: timestamp (nullable = true)
 |-- INCAP_INJRY_CNT: 

# Analysis 1

Find the number of crashes (accidents) in which number of persons killed are male?

In [172]:
df_person.filter(col("PRSN_GNDR_ID")=="MALE").count()

96782

# Analysis 2

How many two wheelers are booked for crashes?

In [173]:
df_person.select("PRSN_HELMET_ID").filter(~col("PRSN_HELMET_ID").isin(["NOT APPLICABLE"])).count()

911

# Analysis 3

Which state has highest number of accidents in which females are involved?

In [4]:
df_unit = spark.read.options(inferSchema='True',header='True', delimiter=',').csv("./data/Units_use.csv")
df_unit.printSchema()

root
 |-- CRASH_ID: integer (nullable = true)
 |-- UNIT_NBR: integer (nullable = true)
 |-- UNIT_DESC_ID: string (nullable = true)
 |-- VEH_PARKED_FL: string (nullable = true)
 |-- VEH_HNR_FL: string (nullable = true)
 |-- VEH_LIC_STATE_ID: string (nullable = true)
 |-- VIN: string (nullable = true)
 |-- VEH_MOD_YEAR: string (nullable = true)
 |-- VEH_COLOR_ID: string (nullable = true)
 |-- VEH_MAKE_ID: string (nullable = true)
 |-- VEH_MOD_ID: string (nullable = true)
 |-- VEH_BODY_STYL_ID: string (nullable = true)
 |-- EMER_RESPNDR_FL: string (nullable = true)
 |-- OWNR_ZIP: string (nullable = true)
 |-- FIN_RESP_PROOF_ID: string (nullable = true)
 |-- FIN_RESP_TYPE_ID: string (nullable = true)
 |-- VEH_DMAG_AREA_1_ID: string (nullable = true)
 |-- VEH_DMAG_SCL_1_ID: string (nullable = true)
 |-- FORCE_DIR_1_ID: string (nullable = true)
 |-- VEH_DMAG_AREA_2_ID: string (nullable = true)
 |-- VEH_DMAG_SCL_2_ID: string (nullable = true)
 |-- FORCE_DIR_2_ID: string (nullable = true)
 |--

In [5]:
df2 = df_unit.join(df_person, "CRASH_ID","inner").select("CRASH_ID","PRSN_GNDR_ID","VEH_LIC_STATE_ID").filter(col("PRSN_GNDR_ID")=="FEMALE").groupby("VEH_LIC_STATE_ID").count()
df2 = df2.sort(col('count').desc())
print(df2.take(1)[0])

Row(VEH_LIC_STATE_ID='TX', count=127640)


# Analysis 4

Which are the Top 5th to 15th VEH_MAKE_IDs that contribute to a largest number of injuries including death?

In [6]:
df4 = df_unit.withColumn("Total_Injuries_death",col("TOT_INJRY_CNT")+col("DEATH_CNT")).select("VEH_MAKE_ID","TOT_INJRY_CNT","DEATH_CNT","Total_Injuries_death").orderBy('Total_Injuries_death', ascending=False)
for i, data in enumerate(df4.collect()):
    if i>=5 and i<=15:
        print(data[0])

BLUE BIRD
NORTH AMERICAN BUS
NORTH AMERICAN BUS
LINCOLN
CHEVROLET
VAN HOOL
VAN HOOL
MAZDA
MCI (LES AUTO BUS)
MCI (LES AUTO BUS)
NISSAN


# Analysis 5

For all the body styles involved in crashes, mention the top ethnic user group of each unique body style?

In [180]:
df2 = df_unit.join(df_person, "CRASH_ID","inner")\
    .select("CRASH_ID","VEH_BODY_STYL_ID","PRSN_ETHNICITY_ID")\
    .groupby("VEH_BODY_STYL_ID","PRSN_ETHNICITY_ID").agg(count("PRSN_ETHNICITY_ID").alias("num_crashes"))

win = Window.partitionBy("VEH_BODY_STYL_ID").orderBy(desc("num_crashes"))
df2 = df2.select("*", row_number().over(win).alias("RowNum")).filter(col("RowNum")==1).drop("RowNum")
df2.show(20, False)

+---------------------------------+-----------------+-----------+
|VEH_BODY_STYL_ID                 |PRSN_ETHNICITY_ID|num_crashes|
+---------------------------------+-----------------+-----------+
|AMBULANCE                        |WHITE            |97         |
|BUS                              |HISPANIC         |391        |
|FARM EQUIPMENT                   |WHITE            |63         |
|FIRE TRUCK                       |WHITE            |112        |
|MOTORCYCLE                       |WHITE            |848        |
|NA                               |WHITE            |5693       |
|NEV-NEIGHBORHOOD ELECTRIC VEHICLE|WHITE            |10         |
|NOT REPORTED                     |HISPANIC         |2          |
|OTHER  (EXPLAIN IN NARRATIVE)    |WHITE            |459        |
|PASSENGER CAR, 2-DOOR            |WHITE            |9877       |
|PASSENGER CAR, 4-DOOR            |WHITE            |58312      |
|PICKUP                           |WHITE            |38609      |
|POLICE CA

# Analysis 6

Among the crashed cars, what are the Top 5 Zip Codes with highest number crashes with alcohols as the contributing factor to a crash (Use Driver Zip Code)?

In [181]:
filter_string = ["ALCOHOL","DRINKING"]
df6 = df_unit.join(df_person, "CRASH_ID","inner"). \
    where(col("CONTRIB_FACTR_1_ID").rlike('|'.join(filter_string)) | col("CONTRIB_FACTR_2_ID").rlike('|'.join(filter_string))). \
    groupby("DRVR_ZIP").count().orderBy(col("count").desc()).filter(col("DRVR_ZIP") != "null").limit(5)
df6 = df6.withColumnRenamed("count","Highest_number_crashes_with_alcohols")
df6.show()

+--------+------------------------------------+
|DRVR_ZIP|Highest_number_crashes_with_alcohols|
+--------+------------------------------------+
|   78521|                                  78|
|   76010|                                  77|
|   75067|                                  70|
|   78753|                                  61|
|   78741|                                  57|
+--------+------------------------------------+



# Analysis 7

Count of Distinct Crash IDs where No Damaged Property was observed and Damage Level (VEH_DMAG_SCL~) is above 4 and car avails Insurance

In [7]:
df_damage = spark.read.options(inferSchema='True',header='True', delimiter=',').csv("./data/Damages_use.csv")
df_damage.printSchema()

root
 |-- CRASH_ID: integer (nullable = true)
 |-- DAMAGED_PROPERTY: string (nullable = true)



In [9]:
df7 = df_damage.join(df_unit, on=["CRASH_ID"], how='inner'). \
    where(((df_unit.VEH_DMAG_SCL_1_ID > "DAMAGED 4")) | ((df_unit.VEH_DMAG_SCL_2_ID > "DAMAGED 4"))). \
    where(df_damage.DAMAGED_PROPERTY == "NONE"). \
    where(df_unit.FIN_RESP_TYPE_ID == "PROOF OF LIABILITY INSURANCE")
display(df7.show())

+--------+----------------+--------+-------------+-------------+----------+----------------+-----------------+------------+------------+-----------+--------------+--------------------+---------------+--------+-----------------+--------------------+--------------------+-----------------+--------------+------------------+-----------------+--------------+------------------+--------------------+--------------------+--------------------+------------------+-------------------+---------------+---------------------+---------------+------------------+--------------+-------------+--------------+-------------+---------+
|CRASH_ID|DAMAGED_PROPERTY|UNIT_NBR| UNIT_DESC_ID|VEH_PARKED_FL|VEH_HNR_FL|VEH_LIC_STATE_ID|              VIN|VEH_MOD_YEAR|VEH_COLOR_ID|VEH_MAKE_ID|    VEH_MOD_ID|    VEH_BODY_STYL_ID|EMER_RESPNDR_FL|OWNR_ZIP|FIN_RESP_PROOF_ID|    FIN_RESP_TYPE_ID|  VEH_DMAG_AREA_1_ID|VEH_DMAG_SCL_1_ID|FORCE_DIR_1_ID|VEH_DMAG_AREA_2_ID|VEH_DMAG_SCL_2_ID|FORCE_DIR_2_ID|VEH_INVENTORIED_FL|     VEH_T

None

# Analysis 8

Determine the Top 5 Vehicle Makes where drivers are charged with speeding related offences, has licensed Drivers, used top 10 used vehicle colours and has car licensed with the Top 25 states with highest number of offences (to be deduced from the data)

In [10]:
df_charge = spark.read.options(inferSchema = 'True',header = 'True', delimiter = ',').csv("./data/Charges_use.csv")
df_charge.printSchema()

root
 |-- CRASH_ID: integer (nullable = true)
 |-- UNIT_NBR: integer (nullable = true)
 |-- PRSN_NBR: integer (nullable = true)
 |-- CHARGE: string (nullable = true)
 |-- CITATION_NBR: string (nullable = true)



In [11]:
top_25_state_list_df = df_unit.groupby("VEH_LIC_STATE_ID").count().orderBy(col("count").desc()). \
    filter(col("VEH_LIC_STATE_ID").cast('int').isNull()).limit(25)
top_25_state_list = top_25_state_list_df.select("VEH_LIC_STATE_ID").rdd.flatMap(lambda x: x).collect()

top_10_used_vehicle_colors_df = df_unit.groupby("VEH_COLOR_ID").count().orderBy(col("count").desc()). \
    where(col("VEH_COLOR_ID").cast('int').isNull()). \
    where(col("VEH_COLOR_ID") != "NA").limit(10)

top_10_used_vehicle_colors = top_10_used_vehicle_colors_df.select("VEH_COLOR_ID").rdd.flatMap(lambda x: x).collect()

In [184]:
df8 = df_charge.join(df_person, 'CRASH_ID','inner'). \
    join(df_unit, 'CRASH_ID', 'inner'). \
    filter(df_charge.CHARGE.contains("SPEED")). \
    filter(df_person.DRVR_LIC_TYPE_ID.isin(["DRIVER LICENSE", "COMMERCIAL DRIVER LIC."])). \
    filter(df_unit.VEH_COLOR_ID.isin(top_10_used_vehicle_colors)). \
    filter(df_unit.VEH_LIC_STATE_ID.isin(top_25_state_list)). \
    groupby("VEH_MAKE_ID").count(). \
    orderBy(col("count").desc()).withColumnRenamed("count","Highest_number_of_offences").limit(5)
df8.show()

+-----------+--------------------------+
|VEH_MAKE_ID|Highest_number_of_offences|
+-----------+--------------------------+
|       FORD|                     19205|
|  CHEVROLET|                     16860|
|     TOYOTA|                     11822|
|      DODGE|                      7935|
|     NISSAN|                      7332|
+-----------+--------------------------+

