In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col
spark = SparkSession.builder.master("local[2]").appName("CollisionRecords").getOrCreate()

CollisionRecords = "20160924_CollisionRecords.txt"

#Define Schema
collision_schema = StructType([
    StructField("CASE_ID", StringType(), True),
    StructField("ACCIDENT_YEAR", IntegerType(), True),
    StructField("PROC_DATE", StringType(), True),
    StructField("JURIS", StringType(), True),
    StructField("COLLISION_DATE", StringType(), True),
    StructField("COLLISION_TIME", StringType(), True),
    StructField("OFFICER_ID", StringType(), True),
    StructField("REPORTING_DISTRICT", StringType(), True),
    StructField("DAY_OF_WEEK", IntegerType(), True),
    StructField("CHP_SHIFT", StringType(), True),
    StructField("POPULATION", IntegerType(), True),
    StructField("CNTY_CITY_LOC", StringType(), True),
    StructField("SPECIAL_COND", StringType(), True),
    StructField("BEAT_TYPE", StringType(), True),
    StructField("CHP_BEAT_TYPE", StringType(), True),
    StructField("CITY_DIVISION_LAPD", StringType(), True),
    StructField("CHP_BEAT_CLASS", StringType(), True),
    StructField("BEAT_NUMBER", StringType(), True),
    StructField("PRIMARY_RD", StringType(), True),
    StructField("SECONDARY_RD", StringType(), True),
    StructField("DISTANCE", IntegerType(), True),
    StructField("DIRECTION", StringType(), True),
    StructField("INTERSECTION", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
])

#Load the data
collision_df = spark.read.csv(path=CollisionRecords, schema=collision_schema, header=False).cache()

#Process data
header = collision_df.first().asDict()
collision_df = collision_df.filter(col("CASE_ID") != header["CASE_ID"])
collision_df.show(10)
collision_df.describe().show()



24/12/04 01:55:09 WARN CacheManager: Asked to cache already cached data.


+-------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+---------+----------+-------------+------------+---------+-------------+------------------+--------------+-----------+---------------+---------------+--------+---------+------------+--------+---------+
|            CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|CHP_SHIFT|POPULATION|CNTY_CITY_LOC|SPECIAL_COND|BEAT_TYPE|CHP_BEAT_TYPE|CITY_DIVISION_LAPD|CHP_BEAT_CLASS|BEAT_NUMBER|     PRIMARY_RD|   SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|LATITUDE|LONGITUDE|
+-------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+---------+----------+-------------+------------+---------+-------------+------------------+--------------+-----------+---------------+---------------+--------+---------+------------+--------+---------+
|0100010101011401155|    

24/12/04 01:56:15 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/04 01:56:17 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/04 01:56:30 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 26.6 MiB so far)
24/12/04 01:56:33 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/04 01:56:46 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 25.5 MiB so far)

+-------+--------------------+-----------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------+--------------------+--------------------+------------------+---------+------------+--------+---------+
|summary|             CASE_ID|    ACCIDENT_YEAR|           PROC_DATE|            JURIS|      COLLISION_DATE|    COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|       DAY_OF_WEEK|         CHP_SHIFT|        POPULATION|     CNTY_CITY_LOC|        SPECIAL_COND|         BEAT_TYPE|     CHP_BEAT_TYPE|CITY_DIVISION_LAPD|    CHP_BEAT_CLASS|BEAT_NUMBER|          PRIMARY_RD|        SECONDARY_RD|          DISTANCE|DIRECTION|INTERSECTION|LATITUDE|LONGITUDE|
+-------+--------------------+-----------------+--------------------+-----------------+--------------------+

                                                                                

In [27]:


# Fix negative distance values 
collision_df = collision_df.filter(col("DISTANCE") >= 0)
collision_df.describe().show()



24/12/04 01:58:32 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/04 01:58:34 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/04 01:58:46 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 26.6 MiB so far)
24/12/04 01:58:47 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/04 01:58:59 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 25.5 MiB so far)

+-------+--------------------+------------------+--------------------+------------------+-------------------+------------------+----------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------+--------------------+--------------------+------------------+---------+------------+--------+---------+
|summary|             CASE_ID|     ACCIDENT_YEAR|           PROC_DATE|             JURIS|     COLLISION_DATE|    COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|       DAY_OF_WEEK|         CHP_SHIFT|        POPULATION|     CNTY_CITY_LOC|        SPECIAL_COND|         BEAT_TYPE|     CHP_BEAT_TYPE|CITY_DIVISION_LAPD|    CHP_BEAT_CLASS|BEAT_NUMBER|          PRIMARY_RD|        SECONDARY_RD|          DISTANCE|DIRECTION|INTERSECTION|LATITUDE|LONGITUDE|
+-------+--------------------+------------------+--------------------+------------------+-----------------

                                                                                

In [35]:
# Filter rows where PRIMARY_RD or SECONDARY_RD contains "..."
invalid_rd_cases = collision_df.filter(
    (col("PRIMARY_RD").contains("...")) | (col("SECONDARY_RD").contains("..."))
)

invalid_rd_cases.show(truncate=False)



24/12/04 02:10:03 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/04 02:10:03 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 27.1 MiB so far)
24/12/04 02:10:03 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/04 02:10:04 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 26.5 MiB so far)
24/12/04 02:10:04 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+---------+----------+-------------+------------+---------+-------------+------------------+--------------+-----------+----------+------------+--------+---------+------------+--------+---------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|CHP_SHIFT|POPULATION|CNTY_CITY_LOC|SPECIAL_COND|BEAT_TYPE|CHP_BEAT_TYPE|CITY_DIVISION_LAPD|CHP_BEAT_CLASS|BEAT_NUMBER|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|LATITUDE|LONGITUDE|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+---------+----------+-------------+------------+---------+-------------+------------------+--------------+-----------+----------+------------+--------+---------+------------+--------+---------+
+-------+-------------+---------+-----+--------------+--------------+----------+-----

                                                                                