In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, length

# Define data and schema
CollisionRecords = [
    "20160924_CollisionRecords.txt",
    "20170112_CollisionRecords.txt",
    "20180925_CollisionRecords.txt",
    "20201024_CollisionRecords.txt"
]

collision_schema = StructType([
    StructField("CASE_ID", StringType(), True),
    StructField("ACCIDENT_YEAR", IntegerType(), True),
    StructField("PROC_DATE", StringType(), True),
    StructField("JURIS", StringType(), True),
    StructField("COLLISION_DATE", StringType(), True),
    StructField("COLLISION_TIME", StringType(), True),
    StructField("OFFICER_ID", StringType(), True),
    StructField("REPORTING_DISTRICT", StringType(), True),
    StructField("DAY_OF_WEEK", IntegerType(), True),
    StructField("CHP_SHIFT", StringType(), True),
    StructField("POPULATION", IntegerType(), True),
    StructField("CNTY_CITY_LOC", StringType(), True),
    StructField("SPECIAL_COND", StringType(), True),
    StructField("BEAT_TYPE", StringType(), True),
    StructField("CHP_BEAT_TYPE", StringType(), True),
    StructField("CITY_DIVISION_LAPD", StringType(), True),
    StructField("CHP_BEAT_CLASS", StringType(), True),
    StructField("BEAT_NUMBER", StringType(), True),
    StructField("PRIMARY_RD", StringType(), True),
    StructField("SECONDARY_RD", StringType(), True),
    StructField("DISTANCE", IntegerType(), True),
    StructField("DIRECTION", StringType(), True),
    StructField("INTERSECTION", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
])

# Start Spark session
spark = SparkSession.builder.appName("CollisionRecords").getOrCreate()

# Load the data
collision_df = spark.read.csv(CollisionRecords, schema=collision_schema, header=False).cache()

# Drop rows where CASE_ID matches the header
header = collision_df.first().asDict()
collision_df = collision_df.filter(col("CASE_ID") != header["CASE_ID"])

#### Debug: Reduce dataset for testing ####
#collision_df = collision_df.limit(10)

# Step 1: Display non-numerical columns to the user
non_numerical_columns = [f.name for f in collision_schema.fields if isinstance(f.dataType, StringType)]
print("Non-Numerical Columns: ", non_numerical_columns)

# Step 2: Manually specify columns to drop (modify this list for your needs)
columns_to_drop = ["CHP_SHIFT", "CITY_DIVISION_LAPD", "SPECIAL_COND", "CITY_DIVISION_LAPD", "CHP_BEAT_CLASS", "BEAT_NUMBER" ]  # Example input

# Drop the specified columns
collision_df = collision_df.drop(*columns_to_drop)

# Step 3: Drop columns with all null values
columns_with_all_nulls = [col_name for col_name in collision_df.columns if collision_df.filter(col(col_name).isNotNull()).count() == 0]
collision_df = collision_df.drop(*columns_with_all_nulls)

# Handle nulls in critical columns
collision_df = collision_df.fillna({
    "ACCIDENT_YEAR": 0,
    "DAY_OF_WEEK": 0,
    "POPULATION": 0,
    "DISTANCE": 0
})

# Ensure DAY_OF_WEEK values are valid
collision_df = collision_df.filter((col("DAY_OF_WEEK") >= 1) & (col("DAY_OF_WEEK") <= 7))

#### Debug: Check for corrupted data ####
collision_df.filter(col("CASE_ID").rlike("[^a-zA-Z0-9]")).show(5)
collision_df.filter(length(col("CASE_ID")) > 50).show(5)


# Show cleaned data
print("Cleaned Data Info:")
collision_df.printSchema()
collision_df.show(10)


24/12/16 11:48:59 WARN Utils: Your hostname, Seans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.158 instead (on interface en0)
24/12/16 11:48:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 11:48:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 11:49:28 WARN MemoryStore: Not enough space to cache rdd_3_9 in memory! (computed 26.6 MiB so far)
24/12/16 11:49:29 WARN BlockManager: Persisting block rdd_3_9 to disk instead.
24/12/16 11:49:29 WARN MemoryStore: Not enough space to cache rdd_3_10 in memory! (computed 26.4 MiB so far)
24/12/16 11:49:29 WARN BlockManager: Persisting block rdd_3_10 to disk instead.
24/12/16 11:49:47 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far

Non-Numerical Columns:  ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'CHP_SHIFT', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DIRECTION', 'INTERSECTION']


24/12/16 11:52:47 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_23 in memory.
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_23 in memory! (computed 384.0 B so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_22 in memory! (computed 13.5 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 11:52:47 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computin

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
| 449906|         2002| 20030206| 1942|      20021008|          1845|     21594|              1145|          2|         7|         1942|        0|            0| GILROY ST|RIVERSIDE DR|       0|     NULL|           Y|
| 097293|         2003| 20031223| 9435|      20031028|          1625|     15986|              NULL|          2|         9|         1

24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 25.5 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 10.7 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 11:53:43 WARN MemoryStore: Not enough space to cache rdd_3_21 in memory! (computed 13.6 MiB so far)
24/12/16 11:53:43 W

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+

Cleaned Data Info:
root
 |-- CASE_ID: string (nullable = true)
 |-- ACCIDENT_YEAR: integer (nullable = false)
 |-- PROC_DATE: strin

In [2]:


# Fix negative distance values 
collision_df = collision_df.filter(col("DISTANCE") >= 0)
collision_df.describe().show()



24/12/16 11:53:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/16 11:56:07 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/16 11:57:06 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 11:57:16 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 11:57:19 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 11:57:21 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 11:57:33 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 10.7 MiB so far)
24/12/16 11:57:43 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 11:57:43 WARN MemoryStore: Not enough s

+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+------------+
|summary|             CASE_ID|     ACCIDENT_YEAR|           PROC_DATE|            JURIS|      COLLISION_DATE|    COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|       DAY_OF_WEEK|       POPULATION|     CNTY_CITY_LOC|         BEAT_TYPE|     CHP_BEAT_TYPE|          PRIMARY_RD|        SECONDARY_RD|          DISTANCE|DIRECTION|INTERSECTION|
+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+---

                                                                                

In [3]:
# Filter rows where PRIMARY_RD or SECONDARY_RD contains "..."
invalid_rd_cases = collision_df.filter(
    (col("PRIMARY_RD").contains("...")) | (col("SECONDARY_RD").contains("..."))
)

invalid_rd_cases.show(truncate=False)



24/12/16 12:18:47 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 12:18:47 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 26.5 MiB so far)
24/12/16 12:18:47 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 12:18:47 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 12:18:47 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 12:18:48 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:48 WARN MemoryStore: Not enough space to cache rdd_3_23 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:48 WARN MemoryStore: Not enough space to cache rdd_3_22 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:48 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 12:18:48 W

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+



                                                                                

In [4]:


# remove all "not stated - values" and null
filterCondition = (col(collision_df.columns[0]) != "-")
filterCondition = filterCondition | (col(collision_df.columns[0]) != None)


for c in collision_df.columns[1:]:
    filterCondition = filterCondition | (col(c) != "-")
    filterCondition = filterCondition | (col(c) != None)

filtered_df = collision_df.filter(filterCondition)
filtered_df.head(5)

[Row(CASE_ID='0100010101011401155', ACCIDENT_YEAR=2001, PROC_DATE='20010416', JURIS='0100', COLLISION_DATE='20010101', COLLISION_TIME='0114', OFFICER_ID='1155', REPORTING_DISTRICT='0', DAY_OF_WEEK=1, POPULATION=4, CNTY_CITY_LOC='0198', BEAT_TYPE='0', CHP_BEAT_TYPE='0', PRIMARY_RD='DUBLIN BL', SECONDARY_RD='SCARLETT CT', DISTANCE=267, DIRECTION='W', INTERSECTION='N'),
 Row(CASE_ID='0100010103174503131', ACCIDENT_YEAR=2001, PROC_DATE='20010416', JURIS='0100', COLLISION_DATE='20010103', COLLISION_TIME='1745', OFFICER_ID='3131', REPORTING_DISTRICT='10', DAY_OF_WEEK=3, POPULATION=4, CNTY_CITY_LOC='0198', BEAT_TYPE='0', CHP_BEAT_TYPE='0', PRIMARY_RD='DOUGHERTY RD', SECONDARY_RD='AMADOR VLY BL', DISTANCE=80, DIRECTION='N', INTERSECTION='N'),
 Row(CASE_ID='0100010104134002415', ACCIDENT_YEAR=2001, PROC_DATE='20010608', JURIS='0100', COLLISION_DATE='20010104', COLLISION_TIME='1340', OFFICER_ID='2415', REPORTING_DISTRICT='0', DAY_OF_WEEK=4, POPULATION=4, CNTY_CITY_LOC='0198', BEAT_TYPE='0', CHP_

In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
encoded_and_index = ["INTERSECTION","DIRECTION"]

indexers = [StringIndexer(inputCol=c, outputCol=c+"_index") for c in encoded_and_index]
encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec") for c in encoded_and_index]

filtered_df2 = filtered_df

for col_name in encoded_and_index:
    
    indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_index")
    print(indexer)
    print(col_name)
    indexedDf = indexer.fit(filtered_df2).transform(filtered_df2)
   
    encoder = OneHotEncoder(inputCol=col_name + "_index", outputCol=col_name + "_vec")
    encoded_df = encoder.fit(indexedDf).transform(indexedDf)

encoded_df.show(truncate=False)

StringIndexer_4b269b4b6812
INTERSECTION


24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_21 in memory! (computed 13.6 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_22 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 12:18:57 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 12:18:57 W

StringIndexer_d98534b42c86
DIRECTION


24/12/16 12:19:06 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 12:19:06 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/16 12:19:06 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 12:19:06 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_17 in memory.
24/12/16 12:19:06 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_15 in memory.
24/12/16 12:19:06 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 384.0 B so far)
24/12/16 12:19:06 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 384.0 B so far)
24/12/16 12:19:06 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_21 in memory.
24/12/16 12:19:06 WARN MemoryStore: Failed to reserve init

+-------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+---------------+---------------+--------+---------+------------+---------------+-------------+
|CASE_ID            |ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD     |SECONDARY_RD   |DISTANCE|DIRECTION|INTERSECTION|DIRECTION_index|DIRECTION_vec|
+-------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+---------------+---------------+--------+---------+------------+---------------+-------------+
|0100010101011401155|2001         |20010416 |0100 |20010101      |0114          |1155      |0                 |1          |4         |0198         |0        |0            |DUBLIN BL      |SCARLETT CT

In [6]:
# Save the cleaned Collision Records DataFrame as CSV
collision_df.write.csv("clean_collision_records.csv", header=True, mode="overwrite")
print("Collision Records saved successfully")

24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_23 in memory! (computed 13.5 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_21 in memory! (computed 13.6 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_22 in memory! (computed 13.5 MiB so far)
24/12/16 12:19:18 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 12:19:18 W

Collision Records saved successfully


                                                                                