In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, length

# Define data and schema
CollisionRecords = [
    "20160924_CollisionRecords.txt",
    "20170112_CollisionRecords.txt",
    "20180925_CollisionRecords.txt",
    "20201024_CollisionRecords.txt"
]

collision_schema = StructType([
    StructField("CASE_ID", StringType(), True),
    StructField("ACCIDENT_YEAR", IntegerType(), True),
    StructField("PROC_DATE", StringType(), True),
    StructField("JURIS", StringType(), True),
    StructField("COLLISION_DATE", StringType(), True),
    StructField("COLLISION_TIME", StringType(), True),
    StructField("OFFICER_ID", StringType(), True),
    StructField("REPORTING_DISTRICT", StringType(), True),
    StructField("DAY_OF_WEEK", IntegerType(), True),
    StructField("CHP_SHIFT", StringType(), True),
    StructField("POPULATION", IntegerType(), True),
    StructField("CNTY_CITY_LOC", StringType(), True),
    StructField("SPECIAL_COND", StringType(), True),
    StructField("BEAT_TYPE", StringType(), True),
    StructField("CHP_BEAT_TYPE", StringType(), True),
    StructField("CITY_DIVISION_LAPD", StringType(), True),
    StructField("CHP_BEAT_CLASS", StringType(), True),
    StructField("BEAT_NUMBER", StringType(), True),
    StructField("PRIMARY_RD", StringType(), True),
    StructField("SECONDARY_RD", StringType(), True),
    StructField("DISTANCE", IntegerType(), True),
    StructField("DIRECTION", StringType(), True),
    StructField("INTERSECTION", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
])

# Start Spark session
spark = (SparkSession.builder
         .config("spark.executor.memory", "4g")  # Increase executor memory
         .config("spark.executor.cores", "2")  # Number of cores per executor
         .config("spark.driver.memory", "4g")  # Increase driver memory
         .appName("CollisionRecords").getOrCreate())

# Load the data
collision_df = spark.read.csv(CollisionRecords, schema=collision_schema, header=False).cache()

# Drop rows where CASE_ID matches the header
header = collision_df.first()
collision_df = collision_df.filter(col("CASE_ID") != header["CASE_ID"])

#### Debug: Reduce dataset for testing ####
#collision_df = collision_df.limit(10)

# Step 1: Display non-numerical columns to the user
non_numerical_columns = [f.name for f in collision_schema.fields if isinstance(f.dataType, StringType)]
print("Non-Numerical Columns: ", non_numerical_columns)

# Step 2: Manually specify columns to drop (modify this list for your needs)
columns_to_drop = ["CHP_SHIFT", "CITY_DIVISION_LAPD", "SPECIAL_COND", "CITY_DIVISION_LAPD", "CHP_BEAT_CLASS", "BEAT_NUMBER" ]  # Example input

# Drop the specified columns
collision_df = collision_df.drop(*columns_to_drop)

# Step 3: Drop columns with all null values
columns_with_all_nulls = [col_name for col_name in collision_df.columns if collision_df.filter(col(col_name).isNotNull()).count() == 0]
collision_df = collision_df.drop(*columns_with_all_nulls)

# Handle nulls in critical columns
collision_df = collision_df.fillna({
    "ACCIDENT_YEAR": 0,
    "DAY_OF_WEEK": 0,
    "POPULATION": 0,
    "DISTANCE": 0
})

# Ensure DAY_OF_WEEK values are valid
collision_df = collision_df.filter((col("DAY_OF_WEEK") >= 1) & (col("DAY_OF_WEEK") <= 7))

#### Debug: Check for corrupted data ####
collision_df.filter(col("CASE_ID").rlike("[^a-zA-Z0-9]")).show(5)
collision_df.filter(length(col("CASE_ID")) > 50).show(5)


# Show cleaned data
print("Cleaned Data Info:")
collision_df.printSchema()
collision_df.show(10)


24/12/16 16:21:07 WARN Utils: Your hostname, Seans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.158 instead (on interface en0)
24/12/16 16:21:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 16:21:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 16:23:14 WARN MemoryStore: Not enough space to cache rdd_3_62 in memory! (computed 25.2 MiB so far)
24/12/16 16:23:14 WARN BlockManager: Persisting block rdd_3_62 to disk instead.
24/12/16 16:23:16 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:16 WARN BlockManager: Persisting block rdd_3_65 to disk instead.
24/12/16 16:23:19 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so f

Non-Numerical Columns:  ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'CHP_SHIFT', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DIRECTION', 'INTERSECTION']


24/12/16 16:23:22 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:24 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:25 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:26 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:27 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:28 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:29 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:30 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:31 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
24/12/16 16:23:32 W

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
| 449906|         2002| 20030206| 1942|      20021008|          1845|     21594|              1145|          2|         7|         1942|        0|            0| GILROY ST|RIVERSIDE DR|       0|     NULL|           Y|
| 097293|         2003| 20031223| 9435|      20031028|          1625|     15986|              NULL|          2|         9|         1

24/12/16 16:23:46 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
                                                                                

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+

Cleaned Data Info:
root
 |-- CASE_ID: string (nullable = true)
 |-- ACCIDENT_YEAR: integer (nullable = false)
 |-- PROC_DATE: strin

In [2]:


# Fix negative distance values 
collision_df = collision_df.filter(col("DISTANCE") >= 0)
collision_df.describe().show()



24/12/16 16:23:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/16 16:25:03 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)

+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+------------+
|summary|             CASE_ID|     ACCIDENT_YEAR|           PROC_DATE|            JURIS|      COLLISION_DATE|    COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|       DAY_OF_WEEK|       POPULATION|     CNTY_CITY_LOC|         BEAT_TYPE|     CHP_BEAT_TYPE|          PRIMARY_RD|        SECONDARY_RD|          DISTANCE|DIRECTION|INTERSECTION|
+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+---

                                                                                

In [3]:
# Filter rows where PRIMARY_RD or SECONDARY_RD contains "..."
invalid_rd_cases = collision_df.filter(
    (col("PRIMARY_RD").contains("...")) | (col("SECONDARY_RD").contains("..."))
)

invalid_rd_cases.show(truncate=False)



24/12/16 16:25:18 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+



                                                                                

In [4]:
#count number of rows in collision_df
print("Number of rows in collision_df: ", collision_df.count())

# remove all "not stated" values and null
filtered_df = collision_df.dropna()

filtered_df.take(5)

#count number of rows in filtered_df
print("Number of rows in filtered_df: ", filtered_df.count())

24/12/16 16:25:19 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
                                                                                

Number of rows in collision_df:  28539312


24/12/16 16:25:23 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)

Number of rows in filtered_df:  7813108


                                                                                

In [5]:
# Save the cleaned Collision Records DataFrame as CSV
filtered_df.write.csv("clean_collision_records.csv", header=True, mode="overwrite")
print("Collision Records saved successfully")

#close spark session
spark.stop()

24/12/16 16:25:31 WARN MemoryStore: Not enough space to cache rdd_3_65 in memory! (computed 28.8 MiB so far)
                                                                                

Collision Records saved successfully
