In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, length

# Define data and schema
CollisionRecords = [
    "20160924_CollisionRecords.txt",
    "20170112_CollisionRecords.txt",
    "20180925_CollisionRecords.txt",
    "20201024_CollisionRecords.txt"
]

collision_schema = StructType([
    StructField("CASE_ID", StringType(), True),
    StructField("ACCIDENT_YEAR", IntegerType(), True),
    StructField("PROC_DATE", StringType(), True),
    StructField("JURIS", StringType(), True),
    StructField("COLLISION_DATE", StringType(), True),
    StructField("COLLISION_TIME", StringType(), True),
    StructField("OFFICER_ID", StringType(), True),
    StructField("REPORTING_DISTRICT", StringType(), True),
    StructField("DAY_OF_WEEK", IntegerType(), True),
    StructField("CHP_SHIFT", StringType(), True),
    StructField("POPULATION", IntegerType(), True),
    StructField("CNTY_CITY_LOC", StringType(), True),
    StructField("SPECIAL_COND", StringType(), True),
    StructField("BEAT_TYPE", StringType(), True),
    StructField("CHP_BEAT_TYPE", StringType(), True),
    StructField("CITY_DIVISION_LAPD", StringType(), True),
    StructField("CHP_BEAT_CLASS", StringType(), True),
    StructField("BEAT_NUMBER", StringType(), True),
    StructField("PRIMARY_RD", StringType(), True),
    StructField("SECONDARY_RD", StringType(), True),
    StructField("DISTANCE", IntegerType(), True),
    StructField("DIRECTION", StringType(), True),
    StructField("INTERSECTION", StringType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
])

# Start Spark session
spark = SparkSession.builder.appName("CollisionRecords").getOrCreate()

# Load the data
collision_df = spark.read.csv(CollisionRecords, schema=collision_schema, header=False).cache()

# Drop rows where CASE_ID matches the header
header = collision_df.first().asDict()
collision_df = collision_df.filter(col("CASE_ID") != header["CASE_ID"])

#### Debug: Reduce dataset for testing ####
#collision_df = collision_df.limit(10)

# Step 1: Display non-numerical columns to the user
non_numerical_columns = [f.name for f in collision_schema.fields if isinstance(f.dataType, StringType)]
print("Non-Numerical Columns: ", non_numerical_columns)

# Step 2: Manually specify columns to drop (modify this list for your needs)
columns_to_drop = ["CHP_SHIFT", "CITY_DIVISION_LAPD", "SPECIAL_COND", "CITY_DIVISION_LAPD", "CHP_BEAT_CLASS", "BEAT_NUMBER" ]  # Example input

# Drop the specified columns
collision_df = collision_df.drop(*columns_to_drop)

# Step 3: Drop columns with all null values
columns_with_all_nulls = [col_name for col_name in collision_df.columns if collision_df.filter(col(col_name).isNotNull()).count() == 0]
collision_df = collision_df.drop(*columns_with_all_nulls)

# Handle nulls in critical columns
collision_df = collision_df.fillna({
    "ACCIDENT_YEAR": 0,
    "DAY_OF_WEEK": 0,
    "POPULATION": 0,
    "DISTANCE": 0
})

# Ensure DAY_OF_WEEK values are valid
collision_df = collision_df.filter((col("DAY_OF_WEEK") >= 1) & (col("DAY_OF_WEEK") <= 7))

#### Debug: Check for corrupted data ####
collision_df.filter(col("CASE_ID").rlike("[^a-zA-Z0-9]")).show(5)
collision_df.filter(length(col("CASE_ID")) > 50).show(5)


# Show cleaned data
print("Cleaned Data Info:")
collision_df.printSchema()
collision_df.show(10)


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [2]:


# Fix negative distance values 
collision_df = collision_df.filter(col("DISTANCE") >= 0)
collision_df.describe().show()



24/12/16 14:28:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 14:29:20 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_18 in memory.
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 14:29:20 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 384.0 B so far)
24/12/16 14:29:20 WARN Me

+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+------------+
|summary|             CASE_ID|     ACCIDENT_YEAR|           PROC_DATE|            JURIS|      COLLISION_DATE|    COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|       DAY_OF_WEEK|       POPULATION|     CNTY_CITY_LOC|         BEAT_TYPE|     CHP_BEAT_TYPE|          PRIMARY_RD|        SECONDARY_RD|          DISTANCE|DIRECTION|INTERSECTION|
+-------+--------------------+------------------+--------------------+-----------------+--------------------+------------------+----------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+--------------------+------------------+---------+---

                                                                                

In [3]:
# Filter rows where PRIMARY_RD or SECONDARY_RD contains "..."
invalid_rd_cases = collision_df.filter(
    (col("PRIMARY_RD").contains("...")) | (col("SECONDARY_RD").contains("..."))
)

invalid_rd_cases.show(truncate=False)



24/12/16 14:30:50 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 14:30:50 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 14:30:50 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 14:30:50 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 14:30:50 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 27.1 MiB so far)
24/12/16 14:30:51 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 20.9 MiB so far)
24/12/16 14:30:51 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 14:30:51 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 14:30:51 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 14:30:51 W

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+



                                                                                

In [8]:
#count number of rows in collision_df
print("Number of rows in collision_df: ", collision_df.count())

# remove all "not stated" values and null
filtered_df = collision_df.dropna()

filtered_df.take(5)

#count number of rows in filtered_df
print("Number of rows in filtered_df: ", filtered_df.count())

24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 13.7 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_22 in memory! (computed 13.5 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_20 in memory! (computed 13.5 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_21 in memory! (computed 13.6 MiB so far)
24/12/16 14:33:34 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 14:33:34 W

Number of rows in collision_df:  28539312


24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 10.7 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 13.2 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 14:33:38 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_20 in mem

Number of rows in filtered_df:  7813108


                                                                                

In [6]:
# Save the cleaned Collision Records DataFrame as CSV
filtered_df.write.csv("clean_collision_records.csv", header=True, mode="overwrite")
print("Collision Records saved successfully")

24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_13 in memory! (computed 13.9 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_12 in memory! (computed 26.9 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_3_16 in memory.
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_17 in memory! (computed 10.7 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_16 in memory! (computed 384.0 B so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_19 in memory! (computed 13.5 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_14 in memory! (computed 13.6 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_15 in memory! (computed 13.6 MiB so far)
24/12/16 14:31:16 WARN MemoryStore: Not enough space to cache rdd_3_18 in memory! (computed 11.9 MiB so f

Collision Records saved successfully
