In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


# Initialize Spark Session
spark = (SparkSession.builder
         .appName("Collision_Victim_Analysis")
         .config("spark.executor.memory", "4g")  # Increase executor memory
         .config("spark.executor.cores", "2")  # Number of cores per executor
         .config("spark.driver.memory", "4g")  # Increase driver memory
         .getOrCreate())
# Load cleaned collision data
collision_df = spark.read.csv("clean_collision_records.csv", header=True, inferSchema=True)

# Load cleaned victim data
victim_df = spark.read.csv("clean_victim_records.csv", header=True, inferSchema=True)

# Verify loaded data
collision_df.show(5)
victim_df.show(5)


24/12/16 15:28:00 WARN Utils: Your hostname, Seans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.158 instead (on interface en0)
24/12/16 15:28:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 15:28:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 15:28:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/16 15:28:02 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

+------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+---------------+---------------+--------+---------+------------+
|           CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|     PRIMARY_RD|   SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|
+------------------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+---------------+---------------+--------+---------+------------+
|100010101011401155|         2001| 20010416| 0100|      20010101|           114|      1155|                 0|          1|         4|          198|        0|            0|      DUBLIN BL|    SCARLETT CT|     267|        W|           N|
|100010103174503131|         2001| 20010416| 0100|      

In [2]:

from pyspark.sql.functions import when

# Join the datasets on CASE_ID
combined_df = collision_df.join(victim_df, "CASE_ID", "inner")


# Show the joined data
combined_df.show(5)
print("Total records after join:", combined_df.count())

# change victim degree of injury to binary values
combined_df = combined_df.withColumn("INJURY_SEVERITY_BINARY", when((combined_df["VICTIM_DEGREE_OF_INJURY"] == 1) | (combined_df["VICTIM_DEGREE_OF_INJURY"] == 2), 1).otherwise(0))

# drop reporting district column - unused
cols_to_drop = ['REPORTING_DISTRICT']
combined_df = combined_df.drop(*cols_to_drop)


24/12/16 15:28:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+------------+-----------+----------+----------+-----------------------+-----------------------+--------------------+--------------------+--------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|REPORTING_DISTRICT|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|PARTY_NUMBER|VICTIM_ROLE|VICTIM_SEX|VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|VICTIM_EJECTED|
+-------+-------------+---------+-----+--------------+--------------+----------+------------------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+------------+-----------+----------+----------



Total records after join: 18710630


                                                                                

In [3]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Categorical columns to encode
encoded_columns = ["VICTIM_DEGREE_OF_INJURY","VICTIM_SEATING_POSITION","VICTIM_SAFETY_EQUIP2","VICTIM_EJECTED", "DIRECTION", "INTERSECTION"]

# Index and encode categorical columns
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid='skip') for c in encoded_columns]
encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec") for c in encoded_columns]

# Assemble features into a single column
#feature_columns = ["DAY_OF_WEEK", "COLLISION_TIME", "VICTIM_AGE", "VICTIM_SEX", "DIRECTION_vec"]

# Create the assembler
#assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Combine all transformations into a Pipeline
encoded_pipeline = Pipeline(stages=indexers + encoders)

# Transform the data
encoded_df = encoded_pipeline.fit(combined_df).transform(combined_df)

encoded_df.show(5, truncate=False)


# Select final features and target
#final_df = prepared_df.select("features", "VICTIM_DEGREE_OF_INJURY_index")

# Show the resulting DataFrame
#final_df.show(5, truncate=False)


[Stage 79:>                                                         (0 + 1) / 1]

+-------+-------------+---------+-----+--------------+--------------+----------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+------------+-----------+----------+----------+-----------------------+-----------------------+--------------------+--------------------+--------------+----------------------+-----------------------------+-----------------------------+--------------------------+--------------------+---------------+------------------+---------------------------+---------------------------+------------------------+------------------+-------------+----------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|PARTY_NUMBER|VICTIM_ROLE|VICTIM_SEX|VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP2|VICTIM_EJECTED|INJU

                                                                                

In [4]:
print("Columns in Combined DataFrame:")
print(combined_df.columns)

print("Columns in Encoded DataFrame:")
print(encoded_df.columns)

print("number of records in Encoded DataFrame:", encoded_df.count())
encoded_df = encoded_df.dropna()



print("number of records in Encoded DataFrame:", encoded_df.count())



Columns in Combined DataFrame:
['CASE_ID', 'ACCIDENT_YEAR', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'DAY_OF_WEEK', 'POPULATION', 'CNTY_CITY_LOC', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'PARTY_NUMBER', 'VICTIM_ROLE', 'VICTIM_SEX', 'VICTIM_AGE', 'VICTIM_DEGREE_OF_INJURY', 'VICTIM_SEATING_POSITION', 'VICTIM_SAFETY_EQUIP1', 'VICTIM_SAFETY_EQUIP2', 'VICTIM_EJECTED', 'INJURY_SEVERITY_BINARY']
Columns in Encoded DataFrame:
['CASE_ID', 'ACCIDENT_YEAR', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'DAY_OF_WEEK', 'POPULATION', 'CNTY_CITY_LOC', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'PARTY_NUMBER', 'VICTIM_ROLE', 'VICTIM_SEX', 'VICTIM_AGE', 'VICTIM_DEGREE_OF_INJURY', 'VICTIM_SEATING_POSITION', 'VICTIM_SAFETY_EQUIP1', 'VICTIM_SAFETY_EQUIP2', 'VICTIM_EJECTED', 'INJURY_SEVERITY_BINARY', 'VICTIM_DEGREE_OF_INJURY_index', 

                                                                                

number of records in Encoded DataFrame: 18710617




number of records in Encoded DataFrame: 18710613


                                                                                

In [5]:
from pyspark.ml.classification import LogisticRegression
# Create a Logistic Regression model to predict injury severity (1 is fatal/severe, 0 is non-fatal/severe), and assemble the features into a single column
logistic_assembler = VectorAssembler(inputCols=["VICTIM_SEATING_POSITION_vec", "VICTIM_SAFETY_EQUIP2_vec", "VICTIM_EJECTED_vec", "DIRECTION_vec", "INTERSECTION_vec"], outputCol="features")
log_output = logistic_assembler.transform(encoded_df)
log_output.show(5)
log_reg1_data = log_output.select("features", "INJURY_SEVERITY_BINARY")
log_reg1_data.show(5)


# Split the data into training and test sets
train_data, test_data = log_reg1_data.randomSplit([0.7, 0.3], seed=7122)

# fit the model
log_reg1 = LogisticRegression(labelCol="INJURY_SEVERITY_BINARY", featuresCol="features")
log_reg1_model = log_reg1.fit(train_data)

# print summaries
log_reg1_summary = log_reg1_model.summary
log_reg1_summary.predictions.show(5)
log_reg1_summary.predictions.describe().show()






                                                                                

+-------+-------------+---------+-----+--------------+--------------+----------+-----------+----------+-------------+---------+-------------+----------+------------+--------+---------+------------+------------+-----------+----------+----------+-----------------------+-----------------------+--------------------+--------------------+--------------+----------------------+-----------------------------+-----------------------------+--------------------------+--------------------+---------------+------------------+---------------------------+---------------------------+------------------------+------------------+-------------+----------------+--------------------+
|CASE_ID|ACCIDENT_YEAR|PROC_DATE|JURIS|COLLISION_DATE|COLLISION_TIME|OFFICER_ID|DAY_OF_WEEK|POPULATION|CNTY_CITY_LOC|BEAT_TYPE|CHP_BEAT_TYPE|PRIMARY_RD|SECONDARY_RD|DISTANCE|DIRECTION|INTERSECTION|PARTY_NUMBER|VICTIM_ROLE|VICTIM_SEX|VICTIM_AGE|VICTIM_DEGREE_OF_INJURY|VICTIM_SEATING_POSITION|VICTIM_SAFETY_EQUIP1|VICTIM_SAFETY_EQUIP

                                                                                

+--------------------+----------------------+
|            features|INJURY_SEVERITY_BINARY|
+--------------------+----------------------+
|(62,[1,19,43,54,5...|                     0|
|(62,[2,19,43,54,5...|                     0|
|(62,[0,19,43,54,5...|                     0|
|(62,[1,19,43,54,5...|                     0|
|(62,[1,19,43,54,5...|                     0|
+--------------------+----------------------+
only showing top 5 rows



24/12/16 15:32:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/16 15:32:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

+--------------------+----------------------+--------------------+--------------------+----------+
|            features|INJURY_SEVERITY_BINARY|       rawPrediction|         probability|prediction|
+--------------------+----------------------+--------------------+--------------------+----------+
|(62,[0,18,43,53,5...|                   0.0|[3.97072530812820...|[0.98148935703703...|       0.0|
|(62,[0,18,43,53,5...|                   0.0|[3.97072530812820...|[0.98148935703703...|       0.0|
|(62,[0,18,43,53,5...|                   0.0|[3.97072530812820...|[0.98148935703703...|       0.0|
|(62,[0,18,43,53,5...|                   0.0|[3.97072530812820...|[0.98148935703703...|       0.0|
|(62,[0,18,43,53,5...|                   0.0|[3.97072530812820...|[0.98148935703703...|       0.0|
+--------------------+----------------------+--------------------+--------------------+----------+
only showing top 5 rows





+-------+----------------------+--------------------+
|summary|INJURY_SEVERITY_BINARY|          prediction|
+-------+----------------------+--------------------+
|  count|              13100117|            13100117|
|   mean|  0.031275751201306067|3.461037790731182E-4|
| stddev|    0.1740620030359064| 0.01860064562476226|
|    min|                   0.0|                 0.0|
|    max|                   1.0|                 1.0|
+-------+----------------------+--------------------+



                                                                                

In [6]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate the Logistic Regression model
log_reg1_evaluator = BinaryClassificationEvaluator(labelCol="INJURY_SEVERITY_BINARY", rawPredictionCol="prediction")
log_reg1_auc = log_reg1_evaluator.evaluate(log_reg1_summary.predictions)

print("Logistic Regression AUC: ", log_reg1_auc)


                                                                                

Logistic Regression AUC:  0.5033323048446299
