In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt

# Initialize Spark Session
spark = (SparkSession.builder
         .appName("Collision_Victim_Analysis")
         .config("spark.executor.memory", "4g")
         .config("spark.executor.cores", "2")
         .config("spark.driver.memory", "4g")
         .getOrCreate())

# Load data
collision_df = spark.read.csv("clean_collision_records.csv", header=True, inferSchema=True)
victim_df = spark.read.csv("clean_victim_records.csv", header=True, inferSchema=True)

# Join datasets
combined_df = collision_df.join(victim_df, "CASE_ID", "inner")

# Feature Engineering
combined_df = combined_df.withColumn(
    "INJURY_SEVERITY_BINARY",
    when((col("VICTIM_DEGREE_OF_INJURY") == 1) | (col("VICTIM_DEGREE_OF_INJURY") == 2), 1).otherwise(0)
).drop("REPORTING_DISTRICT")

# Fill missing values
encoded_columns = ["VICTIM_SEATING_POSITION", "VICTIM_SAFETY_EQUIP2", "VICTIM_EJECTED",
                   "DIRECTION", "VICTIM_SEX", "PRIMARY_RD", "SECONDARY_RD", "VICTIM_ROLE"]
numerical_columns = ["COLLISION_TIME"]

for col_name in encoded_columns:
    combined_df = combined_df.fillna({col_name: "Unknown"})

for col_name in numerical_columns:
    combined_df = combined_df.fillna({col_name: 0})

# Define Pipeline
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_index", handleInvalid='skip') for c in encoded_columns]
encoders = [OneHotEncoder(inputCol=f"{c}_index", outputCol=f"{c}_vec") for c in encoded_columns]
assembler_numerical = VectorAssembler(inputCols=numerical_columns, outputCol="numerical_features")
scaler = StandardScaler(inputCol="numerical_features", outputCol="scaled_numerical_features", withMean=True, withStd=True)
assembler_final = VectorAssembler(
    inputCols=["scaled_numerical_features"] + [f"{c}_vec" for c in encoded_columns],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + encoders + [assembler_numerical, scaler, assembler_final])

# Prepare Data
prepared_df = pipeline.fit(combined_df).transform(combined_df)
final_df = prepared_df.select("features", "VICTIM_AGE")

# Train-Test Split
train_data, test_data = final_df.randomSplit([0.8, 0.2], seed=42)

# Train Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="VICTIM_AGE", predictionCol="prediction")
lr_model = lr.fit(train_data)

# Evaluate Model
predictions = lr_model.transform(test_data)
r2 = RegressionEvaluator(labelCol="VICTIM_AGE", predictionCol="prediction", metricName="r2").evaluate(predictions)
rmse = RegressionEvaluator(labelCol="VICTIM_AGE", predictionCol="prediction", metricName="rmse").evaluate(predictions)
mae = RegressionEvaluator(labelCol="VICTIM_AGE", predictionCol="prediction", metricName="mae")

print("R2:", r2)
print("RMSE:", rmse)
print("MAE:", mae)

# Visualize Predictions
actual = predictions.select("VICTIM_AGE").rdd.flatMap(lambda x: x).collect()
predicted = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

plt.scatter(actual, predicted, alpha=0.5)
plt.xlabel("Actual VICTIM_AGE")
plt.ylabel("Predicted VICTIM_AGE")
plt.title("Actual vs Predicted VICTIM_AGE")
plt.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/18 23:15:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/18 23:15:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/18 23:16:51 WARN DAGScheduler: Broadcasting large task binary with size 1947.3 KiB
24/12/18 23:17:00 WARN DAGScheduler: Broadcasting large task binary with size 1995.5 KiB
24/12/18 23:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1987.9 KiB
24/12/18 23:17:05 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB
24/12/18 23:17:17 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB
24/12/18 23:17:21 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB
24/12/18 23:17:22 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB
24/12/18 23:17:34

Py4JError: An error occurred while calling o1159.fit