In [1]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# 📥 Load Cleaned Dataset
df = pd.read_csv("../data/cleaned_data/nyc_collisions_cleaned.csv")

# 🧠 Feature Engineering
df["crash_hour"] = pd.to_datetime(df["crash_time"], format="%H:%M", errors="coerce").dt.hour
df["crash_date"] = pd.to_datetime(df["crash_date"], dayfirst=True, errors="coerce")
df["day_of_week"] = df["crash_date"].dt.day_name()
df["year"] = df["crash_date"].dt.year

# 🎯 Define Target Column for Classification
df["is_injury"] = df["number_of_persons_injured"].apply(lambda x: 1 if x > 0 else 0)

# ✅ Feature Selection
features = ["borough", "contributing_factor_vehicle_1", "vehicle_type_code1", "crash_hour", "day_of_week", "year"]
target = "is_injury"
df_model = df.dropna(subset=features)

# 🔁 Simplify Categorical Variables
top_factors = df_model["contributing_factor_vehicle_1"].value_counts().nlargest(10).index
top_vehicles = df_model["vehicle_type_code1"].value_counts().nlargest(10).index

df_model["contributing_factor_vehicle_1"] = df_model["contributing_factor_vehicle_1"].apply(lambda x: x if x in top_factors else "Other")
df_model["vehicle_type_code1"] = df_model["vehicle_type_code1"].apply(lambda x: x if x in top_vehicles else "Other")

# 🔤 Encode Categorical Variables
le_borough = LabelEncoder()
le_factor = LabelEncoder()
le_vehicle = LabelEncoder()
le_day = LabelEncoder()

df_model["borough_enc"] = le_borough.fit_transform(df_model["borough"])
df_model["factor_enc"] = le_factor.fit_transform(df_model["contributing_factor_vehicle_1"])
df_model["vehicle_enc"] = le_vehicle.fit_transform(df_model["vehicle_type_code1"])
df_model["day_enc"] = le_day.fit_transform(df_model["day_of_week"])

# 🧪 Define X and y
X = df_model[["borough_enc", "factor_enc", "vehicle_enc", "crash_hour", "day_enc", "year"]]
y = df_model[target]

# 🔀 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🤖 Train Balanced Classifier
clf = RandomForestClassifier(n_estimators=150, max_depth=10, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# 🔍 Evaluate Classifier
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
report = classification_report(y_test, y_pred)

print("📊 Classification Metrics for is_injury:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"ROC AUC   : {roc_auc:.4f}")
print("Detailed Report:\n", report)

# 💾 Save the Classifier Model
os.makedirs("models", exist_ok=True)
joblib.dump(clf, "../models/classifier_is_injury.pkl")
print("✅ Classifier saved as 'models/classifier_is_injury.pkl'")


📊 Classification Metrics for is_injury:
Accuracy  : 0.6335
ROC AUC   : 0.6924
Detailed Report:
               precision    recall  f1-score   support

           0       0.76      0.63      0.69     43206
           1       0.49      0.64      0.56     24268

    accuracy                           0.63     67474
   macro avg       0.62      0.63      0.62     67474
weighted avg       0.66      0.63      0.64     67474

✅ Classifier saved as 'models/classifier_is_injury.pkl'
