In [1]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, fbeta_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# 📥 Load Cleaned Dataset
df = pd.read_csv("../data/cleaned_data/nyc_collisions_cleaned.csv")

# 🧠 Feature Engineering
df["crash_hour"] = pd.to_datetime(df["crash_time"], format="%H:%M", errors="coerce").dt.hour
df["crash_date"] = pd.to_datetime(df["crash_date"], dayfirst=True, errors="coerce")
df["day_of_week"] = df["crash_date"].dt.day_name()
df["month"] = df["crash_date"].dt.month
df["year"] = df["crash_date"].dt.year
df["is_weekend"] = df["day_of_week"].isin(["Saturday", "Sunday"]).astype(int)
df["is_peak_hour"] = df["crash_hour"].between(7, 10) | df["crash_hour"].between(16, 19)

# 🎯 Target Variable
df["is_fatal"] = df["number_of_persons_killed"].apply(lambda x: 1 if x > 0 else 0)

# ✅ Feature Selection
features = [
    "borough", "contributing_factor_vehicle_1", "vehicle_type_code1",
    "crash_hour", "day_of_week", "month", "year", "is_weekend", "is_peak_hour"
]
target = "is_fatal"
df_model = df.dropna(subset=features)

# 🔁 Simplify Categorical Values
top_factors = df_model["contributing_factor_vehicle_1"].value_counts().nlargest(10).index
top_vehicles = df_model["vehicle_type_code1"].value_counts().nlargest(10).index
df_model["contributing_factor_vehicle_1"] = df_model["contributing_factor_vehicle_1"].apply(lambda x: x if x in top_factors else "Other")
df_model["vehicle_type_code1"] = df_model["vehicle_type_code1"].apply(lambda x: x if x in top_vehicles else "Other")

# 🔤 Encode Categorical Variables
encoders = {}
for col in ["borough", "contributing_factor_vehicle_1", "vehicle_type_code1", "day_of_week"]:
    le = LabelEncoder()
    df_model[col + "_enc"] = le.fit_transform(df_model[col])
    encoders[col] = le

# 🧪 Prepare Final Features
X = df_model[[
    "borough_enc", "contributing_factor_vehicle_1_enc", "vehicle_type_code1_enc",
    "crash_hour", "day_of_week_enc", "year", "month", "is_weekend", "is_peak_hour"
]]

y = df_model[target]

# 🔄 Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# 🔀 Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 🌲 Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42)
clf.fit(X_train, y_train)

# 🔍 Predictions
y_prob = clf.predict_proba(X_test)[:, 1]

# 🎯 Threshold Tuning
threshold = 0.25  # Adjusted threshold
y_pred = (y_prob >= threshold).astype(int)

# 📊 Evaluation
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
f2 = fbeta_score(y_test, y_pred, beta=2)
report = classification_report(y_test, y_pred)

print("📊 Classification Metrics for is_fatal (Random Forest with SMOTE & threshold tuning):")
print(f"Accuracy  : {accuracy:.4f}")
print(f"ROC AUC   : {roc_auc:.4f}")
print(f"F2 Score  : {f2:.4f}")
print("Detailed Report:\n", report)

# 💾 Save the Model
os.makedirs("models", exist_ok=True)
joblib.dump(clf, "../models/classifier_is_fatal.pkl")
print("✅ Classifier saved as 'models/classifier_is_fatal.pkl'")


📊 Classification Metrics for is_fatal (Random Forest with SMOTE & threshold tuning):
Accuracy  : 0.8548
ROC AUC   : 0.9755
F2 Score  : 0.9447
Detailed Report:
               precision    recall  f1-score   support

           0       1.00      0.71      0.83     67129
           1       0.78      1.00      0.87     67545

    accuracy                           0.85    134674
   macro avg       0.89      0.85      0.85    134674
weighted avg       0.89      0.85      0.85    134674

✅ Classifier saved as 'models/classifier_is_fatal.pkl'
