In [1]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor

# 📥 Load Cleaned Dataset
df = pd.read_csv("../data/cleaned_data/nyc_collisions_cleaned.csv")

# 🧠 Feature Engineering
df["crash_hour"] = pd.to_datetime(df["crash_time"], format="%H:%M", errors="coerce").dt.hour
df["crash_date"] = pd.to_datetime(df["crash_date"], dayfirst=True, errors="coerce")
df["day_of_week"] = df["crash_date"].dt.day_name()
df["year"] = df["crash_date"].dt.year

# 🎯 Select Features and Target
features = ["borough", "contributing_factor_vehicle_1", "vehicle_type_code1", "crash_hour", "day_of_week", "year"]
target = "number_of_persons_injured"
df_model = df.dropna(subset=features)

# 🔁 Simplify Categorical Variables
top_factors = df_model["contributing_factor_vehicle_1"].value_counts().nlargest(10).index
top_vehicles = df_model["vehicle_type_code1"].value_counts().nlargest(10).index
df_model["contributing_factor_vehicle_1"] = df_model["contributing_factor_vehicle_1"].apply(lambda x: x if x in top_factors else "Other")
df_model["vehicle_type_code1"] = df_model["vehicle_type_code1"].apply(lambda x: x if x in top_vehicles else "Other")

# 🔤 Encode Categorical Columns
le_borough = LabelEncoder()
le_factor = LabelEncoder()
le_vehicle = LabelEncoder()
le_day = LabelEncoder()

df_model["borough_enc"] = le_borough.fit_transform(df_model["borough"])
df_model["factor_enc"] = le_factor.fit_transform(df_model["contributing_factor_vehicle_1"])
df_model["vehicle_enc"] = le_vehicle.fit_transform(df_model["vehicle_type_code1"])
df_model["day_enc"] = le_day.fit_transform(df_model["day_of_week"])

# 🧪 Define X and y
X = df_model[["borough_enc", "factor_enc", "vehicle_enc", "crash_hour", "day_enc", "year"]]
y = df_model[target]

# 🔄 Log Transform Target
y_log = np.log1p(y)  # log(1 + y) to handle zeros

# 🔀 Split Data
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# 🌲 Train Gradient Boosting Model
gbr = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train_log)

# 📈 Predict & Inverse Transform
y_pred_log = gbr.predict(X_test)
y_pred_actual = np.expm1(y_pred_log)  # Reverse log1p
y_test_actual = np.expm1(y_test_log)

# 📊 Evaluation Metrics
mae = mean_absolute_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
r2 = r2_score(y_test_actual, y_pred_actual)

print("📊 Gradient Boosting Regression Model Metrics:")
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")

# 💾 Save the Model
import os
os.makedirs("models", exist_ok=True)
joblib.dump(gbr, "../models/gbr_injury_count.pkl")
print("✅ Model saved as 'models/gbr_injury_count.pkl'")


📊 Gradient Boosting Regression Model Metrics:
MAE  : 0.4978
RMSE : 0.7400
R²   : 0.0484
✅ Model saved as 'models/gbr_injury_count.pkl'
