In [6]:
import pandas as pd
df = pd.read_csv("Accident_Information.csv")
df.head()

Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,...,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,200501BS00001,A,3218.0,,0.0,Serious,,2005-01-04,Tuesday,1.0,...,Metropolitan Police,Wet or damp,Single carriageway,,30.0,17:42,Urban,Raining no high winds,2005,No
1,200501BS00002,B,450.0,C,0.0,Slight,,2005-01-05,Wednesday,1.0,...,Metropolitan Police,Dry,Dual carriageway,,30.0,17:36,Urban,Fine no high winds,2005,No
2,200501BS00003,C,0.0,,0.0,Slight,,2005-01-06,Thursday,1.0,...,Metropolitan Police,Dry,Single carriageway,,30.0,00:15,Urban,Fine no high winds,2005,No
3,200501BS00004,A,3220.0,,0.0,Slight,,2005-01-07,Friday,1.0,...,Metropolitan Police,Dry,Single carriageway,,30.0,10:35,Urban,Fine no high winds,2005,No
4,200501BS00005,Unclassified,0.0,,0.0,Slight,,2005-01-10,Monday,1.0,...,Metropolitan Police,Wet or damp,Single carriageway,,30.0,21:13,Urban,Fine no high winds,2005,No


In [8]:
df.shape

(2047256, 34)

In [4]:


# ===============================
# 1. IMPORTS
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
import joblib
import warnings
warnings.filterwarnings("ignore")

# ===============================
# 2. LOAD DATA (50K ONLY)
# ===============================
accidents = pd.read_csv(
    "Accident_Information.csv",
    encoding="latin1",
    low_memory=False
).sample(50000, random_state=42)

vehicles = pd.read_csv(
    "Vehicle_Information.csv",
    encoding="latin1",
    low_memory=False
)

# ===============================
# 3. TARGET CLEANING
# ===============================
accidents = accidents[accidents["Accident_Severity"].isin(
    ["Fatal", "Serious", "Slight"]
)]

severity_map = {"Fatal": 0, "Serious": 1, "Slight": 2}
accidents["Accident_Severity"] = accidents["Accident_Severity"].map(severity_map)
accidents.dropna(subset=["Accident_Severity"], inplace=True)

# ===============================
# 4. VEHICLE AGGREGATION (SAFE)
# ===============================
vehicle_agg = vehicles.groupby("Accident_Index").agg(
    Engine_CC_Mean=("Engine_Capacity_.CC.", "mean"),
    Number_of_Vehicles=("Vehicle_Reference", "count")
).reset_index()

# ===============================
# 5. MERGE
# ===============================
df = accidents.merge(vehicle_agg, on="Accident_Index", how="left")

# ðŸ”’ GUARANTEE COLUMN EXISTS
if "Number_of_Vehicles" not in df.columns:
    df["Number_of_Vehicles"] = 1

# ===============================
# 6. FEATURE ENGINEERING
# ===============================
df["Time"] = pd.to_datetime(df["Time"], errors="coerce")
df["Hour"] = df["Time"].dt.hour
df.drop(columns=["Time"], inplace=True)

df["Number_of_Vehicles"] = pd.to_numeric(
    df["Number_of_Vehicles"], errors="coerce"
)

# ===============================
# 7. SELECT FEATURES
# ===============================
features = [
    "Number_of_Vehicles",
    "Engine_CC_Mean",
    "Speed_limit",
    "Weather_Conditions",
    "Road_Surface_Conditions",
    "Light_Conditions",
    "Urban_or_Rural_Area",
    "Day_of_Week",
    "Hour"
]

df = df[features + ["Accident_Severity"]]

# ===============================
# 8. SPLIT X / y
# ===============================
X = df.drop(columns=["Accident_Severity"])
y = df["Accident_Severity"].astype(int)

# ===============================
# 9. HANDLE TYPES
# ===============================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])

X[cat_cols] = X[cat_cols].astype(str)
cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
X[cat_cols] = encoder.fit_transform(X[cat_cols])

# ===============================
# 10. TRAIN / TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ===============================
# 11. MODEL
# ===============================
model = HistGradientBoostingClassifier(
    max_depth=8,
    learning_rate=0.08,
    max_iter=300,
    class_weight={0: 6, 1: 2, 2: 1},
    random_state=42
)

model.fit(X_train, y_train)

# ===============================
# 12. EVALUATION
# ===============================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred,
                            target_names=["Fatal", "Serious", "Slight"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ===============================
# 13. SAVE MODEL
# ===============================
joblib.dump(
    {
        "model": model,
        "encoder": encoder,
        "num_imputer": num_imputer,
        "cat_imputer": cat_imputer,
        "features": features,
        "num_cols": list(num_cols),
        "cat_cols": list(cat_cols)
    },
    "fatal_accident_model.joblib")

PermissionError: [Errno 13] Permission denied