# Road Accident Severity Prediction
## Modeling

In [None]:
# ============================================================
# 9. PREP DATA FOR MODELLING
#     - select feature columns that EDA showed important
# ============================================================

feature_cols = [
    # accident numeric
    "Number_of_Vehicles",
    "Number_of_Casualties",
    "Speed_limit",
    "Hour",
    "Month",
    "Is_Weekend",
    "veh_count",
    "veh_age_mean",
    "engine_mean",
    "Latitude",
    "Longitude",
    # accident categorical
    "Weather_Conditions",
    "Light_Conditions",
    "Road_Type",
    "Road_Surface_Conditions",
    "Urban_or_Rural_Area",
    "Junction_Detail",
    "Junction_Control",
    "Carriageway_Hazards",
    "Special_Conditions_at_Site",
]

# keep only columns that actually exist
feature_cols = [c for c in feature_cols if c in acc.columns]

data = acc[feature_cols + [target_col]].copy()
print("Total rows before dropping NA:", data.shape)
data = data.dropna(subset=[target_col])
print("Rows after dropping rows with missing target:", data.shape)

X_full = data[feature_cols]
y_full = data[target_col]


In [None]:
# ============================================================
# 10. HANDLE MISSING VALUES & CATEGORICAL TYPES
# ============================================================

# identify categorical & numeric features
cat_features = X_full.select_dtypes(include=["object", "category"]).columns.tolist()
num_features = [c for c in feature_cols if c not in cat_features]

# fill numerics with median, categoricals with 'Unknown'
for col in num_features:
    X_full[col] = pd.to_numeric(X_full[col], errors="coerce")
    X_full[col] = X_full[col].fillna(X_full[col].median())

for col in cat_features:
    X_full[col] = X_full[col].astype("category")
    # Only add "Unknown" if it's not already in categories
    if "Unknown" not in X_full[col].cat.categories:
        X_full[col] = X_full[col].cat.add_categories(["Unknown"])
    X_full[col] = X_full[col].fillna("Unknown")

X_full.dtypes

In [None]:
# ============================================================
# 11. STRATIFIED SAMPLE (300k) TO TRAIN ON LAPTOP
# ============================================================

N = 300_000
if len(X_full) > N:
    X_sample, _, y_sample, _ = train_test_split(
        X_full, y_full,
        train_size=N,
        stratify=y_full,
        random_state=42
    )
else:
    X_sample, y_sample = X_full, y_full

print("Sampled data shape:", X_sample.shape)
print("Sampled severity distribution:")
print(y_sample.value_counts())


In [None]:
# ============================================================
# 12. TRAIN/TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X_sample,
    y_sample,
    test_size=0.2,
    stratify=y_sample,
    random_state=42
)

X_train.shape, X_test.shape


In [None]:
# ============================================================
# 13. ENCODING FOR MODELS
#     - RF/XGB: OrdinalEncoder
#     - LGBM/CatBoost: use categories directly
# ============================================================

# 13.1 Ordinal encoding for RF and XGB
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

X_train_enc = X_train.copy()
X_test_enc  = X_test.copy()

if cat_features:
    X_train_enc[cat_features] = encoder.fit_transform(X_train[cat_features])
    X_test_enc[cat_features]  = encoder.transform(X_test[cat_features])

X_train_enc.head()


In [None]:
# 13.2 Category dtype for LightGBM & CatBoost
X_train_lgb = X_train.copy()
X_test_lgb  = X_test.copy()
for col in cat_features:
    X_train_lgb[col] = X_train_lgb[col].astype("category")
    X_test_lgb[col]  = X_test_lgb[col].astype("category")


In [None]:
# ============================================================
# 14. CLASS WEIGHTS FOR IMBALANCE
# ============================================================

class_counts = y_train.value_counts().to_dict()
num_classes = len(class_counts)
total = len(y_train)
class_weights = {cls: total / (num_classes * cnt)
                 for cls, cnt in class_counts.items()}
class_weights


In [None]:
# ============================================================
# 15. TRAIN MODELS (RF, XGB, LGBM, CatBoost)
# ============================================================

results = {}
all_preds = {}
all_proba = {}

# 15.1 Random Forest
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    class_weight=class_weights,
    random_state=42
)
rf.fit(X_train_enc, y_train)
y_pred_rf = rf.predict(X_test_enc)
y_proba_rf = rf.predict_proba(X_test_enc)

all_preds["RandomForest"] = y_pred_rf
all_proba["RandomForest"] = y_proba_rf
results["RandomForest"] = f1_score(y_test, y_pred_rf, average="macro")
print("Random Forest Macro F1:", results["RandomForest"])


# XGBOOST

In [None]:
# 15.2 XGBoost
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    learning_rate=0.1,
    max_depth=8,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42
)

xgb_model.fit(X_train_enc, y_train)
y_proba_xgb = xgb_model.predict_proba(X_test_enc)
y_pred_xgb  = np.argmax(y_proba_xgb, axis=1)

all_preds["XGBoost"] = y_pred_xgb
all_proba["XGBoost"] = y_proba_xgb
results["XGBoost"] = f1_score(y_test, y_pred_xgb, average="macro")
print("XGBoost Macro F1:", results["XGBoost"])


# LightGBM

In [None]:
# 15.3 LightGBM
sample_weights = y_train.map(class_weights).values

lgb_train = lgb.Dataset(
    X_train_lgb,
    label=y_train,
    weight=sample_weights,
    categorical_feature=cat_features or None
)
lgb_valid = lgb.Dataset(
    X_test_lgb,
    label=y_test,
    categorical_feature=cat_features or None
)

params_lgb = {
    "objective": "multiclass",
    "num_class": 3,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "metric": "multi_logloss",
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1,
    "seed": 42,
}

callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=100),
]

lgb_model = lgb.train(
    params_lgb,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train", "valid"],
    num_boost_round=1000,
    callbacks=callbacks
)

y_proba_lgb = lgb_model.predict(X_test_lgb)
y_pred_lgb  = np.argmax(y_proba_lgb, axis=1)

all_preds["LightGBM"] = y_pred_lgb
all_proba["LightGBM"] = y_proba_lgb
results["LightGBM"] = f1_score(y_test, y_pred_lgb, average="macro")
print("LightGBM Macro F1:", results["LightGBM"])


# CatBoost

In [None]:
# 15.4 CatBoost
cat_indices = [X_train.columns.get_loc(c) for c in cat_features]

cb_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    verbose=False
)

cb_model.fit(X_train, y_train,
             cat_features=cat_indices,
             eval_set=(X_test, y_test),
             verbose=False)

y_proba_cb = cb_model.predict_proba(X_test)
y_pred_cb  = cb_model.predict(X_test).astype(int).ravel()

all_preds["CatBoost"] = y_pred_cb
all_proba["CatBoost"] = y_proba_cb
results["CatBoost"] = f1_score(y_test, y_pred_cb, average="macro")
print("CatBoost Macro F1:", results["CatBoost"])
