In [4]:
import pandas as pd

df_model = pd.read_csv("../data/processed/df_young_model_ready_2024.csv")

In [5]:
# Always redefine after structural changes
# Redefine target
y = df_model["high_risk"]

# Redefine protected attributes (for fairness later)
A = df_model[["sex_of_driver", "age_band_of_driver"]]

# Redefine feature matrix
X = df_model.drop(columns=["high_risk", "sex_of_driver", "age_band_of_driver"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("A shape:", A.shape)

X shape: (13376, 54)
y shape: (13376,)
A shape: (13376, 2)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X,
    y,
    A,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (10700, 54)
Test: (2676, 54)


##  Random Forest Baseline

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import numpy as np

# Random Forest baseline (simple, not tuned)
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight=None
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nPredicted class distribution:", np.bincount(y_pred_rf))
print("\nClassification report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.7488789237668162
F1 Score: 0.2328767123287671
ROC-AUC: 0.7099246165396175

Predicted class distribution: [2509  167]

Classification report:
               precision    recall  f1-score   support

           0       0.76      0.97      0.85      1967
           1       0.61      0.14      0.23       709

    accuracy                           0.75      2676
   macro avg       0.68      0.56      0.54      2676
weighted avg       0.72      0.75      0.69      2676



### Random Forest Balanced

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import numpy as np

# Balanced Random Forest
rf_bal = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced",
)

# Train
rf_bal.fit(X_train, y_train)

# Predict
y_pred_rf_bal = rf_bal.predict(X_test)
y_proba_rf_bal = rf_bal.predict_proba(X_test)[:, 1]  # probability of class 1 (high_risk)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf_bal))
print("F1 Score:", f1_score(y_test, y_pred_rf_bal))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf_bal))
print("\nPredicted class distribution:", np.bincount(y_pred_rf_bal))
print("\nClassification report:\n", classification_report(y_test, y_pred_rf_bal))

Accuracy: 0.7481315396113603
F1 Score: 0.18990384615384615
ROC-AUC: 0.7085966400473829

Predicted class distribution: [2553  123]

Classification report:
               precision    recall  f1-score   support

           0       0.75      0.98      0.85      1967
           1       0.64      0.11      0.19       709

    accuracy                           0.75      2676
   macro avg       0.70      0.54      0.52      2676
weighted avg       0.72      0.75      0.68      2676



## Threshold Evaluation for Random Forest

In [10]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define thresholds to test
thresholds = [0.5, 0.4, 0.3, 0.2]

print("Threshold Evaluation for Random Forest\n")

for t in thresholds:
    
    # Convert probabilities to predictions using threshold t
    y_pred_thresh = (y_proba_rf >= t).astype(int)
    
    acc = accuracy_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    precision = precision_score(y_test, y_pred_thresh)
    
    print(f"Threshold: {t}")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  Recall   : {recall:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1 Score : {f1:.4f}")
    print("-" * 40)

Threshold Evaluation for Random Forest

Threshold: 0.5
  Accuracy : 0.7485
  Recall   : 0.1467
  Precision: 0.6047
  F1 Score : 0.2361
----------------------------------------
Threshold: 0.4
  Accuracy : 0.7481
  Recall   : 0.3780
  Precision: 0.5349
  F1 Score : 0.4430
----------------------------------------
Threshold: 0.3
  Accuracy : 0.6596
  Recall   : 0.6220
  Precision: 0.4068
  F1 Score : 0.4919
----------------------------------------
Threshold: 0.2
  Accuracy : 0.4918
  Recall   : 0.8717
  Precision: 0.3275
  F1 Score : 0.4761
----------------------------------------
