In [None]:
import pandas as pd

df_model = pd.read_csv("../data/processed/df_young_model_ready_2024.csv")
df_model.shape

2.3.3


(13376, 57)

In [2]:
df_model['high_risk'].value_counts(normalize=True)
df_model.dtypes.value_counts()

int64    57
Name: count, dtype: int64

In [3]:
df_model.isna().sum().sort_values(ascending=False).head(10)

collision_year_x            0
number_of_casualties        0
local_authority_district    0
first_road_class            0
first_road_number           0
road_type                   0
speed_limit                 0
junction_detail_historic    0
junction_detail             0
junction_control            0
dtype: int64

In [4]:
leak_cols = [c for c in df_model.columns if "severity" in c.lower()]
leak_cols

[]

In [5]:
id_like = [c for c in df_model.columns if c in [
    "collision_index", "collision_ref_no_x", "collision_ref_no_y", "vehicle_reference"
]]
id_like

[]

In [7]:
df_model["high_risk"].value_counts(dropna=False)

high_risk
0    9831
1    3545
Name: count, dtype: int64

In [8]:
df_model.duplicated().sum()

np.int64(1)

In [9]:
df_model = df_model.drop_duplicates()

In [10]:
df_model.duplicated().sum()

np.int64(0)

In [None]:
# Always redefine after structural changes
# Redefine target
y = df_model["high_risk"]

# Redefine protected attributes (for fairness later)
A = df_model[["sex_of_driver", "age_band_of_driver"]]

# Redefine feature matrix
X = df_model.drop(columns=["high_risk", "sex_of_driver", "age_band_of_driver"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("A shape:", A.shape)

X shape: (13375, 54)
y shape: (13375,)
A shape: (13375, 2)


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X,
    y,
    A,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (10700, 54)
Test: (2675, 54)


In [16]:
print("Train class balance:")
print(y_train.value_counts(normalize=True))

print("\nTest class balance:")
print(y_test.value_counts(normalize=True))

Train class balance:
high_risk
0    0.735047
1    0.264953
Name: proportion, dtype: float64

Test class balance:
high_risk
0    0.734953
1    0.265047
Name: proportion, dtype: float64


##  Logistic Regression Baseline

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Initialize model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train
log_reg.fit(X_train, y_train)

# Predict
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.737196261682243
F1 Score: 0.1267080745341615
ROC-AUC: 0.6558073999888083


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Logistic Regression Baseline – Initial Results

### Model Performance (Unscaled Features)

- **Accuracy:** 0.737  
- **F1 Score:** 0.127  
- **ROC–AUC:** 0.656  

---

### Interpretation of Results

At first glance, the model appears to perform reasonably well, with an accuracy of approximately **73.7%**. However, this must be interpreted carefully.

The dataset is **class imbalanced**, with approximately:

- **73% Low Risk (0)**  
- **27% High Risk (1)**  

This means that a naive model that predicts *“Low Risk” for every case* would already achieve around **73% accuracy** without learning any meaningful patterns.

Therefore, the observed accuracy (0.737) is only marginally above this majority-class baseline. Accuracy alone is therefore **not an informative metric** in this setting.

---

### F1 Score Analysis

The **F1 score (0.127)** is very low.

The F1 score balances:

- **Precision** (how many predicted high-risk cases were correct)
- **Recall** (how many actual high-risk cases were successfully identified)

A low F1 score indicates that the model struggles to correctly identify **high-risk drivers**, which is the minority but more critical class in this application.

Since the objective of this project is to predict higher-risk outcomes, this result suggests that the baseline Logistic Regression model is not yet effectively discriminating between low- and high-risk cases.

---

### ROC–AUC Interpretation

The **ROC–AUC score of 0.656** suggests that the model has some ability to rank high-risk cases above low-risk cases, but its discriminatory power remains modest.

For reference:

- 0.5 indicates random guessing  
- ~0.65 indicates weak-to-moderate separation  

This implies the model is learning some signal from the data, but not strongly enough to produce reliable classification performance.


---

### Convergence Warning

The model produced the following warning:

`lbfgs failed to converge after 1000 iterations`

This indicates that the optimisation algorithm did not fully converge before reaching the iteration limit. Two likely causes are:

1. The features have not yet been scaled.
2. The feature space contains variables with different magnitudes.

Logistic Regression performs best when features are standardised. This warning suggests that feature scaling (e.g., using `StandardScaler`) should be applied before drawing firm conclusions about model performance.

---

### Baseline Conclusion

This baseline experiment highlights three key points:

1. Accuracy is misleading due to class imbalance.
2. The model performs poorly at identifying high-risk cases (low F1 score).
3. Feature scaling is required to ensure proper convergence and fair evaluation.

This provides a meaningful starting benchmark against which improved preprocessing and more expressive models (e.g., Random Forest or XGBoost) can be compared.

###  Logistic Regression (Scaled Version)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Create pipeline: scaling + logistic regression
lr_scaled_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=5000, random_state=42))
])

# Train
lr_scaled_pipeline.fit(X_train, y_train)

# Predict
y_pred_scaled = lr_scaled_pipeline.predict(X_test)
y_proba_scaled = lr_scaled_pipeline.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred_scaled))
print("F1 Score:", f1_score(y_test, y_pred_scaled))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_scaled))

Accuracy: 0.7357009345794393
F1 Score: 0.10619469026548672
ROC-AUC: 0.6627067768424285


To improve optimisation stability, all numerical features were standardised using a StandardScaler prior to training Logistic Regression. Scaling ensures that features with larger numeric ranges do not dominate the optimisation process.

The scaled Logistic Regression model produced the following results:

- Accuracy: 0.736
- F1 Score: 0.106
- ROC-AUC: 0.663

Compared to the unscaled version, ROC-AUC improved slightly, indicating better probability ranking between high-risk and low-risk drivers. The convergence warning observed previously was resolved after scaling, suggesting improved optimisation behaviour.

However, the F1 score remained low, indicating that the model still struggles to correctly identify high-risk drivers. This reflects the moderate class imbalance in the dataset (~27% high-risk vs ~73% low-risk).

These results suggest that while scaling improves optimisation stability, additional techniques may be required to better handle class imbalance.

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_scaled))

              precision    recall  f1-score   support

           0       0.74      0.98      0.84      1966
           1       0.51      0.06      0.11       709

    accuracy                           0.74      2675
   macro avg       0.63      0.52      0.48      2675
weighted avg       0.68      0.74      0.65      2675



### Logistic Regression Balanced

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Create pipeline: scaling + balanced logistic regression
lr_balanced_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=5000, 
                                 class_weight='balanced', 
                                 random_state=42))
])

# Train
lr_balanced_pipeline.fit(X_train, y_train)

# Predict
y_pred_bal = lr_balanced_pipeline.predict(X_test)
y_proba_bal = lr_balanced_pipeline.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred_bal))
print("F1 Score:", f1_score(y_test, y_pred_bal))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_bal))

Accuracy: 0.5951401869158879
F1 Score: 0.45714285714285713
ROC-AUC: 0.6627397779171156


In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_bal))

              precision    recall  f1-score   support

           0       0.82      0.58      0.68      1966
           1       0.35      0.64      0.46       709

    accuracy                           0.60      2675
   macro avg       0.59      0.61      0.57      2675
weighted avg       0.70      0.60      0.62      2675



In [24]:
import numpy as np
print("Predicted class distribution:")
print(np.bincount(y_pred_bal))

Predicted class distribution:
[1389 1286]
