# Ridge Regression — Baseline

In [1]:
import os, sys, json
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# --- Local imports
sys.path.append(os.path.abspath(os.path.join('..', '..', 'common')))
from prep import (
    load_data, infer_target, split_features, make_preprocessor,
    holdout_split, eval_regression,
    permutation_importance_df, save_json, save_csv_df, save_submission
)

## 1. Load and prepare data

In [2]:
train, test, sample = load_data()
target_col = infer_target(train, test)
feature_cols, cat_cols, num_cols = split_features(train, target_col)

# Split features and target
X = train[feature_cols]
y = train[target_col].astype(float)
X_test = test[feature_cols]

print(f"Train shape: {X.shape}, Test shape: {X_test.shape}")
print(f"Target column: {target_col}")

Train shape: (517754, 13), Test shape: (172585, 13)
Target column: accident_risk


## 2. Define preprocessing and model pipeline

In [3]:
# Preprocessing:
# - numeric: median imputation + scaling
# - categorical: most frequent imputation + OneHotEncoder
prep = make_preprocessor(num_cols, cat_cols, scale_numeric=True)

# Model: Ridge Regression (linear baseline)
model = Ridge(alpha=1.0, random_state=42)

# Combine into pipeline
pipe = Pipeline([
    ('prep', prep),
    ('clf', model)
])

## 3. Holdout validation

In [4]:
X_tr, X_va, y_tr, y_va = holdout_split(X, y)

# Train model
pipe.fit(X_tr, y_tr)

# Predict on validation
valid_pred = pipe.predict(X_va)

# Evaluate metrics
metrics = eval_regression(y_va, valid_pred)
print("Holdout metrics:", metrics)

# Save holdout report
save_json(
    {"model": "Ridge", **metrics},
    "../../outputs/holdout_reports/ridge_holdout.json"
)

Holdout metrics: {'rmse': 0.07353072769066067, 'mae': 0.058312117765281946, 'r2': 0.8041891251770915}


## 4. Permutation Importance

In [5]:
# Check which features most affect RMSE
imp_df = permutation_importance_df(pipe, X_va, y_va, num_cols, cat_cols, n_repeats=5)
save_csv_df(imp_df, "../../outputs/feature_importance/ridge_perm_importance.csv")

print("Top 10 most important features:")
display(imp_df.head(10))

Top 10 most important features:


Unnamed: 0,feature,perm_importance_rmse
0,speed_limit,0.07190653
1,public_road,0.06842494
2,road_signs_present,0.06194028
3,holiday,0.02268653
4,lighting=daylight,0.002666322
5,road_type=highway,1.05553e-05
6,curvature,5.310029e-06
7,school_season,2.409153e-06
8,num_reported_accidents,2.086459e-06
9,road_type=urban,5.243514e-07


## 5. Final model training and submission

In [6]:
# Fit on all data
pipe.fit(X, y)

# Predict on test set
test_pred = pipe.predict(X_test)

# Create submission file (values clipped to [0,1])
out_path = save_submission(sample, test_pred, out_name="ridge.csv")
print("Saved submission file to:", out_path)

Saved submission file to: ../../outputs/submissions/ridge.csv


### 🧾 Summary

- **Model:** Ridge Regression  
- **Holdout metrics:**  
  - RMSE = **0.0735**  
  - MAE = **0.0583**  
  - R² = **0.8042**

The average deviation between predicted and actual accident risk is around **7%**,  
which is a solid result for a simple linear baseline model.  

However, we aim to further improve model performance  
by introducing more flexible algorithms such as **Random Forest Regressor** and **HistGradientBoosting Regressor**,  
which are capable of capturing nonlinear relationships between features and accident risk.
