# LogisticRegression

Clean, junior-friendly baseline with:
- Robust preprocessing
- Stratified 5-Fold ROC AUC
- Holdout sanity check
- Permutation importance
- Auto-save metrics & submission


## 1. Setup

In [1]:
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, precision_recall_curve
from sklearn.inspection import permutation_importance

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

OUT_CV   = "../../outputs/cv_scores"
OUT_HOLD = "../../outputs/holdout_reports"
OUT_IMP  = "../../outputs/feature_importance"
OUT_SUB  = "../../outputs/submissions"

# Estimator

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=400, class_weight="balanced", random_state=RANDOM_STATE)

## 2. Load Data

In [2]:
# Using relative paths (two levels up from this notebook)
TRAIN = "../../data/train.csv"
TEST  = "../../data/test.csv"
SAMPLE = "../../data/sample_submission.csv"

# Define target variable name
target_col = "accident_risk"

# Load datasets
train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
sample_sub = pd.read_csv(SAMPLE)

# Separate feature and target columns
feature_cols = [c for c in train.columns if c != target_col]

# Identify categorical (object dtype) and numerical features
cat_cols = [c for c in feature_cols if train[c].dtype == 'object']
num_cols = [c for c in feature_cols if c not in cat_cols]

# Split data for modeling
X = train[feature_cols]        # Features for training
y = train[target_col].astype(int)  # Target variable (converted to integer if needed)
X_test = test[feature_cols]    # Features for test predictions

# Basic shape check
X.shape, X_test.shape

((517754, 13), (172585, 13))

## 3. Preprocessing

In [3]:
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')
onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# scale numeric for Logistic Regression to help convergence;
# for tree-based models, scaler isn't necessary
use_scaler = ("LogisticRegression" in str(type(clf)))
num_pipeline_steps = [('imp', numeric_imputer)]
if use_scaler:
    num_pipeline_steps.append(('sc', StandardScaler()))

preprocess = ColumnTransformer([
    ('num', Pipeline(num_pipeline_steps), num_cols),
    ('cat', Pipeline([('imp', categorical_imputer), ('oh', onehot)]), cat_cols),
])
pipe = Pipeline([('prep', preprocess), ('clf', clf)])
pipe

## 4. Cross-Validation (ROC AUC)

In [4]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = cross_val_score(pipe, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
mean_auc, std_auc = scores.mean(), scores.std()
print(f"CV ROC AUC: {mean_auc:.4f} ± {std_auc:.4f}")

CV ROC AUC: 0.7653 ± 0.0144


In [5]:
# Save CV scores
cv_payload = {
    "model": "LogisticRegression",
    "mean_auc": float(mean_auc),
    "std_auc": float(std_auc),
    "folds": [float(s) for s in scores.tolist()]
}

cv_path = os.path.join(OUT_CV, "logreg_cv.json")

with open(cv_path, "w") as f:
    json.dump(cv_payload, f, indent=2)
print("Saved CV to:", cv_path)

Saved CV to: ../../outputs/cv_scores\logreg_cv.json


## 5. Holdout Sanity Check

In [6]:
# Split data (stratify keeps class balance)
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Train model on training part
pipe.fit(X_tr, y_tr)

# Predict probabilities for validation part
valid_proba = pipe.predict_proba(X_va)[:, 1]

# Calculate ROC AUC (main metric)
valid_auc = roc_auc_score(y_va, valid_proba)
print(f"Holdout ROC AUC: {valid_auc:.4f}")

# === Automatically find optimal threshold by F1 ===
prec, rec, thr = precision_recall_curve(y_va, valid_proba)
f1 = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-10)
best_thr = thr[np.argmax(f1)]

print(f"Best threshold: {best_thr:.4f}")

# Predict classes using best threshold
valid_pred = (valid_proba >= best_thr).astype(int)

# Print precision, recall, and F1-score for both classes
print(classification_report(y_va, valid_pred, digits=3, zero_division=0))

Holdout ROC AUC: 0.8635
Best threshold: 0.8628
              precision    recall  f1-score   support

           0      1.000     0.966     0.983    103539
           1      0.001     0.250     0.002        12

    accuracy                          0.966    103551
   macro avg      0.500     0.608     0.492    103551
weighted avg      1.000     0.966     0.983    103551



In [7]:
# Save holdout evaluation results

# Import classification_report under a short alias (_cr)
from sklearn.metrics import classification_report as _cr

# Create a dictionary with holdout metrics:
# - ROC AUC (as float)
# - Full classification report (converted to a dictionary)
hold_dict = {
    "holdout_auc": float(valid_auc),
    "report": _cr(y_va, (valid_proba >= 0.5).astype(int), output_dict=True)
}

# Define output path for saving results
hold_path = os.path.join(OUT_HOLD, "logreg_holdout.json")

# Save metrics to a JSON file for later comparison in the "compare_models" notebook
with open(hold_path, "w") as f:
    json.dump(hold_dict, f, indent=2)

# Confirm successful save
print("Saved holdout to:", hold_path)

Saved holdout to: ../../outputs/holdout_reports\logreg_holdout.json


## 6. Permutation Importance (Holdout)

In [8]:
# Permutation Importance (on holdout data) 
# This helps understand which features have the strongest impact
# on the model's ROC AUC score.

# Compute permutation importance
# - n_repeats=5: each feature is shuffled 5 times to estimate its contribution
# - scoring='roc_auc': evaluates importance by drop in ROC AUC
perm = permutation_importance(
    pipe, X_va, y_va,
    n_repeats=5,
    random_state=RANDOM_STATE,
    scoring='roc_auc'
)

# Build readable feature names after OneHotEncoding
# Extract the fitted OneHotEncoder from the preprocessing pipeline
oh = pipe.named_steps['prep'].named_transformers_['cat'].named_steps['oh']

# Expand encoded categorical features into individual "col=value" labels
cat_feature_names = []
for col, cats in zip(cat_cols, oh.categories_):
    cat_feature_names.extend([f"{col}={c}" for c in cats])

# Combine numeric and categorical feature names
feature_names = num_cols + cat_feature_names

# Create a sorted DataFrame of feature importances
n = min(len(feature_names), len(perm.importances_mean))
imp_df = pd.DataFrame({
    "feature": feature_names[:n],
    "perm_importance_auc": perm.importances_mean[:n]
}).sort_values("perm_importance_auc", ascending=False).reset_index(drop=True)

# Display top 20 most important features
imp_df.head(20)

Unnamed: 0,feature,perm_importance_auc
0,lighting=daylight,0.149872
1,speed_limit,0.094805
2,public_road,0.056526
3,road_signs_present,0.056045
4,road_type=urban,0.032847
5,id,0.007052
6,road_type=highway,0.003356
7,school_season,0.000818
8,num_reported_accidents,0.000419
9,curvature,0.000212


In [9]:
# Save importance
imp_path = os.path.join(OUT_IMP, "logreg_perm_importance.csv")
imp_df.to_csv(imp_path, index=False)
print("Saved permutation importance to:", imp_path)

Saved permutation importance to: ../../outputs/feature_importance\logreg_perm_importance.csv


## 7. Train on Full Data & Create Submission

In [10]:
# Train on full training data
# Fit the final pipeline (preprocessing + model) on all available training data
pipe.fit(X, y)

# Predict probabilities for the test set (for class "1" — accident occurs)
test_proba = pipe.predict_proba(X_test)[:, 1]

# Prepare submission file
# Start from sample_submission to keep correct format (ID + target column)
submission = sample_sub.copy()

# Identify the prediction column name (everything except the first column, usually 'target')
pred_cols = [c for c in submission.columns if c != submission.columns[0]]
pred_col = pred_cols[0] if pred_cols else "target"

# Insert predicted probabilities into the submission dataframe
submission[pred_col] = test_proba

In [11]:
# Save submission
sub_path = os.path.join(OUT_SUB, "logreg.csv")
submission.to_csv(sub_path, index=False)
print("Saved submission to:", sub_path)

Saved submission to: ../../outputs/submissions\logreg.csv


## Summary

The Logistic Regression model served as a simple linear baseline for this competition.  
Despite a decent **ROC AUC (~0.86)**, the model completely failed to capture the minority class due to the **extreme class imbalance** — only 12 positive samples in more than 100,000 observations.

Even with an optimized probability threshold, the model achieved near-zero recall and F1-score for the positive class.  
This behavior is expected because Logistic Regression is a **linear model** and cannot learn meaningful patterns from such rare events without strong feature signals or resampling.

**Key takeaways:**
- ROC AUC ≈ 0.86 shows some ranking ability, but not practical classification power.  
- Model predicts almost all cases as class “0” (no accident).  
- Class balancing or tree-based models (RandomForest, HistGradientBoosting) are better suited for this task.  
- Further improvement would require either **rebalancing the dataset** or **switching to regression setup** if the target is continuous.

👉 Conclusion: Logistic Regression is not suitable for this problem beyond a basic reference baseline.

