# Logistic Regression Model

Train logistic regression with L2 regularization using 5-fold cross-validation to tune the regularization parameter.

In [1]:
import os
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    classification_report,
    confusion_matrix
)

# Configuration
DATA_DIR = "../data"
MODELS_DIR = "../models"
RESULTS_DIR = "../results"

TRAIN_FILE = os.path.join(DATA_DIR, "train.csv")
VAL_FILE = os.path.join(DATA_DIR, "val.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")

MODEL_FILE = os.path.join(MODELS_DIR, "logistic_model.pkl")
PREDICTIONS_FILE = os.path.join(RESULTS_DIR, "logistic_predictions.csv")
METRICS_FILE = os.path.join(RESULTS_DIR, "logistic_metrics.json")

TARGET_COL = "status"
RANDOM_STATE = 42
C_VALUES = [0.001, 0.01, 0.1, 1, 10, 100]

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

In [2]:
# Load data
train_df = pd.read_csv(TRAIN_FILE)
val_df = pd.read_csv(VAL_FILE)
test_df = pd.read_csv(TEST_FILE)

X_train = train_df.drop(columns=[TARGET_COL])
y_train = train_df[TARGET_COL]

X_val = val_df.drop(columns=[TARGET_COL])
y_val = val_df[TARGET_COL]

X_test = test_df.drop(columns=[TARGET_COL])
y_test = test_df[TARGET_COL]

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

Training set: (179250, 67)
Validation set: (14867, 67)
Test set: (14867, 67)


In [3]:
# Train with 5-fold CV
param_grid = {
    'C': C_VALUES,
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [1000],
    'random_state': [RANDOM_STATE]
}

grid_search = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV AUC-ROC: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits

Best parameters: {'C': 100, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}
Best CV AUC-ROC: 0.8517


In [4]:
# Evaluate on validation set
y_val_proba = best_model.predict_proba(X_val)[:, 1]
y_val_pred = best_model.predict(X_val)

val_auc_roc = roc_auc_score(y_val, y_val_proba)
val_auc_pr = average_precision_score(y_val, y_val_proba)
val_brier = brier_score_loss(y_val, y_val_proba)

print("Validation Set Performance:")
print(f"AUC-ROC: {val_auc_roc:.4f}")
print(f"AUC-PR: {val_auc_pr:.4f}")
print(f"Brier Score: {val_brier:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Validation Set Performance:
AUC-ROC: 0.8418
AUC-PR: 0.7709
Brier Score: 0.1338

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     11203
           1       0.66      0.66      0.66      3664

    accuracy                           0.83     14867
   macro avg       0.77      0.77      0.77     14867
weighted avg       0.83      0.83      0.83     14867


Confusion Matrix:
[[9965 1238]
 [1259 2405]]


In [5]:
# Evaluate on test set
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = best_model.predict(X_test)

test_auc_roc = roc_auc_score(y_test, y_test_proba)
test_auc_pr = average_precision_score(y_test, y_test_proba)
test_brier = brier_score_loss(y_test, y_test_proba)

print("Test Set Performance:")
print(f"AUC-ROC: {test_auc_roc:.4f}")
print(f"AUC-PR: {test_auc_pr:.4f}")
print(f"Brier Score: {test_brier:.4f}")

Test Set Performance:
AUC-ROC: 0.8512
AUC-PR: 0.7800
Brier Score: 0.1312


In [6]:
# Save model
joblib.dump(best_model, MODEL_FILE)
print(f"Model saved to {MODEL_FILE}")

# Save predictions
predictions_df = pd.DataFrame({
    'true_label': y_val,
    'predicted_probability': y_val_proba,
    'predicted_label': y_val_pred,
    'dataset': 'validation'
})
predictions_df.to_csv(PREDICTIONS_FILE, index=False)
print(f"Predictions saved to {PREDICTIONS_FILE}")

# Save metrics
all_metrics = {
    'best_params': grid_search.best_params_,
    'best_cv_score': float(grid_search.best_score_),
    'validation_metrics': {
        'auc_roc': float(val_auc_roc),
        'auc_pr': float(val_auc_pr),
        'brier_score': float(val_brier),
        'dataset': 'Validation'
    },
    'test_metrics': {
        'auc_roc': float(test_auc_roc),
        'auc_pr': float(test_auc_pr),
        'brier_score': float(test_brier),
        'dataset': 'Test'
    }
}

with open(METRICS_FILE, 'w') as f:
    json.dump(all_metrics, f, indent=4)
print(f"Metrics saved to {METRICS_FILE}")

Model saved to ../models/logistic_model.pkl
Predictions saved to ../results/logistic_predictions.csv
Metrics saved to ../results/logistic_metrics.json
