# Credit Risk Assessment: Surrogate Logistic Regression

---

In [10]:
from aura.utils.pathing import models, reports, root
import joblib
import json
import shap 
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score,
    precision_recall_curve, RocCurveDisplay, confusion_matrix,
    classification_report
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from scipy import sparse
from pathlib import Path
from datetime import date
import warnings
warnings.filterwarnings("ignore")
stamp=date.today().isoformat()

### Data Loading

In [2]:
top_idx = joblib.load(models/"shap_topidx_v1.joblib")
data = Path("../data/processed")
X_train = sparse.load_npz(data/"X_train.npz")[:, top_idx[:5]]
X_test  = sparse.load_npz(data/"X_test.npz")[:, top_idx[:5]]
y_train = pd.read_csv(data/"y_train.csv").squeeze()
y_test  = pd.read_csv(data/"y_test.csv").squeeze()
preprocessed = joblib.load("../models/preprocessor.joblib")
df = pd.read_csv(Path("../data/raw/accepted_2007_to_2018Q4.csv"))

### Surrogate Logistic Regression Model

In [11]:
sur_path = models/"surrogate_lr_v3.joblib"
if sur_path.exists():
    print("Using cached surrogate"); sur = joblib.load(sur_path)
else:
    status_mapping = {"Fully Paid": 0, "Charged Off": 1}
    df=df[df["loan_status"].isin(status_mapping.keys())].copy()
    df["default"] = df["loan_status"].map(status_mapping)
    df['emp_length_na'] = df['emp_length'].isna().astype(int)
    numeric_features = ["last_fico_range_high", "last_fico_range_low", "emp_length_na"]
    categorical_features = ["term", "debt_settlement_flag"]

    numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features), 
    ("cat", categorical_transformer, categorical_features)
    ])

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("estimator", LogisticRegressionCV(
            penalty="l1", solver="saga", class_weight="balanced",
            Cs=np.logspace(-2,1,8), cv=5, scoring="roc_auc",
            max_iter=300, tol=1e-3, n_jobs=1, verbose=1, random_state=42
        ))
    ])

    df = df[df['default'].isin([0, 1])]
    df = df.dropna(subset=numeric_features + categorical_features)
    X = df[["last_fico_range_high", "last_fico_range_low", "term", "debt_settlement_flag", "emp_length_na"]]
    y = df["default"] 
    

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.20,
        random_state=42,
        stratify=y,
    )    
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    print(f"Train target breakdown: {y_train.value_counts().to_dict()}")
    print(f"Test target breakdown: {y_test.value_counts().to_dict()}")

    pipeline.fit(X_train, y_train)
    processed = Path("../data/processed")
    X_train.to_csv(processed / "X_train_top5.csv", index=False)
    X_test.to_csv(processed / "X_test_top5.csv", index=False)
    y_train.to_csv(processed / "y_train_top5.csv", index=False)
    y_test.to_csv(processed / "y_test_top5.csv", index=False)
    joblib.dump(pipeline, "../models/surrogate_top5_pipeline.joblib")
    joblib.dump(top_idx, "../models/shap_topidx_v3.joblib") 


Train shape: (1076248, 5), Test shape: (269062, 5)
Train target breakdown: {0: 861401, 1: 214847}
Test target breakdown: {0: 215350, 1: 53712}
Epoch 1, change: 1
Epoch 2, change: 0.14583325
Epoch 3, change: 0.0606375
Epoch 4, change: 0.035887161
Epoch 5, change: 0.0058504923
Epoch 6, change: 0.0035948695
Epoch 7, change: 0.0013908224
convergence after 8 epochs took 2 seconds
Epoch 1, change: 1
Epoch 2, change: 0.029139056
Epoch 3, change: 0.011719198
Epoch 4, change: 0.0046880557
Epoch 5, change: 0.0034984897
Epoch 6, change: 0.0013433935
convergence after 7 epochs took 1 seconds
Epoch 1, change: 1
Epoch 2, change: 0.023562553
Epoch 3, change: 0.014889695
Epoch 4, change: 0.010823605
Epoch 5, change: 0.0073289546
Epoch 6, change: 0.0051291259
Epoch 7, change: 0.0039204863
Epoch 8, change: 0.0027219245
Epoch 9, change: 0.0020735284
Epoch 10, change: 0.0015311728
Epoch 11, change: 0.0011702437
convergence after 12 epochs took 2 seconds
Epoch 1, change: 1
Epoch 2, change: 0.013032109
Epoc

### Evaluation Metrics

In [12]:
y_pred = pipeline.predict_proba(X_test)[:, 1]
pred  = (y_pred > 0.50).astype(int) 
auc= roc_auc_score(y_test, y_pred)
pr_auc= average_precision_score(y_test, y_pred)
accuracy= accuracy_score(y_test, pred)
report = classification_report(y_test, pred, digits=3, zero_division=0)
md_file = reports / "metrics_surrogate.md"
with open(md_file, "w") as f:
    f.write(f"# Surrogate Logistic Regression – {stamp}\n\n")
    f.write("| Metric | Value |\n|--------|-------|\n")
    f.write(f"| ROC-AUC | **{auc:.4f}** |\n")
    f.write(f"| PR-AUC  | **{pr_auc:.3f}** |\n")
    f.write(f"| Accuracy| **{accuracy:.3f}** |\n\n")
    f.write("<details><summary>Classification report</summary>\n\n```\n")
    f.write(report)
    f.write("\n```\n</details>\n")

print("Surrogate LR artefacts & metrics saved")

Surrogate LR artefacts & metrics saved


In [14]:
print(f'AUC: {auc}, \nPR_AUC: {pr_auc}, \nAccuracy: {accuracy}')

AUC: 0.9536127291361357, 
PR_AUC: 0.8408654449203148, 
Accuracy: 0.8985624131241127


In [None]:
print(report)

              precision    recall  f1-score   support

           0      0.971     0.900     0.934    215350
           1      0.691     0.891     0.778     53712

    accuracy                          0.899    269062
   macro avg      0.831     0.896     0.856    269062
weighted avg      0.915     0.899     0.903    269062

