# Credit Risk Assessment: Surrogate Logistic Regression

---

In [22]:
from aura.utils.pathing import models, reports, root
import joblib
import json
import shap 
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, RocCurveDisplay, confusion_matrix,
    classification_report, precision_score, recall_score, f1_score, PrecisionRecallDisplay
)
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
import category_encoders as ce
from scipy.stats import ks_2samp
import scipy.stats as ss
from pathlib import Path
from datetime import date
import warnings
warnings.filterwarnings("ignore")
stamp=date.today().isoformat()
models = Path('../models')
reports = Path('../reports')
figs = Path('../reports/figs')
processed = Path("../data/processed")
model_version = "v1"
random_state = 42

### Load Data

In [3]:
df = pd.read_parquet("../data/processed/lc_cleaned.parquet")
cutoff_train = pd.to_datetime("2016-12-31")
cutoff_val = pd.to_datetime("2017-12-31")

train_idx = df["issue_d"] <= cutoff_train
val_idx = (df["issue_d"] > cutoff_train) & (df["issue_d"] <= cutoff_val)
test_idx= df["issue_d"] > cutoff_val

X_train, y_train = df.loc[train_idx], df.loc[train_idx, "default"]
X_val, y_val = df.loc[val_idx], df.loc[val_idx, "default"]
X_test, y_test = df.loc[test_idx], df.loc[test_idx, "default"]

ui_cols = ["grade", "term", "acc_open_past_24mths", "dti", "fico_mid"]

X_train = X_train[ui_cols].copy()
X_val = X_val[ui_cols].copy()
X_test = X_test[ui_cols].copy()

print("Shapes (train/val/test):", X_train.shape, X_val.shape, X_test.shape)

Shapes (train/val/test): (1132562, 5) (181728, 5) (68061, 5)


### Metrics

In [4]:
def ks_score(y, p): return ks_2samp(p[y==0], p[y==1]).statistic

def profit_curve(y, p, gain_tp=.80, cost_fp=.10):
    prec, rec, thr = precision_recall_curve(y, p)
    tp = rec * y.sum(); fp = (tp / np.clip(prec,1e-9,1)) - tp
    prof = (gain_tp*tp - cost_fp*fp) / len(y)
    i = prof.argmax(); return float(thr[i]), float(prof[i])

def best_f1(y, p):
    prec, rec, thr = precision_recall_curve(y, p)
    f1 = 2*prec*rec/(prec+rec+1e-12); return float(thr[f1.argmax()])

def metrics(y, p, thr):
    y_hat = (p > thr).astype(int)
    return dict(AUC=roc_auc_score(y,p),
                PR_AUC=average_precision_score(y,p),
                KS=ks_score(y,p),
                Precision=precision_score(y,y_hat),
                Recall=recall_score(y,y_hat),
                ConfMatrix=confusion_matrix(y,y_hat).tolist(),
                ClassReport=classification_report(y,y_hat,
                                                  digits=3, zero_division=0))

### Preprocessing

In [6]:
def engineer(df: pd.DataFrame) -> pd.DataFrame:
    z = df.copy()
    z["grade_term"] = z["grade"].astype(str) + "_" + z["term"].astype(str)
    z["grade_term"]= z["grade_term"].astype("category")
    z["dti_inv"] = 1.0 / (z["dti"] + 1e-3)
    z["fico_mid_sq"] = z["fico_mid"] ** 2
    return z.drop(columns=["grade","term","dti","fico_mid"])

X_train_fe = engineer(X_train)
X_val_fe = engineer(X_val)
X_test_fe = engineer(X_test)

num_feats = ["acc_open_past_24mths", "dti_inv", "fico_mid_sq"]    
cat_feats = ["grade_term"]

num_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

ohe_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",
                          sparse_output=True, min_frequency=0.005))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_feats),
    ("lowc", ohe_pipe, cat_feats)
], remainder="drop")
joblib.dump(preprocessor, models/f"surrogate_lr_preprocessor_{model_version}.joblib")

print(df.shape, X_train.shape, X_val.shape, X_test.shape)

(1382351, 187) (1132562, 5) (181728, 5) (68061, 5)


### Save Objects

In [12]:
processed = Path("../data/processed")
X_train_fe.to_pickle(processed/"X_train_sur.pkl")
X_val_fe.to_pickle(processed/"X_val_sur.pkl")
X_test_fe.to_pickle(processed/"X_test_sur.pkl")
pd.Series(y_train).to_csv(processed/"y_train_sur.csv", index=False)
pd.Series(y_val).to_csv(processed/"y_val_sur.csv", index=False)
pd.Series(y_test).to_csv(processed/"y_test_sur.csv", index=False)
print("saved to", processed.resolve())

saved to /Users/pranavrao/Documents/ai-ml-projects/github-repos/aura-xai-finrisk-llm/data/processed


### Surrogate Logistic Regression Model

In [10]:
sur_path = models/f"surrogate_lr_{model_version}.joblib"
meta_path= models / f"surrogate_lr_meta_{model_version}.json"

if sur_path.exists():
    print("Using cached surrogate"); sur = joblib.load(sur_path)
else:
    tscv = TimeSeriesSplit(n_splits=3)
    base_lr = LogisticRegression(
        penalty="elasticnet", solver="saga", max_iter=2000, tol=5e-4,
        class_weight="balanced", l1_ratio=0.5, n_jobs=1,
        random_state=random_state, verbose=0
    )
    pipe = Pipeline([("pre", preprocessor), ("clf", base_lr)],
                    memory=joblib.Memory("./cache"))
    param_dist = {
        "clf__C": ss.loguniform(5e-2, 5),
        "clf__l1_ratio": ss.uniform(0, 1)
    }

    search = RandomizedSearchCV(pipe, param_dist,
                                n_iter=20, scoring="roc_auc",
                                cv=tscv, n_jobs=3, random_state=random_state,
                                verbose=2).fit(X_train_fe, y_train)
    best_C = search.best_params_["clf__C"]
    best_l1 = search.best_params_["clf__l1_ratio"]
    print("Best AUC (CV):", search.best_score_)

    final_lr = LogisticRegression(
        penalty="elasticnet", solver="saga", C=best_C, l1_ratio=best_l1,
        max_iter=2000, tol=5e-4, class_weight="balanced",
        n_jobs=1, random_state=random_state, verbose=0
    )
    final_pipe = Pipeline([("pre", preprocessor), ("clf", final_lr)])
    final_pipe.fit(X_train_fe, y_train)

    sur = CalibratedClassifierCV(final_pipe, method="isotonic", cv="prefit")
    sur.fit(X_val_fe, y_val)

    joblib.dump(sur, sur_path)
    json.dump({"date": date.today().isoformat(),
               "UI_inputs": ui_cols,
               "eng_feats": num_feats + cat_feats,
               "best_C": float(best_C),
               "best_l1": float(best_l1)},
              open(meta_path,"w"), indent=2)
    print("Surrogate model + meta written.")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END clf__C=0.2805758207667252, clf__l1_ratio=0.9507143064099162; total time=   3.7s
[CV] END clf__C=0.2805758207667252, clf__l1_ratio=0.9507143064099162; total time=   8.0s
[CV] END clf__C=1.4553179565665342, clf__l1_ratio=0.5986584841970366; total time=   4.4s
[CV] END clf__C=0.2805758207667252, clf__l1_ratio=0.9507143064099162; total time=  13.9s
[CV] END clf__C=0.10256691315437247, clf__l1_ratio=0.15599452033620265; total time=   3.4s
[CV] END clf__C=1.4553179565665342, clf__l1_ratio=0.5986584841970366; total time=   9.8s
[CV] END clf__C=1.4553179565665342, clf__l1_ratio=0.5986584841970366; total time=  13.9s
[CV] END clf__C=0.06533369619026641, clf__l1_ratio=0.8661761457749352; total time=   2.1s
[CV] END clf__C=0.10256691315437247, clf__l1_ratio=0.15599452033620265; total time=   8.2s
[CV] END clf__C=0.10256691315437247, clf__l1_ratio=0.15599452033620265; total time=  12.3s
[CV] END clf__C=0.06533369619026641, clf__

### Evaluation Metrics - Train/Val

In [13]:
p_train = sur.predict_proba(X_train_fe)[:, 1]
p_val = sur.predict_proba(X_val_fe)[:,1]

thr_f1_train = best_f1(y_train, p_train)
thr_f1_val = best_f1(y_val, p_val)
thr_profit_val, _ = profit_curve(y_val, p_val)

m_train = metrics(y_train, p_train, thr_f1_train)
m_val_f = metrics(y_val, p_val, thr_f1_val)
m_val_p = metrics(y_val,p_val,thr_profit_val)

print("=== TRAIN (F1-opt) ===")
print(m_train["ClassReport"])
print("\n=== VALID (F1-opt) ===")
print(m_val_f["ClassReport"])
print("\n=== VALID (profit-opt) ===")
print(m_val_p["ClassReport"])

def save_curves(split:str, y_true, proba):
    roc = RocCurveDisplay.from_predictions(y_true, proba, name=f"LR – {split.upper()}")
    roc.figure_.savefig(figs / f"roc_sur_{split}_{model_version}.png", dpi=300)
    plt.close(roc.figure_)

    pr = PrecisionRecallDisplay.from_predictions(y_true, proba, name=f"LR – {split.upper()}")
    pr.figure_.savefig(figs/f"pr_sur_{split}_{model_version}.png", dpi=300)
    plt.close(pr.figure_)

save_curves("train", y_train, p_train)
save_curves("val",y_val,p_val)
print("ROC & PR curves saved for train + val")

md_path = reports / f"metrics_sur_trainval_{model_version}.md"
with open(md_path, "w") as f:
    f.write(f"# Logistic Regression – {model_version}  \n")
    f.write(f"*Date generated:* {date.today().isoformat()}\n\n")

    # ---------- thresholds ----------
    f.write("## Thresholds chosen\n")
    f.write("| Split | F1-opt | Profit-opt |\n")
    f.write("|-------|-------:|-----------:|\n")
    f.write(f"| Train | {thr_f1_train:.3f} | – |\n")
    f.write(f"| Val | {thr_f1_val:.3f} | {thr_profit_val:.3f} |\n\n")

    # ---------- TRAIN ----------
    f.write("## Train (F1-optimised)\n")
    f.write(f"- **AUC:** `{m_train['AUC']:.4f}`  \n")
    f.write(f"- **PR-AUC:** `{m_train['PR_AUC']:.4f}`  \n")
    f.write(f"- **KS:** `{m_train['KS']:.4f}`  \n\n")
    f.write("```text\n" + m_train["ClassReport"] + "\n```\n\n")

    # ---------- VAL: F1 ----------
    f.write("## Validation (F1-optimised)\n")
    f.write(f"- **AUC:** `{m_val_f['AUC']:.4f}`  \n")
    f.write(f"- **PR-AUC:** `{m_val_f['PR_AUC']:.4f}`  \n")
    f.write(f"- **KS:** `{m_val_f['KS']:.4f}`  \n\n")
    f.write("```text\n" + m_val_f["ClassReport"] + "\n```\n\n")

    # ---------- VAL: Profit ----------
    f.write("## Validation (Profit-optimised)\n")
    f.write(f"- **AUC:** `{m_val_p['AUC']:.4f}`  \n")
    f.write(f"- **PR-AUC:** `{m_val_p['PR_AUC']:.4f}`  \n")
    f.write(f"- **KS:** `{m_val_p['KS']:.4f}`  \n\n")
    f.write("```text\n" + m_val_p['ClassReport'] + "\n```\n")

print("train & validation curves saved and markdown written to", md_path.relative_to(reports))

=== TRAIN (F1-opt) ===
              precision    recall  f1-score   support

           0      0.874     0.659     0.751    901143
           1      0.322     0.631     0.426    231419

    accuracy                          0.653   1132562
   macro avg      0.598     0.645     0.589   1132562
weighted avg      0.761     0.653     0.685   1132562


=== VALID (F1-opt) ===
              precision    recall  f1-score   support

           0      0.838     0.460     0.594    130152
           1      0.363     0.775     0.494     51576

    accuracy                          0.549    181728
   macro avg      0.600     0.618     0.544    181728
weighted avg      0.703     0.549     0.565    181728


=== VALID (profit-opt) ===
              precision    recall  f1-score   support

           0      0.915     0.147     0.253    130152
           1      0.310     0.965     0.469     51576

    accuracy                          0.379    181728
   macro avg      0.612     0.556     0.361    181728

### Evaluation Metrics - Test

In [None]:
p_test = sur.predict_proba(X_test_fe)[:, 1]
thr_f1_test = best_f1(y_test, p_test)
thr_profit_test, _ = profit_curve(y_test, p_test)

m_test_f = metrics(y_test, p_test, thr_f1_test)
m_test_p = metrics(y_test, p_test, thr_profit_test)

print("=== TEST (F1-opt) ===")
print(m_test_f["ClassReport"])
print("\n=== TEST (profit-opt) ===")
print(m_test_p["ClassReport"])

roc = RocCurveDisplay.from_predictions(y_test, p_test, name="LR – TEST")
roc.figure_.savefig(figs / f"roc_sur_test_{model_version}.png", dpi=300)
plt.close(roc.figure_)

pr  = PrecisionRecallDisplay.from_predictions(y_test, p_test, name="LR – TEST")
pr.figure_.savefig(figs / f"pr_sur_test_{model_version}.png", dpi=300)
plt.close(pr.figure_)

md_path = reports / f"metrics_sur_test_{model_version}.md"
with open(md_path, "a") as f:          
    f.write("\n---\n\n")
    f.write("## Test results\n")
    f.write("| Threshold type | Value |\n")
    f.write("|----------------|------:|\n")
    f.write(f"| F1-opt | {thr_f1_test:.3f} |\n")
    f.write(f"| Profit-opt | {thr_profit_test:.3f} |\n\n")

    # F1-optimised block
    f.write("### Test (F1-optimised)\n")
    f.write(f"- **AUC:** `{m_test_f['AUC']:.4f}`  \n")
    f.write(f"- **PR-AUC:** `{m_test_f['PR_AUC']:.4f}`  \n")
    f.write(f"- **KS:** `{m_test_f['KS']:.4f}`  \n\n")
    f.write("```text\n" + m_test_f["ClassReport"] + "\n```\n\n")

    # Profit-optimised block
    f.write("### Test (Profit-optimised)\n")
    f.write(f"- **AUC:** `{m_test_p['AUC']:.4f}`  \n")
    f.write(f"- **PR-AUC:** `{m_test_p['PR_AUC']:.4f}`  \n")
    f.write(f"- **KS:** `{m_test_p['KS']:.4f}`  \n\n")
    f.write("```text\n" + m_test_p['ClassReport'] + "\n```\n")

print("test curves saved and markdown updated to", md_path.relative_to(reports))