# Credit Risk Assessment: Surrogate Logistic Regression

---

In [1]:
from aura.utils.pathing import models, reports, root
import joblib
import json
import shap 
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score,
    precision_recall_curve, RocCurveDisplay, confusion_matrix,
    classification_report
)
from scipy import sparse
from pathlib import Path
from datetime import date
import warnings
warnings.filterwarnings("ignore")
stamp=date.today().isoformat()

### Data Loading

In [2]:
top_idx = joblib.load(models/"shap_topidx_v1.joblib")
data = Path("../data/processed")
X_train = sparse.load_npz(data/"X_train.npz")[:,top_idx]
X_test  = sparse.load_npz(data/"X_test.npz")[:,top_idx]
y_train = pd.read_csv(data/"y_train.csv").squeeze()
y_test  = pd.read_csv(data/"y_test.csv").squeeze()

### Surrogate Logistic Regression Model

In [3]:
sur_path = models/"surrogate_lr_v1.joblib"
if sur_path.exists():
    print("Using cached surrogate"); sur = joblib.load(sur_path)
else:
    sur = LogisticRegressionCV(
        penalty="l1", solver="saga", class_weight="balanced",
        Cs=np.logspace(-2,1,8), cv=5, scoring="roc_auc",
        max_iter=300, tol=1e-3, n_jobs=-4, verbose=1, random_state=42
    ).fit(X_train, y_train)

    joblib.dump(sur, sur_path)
    joblib.dump(top_idx, models/"shap_topidx_v1.joblib") 

y_pred = sur.predict_proba(X_test)[:, 1]
pred  = (y_pred > 0.50).astype(int) 
auc= roc_auc_score(y_test, y_pred)
pr_auc= average_precision_score(y_test, y_pred)
accuracy= accuracy_score(y_test, pred)
report = classification_report(y_test, pred, digits=3, zero_division=0)
md_file = reports / "metrics_surrogate.md"
with open(md_file, "w") as f:
    f.write(f"# Surrogate Logistic Regression – {stamp}\n\n")
    f.write("| Metric | Value |\n|--------|-------|\n")
    f.write(f"| ROC-AUC | **{auc:.4f}** |\n")
    f.write(f"| PR-AUC  | **{pr_auc:.3f}** |\n")
    f.write(f"| Accuracy| **{accuracy:.3f}** |\n\n")
    f.write("<details><summary>Classification report</summary>\n\n```\n")
    f.write(report)
    f.write("\n```\n</details>\n")

print("Surrogate LR artefacts & metrics saved")

Using cached surrogate
Surrogate LR artefacts & metrics saved
