# Credit Risk Assessment: SHAP Analysis

---

In [1]:
from aura.utils.pathing import models, reports, root
import joblib
import shap 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import date
from scipy import sparse
import warnings
warnings.filterwarnings("ignore")
stamp=date.today().isoformat()
data = Path("../data/processed")

### SHAP Analysis

In [2]:
shap_idx_path = models/"shap_topidx_v1.joblib"
if shap_idx_path.exists():
    print("Using cached SHAP artefacts"); top_idx = joblib.load(shap_idx_path)
else:
    gbm = joblib.load(models/"lgbm_v1.joblib")
    Xtrain= sparse.load_npz(data/"X_train.npz").toarray()
    pre= joblib.load(data/"preprocessor.joblib")
    feats = pre.get_feature_names_out()

    explainer = shap.TreeExplainer(gbm)
    shap_vals = explainer.shap_values(Xtrain, check_additivity=False)
    if isinstance(shap_vals,list): shap_vals=shap_vals[0]

    top_idx = np.argsort(np.abs(shap_vals).mean(0))[::-1][:50]
    joblib.dump(top_idx, shap_idx_path)
    np.savez_compressed(models/"shap_vals_train.npz", shap_vals=shap_vals.astype("float32"))

    shap.summary_plot(shap_vals, Xtrain, feature_names=feats,
                      max_display=50, show=False)
    plt.savefig(reports/"figs/shap_summary_lig htgbm.png", dpi=300, bbox_inches="tight")
    pd.Series(np.abs(shap_vals).mean(0)[top_idx], index=feats[top_idx])\
      .to_csv(reports/"top_drivers.csv")

    print("SHAP artefacts saved")

Using cached SHAP artefacts


### All Features and their Importances

In [10]:
all_importances = np.abs(shap_vals).mean(0)
sorted_idx = np.argsort(-all_importances) 
sorted_names = np.array(feats)[sorted_idx]
sorted_importances = all_importances[sorted_idx]

for name, imp in zip(sorted_names, sorted_importances):
    print(f"{name}: {imp:.4f}")

num__last_fico_range_high: 1.0935
num__last_fico_range_low: 0.3947
cat__term_ 36 months: 0.0923
cat__debt_settlement_flag_N: 0.0785
num__emp_length_na: 0.0410
ord__grade: 0.0365
num__loan_to_income: 0.0358
cat__debt_settlement_flag_Y: 0.0277
num__int_rate: 0.0244
ord__sub_grade: 0.0240
cat__term_ 60 months: 0.0234
ord__emp_length: 0.0232
num__installment_to_income: 0.0219
num__mo_sin_old_rev_tl_op: 0.0142
num__fico_range_high: 0.0130
num__dti: 0.0124
num__total_rev_hi_lim: 0.0112
num__acc_open_past_24mths_na: 0.0097
num__installment: 0.0080
num__total_acc: 0.0072
num__funded_amnt: 0.0070
num__loan_amnt: 0.0068
num__fico_range_low: 0.0066
num__avg_cur_bal: 0.0062
num__bc_open_to_buy: 0.0060
num__mort_acc_na: 0.0049
num__funded_amnt_inv: 0.0047
num__num_rev_accts: 0.0047
num__pct_tl_nvr_dlq: 0.0039
num__pct_tl_nvr_dlq_na: 0.0035
num__mo_sin_old_rev_tl_op_na: 0.0032
num__acc_open_past_24mths: 0.0031
cat__verification_status_Not Verified: 0.0030
num__mort_acc: 0.0024
cat__home_ownership_MO

### Extract Top Features

In [12]:
vals = np.array([1.0935, 0.3947, 0.0923, 0.0785, 0.0410, 0.0365, 0.0358, 0.0277, 0.0244, 0.0240])
total = vals.sum()
cumsum = vals.cumsum() / total
print(cumsum)

[0.59159273 0.80512876 0.85506384 0.897533   0.91971435 0.93946116
 0.95882926 0.97381519 0.9870158  1.        ]


In [13]:
models = Path("../models")
reports = Path("../reports")
shap_idx_path = models / "shap_topidx_v1.joblib"
shap_vals_path = models / "shap_vals_train.npz"
preproc_path = models / "preprocessor.joblib"

pre = joblib.load(preproc_path)
feats = pre.get_feature_names_out()

if shap_idx_path.exists() and shap_vals_path.exists():
    print("Using cached SHAP artefacts")
    top_idx = joblib.load(shap_idx_path)
    shap_vals = np.load(shap_vals_path)["shap_vals"]
else:
    pass  

N = 5 
top_feature_names = feats[top_idx[:N]]
top_importances = np.abs(shap_vals).mean(0)[top_idx[:N]]

print("Top features for UI/input:", list(top_feature_names))
print("With importances:")
for name, imp in zip(top_feature_names, top_importances):
    print(f"{name}: {imp:.4f}")

pd.Series(top_importances, index=top_feature_names).to_csv(reports / "top_features_for_ui.csv")

Using cached SHAP artefacts
Top features for UI/input: ['num__last_fico_range_high', 'num__last_fico_range_low', 'cat__term_ 36 months', 'cat__debt_settlement_flag_N', 'num__emp_length_na']
With importances:
num__last_fico_range_high: 1.0935
num__last_fico_range_low: 0.3947
cat__term_ 36 months: 0.0923
cat__debt_settlement_flag_N: 0.0785
num__emp_length_na: 0.0410
