In [1]:
# CELL 1
# Uncomment to install in a fresh Colab environment
# !pip install -q xgboost shap scikit-learn imbalanced-learn matplotlib seaborn

print("Run this cell once to install packages if needed.")


Run this cell once to install packages if needed.


In [2]:
# CELL 2
import os, json, math, random
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import shap

PROJECT_DIR = "/content/credit-risk-shap-analysis"
PLOTS_DIR = os.path.join(PROJECT_DIR, "plots")
os.makedirs(PLOTS_DIR, exist_ok=True)
print("Project folder:", PROJECT_DIR)
print("Plots folder:", PLOTS_DIR)


Project folder: /content/credit-risk-shap-analysis
Plots folder: /content/credit-risk-shap-analysis/plots


In [3]:
# CELL 3
# Option A: use uploaded CSV in Colab (place file at /content/credit_risk.csv)
uploaded_path = "/content/credit_risk.csv"

if os.path.exists(uploaded_path):
    df = pd.read_csv(uploaded_path)
    print("Loaded dataset from", uploaded_path)
else:
    # Option B: generate a synthetic dataset (small, realistic-ish)
    np.random.seed(42)
    n = 15000
    df = pd.DataFrame({
        "age": np.random.randint(21, 75, n),
        "income": np.round(np.random.normal(50000, 18000, n)).astype(int),
        "loan_amount": np.round(np.random.normal(15000, 9000, n)).astype(int),
        "loan_term": np.random.choice([12,24,36,48,60], n),
        "num_of_dependents": np.random.poisson(1.2, n),
        "credit_history_length": np.random.randint(0, 30, n),
        "num_of_open_accounts": np.random.randint(0, 10, n),
        "delinquencies_2yrs": np.random.poisson(0.3, n),
        "employment_years": np.random.randint(0, 40, n),
        "home_owner": np.random.choice([0,1], n, p=[0.6,0.4])
    })
    # synthetic risk signal -> binary target
    risk_score = (
        0.03*(65 - df["age"]) +
        0.00002*(70000 - df["income"]) +
        0.00009*df["loan_amount"] +
        0.06*df["delinquencies_2yrs"] -
        0.02*df["home_owner"] -
        0.02*df["credit_history_length"]
    )
    logit = -2.0 + risk_score + np.random.normal(0,0.5, n)
    prob = 1/(1+np.exp(-logit))
    df["default"] = (np.random.rand(n) < prob).astype(int)
    print("Generated synthetic dataset, n =", len(df))

df.head()


Generated synthetic dataset, n = 15000


Unnamed: 0,age,income,loan_amount,loan_term,num_of_dependents,credit_history_length,num_of_open_accounts,delinquencies_2yrs,employment_years,home_owner,default
0,59,82482,14137,60,0,22,1,0,0,0,0
1,72,50334,-470,36,0,10,1,0,32,0,0
2,49,42671,25717,12,2,2,4,0,39,0,1
3,35,67794,11860,36,1,25,0,0,29,1,0
4,63,46626,8203,48,1,14,4,1,0,0,0


In [4]:
# CELL 4
# Minimal preprocessing: remove rows with missing target and simple cleaning
df = df.copy()
df = df.dropna(subset=["default"])
target = "default"

# Feature list (exclude target); convert categorical if needed
feature_cols = [c for c in df.columns if c != target]
# If any object/categorical cols exist, convert via one-hot (small cardinality)
cat_cols = df[feature_cols].select_dtypes(include=["object","category"]).columns.tolist()
if len(cat_cols) > 0:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    feature_cols = [c for c in df.columns if c != target]

X = df[feature_cols]
y = df[target]

# Train-test split (stratify on target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (10500, 10) Test size: (4500, 10)


In [5]:
# CELL 5
# Scale numeric values then apply SMOTE within a pipeline for training
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

# SMOTE expects arrays
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_scaled, y_train.values)
print("After SMOTE:", np.bincount(y_res))
# Save scaled DataFrames for SHAP later (we'll keep feature names)
X_res_df = pd.DataFrame(X_res, columns=numeric_cols)
X_test_df = pd.DataFrame(X_test_scaled, columns=numeric_cols)


After SMOTE: [5298 5298]


In [6]:
# CELL 6
xgb = XGBClassifier(use_label_encoder=False, eval_metric="auc", random_state=42, verbosity=0)

param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

rs = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=12,
                        scoring="roc_auc", cv=3, n_jobs=1, random_state=1, verbose=1)
rs.fit(X_res_df, y_res)
print("Best params:", rs.best_params_)
best_model = rs.best_estimator_


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [7]:
# CELL 7
# Predict on scaled test set
y_proba = best_model.predict_proba(X_test_df)[:,1]
y_pred = (y_proba >= 0.5).astype(int)

auc = roc_auc_score(y_test, y_proba)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

results = {
    "auc": float(auc),
    "precision": float(prec),
    "recall": float(rec),
    "f1": float(f1),
    "confusion_matrix": cm.tolist(),
    "best_params": rs.best_params_
}

# save results
os.makedirs(PROJECT_DIR, exist_ok=True)
with open(os.path.join(PROJECT_DIR, "results.json"), "w") as f:
    json.dump(results, f, indent=2)

print(f"AUC: {auc:.4f} Precision: {prec:.4f} Recall: {rec:.4f} F1: {f1:.4f}")
print("Confusion matrix:\n", cm)


AUC: 0.7235 Precision: 0.6522 Recall: 0.6555 F1: 0.6538
Confusion matrix:
 [[1492  779]
 [ 768 1461]]


In [8]:
# CELL 8
# XGBoost native importance (gain)
# Note: model was trained on scaled numeric features only (numeric_cols)
importances = best_model.get_booster().get_score(importance_type="gain")
# align to all numeric_cols (XGBoost uses f0..)
imp_df = pd.DataFrame({
    "feature": list(importances.keys()),
    "gain": list(importances.values())
})
# map f0->feature name if needed
# XGBoost feature names likely f0..fN if trained on numpy - map them:
mapping = {f"f{i}": name for i, name in enumerate(numeric_cols)}
imp_df["feature_readable"] = imp_df["feature"].map(mapping)
imp_df = imp_df.sort_values("gain", ascending=False)
plt.figure(figsize=(8,4))
sns.barplot(x="gain", y="feature_readable", data=imp_df)
plt.title("XGBoost feature importance (gain)")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "gain_importance.png"))
plt.close()
print("Saved gain_importance.png")


Saved gain_importance.png


In [9]:
# CELL 9
# Build SHAP explainer and compute SHAP values on a sample for speed
explainer = shap.Explainer(best_model)
# Use a sample of training data (post-SMOTE) for global explanation
sample_idx = np.random.choice(X_res_df.shape[0], size=min(2000, X_res_df.shape[0]), replace=False)
X_shap_sample = X_res_df.iloc[sample_idx]
shap_values = explainer(X_shap_sample)

# Summary bar
plt.figure(figsize=(8,5))
shap.plots.bar(shap_values, show=False)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "shap_summary_bar.png"))
plt.close()

# Beeswarm
plt.figure(figsize=(10,6))
shap.plots.beeswarm(shap_values, show=False)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "shap_beeswarm.png"))
plt.close()

print("Saved shap_summary_bar.png and shap_beeswarm.png")


Saved shap_summary_bar.png and shap_beeswarm.png


In [10]:
# CELL 10
# Identify top feature name from mean abs shap
mean_abs = np.abs(shap_values.values).mean(axis=0)
top_idx = int(np.argmax(mean_abs))
top_feature = X_shap_sample.columns[top_idx]
print("Top SHAP feature:", top_feature)

plt.figure(figsize=(8,5))
shap.dependence_plot(top_feature, shap_values.values, X_shap_sample, show=False, feature_names=X_shap_sample.columns)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, f"shap_dependence_{top_feature}.png"))
plt.close()
print("Saved dependence plot for:", top_feature)


Top SHAP feature: loan_amount
Saved dependence plot for: loan_amount


<Figure size 800x500 with 0 Axes>

In [11]:
# CELL 11
# Prepare X_test_df (already scaled) and proba_series
proba_series = pd.Series(y_proba, index=X_test_df.index)

# pick cases:
# high-risk approve -> high predicted prob but actual label = 0 (false positive) OR top prob
# low-risk reject -> low predicted prob but actual label = 1 (false negative) OR bottom prob
# borderline -> prob near 0.5
# attempt to find matching examples; fallback to top/bottom/near-0.5
high_candidates = X_test_df[(y_test==0) & (proba_series > 0.8)].index.tolist()
low_candidates = X_test_df[(y_test==1) & (proba_series < 0.2)].index.tolist()
border_candidates = X_test_df[(proba_series > 0.45) & (proba_series < 0.55)].index.tolist()

def choose(idx_list, fallback_sorted):
    if len(idx_list)>0:
        return idx_list[0]
    return fallback_sorted[0]

high_idx = choose(high_candidates, proba_series.sort_values(ascending=False).index.tolist())
low_idx = choose(low_candidates, proba_series.sort_values(ascending=True).index.tolist())
# borderline: choose nearest to 0.5
if len(border_candidates)>0:
    border_idx = border_candidates[0]
else:
    border_idx = (proba_series - 0.5).abs().sort_values().index[0]

cases = [high_idx, low_idx, border_idx]
print("Selected test indices for local explanations:", cases)

# Generate local SHAP (waterfall) explanations and save PNGs
for i, idx in enumerate(cases, start=1):
    sample = X_test_df.loc[idx:idx]
    expl = explainer(sample)  # Explanation object
    # waterfall plot
    plt.figure(figsize=(8,4))
    try:
        shap.plots.waterfall(expl[0], show=False)
    except Exception:
        # fallback: force plot saved as image via matplotlib
        vals = expl.values[0]
        order = np.argsort(np.abs(vals))[::-1][:10]
        plt.barh(np.array(X_test_df.columns)[order], vals[order])
    plt.title(f"Local SHAP explanation - case {i} (test idx {idx})")
    outp = os.path.join(PLOTS_DIR, f"local_shap_case_{i}.png")
    plt.tight_layout()
    plt.savefig(outp)
    plt.close()
    print("Saved", outp)


  high_candidates = X_test_df[(y_test==0) & (proba_series > 0.8)].index.tolist()
  low_candidates = X_test_df[(y_test==1) & (proba_series < 0.2)].index.tolist()


Selected test indices for local explanations: [4460, 3298, 5]
Saved /content/credit-risk-shap-analysis/plots/local_shap_case_1.png
Saved /content/credit-risk-shap-analysis/plots/local_shap_case_2.png
Saved /content/credit-risk-shap-analysis/plots/local_shap_case_3.png


In [12]:
# CELL 12
for root, dirs, files in os.walk(PROJECT_DIR):
    for fn in files:
        print(os.path.join(root, fn))
print("All done. Check plots in:", PLOTS_DIR)


/content/credit-risk-shap-analysis/results.json
/content/credit-risk-shap-analysis/plots/local_shap_case_3.png
/content/credit-risk-shap-analysis/plots/shap_summary_bar.png
/content/credit-risk-shap-analysis/plots/shap_dependence_loan_amount.png
/content/credit-risk-shap-analysis/plots/local_shap_case_1.png
/content/credit-risk-shap-analysis/plots/local_shap_case_2.png
/content/credit-risk-shap-analysis/plots/shap_beeswarm.png
/content/credit-risk-shap-analysis/plots/gain_importance.png
All done. Check plots in: /content/credit-risk-shap-analysis/plots
