In [1]:
# 📦 Imports (if needed)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
import shap
import lightgbm as lgb

In [2]:
# 🧠 Load model and data
model = joblib.load("../models/lgbm_model.joblib")  # ✅ Your trained LightGBM model
X_valid = pd.read_parquet("../data/processed/X_valid.parquet")  # ✅ Your validation features
y_valid = pd.read_csv("../data/processed/y_valid.csv").squeeze()  # ✅ Your validation labels

# 📊 Predict probabilities and assign predicted labels using chosen threshold
y_pred_proba = model.predict_proba(X_valid)[:, 1]
y_pred_thresh = (y_pred_proba >= 0.3).astype(int)

# 🧮 Add predictions to a copy of X_valid
X_valid_final = X_valid.copy()
X_valid_final["loan_default_proba"] = y_pred_proba
X_valid_final["predicted_label"] = y_pred_thresh
X_valid_final["actual_label"] = y_valid.values

In [16]:
# 📈 Initialize SHAP explainer
explainer = shap.TreeExplainer(model)

# ⚠️ Suppress SHAP warning about binary classifiers (optional)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ✅ Just use the output directly without indexing
shap_values = explainer.shap_values(X_valid)

# Make sure it's the right shape
print("SHAP shape:", shap_values.shape)  # should be (n_rows, n_features)

# ✅ Build DataFrame
shap_df = pd.DataFrame(shap_values, columns=X_valid.columns)
shap_df["loan_default_proba"] = y_pred_proba
shap_df["predicted_label"] = y_pred_thresh
shap_df["actual_label"] = y_valid.values

# ✅ Save for Tableau
shap_df.to_csv("../data/final/shap_values_tableau.csv", index=False)
print("✅ SHAP values saved!")

SHAP shape: (61503, 92)
✅ SHAP values saved!


In [17]:
# 🔄 Melt SHAP values from wide to long format
shap_long = shap_df.melt(
    id_vars=["loan_default_proba", "predicted_label", "actual_label"],
    var_name="feature",
    value_name="shap_value"
)

# 💾 Save for Tableau
shap_long.to_csv("../data/final/shap_values_long.csv", index=False)
print("✅ SHAP long-format file saved!")

✅ SHAP long-format file saved!


In [10]:
# Purpose: Create CSV with global feature importance for Tableau

import pandas as pd

# Load SHAP values and feature matrix
shap_df = pd.read_csv("../data/final/shap_values_tableau.csv")

# Drop prediction columns
feature_cols = shap_df.drop(columns=["loan_default_proba", "predicted_label"])

# Compute mean absolute SHAP per feature
global_importance = (
    feature_cols.abs()
    .mean()
    .reset_index()
    .rename(columns={"index": "feature_name", 0: "mean_abs_shap_value"})
    .sort_values("mean_abs_shap_value", ascending=False)
)

# Save to CSV
global_importance.to_csv("../data/final/global_shap_importance.csv", index=False)

In [13]:
# Purpose: Prepare risk score data for Tableau visualization

# 3. Combine into a DataFrame
risk_df = pd.DataFrame({
    "loan_default_proba": y_pred_proba,
    "true_label": y_valid.values.flatten()
})

# 4. Optional: Create label names for Tableau (e.g., "Default", "No Default")
risk_df["label"] = risk_df["true_label"].map({1: "Default", 0: "No Default"})

# 5. Save to CSV
risk_df.to_csv("../data/final/risk_distribution.csv", index=False)

In [5]:
# ✅ Only include available columns
available_cols = [
    "loan_default_proba",
    "actual_label",
    "DAYS_BIRTH",
    "AMT_INCOME_TOTAL",
    "DAYS_EMPLOYED"
]

X_valid_final["label"] = X_valid_final["actual_label"].map({0: "No Default", 1: "Default"})
risk_df = X_valid_final[available_cols + ["label"]]
risk_df.to_csv("../data/final/risk_distribution.csv", index=False)
print("✅ Simplified risk_distribution.csv saved.")

✅ Simplified risk_distribution.csv saved.


In [2]:
# 📦 Load SHAP values + risk scores
import pandas as pd
shap_df = pd.read_csv("../data/final/shap_values_tableau.csv")

# 🏷️ Create risk bands from predicted probabilities
bins = [0, 0.2, 0.5, 1.0]
labels = ["Low Risk", "Medium Risk", "High Risk"]
shap_df["risk_band"] = pd.cut(shap_df["loan_default_proba"], bins=bins, labels=labels)

# 🔝 Top 15 features based on mean absolute SHAP value
mean_abs_shap = shap_df.drop(columns=["loan_default_proba", "predicted_label", "risk_band"]).abs().mean()
top_features = mean_abs_shap.sort_values(ascending=False).head(15).index.tolist()

# 📊 Melt and group for Tableau
shap_melted = shap_df[["risk_band"] + top_features].melt(id_vars="risk_band", var_name="feature", value_name="shap_value")
agg_df = shap_melted.groupby(["risk_band", "feature"], observed=True).mean().reset_index()

# 💾 Save for Tableau
agg_df.to_csv("../data/final/agg_shap_by_risk_band.csv", index=False)
print("✅ SHAP aggregation saved for Tableau!")

✅ SHAP aggregation saved for Tableau!


In [10]:
# Purpose: Prepare SHAP vs Risk Score visualization data in long format for Tableau

import pandas as pd

# 1. Load the saved shap_vs_risk.csv
df = pd.read_csv("../data/final/shap_vs_risk.csv")

# 2. Define correct column names
feature_cols = [
    "EXT_SOURCE_1",
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
    "credit_annuity_ratio",
    "credit_goods_ratio",
    "CODE_GENDER_M",
    "DAYS_BIRTH",
    "ORGANIZATION_TYPE_TE"
]

# 3. Melt into long format
long_df = df.melt(
    id_vars=["loan_default_proba"],
    value_vars=feature_cols,
    var_name="Feature",
    value_name="SHAP Value"
)

# 4. Save to CSV for Tableau
long_df.to_csv("../data/final/shap_vs_risk_long.csv", index=False)
print("✅ Saved: shap_vs_risk_long.csv")

✅ Saved: shap_vs_risk_long.csv


In [3]:
# 📦 Imports (in case not already)
import pandas as pd
from sklearn.metrics import confusion_matrix

# ✅ Calculate confusion matrix components
cm = confusion_matrix(X_valid_final["actual_label"], X_valid_final["predicted_label"])
tn, fp, fn, tp = cm.ravel()

# 🧾 Create a summary DataFrame
confusion_summary = pd.DataFrame({
    "Metric": ["True Negative", "False Positive", "False Negative", "True Positive"],
    "Count": [tn, fp, fn, tp]
})

# 💾 Save the confusion matrix summary
confusion_summary.to_csv("../data/final/confusion_matrix_summary.csv", index=False)
print("✅ Saved: confusion_matrix_summary.csv")

✅ Saved: confusion_matrix_summary.csv


In [4]:
# 🧮 Save row-level predictions
X_valid_final[["loan_default_proba", "predicted_label", "actual_label"]].to_csv(
    "../data/final/confusion_predictions.csv", index=False
)
print("✅ Saved: confusion_predictions.csv")

✅ Saved: confusion_predictions.csv
