In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report
import shap
import lime
import lime.lime_tabular

# Load the dataset
df = pd.read_csv('/content/Credit Risk Benchmark Dataset.csv')

# Display the first few rows to check format
print(df.head())

# Assume my target is the last column
target_col = df.columns[-1]
X = df.drop(columns=target_col)
y = df[target_col]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model validation
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall, precision)
print('ROC AUC:', roc_auc, 'PR AUC:', pr_auc)
print(classification_report(y_test, model.predict(X_test)))

# SHAP Global & Local Explanation
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)

# Top 10 features
importances = np.abs(shap_values.values).mean(axis=0)
top_10_features = pd.Series(importances, index=X_test.columns).sort_values(ascending=False)[:10]
print(top_10_features)

# Local SHAP for 5 high-risk and 5 low-risk
high_risk_idx = np.argsort(y_pred_proba)[-5:]
low_risk_idx = np.argsort(y_pred_proba)[:5]
indices = np.concatenate([high_risk_idx, low_risk_idx])
for idx in indices:
    shap.plots.waterfall(shap_values[idx])

#LIME Explanations (for same 10 instances)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    mode='classification'
)
for idx in indices:
    exp = lime_explainer.explain_instance(X_test.iloc[idx].values, model.predict_proba, num_features=10)
    exp.show_in_notebook(show_table=True)