## 安装导入包
## 数据导入
## 模型训练
## 模型评估
## SHAP解释
## LIME解释

In [None]:
# 安装shap
!pip install shap

In [None]:
# 安装lime
!pip install lime

In [None]:
# 导入包
import pandas as pd
import numpy as np
import shap
import lime
import sklearn
import shap
shap.initjs() # load JS visualization code to notebook
import xgboost
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# 导入数据并区分X和y
heloc = pd.read_csv('../input/home-equity-line-of-creditheloc/heloc_dataset_v1 (1).csv')
X = heloc.drop(columns = 'RiskPerformance')
y = heloc.RiskPerformance.replace(to_replace=['Bad', 'Good'], value=[1, 0])

In [None]:
# 切分训练集及测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=0)
X_train_array = np.array(X_train)
X_test_array = np.array(X_test)
y_train_array = np.array(y_train)
y_test_array = np.array(y_test)

In [None]:
# # 通过超参数搜索构建XGBoost模型
# xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)] # Number of trees to be used
# xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)] # Maximum number of levels in tree
# xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)] # Minimum number of instaces needed in each node
# xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist'] # Tree construction algorithm used in XGBoost
# xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)] # Learning rate
# xgb_gamma = [int(x) for x in np.linspace(0, 0.5, 6)] # Minimum loss reduction required to make further partition
# # # Learning objective used
# # xgb_objective = ['binary:logistic', 'binary:hinge']
# # Create the grid
# xgb_grid = {'n_estimators': xgb_n_estimators,
#             'max_depth': xgb_max_depth,
#             'min_child_weight': xgb_min_child_weight,
#             'tree_method': xgb_tree_method,
#             'eta': xgb_eta,
#             'gamma': xgb_gamma}
# # Create the model to be tuned
# xgb_base = xgboost.XGBClassifier()
# # Create the random search 
# xgb_random = RandomizedSearchCV(estimator = xgb_base, param_distributions = xgb_grid, 
#                                 n_iter = 5, cv = 3, verbose = 2, 
#                                 random_state = 42, n_jobs = -1)
# # Fit the random search model
# xgb_random.fit(X_train_array, y_train_array)
# # Get the optimal parameters
# xgb_random.best_params_

In [None]:
# 训练最终XGBoost模型
xgb_final = xgboost.XGBClassifier(tree_method = 'hist',
                         n_estimators = 800,
                         min_child_weight = 6,
                         max_depth = 2,
                         gamma = 0,
                         eta = 0.4,
                         early_stop=10,
                         random_state = 42)
xgb_final.fit(X_train_array, y_train_array)

In [None]:
# 模型评估 
def model_eval(model, title, test_features, test_labels):
    scores = pd.DataFrame()
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels,predictions)
    roc_auc = roc_auc_score(test_labels,predictions)
    F1 = f1_score(test_labels,predictions)
    precision = precision_score(test_labels,predictions)
    recall = recall_score(test_labels,predictions)
    scores[title] = [accuracy,roc_auc,F1,precision,recall]
    scores.index = ['Accuracy Score', 'ROC_AUC', 'F1_Score', 'Precision_Score','Recall_Score']
    return scores
train_scores = model_eval(xgb_final,"train",X_train_array,y_train_array)
test_scores = model_eval(xgb_final, "test",X_test_array, y_test_array)
print(train_scores)
print(test_scores)

In [None]:
# 基于XGBOOST模型构建解释器
explainer = shap.TreeExplainer(xgb_final)
shap_values = explainer.shap_values(X)
shap_interaction_values = explainer.shap_interaction_values(X)

In [None]:
# 通过推力图展示第一个样本的局部结果归因
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
# 通过推力图展示多个样本的全局结果归因
shap.force_plot(explainer.expected_value, shap_values[:100,:], X.iloc[:100,:])

In [None]:
# 通过总览图（柱状）展示全局结果归因
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
shap.summary_plot(shap_interaction_values,X)

In [None]:
# 通过总览图（散点）展示全局结果归因
shap.summary_plot(shap_values, X)

In [None]:
# 通过依赖关系散点图展示单个特征的结果归因
shap.dependence_plot("ExternalRiskEstimate", shap_values, X)

In [None]:
# 通过决策路径图展示单个样本的结果归因
shap.decision_plot(explainer.expected_value, shap_values[:1],X.iloc[1])

In [None]:
# 通过决策路径图展示单个样本的结果归因(包含特征间交互作用)
shap.decision_plot(explainer.expected_value,shap_interaction_values[:1],X.iloc[1],feature_display_range=slice(None, -20, -1))

In [None]:
feature_names=['ExternalRiskEstimate', 'MSinceOldestTradeOpen',
       'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades',
       'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec',
       'PercentTradesNeverDelq', 'MSinceMostRecentDelq',
       'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades',
       'NumTradesOpeninLast12M', 'PercentInstallTrades',
       'MSinceMostRecentInqexcl7days', 'NumInqLast6M', 'NumInqLast6Mexcl7days',
       'NetFractionRevolvingBurden', 'NetFractionInstallBurden',
       'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance',
       'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']
#feature_names = X_test.columns.tolist()
target_names=['Good','Bad']

In [None]:
# 基于XGBOOST模型构建LIME解释器
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=target_names, discretize_continuous=False)

In [None]:
i = np.random.randint(0, X_test.shape[0])
i

In [None]:
# 用LIME解释器解释单样本结果归因
exp = lime_explainer.explain_instance(X_test.iloc[3], xgb_final.predict_proba, num_features=23)

In [None]:
# 展示结果归因
exp.show_in_notebook(show_table=True, show_all=False)