In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score , average_precision_score 
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve ,auc , log_loss ,  classification_report 
from sklearn.preprocessing import StandardScaler , Binarizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import time
import os, sys, gc, warnings, random, datetime
import math
import shap
import joblib
warnings.filterwarnings('ignore')

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold , cross_val_score
from sklearn.metrics import roc_auc_score

In [None]:
df = pd.read_pickle('../input/searching-for-bad-loan-data-preprocessing/df_pp.pkl')

In [None]:
# df = pd.read_pickle('../input/loan-include-chargeoff/df_pp.pkl')

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
df['Loan_status'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Loan_status')
ax[0].set_ylabel('')
sns.countplot('Loan_status',data=df,ax=ax[1])
ax[1].set_title('Loan_status')
plt.show()

In [None]:
X = df.drop('Loan_status', axis=1)
y = df['Loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 2020, stratify = y)

### Xgboost

In [None]:
params_xGB = {
    'nthread':16, 
    'gamma': 0, 
    'max_depth': 6, 
    'min_child_weight': 1, 
    'max_delta_step': 0, 
    'subsample': 1.0,
        
    'colsample_bytree': 1.0, 
       
    'objective':'binary:logistic',
    'num_class':1,
    'eval_metric':'logloss',
    'seed':2020,
#     'tree_method' : 'gpu_hist',
}

In [None]:
trainingScores = []
cvScores = []
predictionsBasedOnKFolds = pd.DataFrame(data=[],
                                    index=y_train.index,columns=['prediction'])
k_fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=2020)
start = time.time() 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)),
                                          y_train.ravel()):
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], \
        X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], \
        y_train.iloc[cv_index]
    
    dtrain = xgb.DMatrix(data=X_train_fold, label=y_train_fold)
    dCV = xgb.DMatrix(data=X_cv_fold)
    
    bst = xgb.cv(params_xGB, dtrain, num_boost_round=2000, 
                 nfold=5, early_stopping_rounds=200, verbose_eval=100)
    
    best_rounds = np.argmin(np.array(bst['test-logloss-mean']))
    bst = xgb.train(params_xGB, dtrain, best_rounds)
    
    loglossTraining = log_loss(y_train_fold, bst.predict(dtrain))
    trainingScores.append(loglossTraining)
    
    predictionsBasedOnKFolds.loc[X_cv_fold.index,'prediction'] = \
        bst.predict(dCV)
    loglossCV = log_loss(y_cv_fold, \
        predictionsBasedOnKFolds.loc[X_cv_fold.index,'prediction'])
    cvScores.append(loglossCV)
    
    print('Training Log Loss: ', loglossTraining)
    print('CV Log Loss: ', loglossCV)
    
xgb_runtime = time.time() - start    
loglossXGBoostGradientBoosting = \
    log_loss(y_train, predictionsBasedOnKFolds.loc[:,'prediction'])

print( 'XGBoost Gradient Boosting Log Loss : {0:.4f} ,  XGBoost Runtime : {1:.4f}'.format(loglossXGBoostGradientBoosting ,xgb_runtime ))

In [None]:
preds = pd.concat([y_train,predictionsBasedOnKFolds.loc[:,'prediction']], axis=1)
preds.columns = ['trueLabel','prediction']
predictionsBasedOnKFoldsXGBoostGradientBoosting = preds.copy()

precision, recall, thresholds = \
    precision_recall_curve(preds['trueLabel'],preds['prediction'])
average_precision = \
    average_precision_score(preds['trueLabel'],preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = roc_curve(preds['trueLabel'],preds['prediction'])
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
        Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()

### Random Forest

In [None]:
n_estimators = 10
max_features = 'auto'
max_depth = None
min_samples_split = 2
min_samples_leaf = 1
min_weight_fraction_leaf = 0.0
max_leaf_nodes = None
bootstrap = True
oob_score = False
n_jobs = -1
random_state = 2018
class_weight = 'balanced'

RFC = RandomForestClassifier(n_estimators=n_estimators, 
        max_features=max_features, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf, 
        max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, 
        oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, 
        class_weight=class_weight)

In [None]:
##RandomForest with stratified 5 Fold

trainingScores = []
cvScores = []
predictionsBasedOnKFolds = pd.DataFrame(data=[],
                                        index=y_train.index,columns=[0,1])

start = time.time() 
clf = RandomForestClassifier(n_estimators=30, min_samples_leaf=20, max_features=0.7, n_jobs=-1, random_state = 2020, oob_score=True)
cv = StratifiedKFold(n_splits=5,random_state = 2020)
y_preds_rf = np.zeros(X_test.shape[0])
n_iter = 0 
for train_index,test_index in cv.split(X_train,y_train):
    trx , tsx = X_train.iloc[train_index] , X_train.iloc[test_index]
    vly , vlt = y_train.iloc[train_index] , y_train.iloc[test_index]
    RFC = RFC.fit(trx,vly)   
    loglossTraining = log_loss(vly, \
                                RFC.predict_proba(trx))
    trainingScores.append(loglossTraining)
    
    predictionsBasedOnKFolds.loc[tsx.index,:] = \
        RFC.predict_proba(tsx)  
    loglossCV = log_loss(vlt, \
        predictionsBasedOnKFolds.loc[tsx.index,1])
    cvScores.append(loglossCV)
    print('Training Log Loss: ', loglossTraining)
    print('CV Log Loss: ', loglossCV)
    
    n_iter += 1
    cv_roc_score = roc_auc_score(y_test, RFC.predict_proba(X_test)[:,1], average = 'macro')
    cv_precision, cv_recall, _ = precision_recall_curve(y_test,RFC.predict_proba(X_test)[:,1])
    cv_pr_auc = auc(cv_recall, cv_precision)
    print( '\n#{0}, CV_ROC_AUC : {1} , RF_CV_PR_AUC : {2} '.format(n_iter ,cv_roc_score, cv_pr_auc))
    y_preds_rf += RFC.predict_proba(X_test)[:,1]/ cv.n_splits
rf_runtime = time.time() - start 
rf_cv_roc_score = roc_auc_score(y_test, y_preds_rf, average = 'macro')
rf_cv_precision, rf_cv_recall, _ = precision_recall_curve(y_test,y_preds_rf)
rf_cv_pr_auc = auc(rf_cv_recall, rf_cv_precision)    
loglossRandomForestsClassifier = log_loss(y_train, 
                                          predictionsBasedOnKFolds.loc[:,1])
print( 'Random Forest Log Loss : {0:.4f} ,  Random Forest Runtime : {1:.4f}'.format(loglossRandomForestsClassifier ,rf_runtime ))
    

In [None]:
preds = pd.concat([y_train,predictionsBasedOnKFolds.loc[:,1]], axis=1)
preds.columns = ['trueLabel','prediction']
predictionsBasedOnKFoldsRandomForests = preds.copy()

precision, recall, thresholds = precision_recall_curve(preds['trueLabel'],
                                                       preds['prediction'])
average_precision = average_precision_score(preds['trueLabel'],
                                            preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = roc_curve(preds['trueLabel'],preds['prediction'])
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
          Area under the curve = {0:0.2f}'.format(
          areaUnderROC))
plt.legend(loc="lower right")
plt.show()

### LightGBM

In [None]:
params_lightGB = {
    'task': 'train',
    'application':'binary',
    'num_class':1,
    'boosting': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'metric_freq':50,
    'is_training_metric':False,
    'max_depth':4,
    'num_leaves': 31,
#     'learning_rate': 0.01,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'bagging_seed': 2020,
    'verbose': 50,
    'num_threads':16,
    'random_state ' : 2020
}

In [None]:
trainingScores = []
cvScores = []
predictionsBasedOnKFolds = pd.DataFrame(data=[],
                                index=y_train.index,columns=['prediction'])
start = time.time() 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)),
                                          y_train.ravel()):
    X_train_fold, X_cv_fold = X_train.iloc[train_index,:], \
        X_train.iloc[cv_index,:]
    y_train_fold, y_cv_fold = y_train.iloc[train_index], \
        y_train.iloc[cv_index]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(X_cv_fold, y_cv_fold, reference=lgb_train)
    gbm = lgb.train(params_lightGB, lgb_train, num_boost_round=10000,
                   valid_sets=lgb_eval, early_stopping_rounds=200 , verbose_eval = 500)
    
    loglossTraining = log_loss(y_train_fold, \
                gbm.predict(X_train_fold, num_iteration=gbm.best_iteration))
    trainingScores.append(loglossTraining)
    
    predictionsBasedOnKFolds.loc[X_cv_fold.index,'prediction'] = \
        gbm.predict(X_cv_fold, num_iteration=gbm.best_iteration) 
    loglossCV = log_loss(y_cv_fold, \
        predictionsBasedOnKFolds.loc[X_cv_fold.index,'prediction'])
    cvScores.append(loglossCV)
    
    print('Training Log Loss: ', loglossTraining)
    print('CV Log Loss: ', loglossCV)
lgbm_runtime = time.time() - start     
loglossLightGBMGradientBoosting = \
    log_loss(y_train, predictionsBasedOnKFolds.loc[:,'prediction'])
print( 'LightGBM Log Loss : {0:.4f} ,  LightGBM Runtime : {1:.4f}'.format(loglossLightGBMGradientBoosting ,lgbm_runtime ))

In [None]:
preds = pd.concat([y_train,predictionsBasedOnKFolds.loc[:,'prediction']], axis=1)
preds.columns = ['trueLabel','prediction']
predictionsBasedOnKFoldsLightGBMGradientBoosting = preds.copy()

precision, recall, thresholds = \
    precision_recall_curve(preds['trueLabel'],preds['prediction'])
average_precision = \
    average_precision_score(preds['trueLabel'],preds['prediction'])

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = roc_curve(preds['trueLabel'],preds['prediction'])
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()

Apply to Test Set

In [None]:
predictionsTestSetRandomForests = \
    pd.DataFrame(data=[],index=y_test.index,columns=['prediction'])
predictionsTestSetRandomForests.loc[:,'prediction'] = \
    RFC.predict_proba(X_test)[:,1]
logLossTestSetRandomForests = \
    log_loss(y_test, predictionsTestSetRandomForests)

In [None]:
predictionsTestSetXGBoostGradientBoosting = \
    pd.DataFrame(data=[],index=y_test.index,columns=['prediction'])
dtest = xgb.DMatrix(data=X_test)
predictionsTestSetXGBoostGradientBoosting.loc[:,'prediction'] = \
    bst.predict(dtest)
logLossTestSetXGBoostGradientBoosting = \
    log_loss(y_test, predictionsTestSetXGBoostGradientBoosting)

In [None]:
predictionsTestSetLightGBMGradientBoosting = \
    pd.DataFrame(data=[],index=y_test.index,columns=['prediction'])
predictionsTestSetLightGBMGradientBoosting.loc[:,'prediction'] = \
    gbm.predict(X_test, num_iteration=gbm.best_iteration)
logLossTestSetLightGBMGradientBoosting = \
    log_loss(y_test, predictionsTestSetLightGBMGradientBoosting)

In [None]:
print("Log Loss of Random Forests on Test Set: ", \
          logLossTestSetRandomForests)
print("Log Loss of XGBoost Gradient Boosting on Test Set: ", \
          logLossTestSetXGBoostGradientBoosting)
print("Log Loss of LightGBM Gradient Boosting on Test Set: ", \
          logLossTestSetLightGBMGradientBoosting)

In [None]:
#RF
precision, recall, thresholds = \
    precision_recall_curve(y_test,predictionsTestSetRandomForests)
average_precision = \
    average_precision_score(y_test,predictionsTestSetRandomForests)

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = roc_curve(y_test,predictionsTestSetRandomForests)
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()

In [None]:
#XGB
precision, recall, thresholds = \
    precision_recall_curve(y_test,predictionsTestSetXGBoostGradientBoosting)
average_precision = \
    average_precision_score(y_test,predictionsTestSetXGBoostGradientBoosting)

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = \
    roc_curve(y_test,predictionsTestSetXGBoostGradientBoosting)
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()

In [None]:
#LGB
precision, recall, thresholds = \
    precision_recall_curve(y_test,predictionsTestSetLightGBMGradientBoosting)
average_precision = \
    average_precision_score(y_test,predictionsTestSetLightGBMGradientBoosting)

plt.step(recall, precision, color='k', alpha=0.7, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

plt.title('Precision-Recall curve: Average Precision = {0:0.2f}'.format(
          average_precision))

fpr, tpr, thresholds = \
    roc_curve(y_test,predictionsTestSetLightGBMGradientBoosting)
areaUnderROC = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic: \
Area under the curve = {0:0.2f}'.format(areaUnderROC))
plt.legend(loc="lower right")
plt.show()

In [None]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test, pred)
    print('Confusion Matrix')
    print(confusion)
    print('Auccuracy : {0:.4f}, Precision : {1:.4f} , Recall : {2:.4f} , F1_Score : {3:.4f}'.format(accuracy , precision, recall, f1))
    print('------------------------------------------------------------------------------')

In [None]:
thresholds = {0.1,0.15, 0.2,0.25, 0.3,0.35, 0.4 , 0.45 , 0.5}

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('threshold:', custom_threshold)
        get_clf_eval(y_test, custom_predict)

## get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)

In [None]:
### Using CPU only

start = time.time()

params_lgb={'boosting_type':'gbdt',
           'objective': 'binary',
           'random_state':2020,
           'metric':'binary_logloss',
            'metric_freq' : 50,
            'max_depth' :4, 
            'num_leaves' : 31,
            'learning_rate' : 0.01,
            'feature_fraction' : 1.0,
            'bagging_fraction' : 1.0,
            'bagging_freq' : 0,
            'bagging_seed' : 2020,
            'num_threads' : 16
           }


lgbm_clf = LGBMClassifier(boosting_type = 'gbdt',
           objective= 'binary',
           metric='auc',
#             metric_freq = 50,
#             max_depth =4, 
#             num_leaves = 31,
#             learning_rate = 0.01,
#             feature_fraction = 1.0,
#             bagging_fraction = 1.0,
#             bagging_freq = 0,
# #             bagging_seed = 2020,
#             num_threads = 16,
                          random_state = 2020)

evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train,  verbose = 50)


lgbm_cpu_runtime = time.time() - start

get_eval_by_threshold(y_test, lgbm_clf.predict_proba(X_test)[:,1].reshape(-1,1), thresholds)
lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1], average = 'macro')
lgbm_precision, lgbm_recall, _ = precision_recall_curve(y_test,lgbm_clf.predict_proba(X_test)[:,1])
lgbm_pr_auc = auc(lgbm_recall, lgbm_precision)



print( 'LightGBM_ROC_AUC : {0:.4f} , LightGBM_PR_AUC : {1:.4f} ,Runtime : {2:.4f}'.format(lgbm_roc_score ,lgbm_pr_auc, lgbm_cpu_runtime))

In [None]:
start = time.time()

xgb_clf = XGBClassifier(random_state = 2020)
xgb_clf.fit(X_train, y_train, verbose = 50)

xgb_gpu_runtime = time.time() - start

pred = xgb_clf.predict(X_test)

get_eval_by_threshold(y_test, xgb_clf.predict_proba(X_test)[:,1].reshape(-1,1), thresholds)

xgb_gpu_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1], average = 'macro')

xgb_precision, xgb_recall, _ = precision_recall_curve(y_test,xgb_clf.predict_proba(X_test)[:,1])
xgb_gpu_pr_auc = auc(xgb_recall, xgb_precision)



print( 'XGboost_gpu_ROC_AUC : {0:.4f} , XGboost_gpu_PR_AUC : {1:.4f} , Runtime : {2:.4f}'.format(xgb_gpu_roc_score ,xgb_gpu_pr_auc, xgb_gpu_runtime ))

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
warnings.filterwarnings('ignore')

In [None]:
perm_xgb = PermutationImportance(xgb_clf, random_state=2020).fit(X_test, y_test)
eli5.show_weights(perm_xgb, feature_names = X_test.columns.tolist())

In [None]:
import shap

In [None]:
X = df.drop('Loan_status', axis=1)
y = df['Loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 2020, stratify = y)

In [None]:
y_trainin = y_train.to_frame()
for_sample_train_df = pd.concat([X_train, y_trainin], axis=1)
y_testet = y_test.to_frame()
for_sample_test_df = pd.concat([X_test, y_testet], axis=1)

In [None]:
X_ = for_sample_train_df.drop('Loan_status', axis=1)
y_ = for_sample_train_df['Loan_status']

sample_train_x, sample_test_x, sample_train_y, sample_test_y = train_test_split(X_, y_, test_size = 0.8 , random_state = 2020, stratify = y_)

del X_train, X_test, y_train, y_test , y_trainin, y_testet,

In [None]:
gc.collect()

In [None]:
## Make sample for faster computation

X_ = for_sample_train_df.drop('Loan_status', axis=1)
y_ = for_sample_train_df['Loan_status']

sample_train_x, sample_test_x, sample_train_y, sample_test_y = train_test_split(X_, y_, test_size = 0.80 , random_state = 2020, stratify = y_)

In [None]:
sample_train_yin = sample_train_y.to_frame()
for__sample_train_df = pd.concat([sample_train_x, sample_train_yin], axis=1)
sample_test_yin = sample_test_y.to_frame()
for__sample_test_df = pd.concat([sample_test_x, sample_test_yin], axis=1)

In [None]:
for__sample_train_df.head(50)

In [None]:
X_sampled = sample_train_x.copy()

In [None]:
#LightGBM
import shap
shap.initjs()

# (same syntax works for LightGBM, CatBoost, and scikit-learn models)

explainer = shap.TreeExplainer(lgbm_clf)
shap_values = explainer.shap_values(X_sampled)

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_sampled.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][1,:], X_sampled.iloc[1,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][15,:], X_sampled.iloc[15,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][3,:], X_sampled.iloc[3,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][3,:], X_sampled.iloc[3,:])

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][4,:], X_sampled.iloc[4,:])

In [None]:
# # summarize the effects of all the features
# shap.summary_plot(shap_values, X_sampled, plot_type="bar")

In [None]:
# shap.force_plot(base_value=explainer.expected_value[1], shap_values=shap_values[1], features=X_sampled.columns)