## Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler , Binarizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import time
import os, sys, gc, warnings, random, datetime
from sklearn.model_selection import StratifiedKFold , KFold
import math
import shap
import joblib
import eli5
from eli5.sklearn import PermutationImportance
warnings.filterwarnings('ignore')


In [None]:
!pip install --upgrade git+https://github.com/stanfordmlgroup/ngboost.git
from ngboost import NGBRegressor, NGBClassifier
from ngboost.ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.scores import CRPS, MLE , LogScore
from ngboost.distns import LogNormal, Normal
from ngboost.distns import k_categorical, Bernoulli



#### Data fork from previous EDA kernel 
https://www.kaggle.com/possiblemanjr/handling-imbalanced-data-eda-small-fe

#### trained model from following kernels

https://www.kaggle.com/possiblemanjr/handling-imbalanced-data-supervised-learning

## import & functions

In [None]:
df = pd.read_pickle("../input/handling-imbalanced-data-eda-small-fe/df_for_use.pkl")
df_fe = pd.read_pickle("../input/handling-imbalanced-data-eda-small-fe/df_fe.pkl")

In [None]:
lgbm_clf = joblib.load('../input/handling-imbalanced-data-supervised-learning/lgbm_clf.pkl')
rf_clf = joblib.load('../input/handling-imbalanced-data-supervised-learning/rf_clf.pkl')
xgb_clf = joblib.load('../input/handling-imbalanced-data-supervised-learning/xgb_clf.pkl')
ngb_clf = joblib.load('../input/handling-imbalanced-data-supervised-learning/ngb_clf.pkl')


In [None]:
X = df.drop('loan_condition_cat', axis=1)
y = df['loan_condition_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 2020, stratify = y)



In [None]:
lgbm_cpu_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1], average = 'macro')
rf_roc_score = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1], average = 'macro')
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1], average = 'macro')
NGb_roc_score = roc_auc_score(y_test, ngb_clf.predict_proba(X_test)[:,1], average = 'macro')

print( 'RandomForest_ROC_AUC : {0:.4f}'.format(rf_roc_score))
print( 'Lightgbm_ROC_AUC : {0:.4f}'.format(lgbm_cpu_roc_score))
print( 'Xgboost_ROC_AUC : {0:.4f}'.format(xgb_roc_score))
print( 'Ngboost_ROC_AUC : {0:.4f}'.format(NGb_roc_score))


In [None]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test, pred)
    print('Confusion Matrix')
    print(confusion)
    print('Auccuracy : {0:.4f}, Precision : {1:.4f} , Recall : {2:.4f} , F1_Score : {3:.4f}'.format(accuracy , precision, recall, f1))
    print('------------------------------------------------------------------------------')
    

thresholds = {0.1,0.15, 0.2,0.25, 0.3,0.35, 0.4 , 0.45 , 0.5}

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('threshold:', custom_threshold)
        get_clf_eval(y_test, custom_predict)

## get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)    

## Feature Importance

In [None]:
### Random Forest

ftr_ims_values = rf_clf.feature_importances_
ftr_ims = pd.Series(ftr_ims_values, index = X_train.columns)
ftr_top20 = ftr_ims.sort_values(ascending = False)[:20]

plt.figure(figsize = (10,8))
plt.title('Feature importance')
sns.barplot(x = ftr_top20, y = ftr_top20.index)
plt.show()

In [None]:
### XGboost

from xgboost import plot_importance
fig, ax = plt.subplots(1,1, figsize= (10,8))
plot_importance(xgb_clf, ax= ax, max_num_features = 20 , height = 0.4)

In [None]:
### LightGBM

from lightgbm import plot_importance
fig, ax = plt.subplots(1,1, figsize= (10,8))
plot_importance(lgbm_clf, ax= ax, max_num_features = 20 , height = 0.4)

In [None]:
### NGboost


## Feature importance for loc trees
feature_importance_loc = ngb_clf.feature_importances_[0]

# ## Feature importance for scale trees
# feature_importance_scale = ngb_clf.feature_importances_[1]

df_loc = pd.DataFrame({'feature': X_train.columns, 
                       'importance':feature_importance_loc})\
    .sort_values('importance',ascending=False)
# df_scale = pd.DataFrame({'feature':X_train.columns, 
#                        'importance':feature_importance_scale})\
#     .sort_values('importance',ascending=False)

fig, ax1 = plt.subplots(1,1, figsize=(10,8))
fig.suptitle("Feature importance plot for distribution parameters", fontsize=17)
sns.barplot(x='importance',y='feature',ax=ax1,data=df_loc, color="skyblue").set_title('loc param')
# sns.barplot(x='importance',y='feature',ax=ax2,data=df_scale, color="skyblue").set_title('scale param')


## Permutation Importance

### Apply to LightGBM

In [None]:
perm_lgbm = PermutationImportance(lgbm_clf, random_state=2020).fit(X_test, y_test)
eli5.show_weights(perm_lgbm, feature_names = X_test.columns.tolist())

In [None]:
pi_features = eli5.explain_weights_df(perm_lgbm, feature_names = X_train.columns.tolist())
pi_features = pi_features.loc[pi_features['weight'] >= 0.005]['feature'].tolist()

In [None]:
X_pi = X[pi_features]
X_train, X_test, y_train, y_test = train_test_split(X_pi, y, test_size = 0.2 , random_state = 2020, stratify = y)

In [None]:
start = time.time()

lgbm_clf = LGBMClassifier(n_estimators = 3000, random_state = 2020)
evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric = 'auc' , eval_set = evals, verbose = 50)


lgbm_cpu_runtime = time.time() - start

get_eval_by_threshold(y_test, lgbm_clf.predict_proba(X_test)[:,1].reshape(-1,1), thresholds)
lgbm_cpu_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1], average = 'macro')

print( 'LightGBM_cpu_ROC_AUC : {0:.4f} , Runtime : {1:.4f}'.format(lgbm_cpu_roc_score ,lgbm_cpu_runtime ))

### LightGBM with Stratified 5Fold

In [None]:
from lightgbm import LGBMClassifier

from time import time
params_lgb={'boosting_type':'gbdt',
           'objective': 'binary',
           'random_state':2020,
           'metric':'auc'
           }

k_fold=5
kf=StratifiedKFold(n_splits=k_fold,shuffle=True, random_state=2020)
training_start_time = time()
aucs=[]
y_preds = np.zeros(X_test.shape[0])

for fold, (trn_idx,val_idx) in enumerate(kf.split(X_train,y_train)):
    start_time = time()
    print('Training on fold {}'.format(fold + 1))
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], label=y_train.iloc[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])
    clf = lgb.train(params_lgb, trn_data, num_boost_round=10000, valid_sets = [trn_data, val_data], 
                    verbose_eval=200, early_stopping_rounds=200)
    aucs.append(clf.best_score['valid_1']['auc'])
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    y_preds += clf.predict(X_test) / 5
    
    
    
print('-' * 30)
print('Training is completed!.')
print("\n## Mean CV_AUC_Score : ", np.mean(aucs))
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
# print(clf.best_params_)
print('-' * 30)


# pred_rf = clf.predict(X_test)
auc = roc_auc_score(y_test,y_preds)
print(' ROC_AUC_Score : {0:.4f}'.format (auc))

In [None]:
#### AUC got improved after applying permutation importance
#### Also more improved with CV stratified 5 folds