In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from jupyterthemes import jtplot
%matplotlib inline
jtplot.style()

In [2]:
df_cleancoll = pd.read_csv('cleanVIF_loan_test4.csv')

In [3]:
df_cleancoll.head()

Unnamed: 0,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,...,mths_since_last_record,pub_rec,revol_bal,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,flag_bad_loan
0,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,36638.0,0.0,885.46,0.0,0.0,114834.0,59900.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,7967.0,0.0,333.14,0.0,0.0,14123.0,15100.0,0
2,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,13168.0,0.0,368.45,0.0,0.0,267646.0,61100.0,0
3,0,0,0,0,0,0,0,0,0,1,...,33.0,2.0,5572.0,0.0,119.17,0.0,15386.0,13605.0,8100.0,0
4,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,11431.0,0.0,476.3,0.0,1514.0,272492.0,15400.0,0


In [4]:
df_cleancollDesc=[]
for i in df_cleancoll.columns:
    df_cleancollDesc.append([i,len(df_cleancoll[i]),
                     df_cleancoll[i].dtypes,
                     df_cleancoll[i].isna().sum(),
                     round((((df_cleancoll[i].isna().sum())/(len(df_cleancoll)))*100),2),
                     df_cleancoll[i].nunique(),
                     df_cleancoll[i].drop_duplicates().sample(2, replace=True).values])
pd.DataFrame(df_cleancollDesc, columns=['dataFeatures', 'dataLength','dataType', 'null', 'nullPct', 'unique','uniqueSample'])

Unnamed: 0,dataFeatures,dataLength,dataType,null,nullPct,unique,uniqueSample
0,sub_grade_A1,816592,int64,0,0.0,2,"[0, 1]"
1,sub_grade_A2,816592,int64,0,0.0,2,"[0, 1]"
2,sub_grade_A3,816592,int64,0,0.0,2,"[0, 0]"
3,sub_grade_A4,816592,int64,0,0.0,2,"[0, 0]"
4,sub_grade_A5,816592,int64,0,0.0,2,"[1, 1]"
...,...,...,...,...,...,...,...
147,collections_12_mths_ex_med,816592,float64,0,0.0,12,"[16.0, 7.0]"
148,tot_coll_amt,816592,float64,0,0.0,10321,"[24562.0, 1892.0]"
149,tot_cur_bal,816592,float64,0,0.0,327230,"[58795.0, 121410.0]"
150,total_rev_hi_lim,816592,float64,0,0.0,21249,"[45902.0, 88958.0]"


## 5. MODELLING

#### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = df_cleancoll.drop('flag_bad_loan',axis=1)
y = df_cleancoll['flag_bad_loan']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [8]:
data = df_cleancoll.drop('flag_bad_loan', axis =1)
target = df_cleancoll['flag_bad_loan']

#### Random Forest Classifier

Normal X_train

In [9]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
rfc_pred_train = rfc.predict(X_train)
rfc_pred_test = rfc.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_train,rfc_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    538461
           1       1.00      1.00      1.00     33153

    accuracy                           1.00    571614
   macro avg       1.00      1.00      1.00    571614
weighted avg       1.00      1.00      1.00    571614



In [12]:
print(classification_report(y_test,rfc_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    230233
           1       0.98      0.55      0.70     14745

    accuracy                           0.97    244978
   macro avg       0.98      0.77      0.84    244978
weighted avg       0.97      0.97      0.97    244978



oversampled X_train

In [13]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority')

oversampled_trainX, oversampled_trainY = sm.fit_sample(X_train, y_train)
oversampled_train = pd.concat([pd.DataFrame(oversampled_trainY), pd.DataFrame(oversampled_trainX)], axis =1)

In [14]:
rfc_os = RandomForestClassifier()
rfc_os.fit(oversampled_trainX, oversampled_trainY)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
rfc_pred_ostrain = rfc_os.predict(oversampled_trainX)
rfc_pred_ostest = rfc_os.predict(X_test)

In [16]:
print(classification_report(oversampled_trainY,rfc_pred_ostrain))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    538461
           1       1.00      1.00      1.00    538461

    accuracy                           1.00   1076922
   macro avg       1.00      1.00      1.00   1076922
weighted avg       1.00      1.00      1.00   1076922



In [17]:
print(classification_report(y_test,rfc_pred_ostest))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98    230233
           1       0.95      0.40      0.56     14745

    accuracy                           0.96    244978
   macro avg       0.96      0.70      0.77    244978
weighted avg       0.96      0.96      0.96    244978



## 6. MODEL DIAGNOSTIC/EVALUATION

### Randomized Search CV

Normal X_train

In [18]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=10)]

# Number of features to consider at every splot
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,110, num=11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(start=100, stop=1000, num=100)]

# Minimum number of samples required at each leaf node
min_samples_leaf = max_depth = [int(x) for x in np.linspace(10,110, num=11)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [19]:
rf_random = RandomizedSearchCV (estimator = rfc,
                                param_distributions = random_grid,
                                n_iter = 5,
                                cv = 3,
                                n_jobs = -1,
                                scoring = 'f1')

dict_cv = {}
best_score = []
for i in range(3):
    rf_random.fit(X_train, y_train)
    dict_cv['rf_random {}'.format(i)] = rf_random.best_params_
    best_score.append(rf_random.best_score_)

dict_cv

###### Random Forest Classifier Normal Data (Hyper Parameter Alternatives 1)

In [20]:
rfc_1 = RandomForestClassifier(n_estimators= 20,
 min_samples_split= 590,
 min_samples_leaf= 30,
 max_features= 'auto',
 max_depth= 70,
 bootstrap= False)

In [21]:
rfc_1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=70, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=590,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
rfc1_pred_train = rfc_1.predict(X_train)
rfc1_pred_test = rfc_1.predict(X_test)

In [23]:
print(classification_report(y_train,rfc1_pred_train))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    538461
           1       0.99      0.52      0.68     33153

    accuracy                           0.97    571614
   macro avg       0.98      0.76      0.83    571614
weighted avg       0.97      0.97      0.97    571614



In [24]:
print(classification_report(y_test,rfc1_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    230233
           1       0.99      0.52      0.68     14745

    accuracy                           0.97    244978
   macro avg       0.98      0.76      0.83    244978
weighted avg       0.97      0.97      0.97    244978



###### Random Forest Classifier Normal Data (Hyper Parameter Alternatives 2)

In [25]:
rfc_2 = RandomForestClassifier(n_estimators= 80,
 min_samples_split= 527,
 min_samples_leaf= 70,
 max_features= 'auto',
 max_depth= 80,
 bootstrap= False)
rfc_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=80, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=70, min_samples_split=527,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
rfc2_pred_train = rfc_2.predict(X_train)
rfc2_pred_test = rfc_2.predict(X_test)

In [27]:
print(classification_report(y_train,rfc2_pred_train))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    538461
           1       0.99      0.50      0.66     33153

    accuracy                           0.97    571614
   macro avg       0.98      0.75      0.82    571614
weighted avg       0.97      0.97      0.97    571614



In [28]:
print(classification_report(y_test,rfc2_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    230233
           1       0.99      0.50      0.66     14745

    accuracy                           0.97    244978
   macro avg       0.98      0.75      0.82    244978
weighted avg       0.97      0.97      0.96    244978



###### Random Forest Classifier Normal Data (Hyper Parameter Alternatives 3)

In [29]:
rfc_3 = RandomForestClassifier(n_estimators= 20,
 min_samples_split= 118,
 min_samples_leaf= 10,
 max_features= 'sqrt',
 max_depth= 90,
 bootstrap= True)
rfc_3.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=90, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=118,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [30]:
rfc3_pred_train = rfc_3.predict(X_train)
rfc3_pred_test = rfc_3.predict(X_test)

In [31]:
print(classification_report(y_train,rfc3_pred_train))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    538461
           1       0.98      0.54      0.70     33153

    accuracy                           0.97    571614
   macro avg       0.98      0.77      0.84    571614
weighted avg       0.97      0.97      0.97    571614



In [32]:
print(classification_report(y_test,rfc3_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    230233
           1       0.98      0.54      0.69     14745

    accuracy                           0.97    244978
   macro avg       0.98      0.77      0.84    244978
weighted avg       0.97      0.97      0.97    244978



# Feature importance

In [33]:
import warnings
warnings.filterwarnings('ignore')
from eli5 import show_weights
from eli5.sklearn import PermutationImportance

rfc.fit(X_train, y_train)
perm = PermutationImportance(rfc, random_state=101).fit(X_test, y_test)
show_weights(perm, feature_names = list(X_test.columns))

# Feature Importances - SHAP (SHapely Additive exPlanations)

In [20]:
df_DFI = df_cleancoll[['collection_recovery_fee', 'last_pymnt_amnt', 'last_pymnt_d_2015-10-01', 'last_pymnt_d_2015-11-01', 'last_pymnt_d_2015-09-01', 'last_pymnt_d_2015-12-01','next_pymnt_d_2015-10-01','next_pymnt_d_2015-11-01','flag_bad_loan']].copy()

In [21]:
df_DFI.head()

Unnamed: 0,collection_recovery_fee,last_pymnt_amnt,last_pymnt_d_2015-10-01,last_pymnt_d_2015-11-01,last_pymnt_d_2015-09-01,last_pymnt_d_2015-12-01,next_pymnt_d_2015-10-01,next_pymnt_d_2015-11-01,flag_bad_loan
0,0.0,885.46,0,0,0,0,0,0,0
1,0.0,333.14,0,0,0,0,0,0,0
2,0.0,368.45,0,0,0,0,0,0,0
3,0.0,119.17,0,0,1,0,0,0,0
4,0.0,476.3,0,0,0,0,0,0,0


In [22]:
X_DFI = df_DFI.drop('flag_bad_loan',axis=1)
y_DFI = df_DFI['flag_bad_loan']

In [23]:
X_train_DFI, X_test_DFI, y_train_DFI, y_test_DFI = train_test_split(X_DFI, y_DFI, test_size=0.30, random_state=101)

In [24]:
rfc_DFI = RandomForestClassifier()
rfc_DFI.fit(X_train_DFI, y_train_DFI)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
rfc_pred_train_DFI = rfc_DFI.predict(X_train_DFI)
rfc_pred_test_DFI = rfc_DFI.predict(X_test_DFI)

In [26]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_train_DFI,rfc_pred_train_DFI))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    538461
           1       0.98      0.63      0.77     33153

    accuracy                           0.98    571614
   macro avg       0.98      0.82      0.88    571614
weighted avg       0.98      0.98      0.98    571614



In [27]:
print(classification_report(y_test_DFI,rfc_pred_test_DFI))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98    230233
           1       0.81      0.54      0.65     14745

    accuracy                           0.96    244978
   macro avg       0.89      0.77      0.81    244978
weighted avg       0.96      0.96      0.96    244978



In [28]:
import pickle

filename='loan_rfc_model_DFI.sav'
pickle.dump(rfc_DFI, open(filename,'wb'))

In [29]:
import pickle

filename='loan_rfc_model.sav'
pickle.dump(rfc, open(filename,'wb'))