In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
import xgboost
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_curve, auc 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
dfTrain = pd.read_csv('ML_Artivatic_dataset\\train_indessa.csv')
dfTest = pd.read_csv('ML_Artivatic_dataset\\test_indessa.csv')

In [3]:
dfTrain = dfTrain[['member_id', 'loan_amnt', 'funded_amnt', 'addr_state', 'funded_amnt_inv', 'sub_grade', 'term', 'emp_length', 'home_ownership', 'int_rate', 'purpose', 'annual_inc', 'verification_status', 'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'last_week_pay', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'loan_status']]
dfTest = dfTest[['member_id', 'loan_amnt', 'funded_amnt', 'addr_state', 'funded_amnt_inv', 'sub_grade', 'term', 'emp_length', 'home_ownership', 'int_rate', 'purpose', 'annual_inc', 'verification_status', 'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'last_week_pay', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']]

In [4]:
dfTrain.shape, dfTest.shape

((532428, 36), (354951, 35))

# Data Cleaning 

In [5]:
dfTrain['term'].replace(to_replace=' months', value='', regex=True, inplace=True)
dfTest['term'].replace(to_replace=' months', value='', regex=True, inplace=True)
dfTrain['term'] = pd.to_numeric(dfTrain['term'], errors='coerce')
dfTest['term'] = pd.to_numeric(dfTest['term'], errors='coerce')

In [6]:
dfTrain['emp_length'].replace('n/a', '0', inplace=True)
dfTrain['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
dfTrain['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
dfTrain['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)
dfTrain['emp_length'].replace(to_replace=' year', value='', regex=True, inplace=True)
dfTest['emp_length'].replace('n/a', '0', inplace=True)
dfTest['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
dfTest['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
dfTest['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)
dfTest['emp_length'].replace(to_replace=' year', value='', regex=True, inplace=True)
dfTrain['emp_length'] = pd.to_numeric(dfTrain['emp_length'], errors='coerce')
dfTest['emp_length'] = pd.to_numeric(dfTest['emp_length'], errors='coerce')

In [7]:
dfTrain['last_week_pay'].replace(to_replace='th week', value='', regex=True, inplace=True)
dfTest['last_week_pay'].replace(to_replace='th week', value='', regex=True, inplace=True)
dfTrain['last_week_pay'].replace(to_replace='NA', value='', regex=True, inplace=True)
dfTest['last_week_pay'].replace(to_replace='NA', value='', regex=True, inplace=True)
dfTrain['last_week_pay'] = pd.to_numeric(dfTrain['last_week_pay'], errors='coerce')
dfTest['last_week_pay'] = pd.to_numeric(dfTest['last_week_pay'], errors='coerce')

In [8]:
dfTrain['sub_grade'].replace(to_replace='A', value='0', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='B', value='1', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='C', value='2', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='D', value='3', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='E', value='4', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='F', value='5', regex=True, inplace=True)
dfTrain['sub_grade'].replace(to_replace='G', value='6', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='A', value='0', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='B', value='1', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='C', value='2', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='D', value='3', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='E', value='4', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='F', value='5', regex=True, inplace=True)
dfTest['sub_grade'].replace(to_replace='G', value='6', regex=True, inplace=True)
dfTrain['sub_grade'] = pd.to_numeric(dfTrain['sub_grade'], errors='coerce')
dfTest['sub_grade'] = pd.to_numeric(dfTest['sub_grade'], errors='coerce')

In [9]:
dfTrain.isnull().sum()

member_id                           0
loan_amnt                           0
funded_amnt                         0
addr_state                          0
funded_amnt_inv                     0
sub_grade                           0
term                                0
emp_length                      26891
home_ownership                      0
int_rate                            0
purpose                             0
annual_inc                          3
verification_status                 0
pymnt_plan                          0
dti                                 0
delinq_2yrs                        16
inq_last_6mths                     16
mths_since_last_delinq         272554
mths_since_last_record         450305
open_acc                           16
pub_rec                            16
revol_bal                           0
revol_util                        287
total_acc                          16
total_rec_int                       0
total_rec_late_fee                  0
recoveries  

In [10]:
dfTest.isnull().sum()

member_id                           0
loan_amnt                           0
funded_amnt                         0
addr_state                          0
funded_amnt_inv                     0
sub_grade                           0
term                                0
emp_length                      17934
home_ownership                      0
int_rate                            0
purpose                             0
annual_inc                          1
verification_status                 0
pymnt_plan                          0
dti                                 0
delinq_2yrs                        13
inq_last_6mths                     13
mths_since_last_delinq         181758
mths_since_last_record         300021
open_acc                           13
pub_rec                            13
revol_bal                           0
revol_util                        215
total_acc                          13
total_rec_int                       0
total_rec_late_fee                  0
recoveries  

In [11]:
cols = ['last_week_pay', 'annual_inc', 'emp_length', 'delinq_2yrs', 'inq_last_6mths', 'pub_rec', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_util', 'total_acc', 'mths_since_last_major_derog', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']
for col in cols:
    dfTrain[col].fillna(dfTrain[col].median(), inplace=True)
    dfTest[col].fillna(dfTest[col].median(), inplace=True)

In [12]:
cols = ['acc_now_delinq', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'collections_12_mths_ex_med']
for col in cols:
    dfTrain[col].fillna(0, inplace=True)
    dfTest[col].fillna(0, inplace=True)

In [13]:
cat_attr = ['addr_state', 'home_ownership', 'purpose', 'verification_status', 'pymnt_plan']
for cat in cat_attr:
    df_col = [cat]
    dfTrain[cat] = dfTrain[cat].astype("category")
    dfTrain[cat] = pd.get_dummies(dfTrain, columns=df_col)
    dfTest[cat] = dfTest[cat].astype("category")
    dfTest[cat] = pd.get_dummies(dfTest, columns=df_col)

In [14]:
dfTrain.isnull().sum()

member_id                      0
loan_amnt                      0
funded_amnt                    0
addr_state                     0
funded_amnt_inv                0
sub_grade                      0
term                           0
emp_length                     0
home_ownership                 0
int_rate                       0
purpose                        0
annual_inc                     0
verification_status            0
pymnt_plan                     0
dti                            0
delinq_2yrs                    0
inq_last_6mths                 0
mths_since_last_delinq         0
mths_since_last_record         0
open_acc                       0
pub_rec                        0
revol_bal                      0
revol_util                     0
total_acc                      0
total_rec_int                  0
total_rec_late_fee             0
recoveries                     0
collection_recovery_fee        0
collections_12_mths_ex_med     0
mths_since_last_major_derog    0
last_week_

In [15]:
dfTest.isnull().sum()

member_id                      0
loan_amnt                      0
funded_amnt                    0
addr_state                     0
funded_amnt_inv                0
sub_grade                      0
term                           0
emp_length                     0
home_ownership                 0
int_rate                       0
purpose                        0
annual_inc                     0
verification_status            0
pymnt_plan                     0
dti                            0
delinq_2yrs                    0
inq_last_6mths                 0
mths_since_last_delinq         0
mths_since_last_record         0
open_acc                       0
pub_rec                        0
revol_bal                      0
revol_util                     0
total_acc                      0
total_rec_int                  0
total_rec_late_fee             0
recoveries                     0
collection_recovery_fee        0
collections_12_mths_ex_med     0
mths_since_last_major_derog    0
last_week_

In [16]:
train_target = pd.DataFrame(dfTrain['loan_status'])
finalTrain = dfTrain.drop(['loan_status'], axis=1)
finalTest = dfTest

In [17]:
finalTrain.shape, finalTest.shape

((532428, 35), (354951, 35))

In [18]:
finalTrain.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,addr_state,funded_amnt_inv,sub_grade,term,emp_length,home_ownership,int_rate,...,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,mths_since_last_major_derog,last_week_pay,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,58189336,14350,14350,58189336,14350.0,43,36,9.0,58189336,19.19,...,0.0,0.0,0.0,0.0,74.0,26.0,0.0,0.0,28699.0,30800.0
1,70011223,4800,4800,70011223,4800.0,14,36,0.0,70011223,10.99,...,0.0,0.0,0.0,0.0,44.0,9.0,0.0,0.0,9974.0,32900.0
2,70255675,10000,10000,70255675,10000.0,4,36,2.0,70255675,7.26,...,0.0,0.0,0.0,0.0,44.0,9.0,0.0,65.0,38295.0,34900.0
3,1893936,15000,15000,1893936,15000.0,35,36,10.0,1893936,19.72,...,0.0,0.0,0.0,0.0,44.0,135.0,0.0,0.0,55564.0,24700.0
4,7652106,16000,16000,7652106,16000.0,12,36,10.0,7652106,10.64,...,0.0,0.0,0.0,0.0,44.0,96.0,0.0,0.0,47159.0,47033.0


# Feature Selection, Splitting and Training

In [19]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

In [20]:
xg = XGBClassifier(scale_pos_weight=2.5 ,silent=True)

X_train, X_test, y_train, y_test = train_test_split(np.array(finalTrain), np.array(train_target), test_size=0.30)
eval_set=[(X_test, y_test)]

In [21]:
selector = SelectFromModel(xg)

In [22]:
selector.fit(X_train, y_train)

SelectFromModel(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                        colsample_bylevel=1, colsample_bynode=1,
                                        colsample_bytree=1, gamma=0,
                                        learning_rate=0.1, max_delta_step=0,
                                        max_depth=3, min_child_weight=1,
                                        missing=None, n_estimators=100,
                                        n_jobs=1, nthread=None,
                                        objective='binary:logistic',
                                        random_state=0, reg_alpha=0,
                                        reg_lambda=1, scale_pos_weight=2.5,
                                        seed=None, silent=True, subsample=1,
                                        verbosity=1),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [23]:
selector.get_support()

array([ True, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False])

In [26]:
selected_features = finalTrain.columns[selector.get_support()]
selected_features

Index(['member_id', 'term', 'dti', 'recoveries', 'last_week_pay'], dtype='object')

In [27]:
ultimate_finalTrain = finalTrain[selected_features]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(np.array(ultimate_finalTrain), np.array(train_target), test_size=0.30)
eval_set=[(X_test, y_test)]

In [32]:
ultimate_finalTrain.head()

Unnamed: 0,member_id,term,dti,recoveries,last_week_pay
0,58189336,36,33.88,0.0,26.0
1,70011223,36,3.64,0.0,9.0
2,70255675,36,18.42,0.0,9.0
3,1893936,36,14.97,0.0,135.0
4,7652106,36,20.16,0.0,96.0


In [31]:
#XGBOOST
st = datetime.now()

xg = XGBClassifier(scale_pos_weight=2.5 ,silent=True)
    
xg.fit(X_train, y_train)

print(datetime.now()-st)


# evaluate predictions
y_train_predict_xg = xg.predict(X_train)
print("==========================")
print('Train Accuracy %.3f' % metrics.accuracy_score(y_train, y_train_predict_xg))

# make predictions for test data
y_pred_xg = xg.predict(X_test)
predictions = [round(value) for value in y_pred_xg]

accuracy_per_roc_auc = roc_auc_score(y_test, predictions)
print("ROC-AUC: %.10f%%" % (accuracy_per_roc_auc * 100))


print('Test Accuracy %.3f' % metrics.accuracy_score(y_test, predictions))
print("==========================")
print(metrics.confusion_matrix(y_test, predictions))
print("==========================")
print(metrics.classification_report(y_test, predictions))
print("==========================")
print('Precision Score %.3f' % metrics.precision_score(y_test, predictions))
print('Recall Score %.3f' % metrics.recall_score(y_test, predictions))
print('F1 Score %.3f' % metrics.f1_score(y_test, predictions)) 

0:00:36.792929
Train Accuracy 0.928
ROC-AUC: 90.4300988026%
Test Accuracy 0.928
[[115859   6347]
 [  5233  32290]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95    122206
           1       0.84      0.86      0.85     37523

    accuracy                           0.93    159729
   macro avg       0.90      0.90      0.90    159729
weighted avg       0.93      0.93      0.93    159729

Precision Score 0.836
Recall Score 0.861
F1 Score 0.848


In [33]:
#LGBM
from lightgbm import LGBMClassifier

st = datetime.now()

lgbm = LGBMClassifier(objective ="binary",verbosity = -1, learning_rate=0.1, n_estimators=1000, scale_pos_weight=2.5)
    
lgbm.fit(X_train, y_train)

print(datetime.now()-st)

# evaluate predictions
y_train_predict_lgbm = lgbm.predict(X_train)

print("==========================")
print('Train Accuracy %.3f' % metrics.accuracy_score(y_train, y_train_predict_lgbm))

# make predictions for test data
y_pred_lgbm = lgbm.predict(X_test)
predictions = [round(value) for value in y_pred_lgbm]


accuracy_per_roc_auc = roc_auc_score(y_test, predictions)
print("ROC-AUC: %.10f%%" % (accuracy_per_roc_auc * 100))


print('Test Accuracy %.3f' % metrics.accuracy_score(y_test, predictions))
print("==========================")
print(metrics.confusion_matrix(y_test, predictions))
print("==========================")
print(metrics.classification_report(y_test, predictions))
print("==========================")
print('Precision Score %.3f' % metrics.precision_score(y_test, predictions))
print('Recall Score %.3f' % metrics.recall_score(y_test, predictions))
print('F1 Score %.3f' % metrics.f1_score(y_test, predictions)) 

0:00:34.460304
Train Accuracy 0.938
ROC-AUC: 93.3749075712%
Test Accuracy 0.934
[[114250   7956]
 [  2529  34994]]
              precision    recall  f1-score   support

           0       0.98      0.93      0.96    122206
           1       0.81      0.93      0.87     37523

    accuracy                           0.93    159729
   macro avg       0.90      0.93      0.91    159729
weighted avg       0.94      0.93      0.94    159729

Precision Score 0.815
Recall Score 0.933
F1 Score 0.870
