# Section 1: Extracting training and testing dataset

In [1]:
import pandas as pd 
train_df=pd.read_csv('C:\\workspace\\Credit-Risk-Modeling\\dataset\\loan_data_2007_2014\\train_df.csv',index_col=0)
test_df=pd.read_csv('C:\\workspace\\Credit-Risk-Modeling\\dataset\\loan_data_2007_2014\\test_df.csv',index_col=0)

In [2]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.42029,0.42029,0.428571,0.168605,0.330362,0.010416,0.538306,0.425356,0.034483,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,0.217391,0.217391,0.228571,0.621609,0.135242,0.005615,0.365927,0.586647,0.0,0.030303,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.337681,0.337681,0.345714,0.65407,0.214427,0.003348,0.371976,0.776944,0.034483,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.275362,0.275362,0.285714,0.029554,0.207047,0.004148,0.371976,0.229057,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,0.444203,0.444203,0.452143,0.324612,0.366386,0.007616,0.986895,0.448612,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [3]:
train_df.iloc[:,50].value_counts(normalize=True)

50
1.0    0.890695
0.0    0.109305
Name: proportion, dtype: float64

In [4]:
test_df.iloc[:,50].value_counts(normalize=True)

50
1.0    0.890689
0.0    0.109311
Name: proportion, dtype: float64

In [5]:
X_train=train_df.iloc[:,:50]
y_train=train_df.iloc[:,50]

X_test=test_df.iloc[:,:50]
y_test=test_df.iloc[:,50]

In [6]:
X_train.shape,X_test.shape

((373028, 50), (93257, 50))

In [7]:
y_train.shape,y_test.shape

((373028,), (93257,))

In [8]:
import numpy as np
train_np=np.concatenate((np.array(X_train),np.array(y_train).reshape((-1,1))),axis=1)

# Section 2: Hyperparametertuning

In [9]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score,precision_score,recall_score,precision_recall_curve,roc_curve
from sklearn.utils import resample

from xgboost import XGBClassifier

In [10]:
def make_balance(fit_df):
    fit_df_0=fit_df[fit_df[:,-1]==0]
    fit_df_1=fit_df[fit_df[:,-1]==1]
    max_count=np.floor(len(fit_df_0)/len(fit_df_1))
    residuals=int(len(fit_df_0)-max_count*len(fit_df_1))
    balance_count=1
    fit_df2=np.array(pd.DataFrame(fit_df_1).copy())
    while balance_count<max_count:
        fit_df2=np.concatenate((fit_df2,fit_df_1))
        balance_count=balance_count+1
    fit_df2=np.concatenate((fit_df2,resample(fit_df_1,replace=False,n_samples=residuals)))
    fit_df2=np.concatenate((fit_df2,fit_df_0))
    fit_df2=resample(fit_df2,replace=False,n_samples=len(fit_df2))
    return fit_df2

In [11]:
def ensemble_xgb(train_np,params,test_size=0.3):
    n_estimators=params['n_estimators']
    object=params['objective']
    tree_method=params['tree_method']
    depth_list=params['max_depth']
    eta_list=params['eta']
    model_dic={}
    model_number=1
    
    fit_df,val_df=train_test_split(train_np,random_state=None,test_size=test_size,stratify=train_np[:,-1])
    
    fit_df2=make_balance(fit_df) 
    X_fit=fit_df2[:,:-1]
    y_fit=fit_df2[:,-1]

    #val_df2=make_balance(val_df) 
    X_val=val_df[:,:-1]
    y_val=val_df[:,-1]

    for i in depth_list:
        for j in eta_list:
            for k in n_estimators:
                model=XGBClassifier(n_estimators=np.round(k),objective=object,tree_method=tree_method,early_stopping_rounds=50,
                                    learning_rate=j,max_depth=i,n_jobs=-1,eval_metric=['aucpr']).fit(X_fit,y_fit,eval_set=[(X_val,y_val)])
                #f_1_accuracy_score=f1_score(y_val,pred)
                model_dic['model_number_{}'.format(model_number)]=\
                    {'n_estimators':model.best_iteration,'max_depth':i,'learning_rate':j,'accuracy_score':model.best_score}
                print(model_dic['model_number_{}'.format(model_number)])
                model_number=model_number+1


    model_df=pd.DataFrame(model_dic)
    
    return model_df

In [12]:
param1={'n_estimators':[20000],'max_depth':[3,5,8,12,15],'eta':[0.1,0.05,0.01],'objective':'binary:logistic','tree_method':'hist'}
len(param1['n_estimators'])*len(param1['max_depth'])*len(param1['eta'])

15

In [15]:
selected_model1=ensemble_xgb(train_np,param1)

[0]	validation_0-aucpr:0.96013
[1]	validation_0-aucpr:0.96334
[2]	validation_0-aucpr:0.96741
[3]	validation_0-aucpr:0.96741
[4]	validation_0-aucpr:0.98134
[5]	validation_0-aucpr:0.98468
[6]	validation_0-aucpr:0.98654
[7]	validation_0-aucpr:0.98653
[8]	validation_0-aucpr:0.98959
[9]	validation_0-aucpr:0.98959
[10]	validation_0-aucpr:0.99143
[11]	validation_0-aucpr:0.99161
[12]	validation_0-aucpr:0.99204
[13]	validation_0-aucpr:0.99307
[14]	validation_0-aucpr:0.99366
[15]	validation_0-aucpr:0.99363
[16]	validation_0-aucpr:0.99360
[17]	validation_0-aucpr:0.99374
[18]	validation_0-aucpr:0.99371
[19]	validation_0-aucpr:0.99379
[20]	validation_0-aucpr:0.99435
[21]	validation_0-aucpr:0.99446
[22]	validation_0-aucpr:0.99474
[23]	validation_0-aucpr:0.99492
[24]	validation_0-aucpr:0.99541
[25]	validation_0-aucpr:0.99568
[26]	validation_0-aucpr:0.99577
[27]	validation_0-aucpr:0.99581
[28]	validation_0-aucpr:0.99610
[29]	validation_0-aucpr:0.99625
[30]	validation_0-aucpr:0.99633
[31]	validation_0-