In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
import pandas as pd 
import numpy as np 

from xgboost import XGBClassifier,DMatrix,train
from xgboost.callback import EarlyStopping
from xgboost import cv

from sklearn.metrics import f1_score,confusion_matrix,roc_auc_score,recall_score,accuracy_score,precision_score
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.utils import resample

In [13]:
X_train=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\X_train.csv',index_col=0))
y_train=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\y_train.csv',index_col=0).astype(int)).reshape(-1)

X_test=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\X_test.csv',index_col=0))
y_test=np.array(pd.read_csv('C:\workspace\Credit-Risk-Modeling\dataset\kaggle_pd_estimation\y_test.csv',index_col=0).astype(int)).reshape(-1).astype(int)


In [14]:
train_np=np.concatenate((X_train,y_train.reshape(-1,1)),axis=1)

In [15]:
param={'n_estimators':list(np.linspace(10,2000,20)),'max_depth':[3,5,8,12,15],'eta':list(np.linspace(0.05,0.2,10)),'objective':'binary:logistic','tree_method':'hist'}

In [16]:
def homeMade_f1(y_true,y_pred):
    y_true=pd.Series(y_true).astype(int)
    y_pred=pd.Series(y_pred).astype(int)
    f1_acc_score=f1_score(y_true,y_pred)
    return f1_acc_score

In [17]:
def make_balance(fit_df):
    fit_df_0=fit_df[fit_df[:,-1]==0]
    fit_df_1=fit_df[fit_df[:,-1]==1]
    max_count=np.floor(len(fit_df_0)/len(fit_df_1))
    residuals=int(len(fit_df_0)-max_count*len(fit_df_1))
    balance_count=1
    fit_df2=np.array(pd.DataFrame(fit_df_1).copy())
    while balance_count<max_count:
        fit_df2=np.concatenate((fit_df2,fit_df_1))
        balance_count=balance_count+1
    fit_df2=np.concatenate((fit_df2,resample(fit_df_1,replace=False,n_samples=residuals)))
    fit_df2=np.concatenate((fit_df2,fit_df_0))
    fit_df2=resample(fit_df2,replace=False,n_samples=len(fit_df2))
    return fit_df2

In [18]:
def ensemble_xgb(train_np,params,test_size=0.3):
    n_estimators=params['n_estimators']
    object=params['objective']
    tree_method=params['tree_method']
    depth_list=params['max_depth']
    eta_list=params['eta']
    model_dic={}
    model_number=1
    
    fit_df,val_df=train_test_split(train_np,random_state=None,test_size=test_size,stratify=train_np[:,-1])
    
    fit_df2=make_balance(fit_df) 
    X_fit=fit_df2[:,:-1]
    y_fit=fit_df2[:,-1]

    #val_df2=make_balance(val_df) 
    X_val=val_df[:,:-1]
    y_val=val_df[:,-1]

    for i in depth_list:
        for j in eta_list:
            for k in n_estimators:
                model=XGBClassifier(n_estimators=np.round(k),objective=object,tree_method=tree_method,early_stopping_rounds=50,
                                    learning_rate=j,max_depth=i,n_jobs=-1,eval_metric=['aucpr']).fit(X_fit,y_fit,eval_set=[(X_val,y_val)])
                #f_1_accuracy_score=f1_score(y_val,pred)
                model_dic['model_number_{}'.format(model_number)]=\
                    {'n_estimators':model.best_iteration,'max_depth':i,'learning_rate':j,'accuracy_score':model.best_score}
                print(model_dic['model_number_{}'.format(model_number)])
                model_number=model_number+1


    model_df=pd.DataFrame(model_dic)
    
    return model_df


In [19]:
param1={'n_estimators':[2000],'max_depth':[3,4,5,6,7,8],'eta':[0.1,0.09,0.08,0.07,0.06,0.05],'objective':'binary:logistic','tree_method':'hist'}
len(param1['n_estimators'])*len(param1['max_depth'])*len(param1['eta'])

36

In [20]:
selected_model1=ensemble_xgb(train_np,param1)

[0]	validation_0-aucpr:0.75880
[1]	validation_0-aucpr:0.75880
[2]	validation_0-aucpr:0.75665


[3]	validation_0-aucpr:0.75772
[4]	validation_0-aucpr:0.75772
[5]	validation_0-aucpr:0.75616
[6]	validation_0-aucpr:0.76517
[7]	validation_0-aucpr:0.76310
[8]	validation_0-aucpr:0.77682
[9]	validation_0-aucpr:0.79074
[10]	validation_0-aucpr:0.79793
[11]	validation_0-aucpr:0.81211
[12]	validation_0-aucpr:0.81537
[13]	validation_0-aucpr:0.81069
[14]	validation_0-aucpr:0.81349
[15]	validation_0-aucpr:0.81566
[16]	validation_0-aucpr:0.81792
[17]	validation_0-aucpr:0.81932
[18]	validation_0-aucpr:0.82269
[19]	validation_0-aucpr:0.82404
[20]	validation_0-aucpr:0.82686
[21]	validation_0-aucpr:0.82671
[22]	validation_0-aucpr:0.82619
[23]	validation_0-aucpr:0.82769
[24]	validation_0-aucpr:0.82701
[25]	validation_0-aucpr:0.82920
[26]	validation_0-aucpr:0.82935
[27]	validation_0-aucpr:0.83315
[28]	validation_0-aucpr:0.83378
[29]	validation_0-aucpr:0.83629
[30]	validation_0-aucpr:0.83757
[31]	validation_0-aucpr:0.83792
[32]	validation_0-aucpr:0.83875
[33]	validation_0-aucpr:0.83921
[34]	validation

In [21]:
selected_model1=selected_model1.transpose().sort_values(['accuracy_score','learning_rate','max_depth'],ascending=[False,False,True])

In [22]:
selected_model1

Unnamed: 0,n_estimators,max_depth,learning_rate,accuracy_score
model_number_16,422.0,5.0,0.07,0.898534
model_number_7,674.0,4.0,0.1,0.898526
model_number_23,415.0,6.0,0.06,0.898408
model_number_21,377.0,6.0,0.08,0.89812
model_number_19,205.0,6.0,0.1,0.898116
model_number_15,466.0,5.0,0.08,0.897974
model_number_17,760.0,5.0,0.06,0.897855
model_number_28,441.0,7.0,0.07,0.897387
model_number_20,364.0,6.0,0.09,0.897084
model_number_13,274.0,5.0,0.1,0.897006


In [25]:
train_np2=make_balance(train_np) 
X_train=train_np2[:,:-1]
y_train=train_np2[:,-1]
model=XGBClassifier(n_estimators=422,max_depth=5,learning_rate=0.07,n_jobs=-1,objective='binary:logistic').fit(X_train,y_train)
pred=model.predict(X_test)

In [26]:
confusion_matrix(y_test,pred)

array([[4881,  214],
       [ 283, 1139]], dtype=int64)

In [27]:
confusion_matrix(y_test,pred)

array([[4881,  214],
       [ 283, 1139]], dtype=int64)

In [28]:
recall_score(y_test,pred)

0.80098452883263

In [29]:
precision_score(y_test,pred)

0.8418329637841833

In [30]:
f1_score(y_test,pred)

0.8209009009009008

In [31]:
accuracy_score(y_test,pred)

0.9237379162191193

In [32]:
accuracy_score(y_test,pred)

0.9237379162191193

In [33]:
roc_auc_score(y_test,pred)

0.8794912830620462