In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score


In [3]:
# DATA load
X = pd.read_csv('./content/loan_train_preprocessed.csv')

# backward
X = X[['term', 'initial_list_status', 'int_rate', 
'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 
'inq_last_6mths', 'revol_util', 'recoveries', 
'collection_recovery_fee', 'tot_cur_bal', 
'home_ownershipRENT', 'purposesmall_business', 
'purposewedding', 'earliest_cr_line2000']]

y = pd.read_csv('./content/loan_train_label.csv')
y = y.drop(['id'], axis=1)

In [4]:
# Dividing the data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.20 )
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size = 0.20 )

x_train.shape,y_train.shape,x_val.shape,y_val.shape,x_test.shape,y_test.shape



((10240, 16), (10240, 1), (2560, 16), (2560, 1), (3200, 16), (3200, 1))

In [5]:
# 수정된 get_clf_eval() 함수 
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [6]:
# XGBoost training

evals = [(x_val, y_val)]

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_wrapper.fit(x_train , y_train,  early_stopping_rounds=100, eval_set=evals, eval_metric="logloss",  verbose=True)

y_preds = xgb_wrapper.predict(x_test)
y_pred_proba = xgb_wrapper.predict_proba(x_test)[:, 1]

[0]	validation_0-logloss:0.67110
[1]	validation_0-logloss:0.65323
[2]	validation_0-logloss:0.63793
[3]	validation_0-logloss:0.62494
[4]	validation_0-logloss:0.61419
[5]	validation_0-logloss:0.60512
[6]	validation_0-logloss:0.59704
[7]	validation_0-logloss:0.59038
[8]	validation_0-logloss:0.58429
[9]	validation_0-logloss:0.57924
[10]	validation_0-logloss:0.57523
[11]	validation_0-logloss:0.57103
[12]	validation_0-logloss:0.56741
[13]	validation_0-logloss:0.56457
[14]	validation_0-logloss:0.56182
[15]	validation_0-logloss:0.55962
[16]	validation_0-logloss:0.55739
[17]	validation_0-logloss:0.55539
[18]	validation_0-logloss:0.55365
[19]	validation_0-logloss:0.55212
[20]	validation_0-logloss:0.55072
[21]	validation_0-logloss:0.54932
[22]	validation_0-logloss:0.54795
[23]	validation_0-logloss:0.54699
[24]	validation_0-logloss:0.54571
[25]	validation_0-logloss:0.54480
[26]	validation_0-logloss:0.54415
[27]	validation_0-logloss:0.54350
[28]	validation_0-logloss:0.54273
[29]	validation_0-loglos

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[44]	validation_0-logloss:0.53765
[45]	validation_0-logloss:0.53749
[46]	validation_0-logloss:0.53734
[47]	validation_0-logloss:0.53716
[48]	validation_0-logloss:0.53725
[49]	validation_0-logloss:0.53716
[50]	validation_0-logloss:0.53731
[51]	validation_0-logloss:0.53720
[52]	validation_0-logloss:0.53714
[53]	validation_0-logloss:0.53699
[54]	validation_0-logloss:0.53675
[55]	validation_0-logloss:0.53682
[56]	validation_0-logloss:0.53669
[57]	validation_0-logloss:0.53656
[58]	validation_0-logloss:0.53665
[59]	validation_0-logloss:0.53650
[60]	validation_0-logloss:0.53666
[61]	validation_0-logloss:0.53669
[62]	validation_0-logloss:0.53680
[63]	validation_0-logloss:0.53672
[64]	validation_0-logloss:0.53673
[65]	validation_0-logloss:0.53675
[66]	validation_0-logloss:0.53666
[67]	validation_0-logloss:0.53682
[68]	validation_0-logloss:0.53703
[69]	validation_0-logloss:0.53691
[70]	validation_0-logloss:0.53687
[71]	validation_0-logloss:0.53698
[72]	validation_0-logloss:0.53686
[73]	validatio

In [7]:
get_clf_eval(y_test, y_preds, y_pred_proba)

오차 행렬
[[1145  465]
 [ 486 1104]]
정확도: 0.7028, 정밀도: 0.7036, 재현율: 0.6943,    F1: 0.6990, AUC:0.7915
