In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [33]:
data_path = '../data/'

train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

In [34]:
train_df.head()

Unnamed: 0_level_0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,5448.79,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,936.71,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,1754.01,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,473.66,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,365.67,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [35]:
X = train_df.drop(columns='defects')
y = train_df['defects'].astype(int)

In [36]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic')

In [37]:
from sklearn.model_selection import cross_val_score

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f'roc auc : {np.round(scores, 4)}')
print(f'roc auc mean : {scores.mean()}')

roc auc : [0.7864 0.7825 0.7876 0.7837 0.7815]
roc auc mean : 0.7843294228456473


In [46]:
from sklearn.model_selection import StratifiedKFold

# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=61)
kfold = StratifiedKFold(n_splits=5) #, shuffle=True, random_state=61)
scores = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc')
print(f'roc auc : {np.round(scores, 4)}')
print(f'roc auc mean : {scores.mean()}')

roc auc : [0.7864 0.7825 0.7876 0.7837 0.7815]
roc auc mean : 0.7843294228456473


In [39]:
# kfold 알아보기
print(f'전체 데이터의 클래스 분포 : {np.bincount(y)}')
for idx, (train_idx, valid_idx) in enumerate(kfold.split(X, y)):
    print(f'{idx+1}번째 훈련 폴드 : {np.bincount(y[train_idx])}')
    print(f'{idx+1}번째 검증 폴드 : {np.bincount(y[valid_idx])}', end=f"\n{'-'*30}\n")

전체 데이터의 클래스 분포 : [78699 23064]
1번째 훈련 폴드 : [62959 18451]
1번째 검증 폴드 : [15740  4613]
------------------------------
2번째 훈련 폴드 : [62959 18451]
2번째 검증 폴드 : [15740  4613]
------------------------------
3번째 훈련 폴드 : [62959 18451]
3번째 검증 폴드 : [15740  4613]
------------------------------
4번째 훈련 폴드 : [62959 18452]
4번째 검증 폴드 : [15740  4612]
------------------------------
5번째 훈련 폴드 : [62960 18451]
5번째 검증 폴드 : [15739  4613]
------------------------------


In [40]:
# kfold의 활용
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [41]:
def grid_search(params, random=False):
    xgb = XGBClassifier(
        booster='gbtree',
        objective='binary:logistic',
        random_state=61,
        verbosity=0,
        use_label_encder=False,
    )
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=61)
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=61)
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
    grid.fit(X, y)
    best_params = grid.best_params_
    print(f'최고의 매개변수 : {best_params}')
    best_score = grid.best_score_
    print(f'최상의 점수 : {best_score:.4f}')
    return best_params

In [42]:
# https://xgboost.readthedocs.io/en/stable/parameter.html
best_params = grid_search(params={'nestimators': [100, 200, 400, 800]})

최고의 매개변수 : {'nestimators': 100}
최상의 점수 : 0.8109


In [43]:
xgb = XGBClassifier(
        booster='gbtree',
        objective='binary:logistic',
        random_state=61,
        verbosity=0,
        use_label_encder=False,
        **best_params,
    )
xgb.fit(X,y)
preds = xgb.predict(X)
print(len(preds))
print(f'roc auc : {roc_auc_score(y, preds)}')

101763
roc auc : 0.6972940837796624


In [44]:
y_preds = xgb.predict_proba(test_df)[:, 1]
print(y_preds[:10])

[0.22478987 0.16803485 0.5567377  0.4821386  0.13984482 0.4636549
 0.1374425  0.6345678  0.31232557 0.06660065]


In [45]:
submission_df['defects'] = y_preds
submission_df.to_csv(data_path + 'submission.csv')