In [1]:
from imblearn.over_sampling import SMOTE
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score

### load csv data

In [2]:
X_train = np.loadtxt('../data/csvs/csv_3_3/train_x_mv3_22_3.csv', delimiter=',', skiprows=1)
y_train = np.loadtxt('../data/csvs/csv_3_3/train_y_mv3_22_3.csv', delimiter=',', skiprows=1)
X_test = np.loadtxt('../data/csvs/csv_3_3/test_x_mv3_22_3.csv', delimiter=',', skiprows=1)
y_test = np.loadtxt('../data/csvs/csv_3_3/test_y_mv3_22_3.csv', delimiter=',', skiprows=1)

### over sampling using SMOTE

In [3]:
smote = SMOTE(random_state=11)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

### split train, validation data

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_over, y_train_over, test_size=0.10)

### train with XGBClassifier

In [20]:
model = XGBClassifier(n_estimators=500, learning_rate=0.18, max_depth=4, random_state=32)
xgb_model = model.fit(X_train, y_train, eval_metric="logloss", 
                      early_stopping_rounds=100, eval_set=([X_val, y_val],),
                      verbose=2)
print(xgb_model)

[0]	validation_0-logloss:0.55342
[2]	validation_0-logloss:0.38069
[4]	validation_0-logloss:0.27888
[6]	validation_0-logloss:0.21218
[8]	validation_0-logloss:0.16661
[10]	validation_0-logloss:0.13643
[12]	validation_0-logloss:0.11541
[14]	validation_0-logloss:0.09974
[16]	validation_0-logloss:0.08734
[18]	validation_0-logloss:0.07849
[20]	validation_0-logloss:0.07204




[22]	validation_0-logloss:0.06705
[24]	validation_0-logloss:0.06288
[26]	validation_0-logloss:0.05968
[28]	validation_0-logloss:0.05708
[30]	validation_0-logloss:0.05471
[32]	validation_0-logloss:0.05213
[34]	validation_0-logloss:0.05076
[36]	validation_0-logloss:0.04931
[38]	validation_0-logloss:0.04832
[40]	validation_0-logloss:0.04757
[42]	validation_0-logloss:0.04632
[44]	validation_0-logloss:0.04580
[46]	validation_0-logloss:0.04470
[48]	validation_0-logloss:0.04463
[50]	validation_0-logloss:0.04425
[52]	validation_0-logloss:0.04406
[54]	validation_0-logloss:0.04354
[56]	validation_0-logloss:0.04328
[58]	validation_0-logloss:0.04290
[60]	validation_0-logloss:0.04248
[62]	validation_0-logloss:0.04240
[64]	validation_0-logloss:0.04253
[66]	validation_0-logloss:0.04232
[68]	validation_0-logloss:0.04234
[70]	validation_0-logloss:0.04232
[72]	validation_0-logloss:0.04229
[74]	validation_0-logloss:0.04226
[76]	validation_0-logloss:0.04227
[78]	validation_0-logloss:0.04220
[80]	validatio

### predict test

In [21]:
expected_y = y_test
pred_y = xgb_model.predict(X_test)

### scores

In [22]:
print(classification_report(expected_y, pred_y))
accuracy = accuracy_score(expected_y, pred_y)
print("Accuracy: %.2f%%" % (accuracy * 100))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        44
           1       1.00      0.84      0.92        45
           2       0.90      0.90      0.90        49
           3       1.00      0.96      0.98        53
           4       1.00      0.81      0.89        47
           5       0.97      0.85      0.90        33
           6       0.90      0.70      0.79        37
           7       0.97      0.94      0.96        36
           8       0.97      0.95      0.96        63
           9       1.00      0.92      0.96        38
          10       1.00      0.94      0.97        34
          11       1.00      0.78      0.88        32
          12       1.00      0.96      0.98        45

   micro avg       0.98      0.89      0.93       556
   macro avg       0.98      0.88      0.93       556
weighted avg       0.98      0.89      0.93       556
 samples avg       0.89      0.89      0.89       556

Accuracy: 88.31%


  _warn_prf(average, modifier, msg_start, len(result))


### cross validation

In [23]:
# cross validation - kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(xgb_model, X_train_over, y_train_over, cv=kfold)

# validation score
print("교차 검증별 정확도: ", (np.round(scores, 4)* 100))
print("평균 검증 정확도: ", (np.round(np.mean(scores), 4)*100))

교차 검증별 정확도:  [86.77 88.   87.08 82.46 84.62]
평균 검증 정확도:  85.78


### save model

In [24]:
xgb_model.save_model('xgb_mv3_3.json')