In [1]:
from imblearn.over_sampling import SMOTE
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score

### load csv data

In [3]:
X_train = np.loadtxt('../data/csvs/csv_3/train_data_mv3_22.csv', delimiter=',', skiprows=1)
y_train = np.loadtxt('../data/csvs/csv_3/train_y_mv3_22.csv', delimiter=',', skiprows=1)
X_test = np.loadtxt('../data/csvs/csv_3/test_data_mv3_22.csv', delimiter=',', skiprows=1)
y_test = np.loadtxt('../data/csvs/csv_3/test_y_mv3_22.csv', delimiter=',', skiprows=1)

### over sampling using SMOTE

In [4]:
smote = SMOTE(random_state=11)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

### split train, validation data

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_over, y_train_over, test_size=0.10)

### train with XGBClassifier

In [31]:
model = XGBClassifier(n_estimators=500, learning_rate=0.18, max_depth=4, random_state=32)
xgb_model = model.fit(X_train, y_train, eval_metric="logloss", 
                      early_stopping_rounds=100, eval_set=([X_val, y_val],),
                      verbose=2)
print(xgb_model)

[0]	validation_0-logloss:0.55813
[2]	validation_0-logloss:0.39185
[4]	validation_0-logloss:0.29701
[6]	validation_0-logloss:0.23640
[8]	validation_0-logloss:0.19701
[10]	validation_0-logloss:0.16875
[12]	validation_0-logloss:0.14972
[14]	validation_0-logloss:0.13515
[16]	validation_0-logloss:0.12413
[18]	validation_0-logloss:0.11600
[20]	validation_0-logloss:0.10899




[22]	validation_0-logloss:0.10421
[24]	validation_0-logloss:0.09976
[26]	validation_0-logloss:0.09614
[28]	validation_0-logloss:0.09276
[30]	validation_0-logloss:0.09037
[32]	validation_0-logloss:0.08802
[34]	validation_0-logloss:0.08600
[36]	validation_0-logloss:0.08449
[38]	validation_0-logloss:0.08342
[40]	validation_0-logloss:0.08197
[42]	validation_0-logloss:0.08100
[44]	validation_0-logloss:0.08017
[46]	validation_0-logloss:0.07941
[48]	validation_0-logloss:0.07829
[50]	validation_0-logloss:0.07769
[52]	validation_0-logloss:0.07684
[54]	validation_0-logloss:0.07653
[56]	validation_0-logloss:0.07605
[58]	validation_0-logloss:0.07568
[60]	validation_0-logloss:0.07525
[62]	validation_0-logloss:0.07513
[64]	validation_0-logloss:0.07470
[66]	validation_0-logloss:0.07467
[68]	validation_0-logloss:0.07444
[70]	validation_0-logloss:0.07439
[72]	validation_0-logloss:0.07418
[74]	validation_0-logloss:0.07396
[76]	validation_0-logloss:0.07399
[78]	validation_0-logloss:0.07371
[80]	validatio

### predict test

In [32]:
expected_y = y_test
pred_y = xgb_model.predict(X_test)

### scores

In [33]:
print(classification_report(expected_y, pred_y))
accuracy = accuracy_score(expected_y, pred_y)
print("Accuracy: %.2f%%" % (accuracy * 100))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93        44
           1       0.97      0.76      0.85        45
           2       0.98      0.90      0.94        49
           3       0.96      0.94      0.95        53
           4       0.78      0.74      0.76        47
           5       1.00      0.79      0.88        33
           6       0.88      0.76      0.81        37
           7       1.00      0.86      0.93        36
           8       0.98      0.87      0.92        63
           9       1.00      0.89      0.94        38
          10       0.93      0.82      0.87        34
          11       1.00      0.75      0.86        32
          12       1.00      0.84      0.92        45

   micro avg       0.96      0.84      0.89       556
   macro avg       0.96      0.83      0.89       556
weighted avg       0.96      0.84      0.89       556
 samples avg       0.83      0.84      0.83       556

Accuracy: 82.91%


  _warn_prf(average, modifier, msg_start, len(result))


### cross validation

In [34]:
# cross validation - kfold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(xgb_model, X_train_over, y_train_over, cv=kfold)

# validation score
print("교차 검증별 정확도: ", (np.round(scores, 4)* 100))
print("평균 검증 정확도: ", (np.round(np.mean(scores), 4)*100))

교차 검증별 정확도:  [84.92 78.15 77.85 79.08 74.77]
평균 검증 정확도:  78.95
