In [2]:
import os
import sys
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
style.use('ggplot')
import seaborn as sns
import tqdm
import random

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

!pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import warnings
warnings.filterwarnings("ignore")

data_dir = '/mnt/elice/dataset'

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# 재현성 확보를 위해 시드값을 고정합니다.
seed = 42
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
# train_df: serial number, timestamp, X1, X2~18, Y
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"), index_col='Serial Number')
# test_x: serial number, timestamp, X1, X2~18
test_x = pd.read_csv(os.path.join(data_dir, "test_x.csv"), index_col='Serial Number')

''' timestamp 열 형식 바꾸기 '''
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'])
train_df['TIMESTAMP'] = train_df['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))
test_x['TIMESTAMP'] = pd.to_datetime(test_x['TIMESTAMP'])
test_x['TIMESTAMP'] = test_x['TIMESTAMP'].map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

''' 컬럼 키 추출 '''
serial_key = train_df.index.name
date_time_key = list(train_df.columns)[0]
feature_keys = list(train_df.columns)[2:-1]
target_key = list(train_df.columns)[-1]

# train_x: serial number, timestamp, X1, X2~18
train_x = train_df.drop(columns='Y')
# train_y: serial_number, Y
train_y = pd.read_csv(os.path.join(data_dir, "train_y.csv"), index_col='Serial Number')

# 데이터 전처리

## 장비 이름을 나타내는 X1 변수를 제거

In [5]:
train_x.drop(columns='X1', inplace=True)
test_x.drop(columns='X1', inplace=True)

## `StandardScaler` 를 활용해 표준화 

In [6]:
scaler = StandardScaler()

train_x[feature_keys] = scaler.fit_transform(train_x[feature_keys])
test_x[feature_keys] = scaler.transform(test_x[feature_keys])

## Serial Number를 기준으로 분리

In [7]:
# 데이터를 index가 같은 컬럼 별로 분리해서 리스트에 저장
# group: (그룹 이름, 그룹 데이터프레임) => group[1]: 그룹 데이터프레임
train_x_by_serial = [group[1] for group in train_x.groupby(train_x.index)]
test_x_by_serial = [group[1] for group in test_x.groupby(test_x.index)]

# TIMESTAMP 값을 기준으로 정렬
train_x_by_serial = [group.sort_values('TIMESTAMP') for group in train_x_by_serial]
test_x_by_serial = [group.sort_values('TIMESTAMP') for group in test_x_by_serial]

In [7]:
train_x_by_serial[2]

Unnamed: 0_level_0,TIMESTAMP,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
48,2020-02-09 08:01,0.237359,0.176644,1.060111,1.067976,1.258588,-0.079582,0.845695,0.629975,0.059574,-1.734490,-0.465737,-0.012637,-0.035175,-0.464819,-0.010677,-0.011952,-0.013366
48,2020-02-10 08:16,0.237359,0.176644,1.060111,1.067976,1.258588,-0.079582,0.849884,0.633251,0.059664,-1.684896,-0.410967,-0.012621,-0.035175,-0.409868,-0.010677,-0.011952,-0.012643
48,2020-02-11 08:31,0.237359,0.176644,1.060111,1.067976,1.258588,-0.079582,0.854143,0.636589,0.059844,-1.635302,-0.315119,-0.012613,-0.035175,-0.313703,-0.010677,-0.011952,-0.012282
48,2020-02-12 08:46,0.237359,0.176644,1.060111,1.067976,1.258588,-0.079582,0.858803,0.640521,0.060115,-1.583642,-0.219271,-0.012589,-0.035175,-0.217539,-0.010677,-0.011952,-0.011199
48,2020-02-13 09:01,0.237359,0.176644,1.060111,1.067976,1.258588,-0.079582,0.863227,0.644140,0.060295,-1.534048,-0.127987,-0.012589,-0.035175,-0.125954,-0.010677,-0.011952,-0.011199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,2020-04-14 00:01,0.237359,0.176644,1.407310,1.067976,1.258588,-0.079582,1.133848,0.867393,0.078570,1.480844,4.029984,-0.012373,-0.035175,4.045745,-0.010677,-0.011952,-0.001446
48,2020-04-15 00:16,0.237359,0.176644,1.407310,1.067976,1.258588,-0.079582,1.138296,0.870981,0.078840,1.530438,4.198859,-0.012345,-0.035175,4.215177,-0.010677,-0.011952,-0.000362
48,2020-04-16 00:31,0.237359,0.176644,1.407310,1.067976,1.258588,-0.079582,1.142649,0.874445,0.079110,1.580032,4.317527,-0.012345,-0.035175,4.334238,-0.010677,-0.011952,-0.000362
48,2020-04-17 00:46,0.237359,0.176644,1.407310,1.067976,1.258588,-0.079582,1.147262,0.878283,0.079335,1.631692,4.490966,-0.012337,-0.035175,4.508249,-0.010677,-0.011952,-0.000001


## train, valid 분할

In [8]:
def train_test_split(Xs, ys, test_ratio=0.2):
    ''' 각 (x, y) 쌍을 label 별로 딕셔너리에 저장 '''
    data_per_label = {}

    for x, y in zip(Xs, ys):
        label = y
        if label not in data_per_label:
            data_per_label[label] = []
        # key: label, value: 해당 label에 해당하는 (x, y) 쌍의 리스트
        data_per_label[label].append((x, y))

    train = []
    test = []

    for label in data_per_label:
        # label에 해당하는 데이터 가져온다
        data = data_per_label[label]
        # 테스트 데이터의 개수
        n_test = int(len(data) * test_ratio)
        test += data[:n_test]
        train += data[n_test:]

    X_train, y_train = zip(*train)
    X_test, y_test = zip(*test)

    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_x_by_serial, train_y['Y'], test_ratio=0.2)

# X data에서 Timestamp를 제거합니다.
X_train = [x.drop(columns='TIMESTAMP') for x in X_train]
X_val = [x.drop(columns='TIMESTAMP') for x in X_val]
X_test = [x.drop(columns='TIMESTAMP') for x in test_x_by_serial]

print("Train Data의 개수 :", len(X_train))
print("Validation Data의 개수 :", len(X_val))
print("Test Data의 개수 :", len(X_test))

Train Data의 개수 : 6618
Validation Data의 개수 : 1654
Test Data의 개수 : 2069


## 머신러닝 모델에 적용하기 위해 학습, 검증, 테스트용 데이터를 각각 하나의 numpy array로 합칩니다.

In [10]:
def align_data(data, series_length):
    # X2~18열에 해당하는 시계열 데이터 값만 추출
    data_features = [x[feature_keys] for x in data]
    len_data = len(data_features)
    length_aligned_X = []
    for x in data_features:
        # 시계열 데이터 길이가 series_length 이상이면 > 뒷부분 잘라냄
        if len(x) >= series_length:
            length_aligned_X.append(x[:series_length])
        # 시계열 데이터 길이가 series_length보다 작으면 > 마지막 행을 반복하여 길이를 맞춤
        else:
            length_aligned_X.append(x.append([x.iloc[-1]] * (series_length - len(x))))
    return np.array(length_aligned_X).reshape(len_data, -1)

In [11]:
series_length = 67

X_train = align_data(X_train, series_length)
X_val = align_data(X_val, series_length)
X_test = align_data(X_test, series_length)

In [12]:
y_train = np.array(y_train)
y_val = np.array(y_val)

# 모델 학습

## Fitting XGBoost with GridSearchCV

In [12]:
def xgb_gridsearchcv(model, param_grid, X_train, y_train):

    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)

    # define grid search
    grid = GridSearchCV(estimator = model, 
                        param_grid = param_grid, 
                        n_jobs = -1, 
                        cv = cv, 
                        scoring = 'f1_macro')
    grid.fit(X_train, y_train)
    results = grid.cv_results_
    best = grid.best_estimator_
    params = grid.best_params_
    scores = grid.best_score_
    print('Best Parameters : ', params)
    print('Best F1 score : ', scores)
    
    # 각 파라미터 조합에 대한 결과를 데이터프레임으로 변환
    results_df = pd.DataFrame(results)
    display(results_df)
    
    return params, scores, results_df

### 1. learning rate와 estimator 수를 고정한다.
- 초기값은 다음과 같이 선정한다.

- max_depth = 5 : 보통 4-6 를 시작점으로 한다.

- min_child_weight = 1 : 향후에 튜닝할 것이다.

- gamma = 0 : 0.1-0.2로 시작해도 된다. 향후에 튜닝할 것이다.

- subsample, colsample_bytree = 0.8 : 보통 0.5-0.9로 시작한다.

- scale_pos_weight = 1 : Because of high class imbalance.

In [15]:
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 5,
                      min_child_weight = 1,
                      gamma = 0,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.899


### 2. min_child_weight, max_depth 튜닝한다.

In [12]:
param_grid = {
    'min_child_weight' : [1, 3, 5],
    'max_depth' : [3, 6, 9],
    }
# define model
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      gamma = 0,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [13]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'max_depth': 3, 'min_child_weight': 3}
Best F1 score :  0.9723509892793217


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,43.700461,0.274057,0.015994,0.000727,3,1,"{'max_depth': 3, 'min_child_weight': 1}",0.981402,0.953297,0.988023,...,0.959882,0.984704,0.95371,0.970307,0.970057,0.98172,0.975414,0.97192,0.010275,2
1,40.805666,0.273634,0.015102,0.000934,3,3,"{'max_depth': 3, 'min_child_weight': 3}",0.98457,0.956214,0.990979,...,0.96313,0.981565,0.96313,0.973387,0.973164,0.98172,0.975414,0.972351,0.009172,1
2,39.902483,0.350699,0.014637,0.000373,3,5,"{'max_depth': 3, 'min_child_weight': 5}",0.981402,0.953297,0.984836,...,0.956604,0.981725,0.959882,0.973387,0.964069,0.978581,0.975414,0.970558,0.009686,3
3,50.632394,0.654055,0.017499,0.000288,6,1,"{'max_depth': 6, 'min_child_weight': 1}",0.981402,0.953297,0.985091,...,0.950405,0.98457,0.950405,0.976246,0.967201,0.978581,0.975414,0.969213,0.009985,6
4,43.915485,0.544916,0.01577,0.000769,6,3,"{'max_depth': 6, 'min_child_weight': 3}",0.98457,0.950405,0.981881,...,0.950405,0.984704,0.95084,0.973387,0.970057,0.975414,0.975414,0.969679,0.010071,5
5,42.287022,0.413996,0.014919,0.000307,6,5,"{'max_depth': 6, 'min_child_weight': 5}",0.98457,0.956214,0.987921,...,0.956985,0.984704,0.947537,0.967739,0.970057,0.972219,0.975414,0.968528,0.010502,8
6,51.289692,0.583772,0.01749,0.000604,9,1,"{'max_depth': 9, 'min_child_weight': 1}",0.97542,0.953297,0.985091,...,0.950405,0.981565,0.956604,0.973164,0.967201,0.978581,0.975414,0.969755,0.009352,4
7,44.414374,0.524243,0.015551,0.000717,9,3,"{'max_depth': 9, 'min_child_weight': 3}",0.98457,0.950405,0.981881,...,0.950405,0.981565,0.956604,0.967739,0.970307,0.978581,0.972461,0.968879,0.01001,7
8,41.610061,2.098394,0.014216,0.001943,9,5,"{'max_depth': 9, 'min_child_weight': 5}",0.98457,0.947537,0.978771,...,0.95371,0.981565,0.95371,0.967739,0.970307,0.972219,0.975414,0.968093,0.009537,9


({'max_depth': 3, 'min_child_weight': 3},
 0.9723509892793217,
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      43.700461      0.274057         0.015994        0.000727   
 1      40.805666      0.273634         0.015102        0.000934   
 2      39.902483      0.350699         0.014637        0.000373   
 3      50.632394      0.654055         0.017499        0.000288   
 4      43.915485      0.544916         0.015770        0.000769   
 5      42.287022      0.413996         0.014919        0.000307   
 6      51.289692      0.583772         0.017490        0.000604   
 7      44.414374      0.524243         0.015551        0.000717   
 8      41.610061      2.098394         0.014216        0.001943   
 
   param_max_depth param_min_child_weight  \
 0               3                      1   
 1               3                      3   
 2               3                      5   
 3               6                      1   
 4               6            

In [14]:
param_grid = {
    'min_child_weight' : [2, 3],
    'max_depth' : [1, 2, 3],
    }
# define model
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      gamma = 0,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [15]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'max_depth': 3, 'min_child_weight': 3}
Best F1 score :  0.9723509892793217


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,22.324372,0.256115,0.01004,0.000815,1,2,"{'max_depth': 1, 'min_child_weight': 2}",0.975202,0.959882,0.972705,...,0.95371,0.978586,0.972705,0.970057,0.970307,0.96344,0.972698,0.965846,0.008923,5
1,22.101127,0.312749,0.010056,0.000171,1,3,"{'max_depth': 1, 'min_child_weight': 3}",0.975202,0.957358,0.969802,...,0.95084,0.978586,0.972705,0.970057,0.961238,0.960221,0.978581,0.96495,0.009062,6
2,32.664158,0.597512,0.012524,0.000759,2,2,"{'max_depth': 2, 'min_child_weight': 2}",0.98457,0.959524,0.985091,...,0.953297,0.978398,0.959882,0.970307,0.970307,0.98172,0.975835,0.971053,0.010161,3
3,31.941626,0.548674,0.012785,0.000778,2,3,"{'max_depth': 2, 'min_child_weight': 3}",0.98457,0.956214,0.975842,...,0.959882,0.981565,0.959882,0.973387,0.973387,0.975627,0.972461,0.96981,0.009681,4
4,41.114919,0.741624,0.015333,0.000536,3,2,"{'max_depth': 3, 'min_child_weight': 2}",0.98457,0.956214,0.984965,...,0.96313,0.98771,0.95371,0.976246,0.970307,0.98172,0.975414,0.972213,0.009511,2
5,39.545732,0.815737,0.014618,0.001812,3,3,"{'max_depth': 3, 'min_child_weight': 3}",0.98457,0.956214,0.990979,...,0.96313,0.981565,0.96313,0.973387,0.973164,0.98172,0.975414,0.972351,0.009172,1


({'max_depth': 3, 'min_child_weight': 3},
 0.9723509892793217,
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      22.324372      0.256115         0.010040        0.000815   
 1      22.101127      0.312749         0.010056        0.000171   
 2      32.664158      0.597512         0.012524        0.000759   
 3      31.941626      0.548674         0.012785        0.000778   
 4      41.114919      0.741624         0.015333        0.000536   
 5      39.545732      0.815737         0.014618        0.001812   
 
   param_max_depth param_min_child_weight  \
 0               1                      2   
 1               1                      3   
 2               2                      2   
 3               2                      3   
 4               3                      2   
 5               3                      3   
 
                                     params  split0_test_score  \
 0  {'max_depth': 1, 'min_child_weight': 2}           0.975202   
 1  {'max_

In [16]:
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.897


## 3. gamma을 튜닝한다.

In [17]:
param_grid = {
    'gamma' : [i/10.0 for i in range(0,5)]
    }
# define model
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [18]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'gamma': 0.0}
Best F1 score :  0.9723509892793217


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,39.887298,0.620552,0.014947,0.000548,0.0,{'gamma': 0.0},0.98457,0.956214,0.990979,0.978398,...,0.96313,0.981565,0.96313,0.973387,0.973164,0.98172,0.975414,0.972351,0.009172,1
1,37.683471,0.404255,0.014651,0.000955,0.1,{'gamma': 0.1},0.978205,0.956214,0.981881,0.981402,...,0.959882,0.978398,0.956604,0.973387,0.973164,0.98172,0.975414,0.970869,0.008842,4
2,33.461954,0.724978,0.013434,0.000761,0.2,{'gamma': 0.2},0.981402,0.956214,0.990979,0.981402,...,0.956604,0.978398,0.956985,0.973387,0.964662,0.978581,0.969534,0.9709,0.010082,3
3,31.463848,0.711919,0.012399,0.000655,0.3,{'gamma': 0.3},0.978398,0.953297,0.984836,0.978398,...,0.959882,0.978398,0.956604,0.973387,0.973387,0.984832,0.972461,0.971298,0.009285,2
4,30.091123,0.726174,0.01161,0.001591,0.4,{'gamma': 0.4},0.981402,0.956214,0.984965,0.981565,...,0.953297,0.984704,0.956985,0.976246,0.967201,0.984832,0.972461,0.970271,0.01052,5


({'gamma': 0.0},
 0.9723509892793217,
    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_gamma  \
 0      39.887298      0.620552         0.014947        0.000548         0.0   
 1      37.683471      0.404255         0.014651        0.000955         0.1   
 2      33.461954      0.724978         0.013434        0.000761         0.2   
 3      31.463848      0.711919         0.012399        0.000655         0.3   
 4      30.091123      0.726174         0.011610        0.001591         0.4   
 
            params  split0_test_score  split1_test_score  split2_test_score  \
 0  {'gamma': 0.0}           0.984570           0.956214           0.990979   
 1  {'gamma': 0.1}           0.978205           0.956214           0.981881   
 2  {'gamma': 0.2}           0.981402           0.956214           0.990979   
 3  {'gamma': 0.3}           0.978398           0.953297           0.984836   
 4  {'gamma': 0.4}           0.981402           0.956214           0.984965   
 
    

In [19]:
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.897


## 4. subsample과 colsample_bytree를 튜닝한다.

In [12]:
param_grid = {
    'subsample': [i/10.0 for i in range(6, 10)],
    'colsample_bytree': [i/10.0 for i in range(6, 10)]
    }
# define model
model = XGBClassifier(learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [13]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'colsample_bytree': 0.7, 'subsample': 0.9}
Best F1 score :  0.9723531896105502


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,42.760395,0.38966,0.016448,0.00074,0.6,0.6,"{'colsample_bytree': 0.6, 'subsample': 0.6}",0.98457,0.956214,0.981881,...,0.956604,0.972226,0.96313,0.967473,0.976246,0.975414,0.975414,0.970315,0.010128,15
1,43.159715,0.371529,0.01622,0.001088,0.6,0.7,"{'colsample_bytree': 0.6, 'subsample': 0.7}",0.98457,0.956214,0.987921,...,0.956604,0.981565,0.96313,0.973387,0.970552,0.975627,0.975627,0.970404,0.009697,14
2,43.193903,0.361032,0.016432,0.000716,0.6,0.8,"{'colsample_bytree': 0.6, 'subsample': 0.8}",0.98457,0.956214,0.984836,...,0.956604,0.984704,0.95371,0.973387,0.967473,0.978581,0.975414,0.971003,0.01015,9
3,43.129206,0.381248,0.016369,0.000614,0.6,0.9,"{'colsample_bytree': 0.6, 'subsample': 0.9}",0.981402,0.956214,0.987921,...,0.959882,0.981565,0.960232,0.973387,0.967201,0.984832,0.972461,0.971714,0.01014,4
4,42.686948,0.796678,0.016069,0.000465,0.7,0.6,"{'colsample_bytree': 0.7, 'subsample': 0.6}",0.98771,0.956214,0.987921,...,0.953297,0.981565,0.96313,0.976246,0.976246,0.975414,0.975414,0.970703,0.010826,12
5,42.499746,0.573197,0.016136,0.001022,0.7,0.7,"{'colsample_bytree': 0.7, 'subsample': 0.7}",0.98457,0.956214,0.987921,...,0.950405,0.987817,0.96313,0.973387,0.964368,0.978765,0.975414,0.970796,0.010042,11
6,42.288891,0.379094,0.016049,0.000574,0.7,0.8,"{'colsample_bytree': 0.7, 'subsample': 0.8}",0.98457,0.956214,0.982034,...,0.959882,0.981565,0.96313,0.970307,0.970307,0.98172,0.972461,0.971536,0.009078,6
7,42.650249,0.595857,0.016329,0.000877,0.7,0.9,"{'colsample_bytree': 0.7, 'subsample': 0.9}",0.981402,0.956214,0.984965,...,0.96313,0.984704,0.96345,0.973387,0.970057,0.98172,0.975414,0.972353,0.00931,1
8,42.677706,0.448006,0.016217,0.000878,0.8,0.6,"{'colsample_bytree': 0.8, 'subsample': 0.6}",0.98457,0.953297,0.988023,...,0.949959,0.984704,0.96635,0.973387,0.970307,0.975414,0.972219,0.971262,0.010162,8
9,42.79662,0.401065,0.015973,0.000967,0.8,0.7,"{'colsample_bytree': 0.8, 'subsample': 0.7}",0.98457,0.956214,0.987921,...,0.950405,0.987817,0.959882,0.973387,0.964368,0.98172,0.975414,0.970814,0.010571,10


({'colsample_bytree': 0.7, 'subsample': 0.9},
 0.9723531896105502,
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0       42.760395      0.389660         0.016448        0.000740   
 1       43.159715      0.371529         0.016220        0.001088   
 2       43.193903      0.361032         0.016432        0.000716   
 3       43.129206      0.381248         0.016369        0.000614   
 4       42.686948      0.796678         0.016069        0.000465   
 5       42.499746      0.573197         0.016136        0.001022   
 6       42.288891      0.379094         0.016049        0.000574   
 7       42.650249      0.595857         0.016329        0.000877   
 8       42.677706      0.448006         0.016217        0.000878   
 9       42.796620      0.401065         0.015973        0.000967   
 10      42.934480      0.527637         0.016029        0.000376   
 11      43.238501      0.497670         0.016041        0.000675   
 12      43.094736      0.459173    

In [13]:
model = XGBClassifier(colsample_bytree = 0.7,
                      subsample = 0.9,
                      learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.895


## 5. regularization parameter 튜닝

In [18]:
param_grid = {
    'reg_alpha' : [1e-5, 1e-2, 0.1, 1, 100]
    }
# define model
model = XGBClassifier(colsample_bytree = 0.7,
                      subsample = 0.9,
                      learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [19]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'reg_alpha': 1e-05}
Best F1 score :  0.9722448290172573


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,41.688521,0.510955,0.015374,0.000822,1e-05,{'reg_alpha': 1e-05},0.981402,0.956214,0.984965,0.978398,...,0.959882,0.984704,0.96345,0.973387,0.970057,0.98172,0.975414,0.972245,0.009369,1
1,41.205621,0.896346,0.015262,0.000325,0.01,{'reg_alpha': 0.01},0.981402,0.953297,0.981881,0.978398,...,0.95371,0.981565,0.954113,0.976246,0.967201,0.978581,0.972461,0.970717,0.009645,3
2,41.439571,0.824712,0.015679,0.000596,0.1,{'reg_alpha': 0.1},0.978205,0.956214,0.984836,0.978398,...,0.956985,0.981565,0.950405,0.973387,0.964368,0.98172,0.972461,0.971203,0.009534,2
3,41.309943,0.879555,0.015813,0.000816,1.0,{'reg_alpha': 1},0.981402,0.953297,0.991055,0.978398,...,0.96313,0.981565,0.953297,0.970552,0.964069,0.98172,0.972461,0.970512,0.010209,4
4,23.963263,1.016284,0.009552,0.001122,100.0,{'reg_alpha': 100},0.920739,0.935759,0.93855,0.946591,...,0.912429,0.949959,0.940308,0.949311,0.920808,0.949297,0.94369,0.93582,0.012162,5


({'reg_alpha': 1e-05},
 0.9722448290172573,
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      41.688521      0.510955         0.015374        0.000822   
 1      41.205621      0.896346         0.015262        0.000325   
 2      41.439571      0.824712         0.015679        0.000596   
 3      41.309943      0.879555         0.015813        0.000816   
 4      23.963263      1.016284         0.009552        0.001122   
 
   param_reg_alpha                params  split0_test_score  split1_test_score  \
 0         0.00001  {'reg_alpha': 1e-05}           0.981402           0.956214   
 1            0.01   {'reg_alpha': 0.01}           0.981402           0.953297   
 2             0.1    {'reg_alpha': 0.1}           0.978205           0.956214   
 3               1      {'reg_alpha': 1}           0.981402           0.953297   
 4             100    {'reg_alpha': 100}           0.920739           0.935759   
 
    split2_test_score  split3_test_score  ...  split

In [20]:
model = XGBClassifier(reg_alpha = 1e-05,
                      colsample_bytree = 0.7,
                      subsample = 0.9,
                      learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.895


### 6. learning rate 감소

In [24]:
param_grid = {
    'learning_rate' : [0.5, 0.1, 0.05, 0.01]
    }
model = XGBClassifier(colsample_bytree = 0.7,
                      subsample = 0.9,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)

In [25]:
xgb_gridsearchcv(model, param_grid, X_train, y_train)

Best Parameters :  {'learning_rate': 0.1}
Best F1 score :  0.9723531896105502


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,34.574998,0.588218,0.011816,0.000798,0.5,{'learning_rate': 0.5},0.981402,0.96247,0.979128,0.978398,...,0.943704,0.972226,0.95371,0.973387,0.978951,0.975835,0.969534,0.968146,0.009174,3
1,39.884118,0.584994,0.015015,0.000737,0.1,{'learning_rate': 0.1},0.981402,0.956214,0.984965,0.978398,...,0.96313,0.984704,0.96345,0.973387,0.970057,0.98172,0.975414,0.972353,0.00931,1
2,40.993747,0.627977,0.016095,0.000391,0.05,{'learning_rate': 0.05},0.978205,0.956214,0.988023,0.978398,...,0.956985,0.978398,0.956604,0.976246,0.966923,0.984832,0.975627,0.971279,0.009868,2
3,42.81271,1.465236,0.014305,0.002063,0.01,{'learning_rate': 0.01},0.978398,0.947537,0.984965,0.978586,...,0.94707,0.969275,0.959882,0.967201,0.967473,0.972461,0.975414,0.965451,0.01086,4


({'learning_rate': 0.1},
 0.9723531896105502,
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      34.574998      0.588218         0.011816        0.000798   
 1      39.884118      0.584994         0.015015        0.000737   
 2      40.993747      0.627977         0.016095        0.000391   
 3      42.812710      1.465236         0.014305        0.002063   
 
   param_learning_rate                   params  split0_test_score  \
 0                 0.5   {'learning_rate': 0.5}           0.981402   
 1                 0.1   {'learning_rate': 0.1}           0.981402   
 2                0.05  {'learning_rate': 0.05}           0.978205   
 3                0.01  {'learning_rate': 0.01}           0.978398   
 
    split1_test_score  split2_test_score  split3_test_score  ...  \
 0           0.962470           0.979128           0.978398  ...   
 1           0.956214           0.984965           0.978398  ...   
 2           0.956214           0.988023           0.978

In [26]:
model = XGBClassifier(reg_alpha = 1e-05,
                      colsample_bytree = 0.7,
                      subsample = 0.9,
                      learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.895


In [67]:
model = XGBClassifier(reg_alpha = 1e-05,
                      colsample_bytree = 0.7,
                      subsample = 0.9,
                      learning_rate = 0.1,
                      n_estimators = 1000,
                      max_depth = 3,
                      min_child_weight = 3,
                      gamma = 0,
                      objective = 'binary:logistic',
                      nthread = -1,
                      scale_pos_weight = 1, 
                      random_state = 42,
                      eval_metric = 'logloss',
                      early_stopping = 100)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.895


# 최종 모델

In [46]:
model = XGBClassifier(scale_pos_weight = 1,
                      colsample_bytree = 1, 
                      max_depth = 3, 
                      min_child_weight = 4,
                      subsample = 1,
                      eta = 0.22,
                      random_state = 42)
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val)
# 검증 데이터에 대한 F1 점수 계산
f1_val = f1_score(y_val, y_val_pred, average='macro')
print('Validation F1 score = %.3f' % f1_val)

Validation F1 score = 0.852


# 모델 평가 및 결과 저장


In [68]:
submission = pd.read_csv(os.path.join(data_dir, "test_y.csv"), index_col='Serial Number')

# 테스트 데이터에 대해 예측하고, 결과를 저장합니다.
y_test_pred = model.predict(X_test)
submission["Y"] = y_test_pred
submission.to_csv("submission.csv", index_label='Serial Number')

### 결과 검증

`submission.csv` 파일을 다시 불러와 올바르게 값을 채웠는지 다시 한번 확인합니다.

In [69]:
submission = pd.read_csv("submission.csv", index_col='Serial Number')
submission

Unnamed: 0_level_0,Y
Serial Number,Unnamed: 1_level_1
100122I,1
100368G,0
101403L,0
101426G,0
101505B,0
...,...
997719U,0
998737L,0
999308S,0
999800H,0


In [70]:
# 예측 결과 중 1의 비율을 계산하고, 학습용 데이터의 비율과 비교합니다.
print(submission["Y"].mean())
print(train_y.mean())

0.17303044949250845
Y    0.146518
dtype: float64


### 제출

우측 상단의 제출 버튼을 눌러, `competition.ipynb` 파일과 `submission.csv` 파일을 제출합니다.