# 설치

In [2]:
# !pip install hyperopt==0.2.7

Collecting hyperopt==0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)




Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


# 베이지안 최적화 기반의 파라미터 튜닝

In [3]:
# 패키지
import hyperopt
# 검색 공간
from hyperopt import hp
search_space = {'x' : hp.quniform('x',-10,10,1),
                'y' : hp.quniform('y',-15,15,1)}
# 목적 함수
from hyperopt import STATUS_OK
def objective_func(search_space) :
    x = search_space['x']
    y = search_space['y']
    return x**2 - 20*y
# 결과값 저장한 Trials 객체값 생성
from hyperopt import fmin, tpe, Trials
trial_val = Trials()

In [4]:
# 5번 시도
import numpy as np
best05 = fmin(fn = objective_func,
              space = search_space,
              algo = tpe.suggest,
              max_evals = 5,
              trials = trial_val,
              rstate = np.random.default_rng(seed = 0))
print('best by 5 :', best05)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 555.27trial/s, best loss: -224.0]
best by 5 : {'x': -4.0, 'y': 12.0}


In [5]:
# 20번 시도
best20 = fmin(fn = objective_func,
              space = search_space,
              algo = tpe.suggest,
              max_evals = 20,
              trials = trial_val,
              rstate = np.random.default_rng(seed = 0))
print('best by 20 :', best20)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 882.00trial/s, best loss: -296.0]
best by 20 : {'x': 2.0, 'y': 15.0}


In [6]:
# key값에 해당하는 values
losses = [loss_dict['loss'] for loss_dict in trial_val.results]
# 결과 생성
import pandas as pd
result_df = pd.DataFrame({'x' : trial_val.vals['x'],
                          'y' : trial_val.vals['y'],
                          'losses' : losses})
result_df.head(5)

Unnamed: 0,x,y,losses
0,-6.0,5.0,-64.0
1,-4.0,10.0,-184.0
2,4.0,-2.0,56.0
3,-4.0,12.0,-224.0
4,9.0,1.0,61.0


# XGBoost Hyper Parameter 튜닝

In [7]:
# 패키지
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier as XGBC
from xgboost import plot_importance

In [8]:
# 데이터
df_raw = load_breast_cancer()
X_raw = df_raw.data
Y_raw = df_raw.target
df = pd.DataFrame(data = X_raw, columns = df_raw.feature_names)
df['target'] = Y_raw
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [9]:
# x,y
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
# 데이터 분할(Train:Validation:Test = 7:1:2)
x_train, X_test, y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 156)
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state = 156)

In [10]:
# XGBoost 검색 공간
xgb_search_space = {'max_depth' : hp.quniform('max_depth',5,20,1),
                    'min_child_weight' : hp.quniform('min_child_weight',1,2,1),
                    'learning_rate' : hp.uniform('learning_rate',0.01,0.2),
                    'colsample_bytree' : hp.uniform('colsample_bytree',0.5,1)
                    }

In [11]:
# XGBoost 목적 함수
def objective_func(search_space) :
    xgb_model = XGBC(n_estimators = 100, # 실습에서 수행 시간 절약을 위해
                     max_depth = int(search_space['max_depth']), # 실수형 결과값을 hyper parameter에 맞게 정수형 변환
                     min_child_weight = int(search_space['min_child_weight']),
                     learning_rate = search_space['learning_rate'],
                     colsample_bytree = search_space['colsample_bytree'],
                     eval_metric = 'logloss')
    from sklearn.model_selection import cross_val_score
    accuracy = cross_val_score(xgb_model, x_train, y_train, scoring = 'accuracy', cv = 3)
    return {'loss' : -1*np.mean(accuracy), 'status' : STATUS_OK}

In [12]:
# 50번 시도
trial_val = Trials()
best50 = fmin(fn = objective_func,
              space = xgb_search_space,
              algo = tpe.suggest,
              max_evals = 50,
              trials = trial_val,
              rstate = np.random.default_rng(seed = 9))
print('best by 50 :', best50)

100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [00:23<00:00,  2.12trial/s, best loss: -0.9670616939700244]
best by 50 : {'colsample_bytree': 0.5424149213362504, 'learning_rate': 0.12601372924444681, 'max_depth': 17.0, 'min_child_weight': 2.0}


In [13]:
# 결과 확인
print('colsample_bytree : {:.4f}, learning_rate : {:.2f}, max_depth : {}, min_child_weight : {}'.format(best50['colsample_bytree'],best50['learning_rate'],int(best50['max_depth']),int(best50['min_child_weight'])))

colsample_bytree : 0.5424, learning_rate : 0.13, max_depth : 17, min_child_weight : 2


In [14]:
# 도출된 hyper parameter 적용
xgb_wrapper = XGBC(
                   n_estimators = 400,
                   learning_rate = round(best50['learning_rate'],5),
                   max_depth = int(best50['max_depth']),
                   min_child_weight = int(best50['min_child_weight']),
                   colsample_bytree = round(best50['colsample_bytree'],5)
                   )
evals = [(X_train, Y_train), (X_val, Y_val)]
xgb_wrapper.fit(
                X_train, Y_train,
                early_stopping_rounds = 50,
                eval_metric = 'logloss',
                eval_set = evals,
                verbose = True
                )
pred = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:,1]

[0]	validation_0-logloss:0.58942	validation_1-logloss:0.62048
[1]	validation_0-logloss:0.50801	validation_1-logloss:0.55913
[2]	validation_0-logloss:0.44160	validation_1-logloss:0.50928
[3]	validation_0-logloss:0.38734	validation_1-logloss:0.46815
[4]	validation_0-logloss:0.34224	validation_1-logloss:0.43913
[5]	validation_0-logloss:0.30425	validation_1-logloss:0.41570
[6]	validation_0-logloss:0.27178	validation_1-logloss:0.38953
[7]	validation_0-logloss:0.24503	validation_1-logloss:0.37317
[8]	validation_0-logloss:0.22050	validation_1-logloss:0.35628
[9]	validation_0-logloss:0.19873	validation_1-logloss:0.33798
[10]	validation_0-logloss:0.17945	validation_1-logloss:0.32463
[11]	validation_0-logloss:0.16354	validation_1-logloss:0.31384
[12]	validation_0-logloss:0.15032	validation_1-logloss:0.30607
[13]	validation_0-logloss:0.13813	validation_1-logloss:0.30143
[14]	validation_0-logloss:0.12798	validation_1-logloss:0.29513
[15]	validation_0-logloss:0.11926	validation_1-logloss:0.28891
[1



[54]	validation_0-logloss:0.02500	validation_1-logloss:0.23781
[55]	validation_0-logloss:0.02458	validation_1-logloss:0.23909
[56]	validation_0-logloss:0.02422	validation_1-logloss:0.23809
[57]	validation_0-logloss:0.02380	validation_1-logloss:0.23843
[58]	validation_0-logloss:0.02348	validation_1-logloss:0.23802
[59]	validation_0-logloss:0.02310	validation_1-logloss:0.23837
[60]	validation_0-logloss:0.02275	validation_1-logloss:0.23923
[61]	validation_0-logloss:0.02257	validation_1-logloss:0.23813
[62]	validation_0-logloss:0.02242	validation_1-logloss:0.23983
[63]	validation_0-logloss:0.02227	validation_1-logloss:0.23883
[64]	validation_0-logloss:0.02186	validation_1-logloss:0.23589
[65]	validation_0-logloss:0.02162	validation_1-logloss:0.23630
[66]	validation_0-logloss:0.02148	validation_1-logloss:0.23795
[67]	validation_0-logloss:0.02136	validation_1-logloss:0.23704
[68]	validation_0-logloss:0.02123	validation_1-logloss:0.23670
[69]	validation_0-logloss:0.02097	validation_1-logloss:

[183]	validation_0-logloss:0.01439	validation_1-logloss:0.22504
[184]	validation_0-logloss:0.01436	validation_1-logloss:0.22554
[185]	validation_0-logloss:0.01433	validation_1-logloss:0.22533
[186]	validation_0-logloss:0.01431	validation_1-logloss:0.22427
[187]	validation_0-logloss:0.01428	validation_1-logloss:0.22545
[188]	validation_0-logloss:0.01425	validation_1-logloss:0.22563
[189]	validation_0-logloss:0.01422	validation_1-logloss:0.22525
[190]	validation_0-logloss:0.01419	validation_1-logloss:0.22504
[191]	validation_0-logloss:0.01417	validation_1-logloss:0.22523
[192]	validation_0-logloss:0.01414	validation_1-logloss:0.22529
[193]	validation_0-logloss:0.01411	validation_1-logloss:0.22492
[194]	validation_0-logloss:0.01409	validation_1-logloss:0.22472
[195]	validation_0-logloss:0.01406	validation_1-logloss:0.22589
[196]	validation_0-logloss:0.01403	validation_1-logloss:0.22595
[197]	validation_0-logloss:0.01401	validation_1-logloss:0.22646
[198]	validation_0-logloss:0.01399	valid

In [15]:
# clf_eval() 함수
def clf_eval(y_test, pred = None, pred_proba = None) :
    # 패키지
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    cm = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    # 결과 출력
    print('오차행렬\n', cm)
    print('\n정확도 : {:.4f}\n정밀도 : {:.4f}\n재현율 : {:.4f}'.format(accuracy, precision, recall))
    # F1-score 추가
    f1 = f1_score(y_test, pred)
    print('F1 : {:.4f}'.format(f1))
    # ROC-AUC
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('AUC : {:.4f}'.format(roc_auc))
    
# 결과 확인
clf_eval(Y_test, pred, pred_proba)

오차행렬
 [[35  2]
 [ 2 75]]

정확도 : 0.9649
정밀도 : 0.9740
재현율 : 0.9740
F1 : 0.9740
AUC : 0.9944


In [16]:
# key값에 해당하는 values
losses = [loss_dict['loss'] for loss_dict in trial_val.results]
# 결과 생성
import pandas as pd
result_df = pd.DataFrame({
                          'max_depth' : trial_val.vals['max_depth'],
                          'min_child_weight' : trial_val.vals['min_child_weight'],
                          'colsample_bytree' : trial_val.vals['colsample_bytree'],
                          'learning_rate' : trial_val.vals['learning_rate'],
                          'losses' : losses
                          })
result_df.head(5)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,learning_rate,losses
0,19.0,2.0,0.585235,0.033688,-0.947296
1,5.0,2.0,0.727186,0.105956,-0.960483
2,6.0,2.0,0.959945,0.154804,-0.95829
3,6.0,2.0,0.950012,0.120686,-0.960468
4,16.0,2.0,0.674336,0.142392,-0.962661
