<a href="https://colab.research.google.com/github/steelpipe75/kagglebook-for-colab/blob/master/ch06/ch06-01-hopt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------------------------------
# データ等の準備
# ----------------------------------
import numpy as np
import pandas as pd

In [2]:
import importlib
import sys
import subprocess

# Google Colab 上で実行しているかどうかを判断するフラグ
ON_COLAB = "google.colab" in sys.modules
print(f"ON_COLAB: {ON_COLAB}")

if ON_COLAB:
    USE_GIT = True # Gitを使う
    # USE_GIT = False # Gitを使わない

    print(f"USE_GIT: {USE_GIT}")
    if USE_GIT:
        !git clone https://github.com/ghmagazine/kagglebook.git
    else:
        # Google Drive にマウントする
        drive = importlib.import_module("google.colab.drive")
        drive.mount("/content/drive/")

        import os
        colab_dir = "/content/drive/MyDrive/kagglebook/" # データ置き場

ON_COLAB: True
USE_GIT: True
fatal: destination path 'kagglebook' already exists and is not an empty directory.


train_xは学習データ、train_yは目的変数、test_xはテストデータ
pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

In [3]:
if ON_COLAB:
    if USE_GIT:
        train = pd.read_csv('/content/kagglebook/input/sample-data/train_preprocessed.csv')
    else:
        train = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/train_preprocessed.csv'))
else:
    train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
if ON_COLAB:
    if USE_GIT:
        test_x = pd.read_csv('/content/kagglebook/input/sample-data/test_preprocessed.csv')
    else:
        test_x = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/test_preprocessed.csv'))
else:
    test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [4]:
# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

In [5]:
kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [6]:
# xgboostによる学習・予測を行うクラス
import xgboost as xgb

In [7]:
class Model:

    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred

-----------------------------------
探索するパラメータの空間の指定
-----------------------------------
hp.choiceでは、複数の選択肢から選ぶ
hp.uniformでは、下限・上限を指定した一様分布から抽出する。引数は下限・上限
hp.quniformでは、下限・上限を指定した一様分布のうち一定の間隔ごとの点から抽出する。引数は下限・上限・間隔
hp.loguniformでは、下限・上限を指定した対数が一様分布に従う分布から抽出する。引数は下限・上限の対数をとった値

In [8]:
from hyperopt import hp

In [9]:
space = {
    'activation': hp.choice('activation', ['prelu', 'relu']),
    'dropout': hp.uniform('dropout', 0, 0.2),
    'units': hp.quniform('units', 32, 256, 32),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.01)),
}

In [10]:
# -----------------------------------
# hyperoptを使ったパラメータ探索
# -----------------------------------
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import log_loss

In [11]:
def score(params):
    # パラメータを与えたときに最小化する評価指標を指定する
    # 具体的には、モデルにパラメータを指定して学習・予測させた場合のスコアを返すようにする

    # max_depthの型を整数型に修正する
    params['max_depth'] = int(params['max_depth'])

    # Modelクラスを定義しているものとする
    # Modelクラスは、fitで学習し、predictで予測値の確率を出力する
    model = Model(params)
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    print(f'params: {params}, logloss: {score:.4f}')

    # 情報を記録しておく
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}

In [12]:
# 探索するパラメータの空間を指定する
space = {
    'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),
    'gamma': hp.quniform('gamma', 0, 0.4, 0.1),
}

In [13]:
# hyperoptによるパラメータ探索の実行
max_evals = 10
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

Parameters: { "silent" } are not used.




[0]	train-logloss:0.43616	eval-logloss:0.44908
[1]	train-logloss:0.40152	eval-logloss:0.41851
[2]	train-logloss:0.37815	eval-logloss:0.39463
[3]	train-logloss:0.35871	eval-logloss:0.37839
[4]	train-logloss:0.34113	eval-logloss:0.36450
[5]	train-logloss:0.32922	eval-logloss:0.35546
[6]	train-logloss:0.31727	eval-logloss:0.34401
[7]	train-logloss:0.30476	eval-logloss:0.33126
[8]	train-logloss:0.29570	eval-logloss:0.32379
[9]	train-logloss:0.28685	eval-logloss:0.31610
params: {'gamma': 0.30000000000000004, 'max_depth': 4, 'min_child_weight': 2.0}, logloss: 0.3161
[0]	train-logloss:0.43615	eval-logloss:0.44921
[1]	train-logloss:0.40165	eval-logloss:0.41864
[2]	train-logloss:0.37854	eval-logloss:0.39472
[3]	train-logloss:0.36032	eval-logloss:0.37623
[4]	train-logloss:0.34341	eval-logloss:0.36069
[5]	train-logloss:0.33057	eval-logloss:0.35026
[6]	train-logloss:0.31790	eval-logloss:0.33990
[7]	train-logloss:0.30732	eval-logloss:0.33210
 10%|█         | 1/10 [00:01<00:08,  1.12trial/s, best lo

Parameters: { "silent" } are not used.




[8]	train-logloss:0.29800	eval-logloss:0.32694
[9]	train-logloss:0.29106	eval-logloss:0.32114
params: {'gamma': 0.2, 'max_depth': 4, 'min_child_weight': 4.0}, logloss: 0.3211
[0]	train-logloss:0.40449	eval-logloss:0.42510
[1]	train-logloss:0.35106	eval-logloss:0.38295
[2]	train-logloss:0.31431	eval-logloss:0.35595
[3]	train-logloss:0.28192	eval-logloss:0.33448
[4]	train-logloss:0.25610	eval-logloss:0.32048
[5]	train-logloss:0.23467	eval-logloss:0.30797
[6]	train-logloss:0.21776	eval-logloss:0.29882
 20%|██        | 2/10 [00:01<00:04,  1.82trial/s, best loss: 0.31609811638674035]

Parameters: { "silent" } are not used.




[7]	train-logloss:0.19952	eval-logloss:0.28584
[8]	train-logloss:0.18474	eval-logloss:0.27859
[9]	train-logloss:0.17275	eval-logloss:0.27479
params: {'gamma': 0.1, 'max_depth': 8, 'min_child_weight': 2.0}, logloss: 0.2748
 30%|███       | 3/10 [00:01<00:04,  1.73trial/s, best loss: 0.2747885863063065] 

Parameters: { "silent" } are not used.




[0]	train-logloss:0.42683	eval-logloss:0.44172
[1]	train-logloss:0.38767	eval-logloss:0.40447
[2]	train-logloss:0.35966	eval-logloss:0.37957
[3]	train-logloss:0.33930	eval-logloss:0.36456
[4]	train-logloss:0.32109	eval-logloss:0.34844
[5]	train-logloss:0.30795	eval-logloss:0.33849
[6]	train-logloss:0.29357	eval-logloss:0.32787
[7]	train-logloss:0.28335	eval-logloss:0.31915
[8]	train-logloss:0.27233	eval-logloss:0.31164
[9]	train-logloss:0.26312	eval-logloss:0.30487
params: {'gamma': 0.2, 'max_depth': 5, 'min_child_weight': 4.0}, logloss: 0.3049
 40%|████      | 4/10 [00:03<00:06,  1.09s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[0]	train-logloss:0.40902	eval-logloss:0.42971
[1]	train-logloss:0.35834	eval-logloss:0.39027
[2]	train-logloss:0.32324	eval-logloss:0.36383
[3]	train-logloss:0.29153	eval-logloss:0.34601
[4]	train-logloss:0.26798	eval-logloss:0.32641
[5]	train-logloss:0.24755	eval-logloss:0.31388
[6]	train-logloss:0.23155	eval-logloss:0.30684
[7]	train-logloss:0.21771	eval-logloss:0.29866
[8]	train-logloss:0.20244	eval-logloss:0.29134
[9]	train-logloss:0.18816	eval-logloss:0.28562
params: {'gamma': 0.1, 'max_depth': 7, 'min_child_weight': 1.0}, logloss: 0.2856
[0]	train-logloss:0.41899	eval-logloss:0.43492
 50%|█████     | 5/10 [00:05<00:05,  1.17s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[1]	train-logloss:0.37547	eval-logloss:0.39706
[2]	train-logloss:0.34487	eval-logloss:0.37009
[3]	train-logloss:0.31964	eval-logloss:0.35019
[4]	train-logloss:0.30073	eval-logloss:0.33686
[5]	train-logloss:0.28647	eval-logloss:0.32836
[6]	train-logloss:0.26945	eval-logloss:0.31741
[7]	train-logloss:0.25797	eval-logloss:0.30956
[8]	train-logloss:0.24658	eval-logloss:0.30300
[9]	train-logloss:0.23559	eval-logloss:0.29580
params: {'gamma': 0.4, 'max_depth': 6, 'min_child_weight': 4.0}, logloss: 0.2958
[0]	train-logloss:0.40449	eval-logloss:0.42510
 60%|██████    | 6/10 [00:06<00:04,  1.11s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[1]	train-logloss:0.35109	eval-logloss:0.38300
[2]	train-logloss:0.31438	eval-logloss:0.35603
[3]	train-logloss:0.28202	eval-logloss:0.33487
[4]	train-logloss:0.25517	eval-logloss:0.31980
[5]	train-logloss:0.23427	eval-logloss:0.30751
[6]	train-logloss:0.21658	eval-logloss:0.29947
[7]	train-logloss:0.20056	eval-logloss:0.28993
[8]	train-logloss:0.18723	eval-logloss:0.28326
[9]	train-logloss:0.17612	eval-logloss:0.27735
params: {'gamma': 0.4, 'max_depth': 8, 'min_child_weight': 2.0}, logloss: 0.2774
 70%|███████   | 7/10 [00:07<00:03,  1.13s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[0]	train-logloss:0.41203	eval-logloss:0.43011
[1]	train-logloss:0.36413	eval-logloss:0.39215
[2]	train-logloss:0.32955	eval-logloss:0.36492
[3]	train-logloss:0.30291	eval-logloss:0.34306
[4]	train-logloss:0.27817	eval-logloss:0.32746
[5]	train-logloss:0.25976	eval-logloss:0.31245
[6]	train-logloss:0.24374	eval-logloss:0.30373
[7]	train-logloss:0.22893	eval-logloss:0.29598
[8]	train-logloss:0.21587	eval-logloss:0.28778
[9]	train-logloss:0.20511	eval-logloss:0.28047
params: {'gamma': 0.2, 'max_depth': 7, 'min_child_weight': 3.0}, logloss: 0.2805
 80%|████████  | 8/10 [00:08<00:02,  1.08s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[0]	train-logloss:0.41899	eval-logloss:0.43492
[1]	train-logloss:0.37547	eval-logloss:0.39706
[2]	train-logloss:0.34486	eval-logloss:0.37003
[3]	train-logloss:0.31961	eval-logloss:0.35014
[4]	train-logloss:0.30069	eval-logloss:0.33680
[5]	train-logloss:0.28643	eval-logloss:0.32830
[6]	train-logloss:0.26939	eval-logloss:0.31731
[7]	train-logloss:0.25789	eval-logloss:0.30927
[8]	train-logloss:0.24649	eval-logloss:0.30243
[9]	train-logloss:0.23548	eval-logloss:0.29511
params: {'gamma': 0.1, 'max_depth': 6, 'min_child_weight': 4.0}, logloss: 0.2951
 90%|█████████ | 9/10 [00:09<00:01,  1.27s/trial, best loss: 0.2747885863063065]

Parameters: { "silent" } are not used.




[0]	train-logloss:0.42629	eval-logloss:0.44193
[1]	train-logloss:0.38733	eval-logloss:0.40437
[2]	train-logloss:0.35859	eval-logloss:0.38211
[3]	train-logloss:0.33758	eval-logloss:0.36448
[4]	train-logloss:0.32182	eval-logloss:0.35370
[5]	train-logloss:0.30505	eval-logloss:0.34174
[6]	train-logloss:0.29448	eval-logloss:0.33184
[7]	train-logloss:0.28293	eval-logloss:0.32383
[8]	train-logloss:0.27048	eval-logloss:0.31236
[9]	train-logloss:0.26060	eval-logloss:0.30486
params: {'gamma': 0.1, 'max_depth': 5, 'min_child_weight': 3.0}, logloss: 0.3049
100%|██████████| 10/10 [00:11<00:00,  1.13s/trial, best loss: 0.2747885863063065]


{'gamma': np.float64(0.1),
 'max_depth': np.float64(8.0),
 'min_child_weight': np.float64(2.0)}

In [14]:
# 記録した情報からパラメータとスコアを出力する
# （trialsからも情報が取得できるが、パラメータの取得がやや行いづらいため）
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f'best params:{best[0]}, score:{best[1]:.4f}')

best params:{'gamma': 0.1, 'max_depth': 8, 'min_child_weight': 2.0}, score:0.2748
