In [1]:
import numpy as np
import pandas as pd

### 学習データ・評価データの準備

In [2]:
# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

train = pd.read_csv('../../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

## 評価関数定義

In [3]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [4]:
def score(opt_params):
    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eta': 0.1,
        'gamma': 0.0,
        'alpha': 0.0,
        'lambda': 1.0,
        'min_child_weight': 1,
        'max_depth': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 71,
    }
    params.update(opt_params)

    # 学習
    num_round = 10
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, dtrain, num_round, evals=watchlist)

    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)
    print(f'params: {params}, logloss: {score:.4f}')

    # 情報を記録しておく
    history.append((params, score))

    return {'loss': score, 'status': 'ok'}

## 探索実行

In [5]:
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope

In [6]:
param_space = {
    'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 9, 1)),
#     'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
    'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05),
    'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
    'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
    'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
}

# hyperoptによるパラメータ探索の実行
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

# 記録した情報からパラメータとスコアを出力する
# （trialsからも情報が取得できるが、パラメータの取得がやや行いづらいため）
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]

[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=5
[0]	train-error:0.168	eval-error:0.1804
[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[1]	train-error:0.1368	eval-error:0.1544
[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[2]	train-error:0.1344	eval-error:0.1524
[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[3]	train-error:0.128533	eval-error:0.1488
[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[4]	train-error:0.1248	eval-error:0.1428
[08:09:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[5]	train-error:0.125333	eval-error:0.1424
[08:09:24] /w

[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 0 pruned nodes, max_depth=6
[3]	train-error:0.125467	eval-error:0.148
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[4]	train-error:0.124133	eval-error:0.1456
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[5]	train-error:0.124667	eval-error:0.1444
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[6]	train-error:0.1232	eval-error:0.1444
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=6
[7]	train-error:0.1228	eval-error:0.144
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 0 pruned nodes, max_depth=6
[8]	train-error:0.119867	eval-error:0.1416
[08:09:25]

[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[6]	train-error:0.144	eval-error:0.1592
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[7]	train-error:0.1444	eval-error:0.1608
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[8]	train-error:0.1408	eval-error:0.1588
[08:09:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=4
[9]	train-error:0.140267	eval-error:0.1576
params: {'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': 0.1, 'gamma': 0.1146908270012884, 'alpha': 1.4696704250916958e-08, 'lambda': 1.8041283920976696, 'min_child_weight': 0.31113966098945056, 'max_depth': 4, 'subsample': 0.8, 'colsample_bytree': 0.9, 'random_state': 71}, logloss: 0.4384
[08:09:25] /workspace/src/tree/updater_p

In [7]:
print(f'best params:{best[0]}, score:{best[1]:.4f}')

best params:{'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': 0.1, 'gamma': 0.0412844360022436, 'alpha': 0.18769662886783933, 'lambda': 4.951156091154497e-06, 'min_child_weight': 4.353762802618278, 'max_depth': 8, 'subsample': 0.8, 'colsample_bytree': 0.9, 'random_state': 71}, score:0.4012


=> tr_xなどをグローバル変数として扱っていることに気持ち悪さを感じるが、基本の型は上記のような感じ。