In [1]:
import numpy as np
import pandas as pd

### データ準備

In [2]:
train = pd.read_csv('../../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../../input/sample-data/test_preprocessed.csv')

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

## 学習・評価

In [3]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [4]:
# 特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

In [5]:
# ハイパーパラメータの設定
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 0, 'eval_metric': 'logloss', 'booster': 'gbtree'}
num_round = 500

In [6]:
# 学習の実行
# watchlistには学習データおよびバリデーションデータをセットする
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20)

[0]	train-logloss:0.539452	eval-logloss:0.551324
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.449389	eval-logloss:0.471079
[2]	train-logloss:0.391917	eval-logloss:0.423298
[3]	train-logloss:0.349746	eval-logloss:0.389744
[4]	train-logloss:0.316009	eval-logloss:0.36525
[5]	train-logloss:0.291775	eval-logloss:0.345957
[6]	train-logloss:0.272171	eval-logloss:0.33411
[7]	train-logloss:0.257363	eval-logloss:0.322367
[8]	train-logloss:0.242918	eval-logloss:0.312858
[9]	train-logloss:0.227988	eval-logloss:0.306092
[10]	train-logloss:0.219927	eval-logloss:0.30211
[11]	train-logloss:0.208176	eval-logloss:0.294525
[12]	train-logloss:0.199738	eval-logloss:0.293724
[13]	train-logloss:0.19197	eval-logloss:0.28883
[14]	train-logloss:0.183975	eval-logloss:0.285978
[15]	train-logloss:0.176698	eval-logloss:0.280726
[16]	train-logloss:0.17195	eval-logloss:0.278298
[17]	train-loglos

In [7]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(dvalid, ntree_limit=model.best_ntree_limit)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

logloss: 0.2290


### booster
- ハイパーパラメータのboosterは変更できる
  - gbtree: デフォルト
  - gblinear: 線形モデルと同様の表現力になる。あまり使われない。
  - dart: nn における dropout をしながら boosting する。

In [8]:
# booster = dartとしてみる

# 特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

# ハイパーパラメータの設定
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 0, 'eval_metric': 'logloss', 'booster': 'dart'}
num_round = 500

# 学習の実行
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20)

[0]	train-logloss:0.539452	eval-logloss:0.551324
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.449389	eval-logloss:0.471079
[2]	train-logloss:0.391917	eval-logloss:0.423298
[3]	train-logloss:0.349746	eval-logloss:0.389744
[4]	train-logloss:0.316009	eval-logloss:0.36525
[5]	train-logloss:0.291775	eval-logloss:0.345957
[6]	train-logloss:0.272171	eval-logloss:0.33411
[7]	train-logloss:0.257363	eval-logloss:0.322367
[8]	train-logloss:0.242918	eval-logloss:0.312858
[9]	train-logloss:0.227988	eval-logloss:0.306092
[10]	train-logloss:0.219927	eval-logloss:0.30211
[11]	train-logloss:0.208176	eval-logloss:0.294525
[12]	train-logloss:0.199738	eval-logloss:0.293724
[13]	train-logloss:0.19197	eval-logloss:0.28883
[14]	train-logloss:0.183975	eval-logloss:0.285978
[15]	train-logloss:0.176698	eval-logloss:0.280726
[16]	train-logloss:0.17195	eval-logloss:0.278298
[17]	train-loglos

In [9]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(dvalid, ntree_limit=model.best_ntree_limit)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

logloss: 0.2290


⇒ log_lossは今回の例では同じ値になった。