In [1]:
import numpy as np
import pandas as pd

### データ準備

In [2]:
train = pd.read_csv('../../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../../input/sample-data/test_preprocessed.csv')

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

## 学習・評価

In [3]:
import lightgbm as lgb
from sklearn.metrics import log_loss

In [4]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
dtrain = lgb.Dataset(tr_x, label=tr_y)
dvalid = lgb.Dataset(va_x, label=va_y)
dtest = lgb.Dataset(test_x)

In [5]:
# ハイパーパラメータの設定
params = {'objective': 'binary', 'verbose': 0, 'seed': 0, 'metrics': 'binary_logloss'}
num_round = 500

param は xgboost と若干異なる

|xgboost|lightgbm|
|-|-|
|objective|objective|
|silent|verbose|
|random_state|seed|
|eval_metric|metrics|

In [6]:
# 学習の実行
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする

categorical_features = ['product', 'medical_info_b2', 'medical_info_b3']
model = lgb.train(params, dtrain, num_boost_round=num_round,
                  categorical_feature=categorical_features,
                  valid_names=['train', 'valid'], valid_sets=[dtrain, dvalid], early_stopping_rounds=20)

New categorical_feature is ['medical_info_b2', 'medical_info_b3', 'product']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	train's binary_logloss: 0.453692	valid's binary_logloss: 0.464874
Training until validation scores don't improve for 20 rounds.
[2]	train's binary_logloss: 0.429171	valid's binary_logloss: 0.442905
[3]	train's binary_logloss: 0.40972	valid's binary_logloss: 0.426457
[4]	train's binary_logloss: 0.392776	valid's binary_logloss: 0.412597
[5]	train's binary_logloss: 0.378315	valid's binary_logloss: 0.401005
[6]	train's binary_logloss: 0.365479	valid's binary_logloss: 0.389646
[7]	train's binary_logloss: 0.353488	valid's binary_logloss: 0.38001
[8]	train's binary_logloss: 0.34252	valid's binary_logloss: 0.370739
[9]	train's binary_logloss: 0.333025	valid's binary_logloss: 0.363143
[10]	train's binary_logloss: 0.324363	valid's binary_logloss: 0.355685
[11]	train's binary_logloss: 0.315949	valid's binary_logloss: 0.348907
[12]	train's binary_logloss: 0.308797	valid's binary_logloss: 0.342465
[13]	train's binary_logloss: 0.301935	valid's binary_logloss: 0.336544
[14]	train's binary_logloss

[130]	train's binary_logloss: 0.071083	valid's binary_logloss: 0.212148
[131]	train's binary_logloss: 0.070418	valid's binary_logloss: 0.211763
[132]	train's binary_logloss: 0.0698243	valid's binary_logloss: 0.212046
[133]	train's binary_logloss: 0.0690373	valid's binary_logloss: 0.211188
[134]	train's binary_logloss: 0.0684494	valid's binary_logloss: 0.2113
[135]	train's binary_logloss: 0.0678675	valid's binary_logloss: 0.21132
[136]	train's binary_logloss: 0.06719	valid's binary_logloss: 0.210845
[137]	train's binary_logloss: 0.0665842	valid's binary_logloss: 0.210517
[138]	train's binary_logloss: 0.0659919	valid's binary_logloss: 0.210353
[139]	train's binary_logloss: 0.0653879	valid's binary_logloss: 0.210293
[140]	train's binary_logloss: 0.064817	valid's binary_logloss: 0.210241
[141]	train's binary_logloss: 0.0642196	valid's binary_logloss: 0.210207
[142]	train's binary_logloss: 0.0635851	valid's binary_logloss: 0.210012
[143]	train's binary_logloss: 0.0631005	valid's binary_logl

In [7]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

logloss: 0.2099


⇒ xgboostと比べてかなり早い。更にlog_lossも小さい。