In [1]:
import numpy as np
import pandas as pd

### データ読み込み

In [2]:
train = pd.read_csv('../../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../../input/sample-data/test_preprocessed.csv')

# neural net用のデータ
train_nn = pd.read_csv('../../input/sample-data/train_preprocessed_onehot.csv')
train_x_nn = train_nn.drop(['target'], axis=1)
train_y_nn = train_nn['target']
test_x_nn = pd.read_csv('../../input/sample-data/test_preprocessed_onehot.csv')

## スタッキング

In [3]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from models import Model1Xgb, Model1NN, Model2Linear

Using TensorFlow backend.


In [4]:
# trainデータを4分割し、3つで学習したモデルで残り1つを予測＆このモデルでtestデータを予測。
# trainデータは一周し予測値が一つずつ得られる。testデータは四周しているので平均する。

def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

### 1層目

In [5]:
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)

model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x_nn, train_y, test_x_nn)

[0]	train-logloss:0.540879	eval-logloss:0.550034
[1]	train-logloss:0.452692	eval-logloss:0.47182
[2]	train-logloss:0.394818	eval-logloss:0.42026
[3]	train-logloss:0.351976	eval-logloss:0.385203
[4]	train-logloss:0.320213	eval-logloss:0.361498
[5]	train-logloss:0.296733	eval-logloss:0.344634
[6]	train-logloss:0.276105	eval-logloss:0.329003
[7]	train-logloss:0.258858	eval-logloss:0.316697
[8]	train-logloss:0.243628	eval-logloss:0.30775
[9]	train-logloss:0.231527	eval-logloss:0.300925
[0]	train-logloss:0.538915	eval-logloss:0.548639
[1]	train-logloss:0.452188	eval-logloss:0.471485
[2]	train-logloss:0.395742	eval-logloss:0.419976
[3]	train-logloss:0.354763	eval-logloss:0.384132
[4]	train-logloss:0.322183	eval-logloss:0.356264
[5]	train-logloss:0.299451	eval-logloss:0.339098
[6]	train-logloss:0.277833	eval-logloss:0.325516
[7]	train-logloss:0.263263	eval-logloss:0.315733
[8]	train-logloss:0.247804	eval-logloss:0.30592
[9]	train-logloss:0.233693	eval-logloss:0.295955
[0]	train-logloss:0.5433

In [6]:
# 1層目のモデルの評価
print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')
print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}')

logloss: 0.2967
logloss: 0.2819


In [7]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})

### 2層目
- 1層目の予測値だけを使用して学習を行う

In [8]:
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')

logloss: 0.2508


⇒ 確かに2層にして求めたlog_lossのほうが小さくなった。