# 超初心者向けにスタッキングのやり方を実装したNotebookです。
私自身が最近Kaggleを始めたばかりなので、コードをきれいに書くことができません。玄人の方からしたら、いかにも初心者な書き方をしていると思います。しかし、逆にその分、初心者の方には読みやすいコードになっていると思います。

# This is a Notebook that implements how to do stacking for serious beginners.
I've only recently started Kaggle, so I can't write code well. For an expert user, I think the code is written in a very beginner-like way. But on the other hand, I think it makes the code easy to understand for beginners.

### If you find it helpful, I would appreciate it if you could upvote it...  
### 参考になりましたら、Upvoteしていただけると幸いです、、、

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

train.drop([170514], axis=0, inplace=True)

X = np.array(train.drop(["id", "target"], axis=1))
X_test = np.array(test.drop("id", axis=1))
y = np.array(train["target"])

# Modeling
2層のスタッキングを行う。two-layer stacking.

In [None]:
import random
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

def RMSE(pred, true):
    return np.sqrt(mse(pred, true))

SEED = 2039

In [None]:
# このセルでスタッキングに使う関数を定義している。
# Define the function to be used for stacking in this cell.

# stack_xgb, stack_lgbは次の層に入力する学習データとテストデータを返す関数。
# stack_xgb and stack_lgb are functions that return the training and test data to be input to the next layer.
def stack_xgb(X, X_test, y, params):
    
    SEED = random.randint(0, 100)
    kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
    
    # 次の層の学習データとなるものを格納するリスト
    # A list to store what will be the training data for the next layer
    next_train = []
    
    # 次の層のテストデータとなるものを格納するリスト
    # A list to store what will be the test data for the next layer
    next_test = []
    
    # validationデータに指定されたデータの行番号を格納するリスト
    # A list that stores the indices of the data specified in the validation data.
    vl_ids = []
    
    d_test = xgb.DMatrix(X_test)
    
    for tr_id, vl_id in kf.split(X, y):
        
        X_train, X_val = X[tr_id, :], X[vl_id, :]
        y_train, y_val = y[tr_id], y[vl_id]
        
        d_train = xgb.DMatrix(X_train, label=y_train)
        d_val = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(params=params,
                          dtrain=d_train,
                          num_boost_round=10000,
                          early_stopping_rounds=100,
                          verbose_eval=0,
                          evals=[(d_val, "val")])
        
        # 検証用データに対して予測を行う。これが次の層の入力データ（次の層にとっての学習データ）となる
        # Make predictions on the validation data. This will be the input data for the next layer (training data for the next layer).
        pred_train = model.predict(d_val, ntree_limit=model.best_ntree_limit)
        
        # 予測したものをnext_trainに追加していく
        # Add the predictions to the next_train
        next_train.append(pred_train)
        
        # 検証用データに選ばれたデータのindex番号をリストに追加していく
        # Add the index number of the data selected for verification to the list.
        vl_ids.append(vl_id)
        
        # テストデータに対して予測を行う。これが次の層の入力データ（次の層にとってのテストデータ）となる
        # Make predictions on the test data. This will be the input data for the next layer (the test data for the next layer).
        pred_test = model.predict(d_test, ntree_limit=model.best_ntree_limit)
        
        # 予測したものをnext_testに追加していく。
        # Add the predictions to the next_test.
        next_test.append(pred_test)
        
    # とりあえずこの操作をしておく。
    # I'll do this operation for now.
    vl_ids = np.concatenate(vl_ids)
    next_train = np.concatenate(next_train, axis=0)
    
    # KFoldのshuffle=Trueによって行番号がバラバラになっているので、それをここで修正する。
    # KFold's shuffle=True is causing the line numbers to be broken up, so we'll fix that here.
    order = np.argsort(vl_ids)
    next_train = next_train[order]
    
    # KFoldを使ったことによって、テストデータに対する予測がn_splits=10回分行われているので、平均をとって1つにする。
    # By using KFold, we have made "n_splits=10" predictions for the test data, so we will average them into one.
    next_test = np.mean(next_test, axis=0)
    
    # pd.Series型で返す
    # Return as pd.Series type
    return pd.Series(next_train), pd.Series(next_test)


# 上記のstack_xgbと同様
# Same as stack_xgb above
def stack_lgb(X, X_test, y, params):
    
    SEED = random.randint(0, 100)
    kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
    
    next_train = []
    next_test = []
    vl_ids = []
    
    for tr_id, vl_id in kf.split(X, y):
        
        X_train, X_val = X[tr_id, :], X[vl_id, :]
        y_train, y_val = y[tr_id], y[vl_id]

        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

        model = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=(lgb_train, lgb_val),
                          num_boost_round=10000,
                          early_stopping_rounds=100,
                          verbose_eval=0)
        
        pred_train = model.predict(X_val, num_iteration=model.best_iteration)
        next_train.append(pred_train)
        
        pred_test = model.predict(X_test, num_iteration=model.best_iteration)
        next_test.append(pred_test)
        
        vl_ids.append(vl_id)
        
    vl_ids = np.concatenate(vl_ids)
    next_train = np.concatenate(next_train, axis=0)
    order = np.argsort(vl_ids)
    next_train = next_train[order]
    
    next_test = np.mean(next_test, axis=0)
    
    return pd.Series(next_train), pd.Series(next_test)

# Layer 1 / 1層目
The input for the first layer is the original training data X, y, X_test.  
1層目の入力はもともとの学習データのX, y, X_test。

In [None]:
params_xgb11 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 15,
    "eta": 0.02,
    "gamma": 0.005346636874993822,
    "colsample_bytree": 0.5,
    "subsample": 0.7,
    "min_child_weight": 257,
    "alpha": 0.01563,
    "lambda": 0.003,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_xgb12 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 20,
    "eta": 0.02,
    "colsample_bytree": 1.0,
    "subsample": 0.5,
    "min_child_weight": 100,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_xgb13 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 10,
    "eta": 0.02,
    "colsample_bytree": 0.5,
    "subsample": 0.8,
    "min_child_weight": 20,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_xgb14 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 6,
    "eta": 0.01,
    "colsample_bytree": 1.0,
    "subsample": 0.8,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_lgb11 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 256,
    "bagging_fraction": 0.8206341150202605,
    "feature_fraction": 0.5,
    "min_data_in_leaf": 100,
    "lambda_l1": 1.074622455507616e-05,
    "lambda_l2": 2.0521330798729704e-06,
    "min_data_per_group": 5,
    "max_depth": -1,
    "subsample_for_bin": 200000,
    "cat_smooth": 1.0,
    "importance_type": "split",
    "min_sum_hessian_in_leaf": 0.001,
    "bagging_freq": 6,
    "min_gain_to_split": 0.0,
    "random_state": SEED
}

params_lgb12 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 256,
    "max_depth": 15,
    "bagging_fraction": 0.8206341150202605,
    "feature_fraction": 0.5,
    "min_data_in_leaf": 100,
    "lambda_l1": 1.074622455507616e-05,
    "lambda_l2": 2.0521330798729704e-06,
    "min_data_per_group": 5,
    "max_depth": -1,
    "subsample_for_bin": 200000,
    "cat_smooth": 1.0,
    "importance_type": "split",
    "min_sum_hessian_in_leaf": 0.001,
    "bagging_freq": 6,
    "min_gain_to_split": 0.0,
    "random_state": SEED
}

params_lgb13 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 50,
    "bagging_fraction": 0.5,
    "feature_fraction": 0.8,
    "random_state": SEED
}

params_lgb14 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 100,
    "max_depth": 7,
    "bagging_fraction": 0.8,
    "feature_fraction": 1.0,
    "random_state": SEED
}

In [None]:
next_train_11, next_test_11 = stack_xgb(X, X_test, y, params_xgb11)
next_train_12, next_test_12 = stack_xgb(X, X_test, y, params_xgb12)
next_train_13, next_test_13 = stack_xgb(X, X_test, y, params_xgb13)
next_train_14, next_test_14 = stack_xgb(X, X_test, y, params_xgb14)
next_train_15, next_test_15 = stack_lgb(X, X_test, y, params_lgb11)
next_train_16, next_test_16 = stack_lgb(X, X_test, y, params_lgb12)
next_train_17, next_test_17 = stack_lgb(X, X_test, y, params_lgb13)
next_train_18, next_test_18 = stack_lgb(X, X_test, y, params_lgb14)

In [None]:
next_train_1 = pd.concat([next_train_11, next_train_12,
                          next_train_13, next_train_14,
                          next_train_15, next_train_16,
                          next_train_17, next_train_18], axis=1)

next_test_1 = pd.concat([next_test_11, next_test_12,
                         next_test_13, next_test_14,
                         next_test_15, next_test_16,
                         next_test_17, next_test_18], axis=1)

next_train_1 = np.array(next_train_1)
next_test_1 = np.array(next_test_1)

# Layer 2 / 2層目
The input of the second layer is the output of the first layer, next_train_1, next_test_1, and so on.  
2層目の入力は1層目の出力であるnext_train_1, next_test_1たち。

In [None]:
params_xgb21 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 15,
    "eta": 0.01,
    "gamma": 0.004,
    "colsample_bytree": 0.9,
    "subsample": 0.7,
    "min_child_weight": 200,
    "alpha": 0.005,
    "lambda": 0.001,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_xgb22 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 8,
    "eta": 0.01,
    "gamma": 0.0006,
    "colsample_bytree": 0.9,
    "subsample": 0.5,
    "min_child_weight": 50,
    "alpha": 0.0004,
    "lambda": 0.00002,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_xgb23 = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 6,
    "eta": 0.01,
    "colsample_bytree": 1.0,
    "subsample": 0.95,
    "min_child_weight": 10,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "seed": SEED
}

params_lgb21 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.02,
    "num_leaves": 256,
    "max_depth": 15,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 10,
    "lambda_l1": 0.00003,
    "lambda_l2": 0.000035,
    "random_state": SEED
}

params_lgb22 = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.01,
    "num_leaves": 50,
    "max_depth": 20,
    "bagging_fraction": 0.8,
    "feature_fraction": 1.0,
    "random_state": SEED
}

In [None]:
next_train_21, next_test_21 = stack_xgb(next_train_1, next_test_1, y, params_xgb21)
next_train_22, next_test_22 = stack_xgb(next_train_1, next_test_1, y, params_xgb22)
next_train_23, next_test_23 = stack_xgb(next_train_1, next_test_1, y, params_xgb23)
next_train_24, next_test_24 = stack_lgb(next_train_1, next_test_1, y, params_lgb21)
next_train_25, next_test_25 = stack_lgb(next_train_1, next_test_1, y, params_lgb22)

In [None]:
next_train_2 = pd.concat([next_train_21, next_train_22, next_train_23,
                          next_train_24, next_train_25], axis=1)
next_test_2 = pd.concat([next_test_21, next_test_22, next_test_23,
                         next_test_24, next_test_25], axis=1)

next_train_2 = np.array(next_train_2)
next_test_2 = np.array(next_test_2)

# Last Layer / 最後の層
The inputs of the last layer are the outputs of the second layer, next_train_2 and next_test_2. In the last layer, we use KFold to predict as usual.  
最後の層の入力は2層目の出力であるnext_train_2, next_test_2たち。最後の層ではKFoldを使って普通に予測する。

In [None]:
params_xgb = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eta": 0.01,
    "max_depth": 4,
    "gamma": 0.09192773718666183, 
    "colsample_bytree": 0.6961218462820887,
    "subsample": 0.9987425774067743,
    "min_child_weight": 0.1137086502328514,
    "alpha": 0.9245596765233609,
    "lambda": 6.230027264411933e-06,
    "seed": SEED
}

params_lgb = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.01,
    "num_leaves": 164,
    "max_depth": 4,
    "bagging_fraction": 0.8498649731960014,
    "feature_fraction": 0.5016568140931941,
    "min_data_in_leaf": 200,
    "lambda_l1": 0.00023282363705273031,
    "lambda_l2": 4.693349803533469e-08,
    "random_state": SEED
}

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)

In [None]:
pred_xgb = pd.DataFrame()
d_test = xgb.DMatrix(next_test_2)

for tr_id, vl_id in kf.split(next_train_2, y):
    
    X_train, X_val = next_train_2[tr_id, :], next_train_2[vl_id, :]
    y_train, y_val = y[tr_id], y[vl_id]
    
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_val, label=y_val)
    
    model = xgb.train(params=params_xgb,
                      dtrain=d_train,
                      num_boost_round=10000,
                      early_stopping_rounds=100,
                      verbose_eval=0,
                      evals=[(d_val, "val")])
    
    pred = model.predict(d_test, ntree_limit=model.best_ntree_limit)
    pred = pd.Series(pred)
    pred_xgb = pd.concat([pred_xgb, pred], axis=1)

In [None]:
pred_xgb.head()

In [None]:
pred_lgb = pd.DataFrame()

for tr_id, vl_id in kf.split(next_train_2, y):
    
    X_train, X_val = next_train_2[tr_id, :], next_train_2[vl_id, :]
    y_train, y_val = y[tr_id], y[vl_id]
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    model = lgb.train(params=params_lgb,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      num_boost_round=10000,
                      early_stopping_rounds=100,
                      verbose_eval=0)
    
    pred = model.predict(next_test_2, num_iteration=model.best_iteration)
    pred = pd.Series(pred)
    pred_lgb = pd.concat([pred_lgb, pred], axis=1)

In [None]:
pred_lgb.head()

# Submission

In [None]:
pred = pd.concat([pred_xgb, pred_lgb], axis=1)
pred = pred.mean(axis=1)

In [None]:
sample_sub = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
sub = sample_sub.copy()

sub["target"] = pred
sub

In [None]:
sub.to_csv("submission_stackingGBDTs.csv", index=False)