In [1]:
# https://www.kaggle.com/mainya/stacking-xgboost-lightgbm-catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold

import xgboost as xgb
import lightgbm as lgb

import gc
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("./Data8.csv", engine='python')
df = df.drop(['Unnamed: 0'], axis=1)
input_dim = df.shape[1] - 1  # only X dim
df.head()

Unnamed: 0,TrainCharacteristicIC,TrainCharacteristicLM,TrainCharacteristicSPR,Trainnumber11618,Trainnumber11622,Trainnumber11623,Trainnumber11627,Trainnumber11716,Trainnumber11720,Trainnumber11721,...,Rain,Slack,DelayJump,SlackSum,DriverDelay,DriverToNow,RollDelay_co,RollToNow_co,RollDelay_cn,RollToNow_cn
0,1,0,0,0,0,0,0,0,0,0,...,0.6,-29.0,17,10.0,122,2640.0,0,0.0,0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0.6,14.0,32,43.0,122,3120.0,0,0.0,0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.6,-28.0,-40,92.0,122,3240.0,0,0.0,0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0.6,42.0,42,78.0,122,3480.0,0,0.0,0,0.0
4,1,0,0,0,0,0,0,0,0,0,...,0.6,-12.0,15,49.0,122,3600.0,0,0.0,0,0.0


In [3]:
# split data into train and test
X=np.array(df.drop(['DelayJump'], axis=1))
y=np.array(df['DelayJump'] )# df.DelayJump
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train.shape, X_test.shape

((81744, 3290), (20437, 3290))

In [5]:
del df, X, y
gc.collect()

129

# Build model for basic information (XGB, LGB)

In [6]:
def mae(y_test, y_pred):
    return (sum(abs(y_test-y_pred))/len(y_test) )

In [None]:
class XgbWrapper(object):
    def __init__(self, params=None, nrounds=200, seed=122):
        self.params = params
        self.params['seed'] = seed
        self.nrounds = nrounds

    def train(self, X_train, y_train, X_test, y_test):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        watchlist = [(dtrain, 'train'), (dtest, 'test')]
        self.model = xgb.train(self.params, dtrain, self.nrounds, watchlist, 
                               early_stopping_rounds=30, verbose_eval=50)
        
    def predict(self, x):
        return self.model.predict(xgb.DMatrix(x))  
    
xgb_params ={
  'subsample' : 0.8,   
  'colsample_bytree' : 0.8,                         
  'booster' : "gbtree",
  'max_depth' : 10,  
  'learning_rate':0.01 ,
  'eval_metric' : "mae",                      
  'objective' : "reg:linear"
}

In [None]:
XGB_model = XgbWrapper(params = xgb_params, nrounds=50000)
XGB_model.train(X_train, y_train, X_test, y_test)

[0]	train-mae:74.3983	test-mae:73.727
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 30 rounds.
[50]	train-mae:68.0448	test-mae:67.7512
[100]	train-mae:64.4656	test-mae:64.5896
[150]	train-mae:62.2085	test-mae:62.7571
[200]	train-mae:60.6584	test-mae:61.6097


In [None]:
class LgbWrapper(object):
    def __init__(self, params=None, nrounds=200, seed=122):
        self.params = params
        self.params['seed'] = seed
        self.nrounds = nrounds

    def train(self, X_train, y_train, X_test, y_test):
        dtrain = lgb.Dataset(X_train, label=y_train)
        dtest = lgb.Dataset(X_test, label=y_test)
        watchlist = [(dtrain, 'train'), (dtest, 'test')]
        self.model = lgb.train(self.params, dtrain, self.nrounds, [dtrain, dtest], 
                               early_stopping_rounds=30, verbose_eval=50)

    def predict(self, x):
        return self.model.predict(x)
    
lgb_params ={
  'bagging_fraction' : 0.8,  
  'feature_fraction' : 0.8,                              
  'boosting_type' : "gbdt",
  'max_depth' : 10,
  'learning_rate':0.01,
  'metric' : "mae",                      
  'objective' : "regression"
}

In [None]:
LGB_model = LgbWrapper(params = lgb_params, nrounds=50000)
LGB_model.train(X_train, y_train, X_test, y_test)

In [None]:
y_pred = XGB_model.predict(X_test)
mae(y_test, y_pred)

In [None]:
y_pred = LGB_model.predict(X_test)
mae(y_test, y_pred)

# Stacking

In [None]:
n_fold = 2
kf = KFold(n_splits=n_fold)
for train_index, valid_index in kf.split(X_train):
    print(train_index,  len(train_index), valid_index, len(valid_index))

In [None]:
# The first stage #
n_fold = 5
kf = KFold(n_splits=n_fold)
df_count = 1
metaTrain = {}
init_array = np.zeros(y_test.shape[0])
metaTest = {'XGB_1_META': init_array,
           'LGB_1_META': init_array}


for train_index, meta_index in kf.split(X_train):
    
    print('====== Round: ', df_count, ' =============')
    metaTrain_dict = {}
   
    X_train_train = X_train[train_index]
    y_train_train = y_train[train_index]
    X_train_meta = X_train[meta_index]
    y_train_meta = y_train[meta_index]
    metaTrain_dict['y_train_meta'] = list(y_train_meta)
    
    # XGB-1
    print('======= XGB-1 =======')
    XGB_model = XgbWrapper(params = xgb_params, nrounds=50000)
    XGB_model.train(X_train_train, y_train_train, X_train_meta, y_train_meta)
    metaTrain_dict['XGB_1_META'] = XGB_model.predict(X_train_meta)
    metaTest['XGB_1_META'] = metaTest['XGB_1_META'] + XGB_model.predict(X_test)
    print(metaTrain_dict['XGB_1_META'], metaTest['XGB_1_META'])
    
    # LGB-1
    print('======= LGB-1 =======')
    LGB_model = LgbWrapper(params = lgb_params, nrounds=50000)
    LGB_model.train(X_train_train, y_train_train, X_train_meta, y_train_meta)
    metaTrain_dict['LGB_1_META'] = LGB_model.predict(X_train_meta)
    metaTest['LGB_1_META'] = metaTest['LGB_1_META'] + LGB_model.predict(X_test)
    print(metaTrain_dict['LGB_1_META'], metaTest['LGB_1_META'])
    
    # record meta-result
    metaTrain[df_count] = pd.DataFrame.from_dict(metaTrain_dict)
    print(metaTrain[df_count].shape, metaTrain[df_count].head())
    df_count += 1
    
metaTest['XGB_1_META'] = metaTest['XGB_1_META'] / df_count
metaTest['LGB_1_META'] = metaTest['LGB_1_META'] / df_count
metaTest['y_train_meta'] = y_test

In [None]:
metaTest

In [None]:
meta_Train = pd.concat(metaTrain.values(), ignore_index=True)
meta_Test = pd.DataFrame.from_dict(metaTest)
meta_Train.shape, meta_Test.shape

In [None]:
meta_Train.head() , meta_Test.head() 

In [None]:
# The second stage #
metaTrain_X = np.array(meta_Train.drop(['y_train_meta'], axis=1))
metaTrain_y = np.array(meta_Train['y_train_meta'])
metaTest_X = np.array(meta_Test.drop(['y_train_meta'], axis=1))
metaTest_y = np.array(meta_Test['y_train_meta'])

blender = XgbWrapper(params = xgb_params, nrounds=50000)
blender.train(metaTrain_X, metaTrain_y, metaTest_X, metaTest_y)