# Blending LightGBM Models

In [1]:
import numpy as np
import pandas as pd
import pickle
import optuna
import time
import os
            
# Models
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from category_encoders import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.linear_model import RidgeCV, LinearRegression, LassoCV
from xgboost import XGBRegressor

import warnings

# Mute warnings
warnings.filterwarnings('ignore')

# Choose Model Parameters

We take the top 2 sets of hyperparameters from each search

In [2]:
temp = None
for dirname, _, filenames in os.walk('output'):
    for filename in filenames:
        if temp is None:
            old_study = pickle.load(open(os.path.join(dirname, filename), "rb" ))
            temp = old_study.trials_dataframe().sort_values('value').head(2)
        else:
            old_study = pickle.load(open(os.path.join(dirname, filename), "rb" ))
            temp = pd.concat([temp, old_study.trials_dataframe().sort_values('value').head(2)])

temp.sort_values('value', inplace = True)
temp.head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cat_l2,params_cat_smooth,params_colsample_bytree,params_learning_rate,params_max_bin,params_max_depth,params_min_child_samples,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state,system_attrs_fixed_params
281,281,0.717663,2021-08-27 11:26:32.426703,2021-08-27 11:27:58.119703,0 days 00:01:25.693000,76.3,55.0,0.13,0.039,1250,3,20,6.85,26,40.0,60.0,0.4,COMPLETE,
71,71,0.717671,2021-08-30 17:05:34.734361,2021-08-30 17:07:02.682713,0 days 00:01:27.948352,82.577407,70.053774,0.147395,0.102204,1391,2,17,7.06416,14,39.917085,59.816029,0.434015,COMPLETE,
31,31,0.717676,2021-08-30 16:12:50.587224,2021-08-30 16:14:21.101620,0 days 00:01:30.514396,83.7769,69.823151,0.157685,0.102001,1391,2,17,7.087311,17,40.08618,59.681072,0.433856,COMPLETE,
172,172,0.717681,2021-08-27 17:59:00.714964,2021-08-27 18:00:58.876600,0 days 00:01:58.161636,85.0,75.0,0.16,0.068,1380,2,5,6.24,20,40.0,55.0,0.39,COMPLETE,
277,277,0.717683,2021-08-27 22:50:15.073869,2021-08-27 22:52:30.910902,0 days 00:02:15.837033,85.0,70.0,0.175,0.057,1390,2,7,6.24,20,38.0,57.0,0.4,COMPLETE,


## Save Best Parameters

In [3]:
cols = [col for col in temp.columns if col.startswith('params_')]
temp = temp[cols]
temp.columns = [col[7:] for col in cols]
best_params = temp.to_dict(orient='records')

## Load Training Data and Create K-Folds

In [4]:
#### GLOBAL VARIABLES ####
FOLD_SEED = 3027
NUM_FOLDS = 5
EARLY_STOP = 200

# Load the training data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# Define Folds
train["kfold"] = -1
kf = KFold(NUM_FOLDS, shuffle = True, random_state = FOLD_SEED) 
for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
    train.loc[valid_idx,"kfold"] = fold

# List of categorical/numerical columns
object_cols = [col for col in train.columns if 'cat' in col]
number_cols = [col for col in train.columns if 'cont' in col]
columns = number_cols + object_cols

## Stack Estimators

We use `StackingRegessor` from scikit-learn to blend our models on each fold, then average across folds.

In [5]:
out_of_fold = pd.DataFrame({i: np.zeros((train.shape[0],)) for i in range(len(best_params))})
out_of_fold['kfold'] = train.kfold
predictions = pd.DataFrame({i: np.zeros((test.shape[0],)) for i in range(len(best_params))})

for i, params in enumerate(best_params):
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)

    for j in range(NUM_FOLDS):
        X_train = X[X.kfold != j][columns].copy()
        X_valid = X[X.kfold == j][columns].copy()
        y_train = X[X.kfold != j]['target'].copy()
        y_valid = X[X.kfold == j]['target'].copy()
        X_test = test.set_index('id')[columns]

        # Label Encode Data
        encoder = OrdinalEncoder(cols = object_cols)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)
        
        model = LGBMRegressor(**{**{'random_state': 0, 
                                    'n_jobs': -1,
                                    'n_estimators': 20000}, **params})
        model.fit(X_train, y_train,
                  verbose=False,
                  eval_set=[(X_valid, y_valid)],
                  categorical_feature = object_cols,
                  early_stopping_rounds = EARLY_STOP,
                  )

        predictions[i] += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_squared_error(y_valid, preds_valid, squared=False)
        out_of_fold[i][X.kfold == j] = preds_valid
        print("Model", i ," Fold",j ,"(RSME):", scores[j])

    print("Model", i, "Average (RMSE):", scores.mean())
    print("Model", i, "Worst (RMSE):", scores.max())

Model 0  Fold 0 (RSME): 0.7156864410884715
Model 0  Fold 1 (RSME): 0.7167169293206732
Model 0  Fold 2 (RSME): 0.7174633344778988
Model 0  Fold 3 (RSME): 0.7176625887465671
Model 0  Fold 4 (RSME): 0.7167491939039573
Model 0 Average (RMSE): 0.7168556975075135
Model 0 Worst (RMSE): 0.7176625887465671
Model 1  Fold 0 (RSME): 0.7155124607020212
Model 1  Fold 1 (RSME): 0.7166094138916937
Model 1  Fold 2 (RSME): 0.7176710803054804
Model 1  Fold 3 (RSME): 0.7176466124220399
Model 1  Fold 4 (RSME): 0.7167418920771789
Model 1 Average (RMSE): 0.7168362918796829
Model 1 Worst (RMSE): 0.7176710803054804
Model 2  Fold 0 (RSME): 0.7155900761241745
Model 2  Fold 1 (RSME): 0.716705924427379
Model 2  Fold 2 (RSME): 0.7176759550246586
Model 2  Fold 3 (RSME): 0.7176441816462166
Model 2  Fold 4 (RSME): 0.7168428610074886
Model 2 Average (RMSE): 0.7168917996459835
Model 2 Worst (RMSE): 0.7176759550246586
Model 3  Fold 0 (RSME): 0.7156851291897919
Model 3  Fold 1 (RSME): 0.7167795226032495
Model 3  Fold 2 (R

# Training the Ensemble Model

We use a ridge regression to blend the models

In [8]:
def ensemble(ensemble_model, submit = False, fit_params = {}):
    preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    for j in range(NUM_FOLDS):
        X_train = out_of_fold[X.kfold != j][0:len(best_params)-1].copy()
        X_valid = out_of_fold[X.kfold == j][0:len(best_params)-1].copy()
        y_train = train['target'][X.kfold != j].copy()
        y_valid = train['target'][X.kfold == j].copy()
        X_test = predictions.copy()

        model = clone(ensemble_model)
        model.fit(X_train, y_train)

        preds += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_squared_error(y_valid, preds_valid, squared=False)
        print("Fold", j ,"(RSME):", scores[j])

    print("Avg (RMSE):", round(scores.mean(),6))
    print("Max (RMSE):", round(scores.max(),6))

    if submit:
        output = pd.DataFrame({'id': test.id,'target': preds})
        timestr = time.strftime("%Y%m%d-%H%M%S")
        output.to_csv('submissions/submission_ensemble_'+timestr+'.csv', index=False)

## Testing Models on the meta-dataset

In [9]:
ensemble(ensemble_model = LinearRegression())

ValueError: Found input variables with inconsistent numbers of samples: [9, 240000]

In [None]:
ensemble(ensemble_model = RidgeCV())

In [None]:
ensemble(ensemble_model = LassoCV())

In [None]:
ensemble(ensemble_model = LGBMRegressor())

In [None]:
ensemble(ensemble_model = XGBRegressor())