# Stacking LightGBM and XGBoost Models

In [1]:
#### GLOBAL VARIABLES ####
FOLD_SEED = 3027
NUM_FOLDS = 5
EARLY_STOP = 200

import numpy as np
import pandas as pd
import pickle
import optuna
import time
import os
            
# Models
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from category_encoders import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.linear_model import RidgeCV, LinearRegression, LassoCV
from xgboost import XGBRegressor

import warnings

# Mute warnings
warnings.filterwarnings('ignore')

## 1. Retrieve Model Parameters

We pick the top 2 trials from all of our saved optuna study files.

In [2]:
temp = None
for dirname, _, filenames in os.walk('../output'):
    for filename in filenames:
        if not filename.startswith('study_lgbm'):
            continue
        elif temp is None:
            old_study = pickle.load(open(os.path.join(dirname, filename), "rb" ))
            temp = old_study.trials_dataframe().sort_values('value').head(2)
        else:
            old_study = pickle.load(open(os.path.join(dirname, filename), "rb" ))
            temp = pd.concat([temp, old_study.trials_dataframe().sort_values('value').head(2)])

temp.sort_values('value', inplace = True)
temp.head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cat_l2,params_cat_smooth,params_colsample_bytree,params_learning_rate,params_max_bin,params_max_depth,params_min_child_samples,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state,system_attrs_fixed_params
71,71,0.717671,2021-08-30 17:05:34.734361,2021-08-30 17:07:02.682713,0 days 00:01:27.948352,82.577407,70.053774,0.147395,0.102204,1391,2,17,7.06416,14,39.917085,59.816029,0.434015,COMPLETE,
31,31,0.717676,2021-08-30 16:12:50.587224,2021-08-30 16:14:21.101620,0 days 00:01:30.514396,83.7769,69.823151,0.157685,0.102001,1391,2,17,7.087311,17,40.08618,59.681072,0.433856,COMPLETE,
176,176,0.717704,2021-08-28 04:54:55.670475,2021-08-28 04:55:48.832475,0 days 00:00:53.162000,81.0,65.0,0.127,0.123,1491,2,19,6.91,16,39.3,60.6,0.46,COMPLETE,
160,160,0.717709,2021-08-28 04:39:06.082533,2021-08-28 04:40:03.181533,0 days 00:00:57.099000,81.0,66.0,0.128,0.118,1492,2,16,6.93,18,39.8,59.9,0.42,COMPLETE,


## 2. Save Best Parameters

Put the parameters into a form that we can use for training models.

In [3]:
cols = [col for col in temp.columns if col.startswith('params_')]
temp = temp[cols]
temp.columns = [col[7:] for col in cols]
best_params = temp.to_dict(orient='records')

## 3. Load Training Data and Create K-Folds

Load data, create K-fold sets for cross-validation and reproducibility

In [4]:
# Load the training data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Define Folds
train["kfold"] = -1
kf = KFold(NUM_FOLDS, shuffle = True, random_state = FOLD_SEED) 
for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
    train.loc[valid_idx,"kfold"] = fold

# List of categorical/numerical columns
object_cols = [col for col in train.columns if 'cat' in col]
number_cols = [col for col in train.columns if 'cont' in col]
columns = number_cols + object_cols

## 4. Train LightGBM Models and Generate Predictions

Train models using the parameters from the previous steps

In [5]:
out_of_fold = pd.DataFrame({"LGBM"+str(i): np.zeros((train.shape[0],)) for i in range(len(best_params))})
out_of_fold['kfold'] = train.kfold
predictions = pd.DataFrame({"LGBM"+str(i): np.zeros((test.shape[0],)) for i in range(len(best_params))})

for i, params in enumerate(best_params):
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)

    for j in range(NUM_FOLDS):
        X_train = X[X.kfold != j][columns].copy()
        X_valid = X[X.kfold == j][columns].copy()
        y_train = X[X.kfold != j]['target'].copy()
        y_valid = X[X.kfold == j]['target'].copy()
        X_test = test.set_index('id')[columns]

        # Label Encode Data
        encoder = OrdinalEncoder(cols = object_cols)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)
        
        model = LGBMRegressor(**{**{'random_state': 0, 
                                    'n_jobs': -1,
                                    'n_estimators': 20000}, **params})
        model.fit(X_train, y_train,
                  verbose=False,
                  eval_set=[(X_valid, y_valid)],
                  categorical_feature = object_cols,
                  early_stopping_rounds = EARLY_STOP,
                  )

        predictions["LGBM"+str(i)] += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_squared_error(y_valid, preds_valid, squared=False)
        out_of_fold["LGBM"+str(i)][X.kfold == j] = preds_valid
        print("Model", i ," Fold",j ,"(RSME):", scores[j])

    print("Model", i, "Average (RMSE):", scores.mean())
    print("Model", i, "Worst (RMSE):", scores.max())

Model 0  Fold 0 (RSME): 0.7155124607020212
Model 0  Fold 1 (RSME): 0.7166094138916937
Model 0  Fold 2 (RSME): 0.7176710803054804
Model 0  Fold 3 (RSME): 0.7176466124220399
Model 0  Fold 4 (RSME): 0.7167418920771789
Model 0 Average (RMSE): 0.7168362918796829
Model 0 Worst (RMSE): 0.7176710803054804
Model 1  Fold 0 (RSME): 0.7155900761241745
Model 1  Fold 1 (RSME): 0.716705924427379
Model 1  Fold 2 (RSME): 0.7176759550246586
Model 1  Fold 3 (RSME): 0.7176441816462166
Model 1  Fold 4 (RSME): 0.7168428610074886
Model 1 Average (RMSE): 0.7168917996459835
Model 1 Worst (RMSE): 0.7176759550246586
Model 2  Fold 0 (RSME): 0.7157391700610005
Model 2  Fold 1 (RSME): 0.7166198137210454
Model 2  Fold 2 (RSME): 0.717607610220349
Model 2  Fold 3 (RSME): 0.7177041194698671
Model 2  Fold 4 (RSME): 0.7166322975027858
Model 2 Average (RMSE): 0.7168606021950096
Model 2 Worst (RMSE): 0.7177041194698671
Model 3  Fold 0 (RSME): 0.7156032588665565
Model 3  Fold 1 (RSME): 0.7166588751967159
Model 3  Fold 2 (RS

## 5. Loading XGBoost Predictions

In the case of the XGBRegressor, we run optuna and generate the predictions on Kaggle using their GPUs since this takes far too long to process locally with CPU.  

In [7]:
# Get the filenames
import os
# Load the XGBoost predictions
for dirname, _, filenames in os.walk('..\output'):
    for filename in filenames:
        if not filename.startswith('study_lgbm'):
            print(os.path.join(dirname, filename))

..\output\outoffold_xgboost_20210831-210554.p
..\output\predictions_xgboost_20210831-210554.p


In [8]:
# Out-of-Fold Validation
temp = pickle.load(open("..\output\outoffold_xgboost_20210831-210554.p", "rb" ))
for col in temp.columns:
    if 'XGBOOST' in col:
        out_of_fold[col] = temp[col]
        
# Averaged Test Set Predictions
temp = pickle.load(open("..\output\predictions_xgboost_20210831-210554.p", "rb" ))
for col in temp.columns:
    predictions[col] = temp[col]

## 6. Training the Ensemble Model

We create a function for easily testing various models

In [9]:
def ensemble(ensemble_model, submit = False, fit_params = {}):
    preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    for j in range(NUM_FOLDS):
        X_train = out_of_fold[X.kfold != j].drop('kfold', axis = 1)
        X_valid = out_of_fold[X.kfold == j].drop('kfold', axis = 1)
        y_train = train['target'][X.kfold != j].copy()
        y_valid = train['target'][X.kfold == j].copy()
        X_test = predictions.copy()

        model = clone(ensemble_model)
        model.fit(X_train, y_train)

        preds += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_squared_error(y_valid, preds_valid, squared=False)
        print("Fold", j ,"(RSME):", scores[j])

    print("Avg (RMSE):", round(scores.mean(),6))
    print("Max (RMSE):", round(scores.max(),6))

    if submit:
        output = pd.DataFrame({'id': test.id,'target': preds})
        timestr = time.strftime("%Y%m%d-%H%M%S")
        output.to_csv('../submissions/submission_ensemble_'+timestr+'.csv', index=False)

## 7. Testing Various Ensemble Models

In [10]:
ensemble(ensemble_model = LinearRegression())

Fold 0 (RSME): 0.7152415363083845
Fold 1 (RSME): 0.7162589814914677
Fold 2 (RSME): 0.7172545900532766
Fold 3 (RSME): 0.7176352205622318
Fold 4 (RSME): 0.7164015327728596
Avg (RMSE): 0.716558
Max (RMSE): 0.717635


In [11]:
ensemble(ensemble_model = RidgeCV())

Fold 0 (RSME): 0.7152541888294607
Fold 1 (RSME): 0.7162585176557689
Fold 2 (RSME): 0.7172554943810194
Fold 3 (RSME): 0.7176317085696127
Fold 4 (RSME): 0.7164019401273162
Avg (RMSE): 0.71656
Max (RMSE): 0.717632


In [12]:
ensemble(ensemble_model = LassoCV())

Fold 0 (RSME): 0.7152525796544902
Fold 1 (RSME): 0.7162606568068188
Fold 2 (RSME): 0.7172591914556631
Fold 3 (RSME): 0.7176239113907984
Fold 4 (RSME): 0.7164003341049899
Avg (RMSE): 0.716559
Max (RMSE): 0.717624


In [13]:
ensemble(ensemble_model = LGBMRegressor())

Fold 0 (RSME): 0.715378130542827
Fold 1 (RSME): 0.7163788369843227
Fold 2 (RSME): 0.7175185787761765
Fold 3 (RSME): 0.717782240526377
Fold 4 (RSME): 0.7165580011243365
Avg (RMSE): 0.716723
Max (RMSE): 0.717782


In [14]:
ensemble(ensemble_model = XGBRegressor())

Fold 0 (RSME): 0.7175716900497805
Fold 1 (RSME): 0.7185605582480795
Fold 2 (RSME): 0.7194318882334912
Fold 3 (RSME): 0.7200217278524934
Fold 4 (RSME): 0.7186498272618699
Avg (RMSE): 0.718847
Max (RMSE): 0.720022


In [15]:
ensemble(ensemble_model = LassoCV())

Fold 0 (RSME): 0.7152525796544902
Fold 1 (RSME): 0.7162606568068188
Fold 2 (RSME): 0.7172591914556631
Fold 3 (RSME): 0.7176239113907984
Fold 4 (RSME): 0.7164003341049899
Avg (RMSE): 0.716559
Max (RMSE): 0.717624
