Validate the idea of composition. Use LGB inside XGB

In [4]:


params_lgb = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": -1,
    "verbose": -1,
    "seed": 42,
}

params_xgb = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    # "max_depth": 6,
    "verbosity": 0,
    "seed": 42,
}

# 2-layer model attempt (has data leak)

In [None]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

enc = LabelEncoder()
predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']

target = 'z'



# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, z, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# LGB
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
# Enforce early stopping
callbacks = [lgb.early_stopping(stopping_rounds=10, verbose=1)]
lgb_model = lgb.train(params_lgb, lgb_train, num_boost_round=1000, valid_sets=lgb_val, callbacks=callbacks)
X_train['z_lgb'] = lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration)
X_val['z_lgb'] = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
X_test['z_lgb'] = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# HGB
hgb = HistGradientBoostingRegressor(max_iter=1000,
                                    learning_rate=0.05,
                                    random_state=42, 
                                    early_stopping=True,
                                    validation_fraction=0.1,
                                    n_iter_no_change=10
)

hgb.fit(X_train, y_train)
X_train['z_hgb'] = hgb.predict(X_train)
X_val['z_hgb'] = hgb.predict(X_val)
X_test['z_hgb'] = hgb.predict(X_test)


# Fuse the train and val sets
X_train, y_train = pd.concat([X_train, X_val], axis=0), pd.concat([y_train, y_val], axis=0)
predictors.append('z_lgb', 'z_hgb')

# Split the data into train and vals sets again
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=43)

# XGB
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_val = xgb.DMatrix(X_val, label=y_val)
xgb_model = xgb.train(params_xgb, xgb_train, num_boost_round=1000, evals=[(xgb_val, 'val')], early_stopping_rounds=100)

z_pred = xgb_model.predict(xgb.DMatrix(X_test), iteration_range=(0, xgb_model.best_iteration))

# Measure the RMSE
rmse = np.sqrt(mean_squared_error(y_test, z_pred))
print(f"RMSE: {rmse:.4f}")
# 



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[551]	valid_0's rmse: 0.0598906


TypeError: list.append() takes exactly one argument (2 given)

In [10]:
predictors += ['z_lgb', 'z_hgb']

# Split the data into train and vals sets again
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=43)

# XGB
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_val = xgb.DMatrix(X_val, label=y_val)
xgb_model = xgb.train(params_xgb, xgb_train, num_boost_round=1000, evals=[(xgb_val, 'val')], early_stopping_rounds=100)

z_pred = xgb_model.predict(xgb.DMatrix(X_test), iteration_range=(0, xgb_model.best_iteration))

# Measure the RMSE
rmse = np.sqrt(mean_squared_error(y_test, z_pred))
print(f"RMSE: {rmse:.4f}")

[0]	val-rmse:0.91256
[1]	val-rmse:0.86715
[2]	val-rmse:0.82402
[3]	val-rmse:0.78305
[4]	val-rmse:0.74415
[5]	val-rmse:0.70720
[6]	val-rmse:0.67210
[7]	val-rmse:0.63878
[8]	val-rmse:0.60714
[9]	val-rmse:0.57708
[10]	val-rmse:0.54855
[11]	val-rmse:0.52146
[12]	val-rmse:0.49575
[13]	val-rmse:0.47133
[14]	val-rmse:0.44815
[15]	val-rmse:0.42616
[16]	val-rmse:0.40528
[17]	val-rmse:0.38546
[18]	val-rmse:0.36666
[19]	val-rmse:0.34882
[20]	val-rmse:0.33190
[21]	val-rmse:0.31584
[22]	val-rmse:0.30062
[23]	val-rmse:0.28619
[24]	val-rmse:0.27251
[25]	val-rmse:0.25954
[26]	val-rmse:0.24726
[27]	val-rmse:0.23562
[28]	val-rmse:0.22460
[29]	val-rmse:0.21417
[30]	val-rmse:0.20430
[31]	val-rmse:0.19496
[32]	val-rmse:0.18613
[33]	val-rmse:0.17778
[34]	val-rmse:0.16990
[35]	val-rmse:0.16245
[36]	val-rmse:0.15542
[37]	val-rmse:0.14880
[38]	val-rmse:0.14256
[39]	val-rmse:0.13668
[40]	val-rmse:0.13114
[41]	val-rmse:0.12593
[42]	val-rmse:0.12104
[43]	val-rmse:0.11645
[44]	val-rmse:0.11215
[45]	val-rmse:0.1081

# 2-layer (with oof to prevent data leak)

In [20]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

enc = LabelEncoder()
predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']

target = 'z'



# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]
# 1. Initial split to create true holdout test set
X_train_full, X_test, y_train_full, y_test = train_test_split(X, z, test_size=0.2, random_state=42)

# 2. Use cross-validation to create meta-features for X_train_full
# For each fold, train on part of the data, predict on the rest
X_train_meta = X_train_full.copy()
X_train_meta['z_lgb'] = np.zeros(len(X_train_full))
X_train_meta['z_hgb'] = np.zeros(len(X_train_full))

# K-fold CV for non-leaking meta-features
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kf.split(X_train_full):
    # Split data for this fold
    X_fold_train, X_fold_val = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    y_fold_train, y_fold_val = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]
    
    # Train LGB on fold's training data
    lgb_fold = lgb.train(params_lgb, lgb.Dataset(X_fold_train, y_fold_train), num_boost_round=1000)
    # Predict on fold's validation data (these predictions haven't seen this data)
    (X_train_meta.iloc[val_idx])['z_lgb'] = lgb_fold.predict(X_fold_val)
    
    # Train HGB on fold's training data
    hgb_fold = HistGradientBoostingRegressor(max_iter=1000,
                                    learning_rate=0.05,
                                    random_state=42, 
                                    early_stopping=True,
                                    validation_fraction=0.1,
                                    n_iter_no_change=10).fit(X_fold_train, y_fold_train)
    # Predict on fold's validation data
    (X_train_meta.iloc[val_idx])[ 'z_hgb'] = hgb_fold.predict(X_fold_val)

# 3. Train full LGB and HGB models on ALL training data for test set predictions
lgb_full = lgb.train(params_lgb, lgb.Dataset(X_train_full, y_train_full), num_boost_round=1000)
hgb_full = HistGradientBoostingRegressor(max_iter=1000,
                                    learning_rate=0.05,
                                    random_state=42, 
                                    early_stopping=True,
                                    validation_fraction=0.1,
                                    n_iter_no_change=10).fit(X_train_full, y_train_full)

# 4. Create meta-features for test set using full models
X_test_meta = X_test.copy()
X_test_meta['z_lgb'] = lgb_full.predict(X_test)
X_test_meta['z_hgb'] = hgb_full.predict(X_test)

# 5. Train second-level model (XGB) using meta-features
predictors += ['z_lgb', 'z_hgb']
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train_meta[predictors], y_train_full, test_size=0.2, random_state=43
)

# 6. Train XGB on clean meta-features
xgb_train = xgb.DMatrix(X_train_final, label=y_train_final)
xgb_val = xgb.DMatrix(X_val_final, label=y_val_final)
xgb_model = xgb.train(params_xgb, xgb_train, num_boost_round=1000, 
                     evals=[(xgb_val, 'val')], early_stopping_rounds=100)

# 7. Final prediction on true holdout test set
z_pred = xgb_model.predict(xgb.DMatrix(X_test_meta[predictors]), 
                          iteration_range=(0, xgb_model.best_iteration))

[0]	val-rmse:0.91454
[1]	val-rmse:0.86986
[2]	val-rmse:0.82743
[3]	val-rmse:0.78711
[4]	val-rmse:0.74888
[5]	val-rmse:0.71256
[6]	val-rmse:0.67809
[7]	val-rmse:0.64532
[8]	val-rmse:0.61424
[9]	val-rmse:0.58470
[10]	val-rmse:0.55666
[11]	val-rmse:0.53003
[12]	val-rmse:0.50476
[13]	val-rmse:0.48076
[14]	val-rmse:0.45802
[15]	val-rmse:0.43642
[16]	val-rmse:0.41594
[17]	val-rmse:0.39653
[18]	val-rmse:0.37804
[19]	val-rmse:0.36054
[20]	val-rmse:0.34394
[21]	val-rmse:0.32817
[22]	val-rmse:0.31322
[23]	val-rmse:0.29907
[24]	val-rmse:0.28565
[25]	val-rmse:0.27291
[26]	val-rmse:0.26085
[27]	val-rmse:0.24940
[28]	val-rmse:0.23855
[29]	val-rmse:0.22828
[30]	val-rmse:0.21858
[31]	val-rmse:0.20942
[32]	val-rmse:0.20072
[33]	val-rmse:0.19250
[34]	val-rmse:0.18472
[35]	val-rmse:0.17740
[36]	val-rmse:0.17043
[37]	val-rmse:0.16390
[38]	val-rmse:0.15770
[39]	val-rmse:0.15186
[40]	val-rmse:0.14637
[41]	val-rmse:0.14116
[42]	val-rmse:0.13625
[43]	val-rmse:0.13159
[44]	val-rmse:0.12717
[45]	val-rmse:0.1230

In [21]:
# Print RMSE
rmse = np.sqrt(mean_squared_error(y_test, z_pred))
print(f"RMSE: {rmse:.4f}")

RMSE: 0.0605


# Generate final prediction

In [28]:
test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
test.drop(columns='id', inplace=True)

z_lgb = lgb_full.predict(test)
z_hgb = hgb_full.predict(test)


test['z_lgb'] = z_lgb
test['z_hgb'] = z_hgb
# predictors += ['z_lgb', 'z_hgb']

z_pred = xgb_model.predict(xgb.DMatrix(test), 
                          iteration_range=(0, xgb_model.best_iteration))

y_pred = np.clip(np.expm1(z_pred), 0, None)


# 2-model extension of 2-layer idea

In [5]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

enc = LabelEncoder()
predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']

target = 'z'



# Prepare the train data
train = pd.read_csv('train.csv')
train['Sex'] = enc.fit_transform(train['Sex'])
train['z'] = np.log1p(train['Calories'])
X = train[predictors]
z = train[target]

# Prepare the test data
test = pd.read_csv('test.csv')
test['Sex'] = enc.transform(test['Sex'])
test.drop(columns='id', inplace=True)



# Separate datasets into male/female
# Train
indices_train = train['Sex']==0
X_train_f = X[indices_train]
y_train_f = z[indices_train]
X_train_m = X[~indices_train]
y_train_m = z[~indices_train]
# Test
indices_test = test['Sex']==0
X_test_f = test[indices_test]
X_test_m = test[~indices_test]

# Generate oof predictions for LGB and HGB in train set
print('-Creating oof predictions for LGB and HGB in train set')
print('--Female')
oof_lgb_f, oof_hgb_f = oof_generator(X_train_f, y_train_f)
print('--Male')
oof_lgb_m, oof_hgb_m = oof_generator(X_train_m, y_train_m)

# Generate meta-features for test set using full models
print('-Creating meta-features for test set using full models')
print('--Female')
lgb_test_f, hgb_test_f = generate_meta_test(X_train_f, y_train_f, X_test_f)
print('--Male')
lgb_test_m, hgb_test_m = generate_meta_test(X_train_m, y_train_m, X_test_m)

# Prepare the final datasets
# Train
X_train_meta_f = X_train_f.copy()
X_train_meta_f['z_lgb'] = oof_lgb_f
X_train_meta_f['z_hgb'] = oof_hgb_f

X_train_meta_m = X_train_m.copy()
X_train_meta_m['z_lgb'] = oof_lgb_m
X_train_meta_m['z_hgb'] = oof_hgb_m

# Test
X_test_meta_f = X_test_f.copy()
X_test_meta_f['z_lgb'] = lgb_test_f
X_test_meta_f['z_hgb'] = hgb_test_f

X_test_meta_m = X_test_m.copy()
X_test_meta_m['z_lgb'] = lgb_test_m
X_test_meta_m['z_hgb'] = hgb_test_m

# Train the second-level model (XGB) using meta-features
print('-Training the second-level model (XGB) using meta-features')
predictors += ['z_lgb', 'z_hgb']
print('--Female')
model_xgb_f = xgb.train(params_xgb, xgb.DMatrix(X_train_meta_f[predictors], label=y_train_f), num_boost_round=1000)
print('--Male')
model_xgb_m = xgb.train(params_xgb, xgb.DMatrix(X_train_meta_m[predictors], label=y_train_m), num_boost_round=1000)
# Predict on the test set
z_pred_f = model_xgb_f.predict(xgb.DMatrix(X_test_meta_f[predictors]))
z_pred_m = model_xgb_m.predict(xgb.DMatrix(X_test_meta_m[predictors]))

# Combine the predictions
z_pred = np.zeros(len(test))
z_pred[indices_test] = z_pred_f
z_pred[~indices_test] = z_pred_m
y_pred = np.clip(np.expm1(z_pred), 0, None)
print('Done')



-Creating oof predictions for LGB and HGB in train set
--Female
--Male
-Creating meta-features for test set using full models
--Female
--Male
-Training the second-level model (XGB) using meta-features
--Female
--Male
Done


In [6]:
y_pred[:5]

array([ 26.93469623, 109.09204994,  87.3721937 , 124.94670256,
        75.85638079])

In [2]:
def oof_generator(X_train, z_train):
    
    predictors = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp']

    # target = 'z'
    # 1. Initial split to create true holdout test set
    # X_train, X_test, y_train_full, y_test = train_test_split(X_train, z_train, test_size=0.2, random_state=42)


    # 2. Use cross-validation to create meta-features for X_train
    # For each fold, train on part of the data, predict on the rest
    # X_train_meta = X_train.copy()
    oof_lgb_train = np.zeros(len(X_train))
    oof_hgb_train = np.zeros(len(X_train))

    # K-fold CV for non-leaking meta-features
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in kf.split(X_train):
        # Split data for this fold
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        z_fold_train, z_fold_val = z_train.iloc[train_idx], z_train.iloc[val_idx]
        
        # Train LGB on fold's training data
        lgb_fold = lgb.train(params_lgb, lgb.Dataset(X_fold_train, z_fold_train), num_boost_round=1000)
        # Predict on fold's validation data (these predictions haven't seen this data)
        oof_lgb_train[val_idx] = lgb_fold.predict(X_fold_val)
        
        # Train HGB on fold's training data
        hgb_fold = HistGradientBoostingRegressor(max_iter=1000,
                                        learning_rate=0.05,
                                        random_state=42, 
                                        early_stopping=True,
                                        validation_fraction=0.1,
                                        n_iter_no_change=10).fit(X_fold_train, z_fold_train)
        # Predict on fold's validation data
        oof_hgb_train[val_idx] = hgb_fold.predict(X_fold_val)

        return oof_lgb_train, oof_hgb_train
    

def generate_meta_test(X_train, y_train, X_test):
    # 3. Train full LGB and HGB models on ALL training data for test set predictions
    lgb_full = lgb.train(params_lgb, lgb.Dataset(X_train, y_train), num_boost_round=1000)
    hgb_full = HistGradientBoostingRegressor(max_iter=1000,
                                        learning_rate=0.05,
                                        random_state=42, 
                                        early_stopping=True,
                                        validation_fraction=0.1,
                                        n_iter_no_change=10).fit(X_train,y_train)

    # 4. Create meta-features for test set using full models
    # X_test_meta = X_test.copy()

    lgb_test = lgb_full.predict(X_test)
    hgb_test = hgb_full.predict(X_test)
    
    return lgb_test, hgb_test

# Submission

In [15]:
def generate_submission(y_pred, submission_name):
    
    df_submission = pd.read_csv('sample_submission.csv')
    df_submission['Calories'] = y_pred
    df_submission.to_csv(submission_name, index=False)

submission_name = '2_layer_plus_2_model'
generate_submission(y_pred, f'submissions\\{submission_name}.csv')

# Helper Functions

In [None]:
def interweave(array_1, array_2, indices):
    assert len(array_1) == indices.sum()
    output = np.zeros(len(array_1) + len(array_2))
    output[indices] = array_1
    output[~indices] = array_2
    return output