# Introduction

This model demonstrates simple kernel stacking using XGBoost, LightGBM, Hist Gradient Boosting Regression, and Ridge Regression. It uses 5-fold cross validation to build each model, and makes both test and training predictions out-of-fold. Those results are then fed into the level 2 Ridge model, where 5-fold cross validation is used again to make out-of-fold predictions for the submission result. Basic feature engineering is employed. If you like the model, please consider upvoting!

# Credits

The following discussions and other kernels have all contributed to the additional feature engineering seen within this kernel. If you find this kernel useful, please visit these discussions and upvote information contained therein.

* Discussion - [A simple feature that improved my score](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974) by [
Carl McBride Ellis](https://www.kaggle.com/carlmcbrideellis)
* Kernel - [LGBM LOVER'S](https://www.kaggle.com/shivansh002/lgbm-lover-s) by [OnePunchMan](https://www.kaggle.com/shivansh002/lgbm-lover-s)

# Load Data

In [None]:
import pandas
import numpy

train = pandas.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pandas.read_csv("../input/ventilator-pressure-prediction/test.csv")
submission = pandas.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")
train

# Define Features

In [None]:
cont_features = ["time_step", "u_in"]
cat_features = ["R", "C"]

target = train["pressure"]

Given that this is time series information, we can use the previous 2 steps worth of information for our machine learning model. Although not strictly a good idea, we can also give it 2 steps of information about the future, since we have the entire breathing sequence available to us.  In addition, we'll calculate the step size instead of using the raw `time_step` value. Note that we do all of this based on `breath_id`.

In [None]:
# Cast categorical columns to integer types
train["R"] = train["R"].astype(numpy.int8)
train["C"] = train["C"].astype(numpy.int8)
train["u_out"] = train["u_out"].astype(numpy.int8)

test["R"] = test["R"].astype(numpy.int8)
test["C"] = test["C"].astype(numpy.int8)
test["u_out"] = test["u_out"].astype(numpy.int8)

# Look two steps into the past for u_in (u_out doesn't matter)
train["u_in_last_n1"] = train.groupby("breath_id")["u_in"].shift(1)
train["u_in_last_n2"] = train.groupby("breath_id")["u_in"].shift(2)
test["u_in_last_n1"] = test.groupby("breath_id")["u_in"].shift(1)
test["u_in_last_n2"] = test.groupby("breath_id")["u_in"].shift(2)

# Look two steps into the future
train["u_in_future_n1"] = train.groupby("breath_id")["u_in"].shift(-1)
train["u_in_future_n2"] = train.groupby("breath_id")["u_in"].shift(-2)
train["u_out_future_n1"] = train.groupby("breath_id")["u_out"].shift(-1)
train["u_out_future_n2"] = train.groupby("breath_id")["u_out"].shift(-2)
test["u_in_future_n1"] = test.groupby("breath_id")["u_in"].shift(-1)
test["u_in_future_n2"] = test.groupby("breath_id")["u_in"].shift(-2)
test["u_out_future_n1"] = test.groupby("breath_id")["u_out"].shift(-1)
test["u_out_future_n2"] = test.groupby("breath_id")["u_out"].shift(-2)

# Fill missing values with zeros
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# u_out values should just be integers for categoricals
train["u_out_future_n1"] = train["u_out_future_n1"].astype(numpy.int8)
train["u_out_future_n2"] = train["u_out_future_n2"].astype(numpy.int8)
test["u_out_future_n1"] = test["u_out_future_n1"].astype(numpy.int8)
test["u_out_future_n2"] = test["u_out_future_n2"].astype(numpy.int8)

# Grab the max and min values of u_in
train["u_in_max"] = train.groupby("breath_id")["u_in"].max()
test["u_in_max"] = test.groupby("breath_id")["u_in"].max()

train["u_in_min"] = train.groupby("breath_id")["u_in"].min()
test["u_in_min"] = test.groupby("breath_id")["u_in"].min()

# Differences between current, max, min, and mean values for u_in
train["u_in_last_n1_diff"] = train["u_in"] - train["u_in_last_n1"]
train["u_in_last_n2_diff"] = train["u_in"] - train["u_in_last_n2"]
train["u_in_future_n1_diff"] = train["u_in"] - train["u_in_future_n1"]
train["u_in_future_n2_diff"] = train["u_in"] - train["u_in_future_n2"]

train["u_in_max_diff"] = train.groupby("breath_id")["u_in"].max() - train["u_in"]
train["u_in_min_diff"] = train.groupby("breath_id")["u_in"].min() - train["u_in"]
train["u_in_mean_diff"] = train.groupby("breath_id")["u_in"].mean() - train["u_in"]

test["u_in_max_diff"] = test.groupby("breath_id")["u_in"].max() - test["u_in"]
test["u_in_min_diff"] = test.groupby("breath_id")["u_in"].min() - test["u_in"]
test["u_in_mean_diff"] = test.groupby("breath_id")["u_in"].mean() - test["u_in"]

test["u_in_last_n1_diff"] = test["u_in"] - test["u_in_last_n1"]
test["u_in_last_n2_diff"] = test["u_in"] - test["u_in_last_n2"]
test["u_in_future_n1_diff"] = test["u_in"] - test["u_in_future_n1"]
test["u_in_future_n2_diff"] = test["u_in"] - test["u_in_future_n2"]

# Grab cumulative sum of u_in
train["u_in_cumulative"] = train.groupby("breath_id")["u_in"].cumsum()
test["u_in_cumulative"] = test.groupby("breath_id")["u_in"].cumsum()

# Make a note of the previous time for each row
train["last_time_step"] = train.groupby("breath_id")["time_step"].shift(1)
test["last_time_step"] = test.groupby("breath_id")["time_step"].shift(1)

# Calculate the size of the step for each row from the previous observation
train["step_size"] = train["time_step"] - train["last_time_step"]
test["step_size"] = test["time_step"] - test["last_time_step"]

# Add new features to the appropriate feature lists
cont_features.append("u_in_last_n1")
cont_features.append("u_in_last_n2")
cont_features.append("u_in_future_n1")
cont_features.append("u_in_future_n2")
cont_features.append("step_size")
cont_features.append("u_in_max_diff")
cont_features.append("u_in_min_diff")
cont_features.append("u_in_mean_diff")
cont_features.append("u_in_cumulative")
cat_features.append("u_out_future_n1")
cat_features.append("u_out_future_n2")
cont_features.append("u_in_last_n1_diff")
cont_features.append("u_in_last_n2_diff")
cont_features.append("u_in_future_n1_diff")
cont_features.append("u_in_future_n2_diff")

Encode features depending on model type. Some of the categorical features must be encoded differently depending on the model.

In [None]:
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder

xgb_cat_features = []
lgb_cat_features = []
cb_cat_features = []
hgbc_cat_features = []

loo_features = []
le_features = []

def label_encode(train_df, test_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column].unique().tolist() + test_df[column].unique().tolist())
    train_df[new_feature] = le.transform(train_df[column])
    test_df[new_feature] = le.transform(test_df[column])
    return new_feature

def loo_encode(train_df, test_df, column):
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train_df[column], train_df["pressure"])
    train_df[new_feature] = loo.transform(train_df[column])
    test_df[new_feature] = loo.transform(test_df[column])
    return new_feature

for feature in cat_features:
    loo_features.append(loo_encode(train, test, feature))
    le_features.append(label_encode(train, test, feature))
    
xgb_cat_features.extend(loo_features)
lgb_cat_features.extend(le_features)
cb_cat_features.extend(cat_features)
hgbc_cat_features.extend(loo_features)

Finally, we'll define the dataset to train on. As specified by the competition evaluation page:

* The expiratory phase is not scored

This means we don't need to bother learning about pressures when `u_out` is `1`. 

In [None]:
new_train = train[(train["u_out"] == 0)]
target = new_train["pressure"]

# Generate Level 1 Models

In [None]:
import gc
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_absolute_error

random_state = 2021
n_folds = 5
k_fold = GroupKFold(n_splits=n_folds)

xgb_train_preds = numpy.zeros(len(new_train.index), )
xgb_test_preds = numpy.zeros(len(test.index), )

lgb_train_preds = numpy.zeros(len(new_train.index), )
lgb_test_preds = numpy.zeros(len(test.index), )

hgbc_train_preds = numpy.zeros(len(new_train.index), )
hgbc_test_preds = numpy.zeros(len(test.index), )

for fold, (train_index, test_index) in enumerate(k_fold.split(new_train, target, new_train["breath_id"])):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    xgb_features = xgb_cat_features + cont_features
    xgb_x_train = pandas.DataFrame(new_train[xgb_features].iloc[train_index])
    xgb_x_valid = pandas.DataFrame(new_train[xgb_features].iloc[test_index])

    xgb_model = XGBRegressor(
        seed=random_state,
        n_estimators=10000,
        verbosity=1,
        eval_metric="mae",
        tree_method="gpu_hist",
        gpu_id=0,
        alpha=0.17318869995918917,
        colsample_bytree=0.867867790089098,
        gamma=0.38457003274587587,
        reg_lambda=9.642931533488165,
        learning_rate=0.09834191296472465,
        max_bin=696,
        max_depth=13,
        min_child_weight=1.8360422783281707,
        subsample=0.7191624126962364,
    )
    xgb_model.fit(
        xgb_x_train,
        y_train,
        eval_set=[(xgb_x_valid, y_valid)], 
        verbose=0,
        early_stopping_rounds=200,
    )

    train_oof_preds = xgb_model.predict(xgb_x_valid)
    test_oof_preds = xgb_model.predict(test[xgb_features])
    xgb_train_preds[test_index] = train_oof_preds
    xgb_test_preds += test_oof_preds / n_folds
    print(": XGB - MAE Score = {}".format(mean_absolute_error(y_valid, train_oof_preds)))
   
    del(xgb_x_train)
    del(xgb_x_valid)
    del(xgb_model)
    _ = gc.collect()
    
    
    lgb_features = lgb_cat_features + cont_features
    lgb_x_train = pandas.DataFrame(new_train[lgb_features].iloc[train_index])
    lgb_x_valid = pandas.DataFrame(new_train[lgb_features].iloc[test_index])

    lgb_model = LGBMRegressor(
        cat_feature=[x for x in range(len(lgb_cat_features))],
        random_state=random_state,
        early_stopping_round=200,
        metric="mae",
        n_estimators=100000,
        n_jobs=-1,
        verbose=-1,
        cat_l2=39.95489636774699,
        cat_smooth=35.550900375908554,
        colsample_bytree=0.8046962761788545,
        learning_rate=0.6023144602962871,
        max_bin=395,
        max_depth=16,
        min_child_samples=359,
        min_data_per_group=209,
        num_leaves=163,
        reg_alpha=5.832855074792787,
        reg_lambda=6.454871156617244,
        subsample=0.7264718486219024,
        subsample_freq=1,
    )
    lgb_model.fit(
        lgb_x_train,
        y_train,
        eval_set=[(lgb_x_valid, y_valid)], 
        verbose=0,
    )

    train_oof_preds = lgb_model.predict(lgb_x_valid)
    test_oof_preds = lgb_model.predict(test[lgb_features])
    lgb_train_preds[test_index] = train_oof_preds
    lgb_test_preds += test_oof_preds / n_folds
    print(": LGB - MAE Score = {}".format(mean_absolute_error(y_valid, train_oof_preds)))

    del(lgb_x_train)
    del(lgb_x_valid)
    del(lgb_model)
    _ = gc.collect()

    
    hgbc_features = hgbc_cat_features + cont_features
    hgbc_x_train = pandas.DataFrame(new_train[hgbc_features].iloc[train_index])
    hgbc_x_valid = pandas.DataFrame(new_train[hgbc_features].iloc[test_index])

    hgbc_model = HistGradientBoostingRegressor(
        random_state=2021,
        l2_regularization=0.226902431039134,
        learning_rate=0.4389644036579206,
        max_bins=127,
        max_depth=63,
        max_leaf_nodes=364,
    )
    hgbc_model.fit(
        hgbc_x_train,
        y_train,
    )

    train_oof_preds = hgbc_model.predict(hgbc_x_valid)
    test_oof_preds = hgbc_model.predict(test[hgbc_features])
    hgbc_train_preds[test_index] = train_oof_preds
    hgbc_test_preds += test_oof_preds / n_folds
    print(": HGBC - MAE Score = {}".format(mean_absolute_error(y_valid, train_oof_preds)))

    del(hgbc_x_train)
    del(hgbc_x_valid)
    del(hgbc_model)
    _ = gc.collect()

    print("")
    
print("--> Overall metrics")
print(": XGB - MAE Score = {}".format(mean_absolute_error(target, xgb_train_preds)))
print(": LGB - MAE Score = {}".format(mean_absolute_error(target, lgb_train_preds)))
print(": HGBC - MAE Score = {}".format(mean_absolute_error(target, hgbc_train_preds)))

# Build Level 2 Model

In [None]:
from scipy.special import expit

random_state = 2021
n_folds = 5
k_fold = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)

l1_train = pandas.DataFrame(data={
    "xgb": xgb_train_preds.tolist(),
    "lgb": lgb_train_preds.tolist(),
    "hgbc": hgbc_train_preds.tolist(),
})
l1_test = pandas.DataFrame(data={
    "xgb": xgb_test_preds.tolist(),
    "lgb": lgb_test_preds.tolist(),
    "hgbc": hgbc_test_preds.tolist(),
})

train_preds = numpy.zeros(len(l1_train.index), )
test_preds = numpy.zeros(len(l1_test.index), )
features = ["xgb", "lgb", "hgbc"]

for fold, (train_index, test_index) in enumerate(k_fold.split(l1_train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    x_train = pandas.DataFrame(l1_train[features].iloc[train_index])
    x_valid = pandas.DataFrame(l1_train[features].iloc[test_index])
    
    model = Ridge(random_state=random_state)
    model.fit(
        x_train,
        y_train,
    )

    train_oof_preds = model.predict(x_valid)
    test_oof_preds = model.predict(l1_test[features])
    train_preds[test_index] = train_oof_preds
    test_preds += test_oof_preds / n_folds
    print(": MAE Score = {}".format(mean_absolute_error(y_valid, train_oof_preds)))
    print("")
    
print("--> Overall metrics")
print(": MAE Score = {}".format(mean_absolute_error(target, train_preds)))

# Save Predictions

In [None]:
submission["pressure"] = test_preds.tolist()
submission.to_csv("submission.csv", index=False)