Ensemble learning techniques has several approaches
 - Bagging
 - Boosting
 - Stacking 
 - Blending
 
This notebook will focus on Stacking

The way to stack any number of models and train 2 stage model LogisticClassification 
on meta features. The goal is to create several week models to make predictions and after train second level model to make predictions based on first models level. 

![](https://miro.medium.com/max/7932/1*CoauXirckomVXxw2Id2w_Q.jpeg)

In [None]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Imports from our package

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
import datetime
import pandas as pd
import numpy as np
from time import time



from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

from catboost import CatBoostClassifier, Pool
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest

import seaborn as sns


In [None]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train_data['target'] = train_data['target'].str.slice(start=6).astype(int) - 1
train_data.head()

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test_data.head()

In [None]:
def create_gr_feats(data):
    pass

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

In [None]:
le = LabelEncoder()
train_data['target'] = le.fit_transform(train_data['target'])
train_data

y = train_data['target']
X = train_data.drop(['id', 'target'], axis=1)

## Feature Engineering

### Missing Values

In [None]:
X.isnull().sum()

#### Cube Root Transform

In [None]:
X = (X**(1/3))

In [None]:
sns.distplot(X['feature_10'])
plt.title("Distribution plot after Cube transformation")
sns.despine()
plt.show()

In [None]:
sns.boxplot(X['feature_30'])
plt.title("Distribution plot after Cube transformation")
sns.despine()
plt.show()

In [None]:
for col in X.columns:
    X[col] = X[col].fillna(X[col].median())

## Stacking

In [None]:
class ModelStager:

    def __init__(self, penalty, n_folds,
                 verbose=1, shuffle=True, random_state=1):
        self._penalty = penalty
        self._n_folds = n_folds
        self._verbose = verbose
        self._random_state = random_state
        self._shuffle = shuffle

    def _print(self, input_str):
        time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        print(bcolors.HEADER + "[ModelStager | " + time + "] " + bcolors.ENDC + str(input_str)) 

    def fit(self, X, y, model, model_type=None):
        kfold = KFold(n_splits=self._n_folds, shuffle=self._shuffle,
                  random_state=self._random_state)

        cv_scores = []
        oof_predictions = pd.DataFrame(index=X.index, columns=range(y.nunique()))

        fold_idx = 0

        for tr_idx, val_idx in kfold.split(X):

            X_tr = X.iloc[tr_idx]
            X_val = X.iloc[val_idx]

            y_tr = y.iloc[tr_idx]
            y_val = y.iloc[val_idx]

            if self._verbose:
                self._print("Data_tr shape : " + str(X_tr.shape))

            fold_idx = fold_idx + 1
            t = time()
            
            if model_type == 'LightAutoML':
                model.fit_predict(pd.concat([X_tr, y_tr], axis=1), roles = roles)
            else:
                model.fit(X_tr, y_tr)
            
            if model_type == 'LightAutoML':
                validation_prediction = model.predict(X_val)
            else:
                validation_prediction = model.predict_proba(X_val)

            oof_predictions.iloc[val_idx] = validation_prediction

            cv_score_model = self._penalty(y_val, validation_prediction)
            cv_scores.append(cv_score_model)

            if self._verbose:
                self._print("Fold %.0f : TEST %.5f | TIME %.2fm (1-fold)" %
                            (fold_idx, cv_score_model, (time() - t) / 60))

        self._print("TEST AVERAGE : %.5f" % (np.mean(cv_scores)))

        return oof_predictions

## CATBOOST

In [None]:
cat_best_params_cb = {'learning_rate': 0.04421824097495285, 'reg_lambda': 23.319572135686258, 'subsample': 0.22509693846883988, 'random_strength': 0.13972768817453876, 'depth': 11, 'min_data_in_leaf': 9, 'num_leaves': 20, 'leaf_estimation_iterations': 2}
cat_best_params_cb['loss_function'] = 'MultiClass'
cat_best_params_cb['eval_metric'] = 'MultiClass'
cat_best_params_cb['verbose'] = False
cat_best_params_cb['bootstrap_type']= 'Bernoulli'
cat_best_params_cb['leaf_estimation_method'] = 'Newton'
cat_best_params_cb['random_state'] = 42
cat_best_params_cb['n_estimators'] = 1000
cat_best_params_cb['task_type'] = 'CPU'
cat_best_params_cb['grow_policy'] = 'Lossguide'

## LightGBM

In [None]:
best_params_lgbm = {'reg_alpha': 1.968952683454436e-05, 'reg_lambda': 23.573499535643215, 'colsample_bytree': 0.6, 'subsample': 0.06343012933691167, 'learning_rate': 0.08143516756001878, 'max_depth': 1, 'num_leaves': 884, 'min_child_samples': 185, 'min_child_weight': 3.692224868094191e-05, 'cat_smooth': 17, 'cat_l2': 14}
best_params_lgbm['objective'] = 'multiclass'
best_params_lgbm['random_state'] = 42
best_params_lgbm['n_estimators'] = 1000
best_params_lgbm['metric'] = 'multi_logloss'
best_params_lgbm['device_type'] : 'cpu'

## XGBClassifier

In [None]:
best_params_xgb = {'learning_rate': 0.4456929987528251, 'gamma': 1.6443478072941096, 'max_depth': 29, 'min_child_weight': 51.45867185135785, 'max_delta_step': 3.54917148452682, 'subsample': 0.7132967600600638, 'colsample_bytree': 0.3802004057000849, 'lambda': 15.516716769784777, 'alpha': 16.618133595583096, 'max_leaves': 31}
best_params_xgb['objective'] = 'multi:softprob'
best_params_xgb['random_state'] = 13
best_params_xgb['eval_metric'] = 'mlogloss'
best_params_xgb['grow_policy'] = 'lossguide'
best_params_xgb['tree_method'] ='hist'
best_params_xgb['n_estimators'] = 1000
best_params_xgb['predictor'] ='cpu_predictor'

## Training

In [None]:
stager = ModelStager(log_loss, 5)

print("CatBoost model")
catboost_model = CatBoostClassifier(**cat_best_params_cb)
stage1_cat = stager.fit(X, y, catboost_model)

print("LightGBM model")
lightgbm_model = LGBMClassifier(**best_params_lgbm)
stage1_gbm = stager.fit(X, y, lightgbm_model)

print("XGB model")
model_xgb = XGBClassifier(**best_params_xgb)
stage_1_xgb = stager.fit(X, y, model_xgb)


print("Stage 1 : (RF, ET) -> logistic model")
stage1_rf_et = pd.concat([stage1_cat, stage1_gbm, stage_1_xgb], axis=1)
stager.fit(stage1_rf_et, y, LogisticRegression())

### Resources

[More info on Staking](https://towardsdatascience.com/ensemble-learning-stacking-blending-voting-b37737c4f483)