# About this notebook

This notebook shows a hands on approach to use Stacking for tabular data. Stacking is a very powerful method when it comes to extracting the last bit of performance from a group of already decent estimators.  

I want to acknowledge [@abhishek](https://www.kaggle.com/abhishek) for his k-fold dataset (used for training in this notebook) and his [awesome playlist](https://www.youtube.com/playlist?list=PL98nY_tJQXZnP-k3qCDd1hljVSciDV9_N) on Youtube which covers most of the pre-requisites and concepts used here.

Note:- I have directly used the optimised parameters for L0 models here. Majority of which I tuned on my own in a separate Kernel and some of which I found in [this](https://www.kaggle.com/abhishek/blending-blending-blending) kernel shared by [@abhishek](https://www.kaggle.com/abhishek)

# Imports

In [None]:
# Asthetics
import warnings
import sklearn.exceptions
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=sklearn.exceptions.UndefinedMetricWarning)

# General
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os
import random
import gc
import itertools
gc.enable()

# Visialisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

# Machine Learning
# Utils
from sklearn import preprocessing
import category_encoders as ce
# Regression Models
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#Metrics
from sklearn.metrics import mean_squared_error

# Deep Learning
import torch

# Fixing Seed
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything()

# Device Optimization
if torch.cuda.is_available():
    GPU = True
else:
    GPU = False
    
print(f'GPU Available: {GPU}')

# Read Data

In [None]:
data_dir = '../input/30-days-of-ml'

train_file_path = '../input/30days-folds/train_folds.csv'
test_file_path = os.path.join(data_dir, 'test.csv')
sample_sub_file_path = os.path.join(data_dir, 'sample_submission.csv')

print(f'Train file: {train_file_path}')
print(f'Test file: {test_file_path}')
print(f'Sample Sub file: {sample_sub_file_path}')

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
target = ['target']
not_features = ['id', 'kfold', 'target']
cols = list(train_df.columns)
features = [feat for feat in cols if feat not in not_features]

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_features = []
numerical_features = []

for i in features:
    if train_df[i].dtype in numerics:
        numerical_features.append(i)
    else:
        categorical_features.append(i)
        
print(f'Numeric features: {numerical_features}')
print(f'Categorical features: {categorical_features}')

In [None]:
high_cardinality_cat_feat = []
low_cardinality_cat_feat = []

for col in categorical_features:
    if train_df[col].nunique() > 5:
        high_cardinality_cat_feat.append(col)
    else:
        low_cardinality_cat_feat.append(col)

print(f'High Cardinality Categorical Features: {high_cardinality_cat_feat}')
print(f'Low Cardinality Categorical Features:  {low_cardinality_cat_feat}')

# Feature Encoding

In [None]:
def ordinal_enc(train_df, valid_df, test_df, features):
    ord_enc = preprocessing.OrdinalEncoder()
    
    train_df[features] = ord_enc.fit_transform(train_df[features])
    valid_df[features] = ord_enc.transform(valid_df[features])
    test_df[features] = ord_enc.transform(test_df[features])
            
    return train_df, valid_df, test_df

In [None]:
def one_hot_enc(train_df, valid_df, test_df, features):
    OH_enc = preprocessing.OneHotEncoder(sparse=False)
    OH_cols_train = pd.DataFrame(OH_enc.fit_transform(train_df[features]))
    OH_cols_valid = pd.DataFrame(OH_enc.transform(valid_df[features]))
    OH_cols_test = pd.DataFrame(OH_enc.transform(test_df[features]))
    
    OH_cols_train.index = train_df[features].index
    OH_cols_valid.index = valid_df[features].index
    OH_cols_test.index = test_df[features].index
    
    train_df = train_df.drop(features, axis=1)
    valid_df = valid_df.drop(features, axis=1)
    test_df = test_df.drop(features, axis=1)
    
    train_df = pd.concat([train_df, OH_cols_train], axis=1)
    valid_df = pd.concat([valid_df, OH_cols_valid], axis=1)
    test_df = pd.concat([test_df, OH_cols_test], axis=1)
    
    return train_df, valid_df, test_df

In [None]:
def catboost_enc(train_df, valid_df, test_df, features):
    cb_enc = ce.CatBoostEncoder(cols=features)
    cb_enc.fit(train_df[features], train_df['target'])
    
    train_df = train_df.join(cb_enc.transform(train_df[features]).add_suffix('_cb'))
    valid_df = valid_df.join(cb_enc.transform(valid_df[features]).add_suffix('_cb'))
    test_df = test_df.join(cb_enc.transform(test_df[features]).add_suffix('_cb'))
    
    train_df = train_df.drop(features, axis=1)
    valid_df = valid_df.drop(features, axis=1)
    test_df = test_df.drop(features, axis=1)
    
    return train_df, valid_df, test_df

# L0 Models

## 1. XGBoost

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = catboost_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    if GPU:
        xgb_params = {
            'learning_rate': 0.03336420518893778,
            'n_estimators': 9801,
            'reg_lambda': 1.861341724045207e-07,
            'reg_alpha': 41.069310465195315,
            'max_depth': 4,
            'subsample': 0.7889762678652061,
            'colsample_bytree': 0.11946885793267677,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'tree_method':'gpu_hist',
            'gpu_id': 0,
            'predictor': 'gpu_predictor'
        }
        reg = XGBRegressor(**xgb_params)
    else:
        xgb_params = {
            'learning_rate': 0.03336420518893778,
            'n_estimators': 9801,
            'reg_lambda': 1.861341724045207e-07,
            'reg_alpha': 41.069310465195315,
            'max_depth': 4,
            'subsample': 0.7889762678652061,
            'colsample_bytree': 0.11946885793267677,
            'booster': 'gbtree',
            'random_state': RANDOM_SEED,
            'verbosity': 0,
            'n_jobs': -1
        }
        reg = XGBRegressor(**xgb_params)
      
    reg.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)],
            eval_metric = 'rmse',
            early_stopping_rounds = 300,
            verbose=False)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_1']
valid_pred_all.to_csv('train_pred_1.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_1'] = test_pred_all
sub_2.to_csv('test_pred_1.csv', index=False)

## 2. LightGBM

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = one_hot_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = catboost_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]

    lgbm_params = {
        'learning_rate': 0.02685940133369163,
        'n_estimators': 7981,
        'reg_lambda': 0.0014864508341972096,
        'reg_alpha': 0.011093263193854072,
        'max_depth': 2,
        'num_leaves': 106,
        'min_data_in_leaf': 543,
        'subsample_freq': 9,
        'bagging_fraction': 0.8639703294516574,
        'pos_bagging_fraction': 0.7927447639915055,
        'neg_bagging_fraction': 0.8037410861863784,
        'colsample_bytree': 0.9798011192356275,
        'objective': 'regression',
        'metric': 'l2',
        'verbose': -1,
        'n_jobs': -1,
        'random_state': RANDOM_SEED
    }

    reg = LGBMRegressor(**lgbm_params)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_2']
valid_pred_all.to_csv('train_pred_2.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_2'] = test_pred_all
sub_2.to_csv('test_pred_2.csv', index=False)

## 3. XGBoost 2 ( Abhisek's Parameters)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]

    if GPU:
        xgb_params = {
            'learning_rate': 0.03628302216953097,
            'n_estimators': 10000,
            'reg_lambda': 0.0008746338866473539,
            'reg_alpha': 23.13181079976304,
            'max_depth': 3,
            'subsample': 0.7875490025178415,
            'colsample_bytree': 0.11807135201147481,
            'booster': 'gbtree',
            'random_state': 1,
            'verbosity': 0,
            'tree_method':'gpu_hist',
            'gpu_id': 0,
            'predictor': 'gpu_predictor'
        }
        reg = XGBRegressor(**xgb_params)
    else:
        xgb_params = {
            'learning_rate': 0.03628302216953097,
            'n_estimators': 10000,
            'reg_lambda': 0.0008746338866473539,
            'reg_alpha': 23.13181079976304,
            'max_depth': 3,
            'subsample': 0.7875490025178415,
            'colsample_bytree': 0.11807135201147481,
            'booster': 'gbtree',
            'random_state': 1,
            'verbosity': 0,
            'n_jobs': 4
        }
        reg = XGBRegressor(**xgb_params)
      
    reg.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)],
            eval_metric = 'rmse',
            early_stopping_rounds = 300,
            verbose=False)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_3']
valid_pred_all.to_csv('train_pred_3.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_3'] = test_pred_all
sub_2.to_csv('test_pred_3.csv', index=False)

## 4. XGBoost 3 ( Abhisek's Parameters)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    if GPU:
        xgb_params = {
            'learning_rate': 0.07853392035787837,
            'n_estimators': 5000,
            'reg_lambda': 1.7549293092194938e-05,
            'reg_alpha': 14.68267919457715,
            'max_depth': 3,
            'subsample': 0.8031450486786944,
            'colsample_bytree': 0.170759104940733,
            'random_state': i,
            'verbosity': 0,
            'tree_method':'gpu_hist',
            'gpu_id': 0,
            'predictor': 'gpu_predictor'
        }
        reg = XGBRegressor(**xgb_params)
    else:
        xgb_params = {
            'learning_rate': 0.07853392035787837,
            'n_estimators': 5000,
            'reg_lambda': 1.7549293092194938e-05,
            'reg_alpha': 14.68267919457715,
            'max_depth': 3,
            'subsample': 0.8031450486786944,
            'colsample_bytree': 0.170759104940733,
            'random_state': i,
            'verbosity': 0,
            'n_jobs': 4
        }
        reg = XGBRegressor(**xgb_params)
      
    reg.fit(train[features].values, train[target].values,
            eval_set = [(valid[features].values, valid[target].values)],
            eval_metric = 'rmse',
            early_stopping_rounds = 300,
            verbose=False)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_4']
valid_pred_all.to_csv('train_pred_4.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_4'] = test_pred_all
sub_2.to_csv('test_pred_4.csv', index=False)

## 5. Linear Regression

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    reg = LinearRegression()
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_5']
valid_pred_all.to_csv('train_pred_5.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_5'] = test_pred_all
sub_2.to_csv('test_pred_5.csv', index=False)

## 6. Random Forest

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    reg = RandomForestRegressor(n_estimators=500, n_jobs=4)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    del reg
    gc.collect()
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_6']
valid_pred_all.to_csv('train_pred_6.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_6'] = test_pred_all
sub_2.to_csv('test_pred_6.csv', index=False)

## 7. ElasticNet

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    reg = ElasticNet(random_state=RANDOM_SEED)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_7']
valid_pred_all.to_csv('train_pred_7.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_7'] = test_pred_all
sub_2.to_csv('test_pred_7.csv', index=False)

## 8. XGBoost 4 (Public Params)

[Here](https://www.kaggle.com/aditidutta/tutorial-30days-rf-xgb-lgbm-catboost-eda)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    xgb_params = {
        'objective': 'reg:squarederror',
        'n_estimators': 10000,
        'learning_rate': 0.036,
        'subsample': 0.926,
        'colsample_bytree': 0.118,
        'grow_policy':'lossguide',
        'max_depth': 3,
        'booster': 'gbtree', 
        'reg_lambda': 45.1,
        'reg_alpha': 34.9,
        'random_state': 42,
        'reg_lambda': 0.00087,
        'reg_alpha': 23.132,
        'n_jobs': 4
    }
    
    reg = XGBRegressor(**xgb_params)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_8']
valid_pred_all.to_csv('train_pred_8.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_8'] = test_pred_all
sub_2.to_csv('test_pred_8.csv', index=False)

## 9. XGBoost 5 (Public Params)

[Here](https://www.kaggle.com/aditidutta/tutorial-30days-rf-xgb-lgbm-catboost-eda)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    xgb_params = {
        'objective': 'reg:squarederror',
        'n_estimators': 5000,
        'learning_rate': 0.12,
        'subsample': 0.96,
        'colsample_bytree': 0.12,
        'max_depth': 2,
        'booster': 'gbtree', 
        'reg_lambda': 65.1,
        'reg_alpha': 15.9,
        'random_state':40
    }
    
    reg = XGBRegressor(**xgb_params)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_9']
valid_pred_all.to_csv('train_pred_9.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_9'] = test_pred_all
sub_2.to_csv('test_pred_9.csv', index=False)

## 10. XGBoost 6 (Public Params)

[Here](https://www.kaggle.com/aditidutta/tutorial-30days-rf-xgb-lgbm-catboost-eda)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = ordinal_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = ordinal_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]
    
    xgb_params = {
        'random_state': 1, 
        'n_jobs': 4,
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.0362,
        'reg_lambda': 0.000874,
        'reg_alpha': 23.131,
        'subsample': 0.787,
        'colsample_bytree': 0.118,
        'max_depth': 3
    }
    
    reg = XGBRegressor(**xgb_params)
    reg.fit(train[features].values, train[target].values)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_10']
valid_pred_all.to_csv('train_pred_10.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_10'] = test_pred_all
sub_2.to_csv('test_pred_10.csv', index=False)

## 11. LGBM (Public Params)

[Here](https://www.kaggle.com/sumukhar/light-gbm-hyperparameters-tuned)

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    train, valid, test = one_hot_enc(train, valid, test, low_cardinality_cat_feat)
    train, valid, test = catboost_enc(train, valid, test, high_cardinality_cat_feat)

    valid_ids = valid.id.values.tolist()

    cols = list(train.columns)
    features = [feat for feat in cols if feat not in not_features+target]

    lgbm_params = {
        'max_depth': 5,
        'learning_rate': 0.05,
        'metric': 'rmse', 
        'n_jobs': 4,
        'n_estimators': 10000,
        'reg_alpha': 17,
        'reg_lambda': 21,
        'colsample_bytree': 0.225,
        'subsample': 0.75,
        'num_leaves': 64,
        'min_child_samples': 15,
        'max_bin': 250
    }

    reg = LGBMRegressor(**lgbm_params)
    reg.fit(train[features].values, train[target].values, early_stopping_rounds=75,
            eval_set = [(valid[features].values, valid[target].values)], verbose=False)
    
    valid_pred = reg.predict(valid[features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = reg.predict(test[features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_11']
valid_pred_all.to_csv('train_pred_11.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_11'] = test_pred_all
sub_2.to_csv('test_pred_11.csv', index=False)

# Blending (L1 Models)

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
prev_features = list(train_df.columns)

In [None]:
df1 = pd.read_csv('train_pred_1.csv')
df2 = pd.read_csv('train_pred_2.csv')
df3 = pd.read_csv('train_pred_3.csv')
df4 = pd.read_csv('train_pred_4.csv')
df5 = pd.read_csv('train_pred_5.csv')
df6 = pd.read_csv('train_pred_6.csv')
df7 = pd.read_csv('train_pred_7.csv')
df8 = pd.read_csv('train_pred_8.csv')
df9 = pd.read_csv('train_pred_9.csv')
df10 = pd.read_csv('train_pred_10.csv')
df11 = pd.read_csv('train_pred_11.csv')

df_test1 = pd.read_csv('test_pred_1.csv')
df_test2 = pd.read_csv('test_pred_2.csv')
df_test3 = pd.read_csv('test_pred_3.csv')
df_test4 = pd.read_csv('test_pred_4.csv')
df_test5 = pd.read_csv('test_pred_5.csv')
df_test6 = pd.read_csv('test_pred_6.csv')
df_test7 = pd.read_csv('test_pred_7.csv')
df_test8 = pd.read_csv('test_pred_8.csv')
df_test9 = pd.read_csv('test_pred_9.csv')
df_test10 = pd.read_csv('test_pred_10.csv')
df_test11 = pd.read_csv('test_pred_11.csv')

In [None]:
train_df = train_df.merge(df1, on='id', how='left')
train_df = train_df.merge(df2, on='id', how='left')
train_df = train_df.merge(df3, on='id', how='left')
train_df = train_df.merge(df4, on='id', how='left')
train_df = train_df.merge(df5, on='id', how='left')
train_df = train_df.merge(df6, on='id', how='left')
train_df = train_df.merge(df7, on='id', how='left')
train_df = train_df.merge(df8, on='id', how='left')
train_df = train_df.merge(df9, on='id', how='left')
train_df = train_df.merge(df10, on='id', how='left')
train_df = train_df.merge(df11, on='id', how='left')

test_df = test_df.merge(df_test1, on='id', how='left')
test_df = test_df.merge(df_test2, on='id', how='left')
test_df = test_df.merge(df_test3, on='id', how='left')
test_df = test_df.merge(df_test4, on='id', how='left')
test_df = test_df.merge(df_test5, on='id', how='left')
test_df = test_df.merge(df_test6, on='id', how='left')
test_df = test_df.merge(df_test7, on='id', how='left')
test_df = test_df.merge(df_test8, on='id', how='left')
test_df = test_df.merge(df_test9, on='id', how='left')
test_df = test_df.merge(df_test10, on='id', how='left')
test_df = test_df.merge(df_test11, on='id', how='left')

In [None]:
cols = list(train_df.columns)
blend_features = [feat for feat in cols if str(feat).startswith('pred')]
print(blend_features)

## 1. Linear Regression

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = LinearRegression()
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict(valid[blend_features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict(test[blend_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_1']
valid_pred_all.to_csv('L1_train_pred_1.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_1'] = test_pred_all
sub_2.to_csv('L1_test_pred_1.csv', index=False)

## 2. Random Forest

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = RandomForestRegressor(n_estimators=500, n_jobs=4, max_depth=3)
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict(valid[blend_features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict(test[blend_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_2']
valid_pred_all.to_csv('L1_train_pred_2.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_2'] = test_pred_all
sub_2.to_csv('L1_test_pred_2.csv', index=False)

## 3. Gradient Boosting Regressor

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = GradientBoostingRegressor(n_estimators=500, max_depth=3)
    model.fit(train[blend_features].values, train[target].values)

    valid_pred = model.predict(valid[blend_features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict(test[blend_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_3']
valid_pred_all.to_csv('L1_train_pred_3.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_3'] = test_pred_all
sub_2.to_csv('L1_test_pred_3.csv', index=False)

## 4. XGBoost

In [None]:
test_pred_all = None
valid_pred_all = {}
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    valid_ids = valid.id.values.tolist()

    model = XGBRegressor(
        random_state = 1,
        booster = 'gbtree',
        n_estimators = 7000,
        learning_rate = 0.03,
        max_depth = 2,
        n_jobs = 4
    )
    model.fit(train[blend_features].values, train[target].values,
              eval_set = [(valid[blend_features].values, valid[target].values)],
              eval_metric = 'rmse',
              early_stopping_rounds = 300,
              verbose=False)

    valid_pred = model.predict(valid[blend_features].values)
    valid_pred_all.update(dict(zip(valid_ids, valid_pred)))
    test_pred = model.predict(test[blend_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

valid_pred_all = pd.DataFrame.from_dict(valid_pred_all, orient='index').reset_index()
valid_pred_all.columns = ['id', 'pred_L1_4']
valid_pred_all.to_csv('L1_train_pred_4.csv', index=False)

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['pred_L1_4'] = test_pred_all
sub_2.to_csv('L1_test_pred_4.csv', index=False)

# Stacking (L2 Model)

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

In [None]:
prev_features = list(train_df.columns)

In [None]:
df1 = pd.read_csv('L1_train_pred_1.csv')
df2 = pd.read_csv('L1_train_pred_2.csv')
df3 = pd.read_csv('L1_train_pred_3.csv')
df4 = pd.read_csv('L1_train_pred_4.csv')

df_test1 = pd.read_csv('L1_test_pred_1.csv')
df_test2 = pd.read_csv('L1_test_pred_2.csv')
df_test3 = pd.read_csv('L1_test_pred_3.csv')
df_test4 = pd.read_csv('L1_test_pred_4.csv')

In [None]:
train_df = train_df.merge(df1, on='id', how='left')
train_df = train_df.merge(df2, on='id', how='left')
train_df = train_df.merge(df3, on='id', how='left')
train_df = train_df.merge(df4, on='id', how='left')

test_df = test_df.merge(df_test1, on='id', how='left')
test_df = test_df.merge(df_test2, on='id', how='left')
test_df = test_df.merge(df_test3, on='id', how='left')
test_df = test_df.merge(df_test4, on='id', how='left')

In [None]:
cols = list(train_df.columns)
stack_features = [feat for feat in cols if str(feat).startswith('pred')]
print(stack_features)

In [None]:
test_pred_all = None
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    model = LinearRegression()
    model.fit(train[stack_features].values, train[target].values)

    valid_pred = model.predict(valid[stack_features].values)
    test_pred = model.predict(test[stack_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)} Coeff: {model.coef_[0]}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['target'] = test_pred_all
sub_2.to_csv('Stacked_Submission_1.csv', index=False)

In [None]:
test_pred_all = None
all_rmse = []

for i in tqdm(range(train_df['kfold'].nunique())):
    train = train_df[train_df['kfold'] != i]
    valid = train_df[train_df['kfold'] == i]
    test = test_df.copy()

    model = RandomForestRegressor(n_estimators=600, n_jobs=4, max_depth=5)
    model.fit(train[stack_features].values, train[target].values)

    valid_pred = model.predict(valid[stack_features].values)
    test_pred = model.predict(test[stack_features].values)
    rmse = mean_squared_error(valid[target].values, valid_pred, squared=False)
    all_rmse.append(rmse)

    if test_pred_all is None:
        test_pred_all = test_pred
    else:
        test_pred_all += test_pred
    
    print(f'Fold {i+1} RMSE: {round(rmse, 4)}')
    
print('')
print(f'Average RMSE: {round(np.mean(all_rmse), 4)} Std: {round(np.std(all_rmse), 4)}')
test_pred_all /= train_df['kfold'].nunique()

sub_2 = pd.DataFrame()
sub_2['id'] = test_df['id']
sub_2['target'] = test_pred_all
sub_2.to_csv('Stacked_Submission_2.csv', index=False)