# Introduction
This notebook was prepared for the [Aug 2021 Playground Series competition](https://www.kaggle.com/c/tabular-playground-series-aug-2021). Questions, comments, feedback welcome!

## Overview
1. [Quick EDA](#1)
2. [Hyperparameter tuning with RandomizedSearchCV](#2)
3. [Stacking Ensemble using OOF predictions](#3)
    * XGB & LGBM regressors (base), Linear regression (meta)

In [None]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams["axes.spines.top"] = False
rcParams["axes.spines.right"] = False

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

seed=3717


<a id="1"></a> <br>
### 1. EDA

In [None]:
train_raw = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv", index_col="id")
test_raw = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv", index_col="id")

print('Train shape:', train_raw.shape)
print('Test shape:', test_raw.shape)

In [None]:
# combine train and test for pre processing
# save original end/start indices for re-splitting train/test later
train_end_idx=249999
test_start_idx=250000
all_raw = pd.concat([train_raw,test_raw])
all_raw.head(3)

In [None]:
# check for missing values
print('Train data null count:',train_raw.isnull().sum().sum())
print('Test data null count:',test_raw.isnull().sum().sum())

In [None]:
# check feature datatypes
train_raw.dtypes.unique()

In [None]:
# see summary statistics for features
all_raw.drop('loss',axis=1).describe().T.sample(10)

In [None]:
# examine distribution of features
fig, axs = plt.subplots(20,5,figsize=(12,40))
plt.suptitle('Feature Distributions')
for i, feat in enumerate(all_raw.loc[:,:'f99']):
    sns.histplot(all_raw[feat],kde=False, ax=axs.flat[i])
    axs.flat[i].axes.get_yaxis().set_visible(False)
    axs.flat[i].spines['left'].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
# examine distribution for target
sns.kdeplot(train_raw['loss'], shade=True)
plt.show()

In [None]:
# set up train test data 
X =  train_raw.copy()
y = X.pop('loss')

X_test = test_raw.copy()

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

<a id="2"></a> <br>
### 2. Hyperparameter Tuning

In [None]:
# RandomizedSearch tuning
test_params = {
    'num_iterations': [300,700,1000],
    'max_depth': [5,50,100],
    'learning_rate': [0.05, 0.1, 0.2],
    'reg_alpha': [1,10,20],
    'reg_lambda': [0,0.5,2,10]
}
lgbr = LGBMRegressor(device_type='gpu', seed=seed)
search_results = RandomizedSearchCV(estimator=lgbr, 
                         param_distributions=test_params,
                         scoring='neg_mean_squared_error',
                         n_iter=10,
                         verbose=1)
#search_results.fit(X, y)

In [None]:
#print("Best parameters:", search_results.best_params_)
#print("Lowest RMSE: ", (-search_results.best_score_)**(1/2.0))

In [None]:
# best params are from randomized search
best_xgb_params = {
    'n_estimators':1000,
    'max_depth':5,
    'learning_rate':0.05,
    'colsample_bytree':0.7,
    'min_child_weight':7,
    'alpha':0.5,
    'lambda':1.5
}

best_lgb_params = {
    'reg_lambda': 0.5,
    'reg_alpha': 10,
    'num_iterations': 700,
    'max_depth': 5,
    'learning_rate': 0.05
}

In [None]:
# function to get list of models
def get_models():
    models = dict()
    models['xgb'] = XGBRegressor(tree_method='gpu_hist', **best_xgb_params)
    models['lgb'] = LGBMRegressor(device_type='gpu', seed=seed, **best_lgb_params)
    return models
 
# evaluate a given model using cross-validation, default 5 folds
def cv_rmse(model, X, y, folds=5):
    kfolds = KFold(n_splits=folds, shuffle=True, random_state=seed)
    scores = cross_val_score(
        model, X, y, cv=kfolds, scoring="neg_mean_squared_error",
    )
    scores = np.sqrt(-1*scores)
    return scores

# use cv_rmse to evaluate models and plot rsme
def evaluate_models():
    # get the models to evaluate
    models = get_models()

    # evaluate the models and store results
    results, names = [], []
    for name, model in models.items():
        scores = cv_rmse(model, X, y)
        results.append(scores)
        names.append(name)
        print(f'Model: {name}, Mean RSME: {np.mean(scores):.4f}, Std: {np.std(scores):.4f}')

    plt.boxplot(results, showmeans=True, labels=names)
    plt.show()
evaluate_models()

<a id="3"></a> <br>
### 3. Stacking Ensemble

In [None]:
# create 2 level stack and use oof predictions
# base estimators: xbg and lgb regressors
# meta estimator: linear regression
# average oof predictions over 5 folds
def stacking_ensemble():
    n_splits=5
    test_preds = []
    xgb_rmse = []
    lgb_rmse = []
    lr_rmse = []

    kfolds = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    xgb_base_estimator = get_models()['xgb']
    lgb_base_estimator = get_models()['lgb']
    final_estimator = LinearRegression()
    
    print(f'Stacking starting, {n_splits} total folds...')

    for fold, (train_idx, valid_idx) in enumerate(kfolds.split(X, y)):
        
        # split train/validate
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        
        # XGB base model - fit, predict, and score
        xgb_base_estimator.fit(X_train, y_train)
        xgb_base_preds = xgb_base_estimator.predict(X_valid)
        xgb_base_rmse = mean_squared_error(y_valid, xgb_base_preds, squared=False)
        xgb_rmse.append(xgb_base_rmse)
        print(f'Fold {fold+1}, Base XGB RMSE: {xgb_base_rmse:.5f}')
          
        # LGBM base model - fit, predict, and score
        lgb_base_estimator.fit(X_train, y_train)
        lgb_base_preds = lgb_base_estimator.predict(X_valid)
        lgb_base_rmse = mean_squared_error(y_valid, lgb_base_preds, squared=False)
        lgb_rmse.append(lgb_base_rmse)
        print(f'Fold {fold+1}, Base LGBM RMSE: {lgb_base_rmse:.5f}')
        
        # Linear regression final (meta) model
        blend_train = np.c_[xgb_base_preds, lgb_base_preds]
        blend_test = np.c_[xgb_base_estimator.predict(X_test), lgb_base_estimator.predict(X_test)]
        final_estimator.fit(blend_train, y_valid)
        final_estimator_preds = final_estimator.predict(blend_test)
        final_estimator_rmse = mean_squared_error(y_valid, final_estimator.predict(blend_train), squared=False)
        lr_rmse.append(final_estimator_rmse)
        print(f'Fold {fold+1}, Final LR RMSE: {final_estimator_rmse:.5f}')
        test_preds.append(final_estimator_preds)
    
    # get average RMSEs across all folds for all models
    xgb_rmse_avg = np.mean(xgb_rmse)
    lgb_rmse_avg = np.mean(lgb_rmse)
    lr_rmse_avg = np.mean(lr_rmse)

    print(f'Average RMSEs - Base XGB:{xgb_rmse_avg:.5f}, Base LGBM:{lgb_rmse_avg:.5f}, Final LR:{lr_rmse_avg:.5f}')
    return(sum(test_preds)/n_splits)

stack_preds = stacking_ensemble()

In [None]:
# create submission file
output = pd.DataFrame({'id': test_raw.index, 
                       'loss':stack_preds})
output.to_csv('my_submission.csv', index=False)
print("Submission saved.")

In [None]:
# sanity check of output
sns.kdeplot(output['loss'], shade=True)
plt.show()