Hello everybody! I learnt so much from so many differant people on Kaggle and so I thought it to be only right for me to give back to the community with my model development process. This is my first notebook and so any comments and upvotes are really appreciated!

# **Importing the Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import optuna 
from optuna.visualization import plot_optimization_history
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statistics import mean

from vecstack import StackingTransformer

import warnings
warnings.filterwarnings('ignore')


RS = 69 # :)

# **Importing & Shaping the Data**

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
df.drop('id', axis=1, inplace=True)
df

In [None]:
# Remove Outliers
df = df.sort_values(by='target')
df = df.iloc[2:,:]

In [None]:
# Split into Feat & Targets
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X.shape, y.shape

In [None]:
# Train-Test-Validation Split, 60-20-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= RS)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state= RS)
evals = [(X_val, y_val)]

# **Base Accuracy Score on XGBoost**

In [None]:
# The XGBRegressor is built using GPU for accelerated computing! You can remove the variables and it will default to CPU
model = XGBRegressor(predictor = 'gpu_predictor',
                     tree_method = 'gpu_hist',
                     eval_metric = 'rmse',
                     verbosity=1)

In [None]:
# Train Model with early stopping to prevent overfitting
model.fit(X_train, y_train, eval_set = evals, eval_metric = 'rmse', early_stopping_rounds = 15)

In [None]:
# Predict
y_pred = model.predict(X_test)
y_valpred = model.predict(X_val)

# Compute Metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
val_rmse = mean_squared_error(y_val, y_valpred, squared=False)

# As a general rule of thumb expect your submission RMSE to be slightly higher than mean
print("The mean RMSE of the base model is {}".format(mean((val_rmse, test_rmse))))

# **Hyperparameter Tuning using Optuna**

In [None]:
# Optuna iterates through this function

def objective(trial: Trial, X, y) -> float:
    
    # Split into Train-Test-Validation, 60, 20, 20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RS)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=RS)
    evals = [(X_val, y_val)]
    
    # Assign Parameter Dict
    param = {
                "n_estimators":trial.suggest_int('n_estimators', 0, 1000),
                'max_depth':trial.suggest_int('max_depth', 2, 25),
                'reg_alpha':trial.suggest_int('reg_alpha', 0, 5),
                'reg_lambda':trial.suggest_int('reg_lambda', 0, 5),
                'min_child_weight':trial.suggest_int('min_child_weight', 0, 5),
                'gamma':trial.suggest_int('gamma', 0, 5),
                'learning_rate':trial.suggest_loguniform('learning_rate', 0.005, 0.5),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree', 0.1, 1, 0.01)
            }
    
    # Build Model
    model = XGBRegressor(**param,
                         predictor = 'gpu_predictor',
                         tree_method = 'gpu_hist',
                         eval_metric = 'rmse',
                         verbosity=1)
    
    # Fit Model
    model.fit(X_train, y_train, eval_set = evals, eval_metric = 'rmse', early_stopping_rounds = 15)
    
    # Predict
    y_pred = model.predict(X_test)
    y_valpred = model.predict(X_val)
    
    # Compute Metrics
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    val_rmse = mean_squared_error(y_val, y_valpred, squared=False)
    
    return mean((val_rmse, test_rmse))

In [None]:
%%time
# To conserve computing time I have limited the trials here to 10, but I used 100 to develop my model

study = optuna.create_study(study_name='Kaggle_Tabular_Compeition',
                            direction='minimize',
                            sampler=TPESampler())

# Iterates through the function
study.optimize(lambda trial : objective(trial, X, y), n_trials= 10)

print('Best trial: RMSE {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
# Dataframe of study 
hist = study.trials_dataframe()
hist.head()

In [None]:
# Plots a curve of how the mean RMSE changed through the 'epochs' (shoutout to all my ML nerds)
# Deselect Objective Value to see the curve more clearly!
plot_optimization_history(study)

In [None]:
print('Best mean trial: RMSE {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))

# **Stacking the best XGBoost models**

**Model Parameter Dictionaries**

In [None]:
# Takes the 5 best optimized models to ensemble

# This was the threshold I used to develop my model, choose one which suits you best and comment below!
# bestTrials = hist[hist['value'] <= 0.699]

bestTrials = hist.sort_values(by='value', ascending=True)
bestTrials = bestTrials.head()
bestTrials

# Create a dictionary of best params for each bestTrialsis is a slightly 'hacky' way to do this, if you have a more elegant solution please let me know
param1 = bestTrials.iloc[0:1, 5:-1]
param1 = param1.to_dict()

param2 = bestTrials.iloc[1:2, 5:-1]
param2 = param2.to_dict()

param3 = bestTrials.iloc[2:3, 5:-1]
param3 = param3.to_dict()

param4 = bestTrials.iloc[3:4, 5:-1]
param4 = param4.to_dict()

param5 = bestTrials.iloc[4:5, 5:-1]
param5 = param5.to_dict()

In [None]:
# You now have to go through each param dictionary and manually edit it to work by copying, pasting and then reassigning! Please tell me there is a better way!
param1
# param2
# param3
# param4
# param5

In [None]:
param1 = {'params_colsample_bytree':0.91,
          'params_gamma':0,
          'params_learning_rate':0.16424108802390555,
          'params_max_depth':6,
          'params_min_child_weight':2,
          'params_n_estimators':172,
          'params_reg_alpha':0,
          'params_reg_lambda':5
         }

param2 = {'params_colsample_bytree':0.49,
          'params_gamma':3,
          'params_learning_rate':0.07499692005931666,
          'params_max_depth':10,
          'params_min_child_weight':5,
          'params_n_estimators':865,
          'params_reg_alpha':2,
          'params_reg_lambda':1
         }

param3 = {'params_colsample_bytree':0.36,
          'params_gamma':4,
          'params_learning_rate':0.015149948444442753,
          'params_max_depth':22,
          'params_min_child_weight':5,
          'params_n_estimators':932,
          'params_reg_alpha':5,
          'params_reg_lambda':0
         }

param4 = {'params_colsample_bytree':0.98,
          'params_gamma':0,
          'params_learning_rate':0.18311988381440686,
          'params_max_depth':9,
          'params_min_child_weight':5,
          'params_n_estimators':507,
          'params_reg_alpha':1,
          'params_reg_lambda':2
         }

param5 = {'params_colsample_bytree':0.56,
          'params_gamma':5,
          'params_learning_rate':0.02768954031767648,
          'params_max_depth':13,
          'params_min_child_weight':4,
          'params_n_estimators':421,
          'params_reg_alpha':0,
          'params_reg_lambda':2
         }

**Building the Models**

In [None]:
print("Building Models...")
xgboost1 = XGBRegressor(**param1,
                       predictor = 'gpu_predictor',
                       tree_method = 'gpu_hist',
                       random_state = RS,
                       verbosity=0)

xgboost2 = XGBRegressor(**param2,
                       predictor = 'gpu_predictor',
                       tree_method = 'gpu_hist',
                       random_state = RS,
                       verbosity=0)

xgboost3 = XGBRegressor(**param3,
                       predictor = 'gpu_predictor',
                       tree_method = 'gpu_hist',
                       random_state = RS,
                       verbosity=0)

xgboost4 = XGBRegressor(**param4,
                       predictor = 'gpu_predictor',
                       tree_method = 'gpu_hist',
                       random_state = RS,
                       verbosity=0)

xgboost5 = XGBRegressor(**param5,
                       predictor = 'gpu_predictor',
                       tree_method = 'gpu_hist',
                       random_state = RS,
                       verbosity=0)

# Notice how xgboost5 is not within the stack, that is for a reason
models = [
            ('XGB1', xgboost1),
            ('XGB2', xgboost2),
            ('XGB3', xgboost3),
            ('XGB4', xgboost4)
         ]

print("Built Models!")

In [None]:
# I used 10 folds but to save Compute time it is again only 2 here
stack = StackingTransformer(estimators= models,
                            regression= True,
                            metric= mean_squared_error,
                            n_folds= 2, 
                            shuffle= True,  
                            random_state= RS,    
                            verbose= 2)

In [None]:
%%time
print("Training Stack...")
stack = stack.fit(X_train, y_train)
print("Stack trained!")

In [None]:
# Create Stacked Train-Test
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

In [None]:
# Train Final Predictor
xgboost5 = xgboost5.fit(S_train, y_train)
y_pred = xgboost5.predict(S_test)
print('Final RMSE: %.6f' % mean_squared_error(y_test, y_pred, squared= False))
print("Stacking and HyperParam tuning decreased RMSE by {}%".format(100-(study.best_trial.value/mean_squared_error(y_test, y_pred, squared= False)*100)))

# Obviously I have tuned down the model to save upon compute time but I urge you to fiddle with the parameters and see what you get!

# **Submitting Results**

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

In [None]:
submission = pd.DataFrame(test_df.iloc[:, 0])
X = test_df.drop('id', axis=1).values

In [None]:
X_test = stack.transform(X)
preds = xgboost5.predict(X_test)

In [None]:
submission['target'] = preds
submission = submission.set_index('id')

In [None]:
submission

In [None]:
# Press Save All & Run All, after that has finished go to the submissions page on the tournament and you should be able to submit the version of the notebook you saved!
submission.to_csv('submission.csv',index='id')