# XGBoost Parameter Tuning with Hyperopt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc #garbage collection
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from tqdm import tqdm

In [None]:
# Load data
path = '../input/tabular-playground-series-jan-2021/'
train = pd.read_csv(path+'train.csv')
train.set_index('id',drop=True,inplace=True)
train.drop(284103,inplace=True) # looks like an error/outlier, label = 0.0

In [None]:
# train.head()
# train.isna().sum() 
# found no missing values

In [None]:
features = [col for col in train.columns if 'cont' in col]
label = 'target'

In [None]:
plt.hist(train[label],bins=100)
plt.title('Distribution of target')
plt.show()

In [None]:
fig, ax = plt.subplots(int(len(features)/2),2, figsize= (20,20))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)
counter = 0 
for i,feature in enumerate(features):
    ax[i%7,counter//7].hist(train[feature],bins=100)
    ax[i%7,counter//7].set_title('Distribution of feature ' + str(feature), fontsize=12)
    counter +=1

In [None]:
fig.clear()
plt.close(fig)

In [None]:
# train/eval set split
X_train,X_valid, y_train,y_valid = train_test_split(train[features],train[label],test_size=0.2)

d_tr = xgb.DMatrix(X_train, y_train)
d_val = xgb.DMatrix(X_valid,y_valid)

In [None]:
# Un-tunded base case estimator to compare score
params_base = {'objective': 'reg:squarederror',
               'tree_method': 'gpu_hist',
               'random_state': 0}
base_model = xgb.train(params = params_base,
                       dtrain = d_tr,
                       num_boost_round = 1000,
                       evals = [(d_val,'eval')],
                       early_stopping_rounds=10,
                       verbose_eval = 20)
y_pred_base = base_model.predict(d_val)
base_score = mean_squared_error(y_valid, y_pred_base,squared=False)
print(base_score)

In [None]:
# Simple Cross Val score as function to be optimised

def score(params):
    
    ps = {'learning_rate': params['learning_rate'],
         'max_depth': params['max_depth'], 
         'gamma': params['gamma'], 
         'min_child_weight': params['min_child_weight'], 
         'subsample': params['subsample'], 
         'colsample_bytree': params['colsample_bytree'], 
         'verbosity': 1, 
         'objective': 'reg:squarederror',
         'eval_metric': 'rmse', 
         'tree_method': 'gpu_hist', 
         'random_state': 27,
        }
    model = xgb.train(ps,d_tr, params['n_round'], [(d_val, 'eval')], early_stopping_rounds=10, verbose_eval = False)
    y_pred = model.predict(d_val)
    score = mean_squared_error(y_valid, y_pred,squared=False)

    return score

In [None]:
# Define parameter space
param_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3), 
               'n_round': scope.int(hp.quniform('n_round', 200, 3000, 100)),
               'max_depth': scope.int(hp.quniform('max_depth', 5, 16, 1)), 
               'gamma': hp.uniform('gamma', 0, 10), 
               'min_child_weight': hp.uniform('min_child_weight', 0, 10),
               'subsample': hp.uniform('subsample', 0.1, 1), 
               'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1)
              }

In [None]:
# Run optimiser with tpe
%time
trials = Trials()

hopt = fmin(fn = score,
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 1000, ## 100
            trials = trials, 
           )

In [None]:
params_best = hopt
params_best['max_depth'] = int(hopt['max_depth'])
n_rounds_best = int(hopt['n_round'])
del params_best['n_round']
print(params_best)
print(n_rounds_best)

In [None]:
# score(params_best)

In [None]:
# # trails object stores results on all trials
# trials.trials

In [None]:
# plot parameter choice in trials for selected parameters
params_to_plot = ['n_round','learning_rate', 'max_depth', 'min_child_weight']
fig, ax = plt.subplots(len(params_to_plot),1, figsize= (10,15))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.5)
for i,param in enumerate(params_to_plot):
    xs = [t['tid'] for t in trials.trials]
    ys = [t['misc']['vals'][param] for t in trials.trials]
    ax[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75)
    ax[i].set_title(str(param) + ' vs t ', fontsize=18)
    ax[i].set_xlabel('id', fontsize=16)
    ax[i].set_ylabel(str(param), fontsize=16)

In [None]:
f, ax = plt.subplots(1)
xs = [t['tid'] for t in trials.trials]
ys = [t['result']['loss'] for t in trials.trials]
ax.scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75)
ax.set_title('loss over time', fontsize=18)
ax.set_xlabel('trials', fontsize=16)
ax.set_ylabel('loss', fontsize=16)

In [None]:
%time
# Train with full dataset and best params
params_best['tree_method'] = 'gpu_hist'
d = xgb.DMatrix(train[features], train[label])
xgb_final = xgb.train(params_best,d,n_rounds_best)

In [None]:
y_pred_final = xgb_final.predict(d)
score_final = np.sqrt(mean_squared_error(train[label], y_pred_final))
print(score_final) #sanity check

In [None]:
# Load test data
test = pd.read_csv(path + 'test.csv')
test.set_index('id',drop=True,inplace=True)
d_tst = xgb.DMatrix(test[features])
# test.head()

In [None]:
# Predictions for test data
models = []

for seed in range(0,10):
    params_best['seed'] = seed
    xgb_final = xgb.train(params_best,d,num_boost_round = n_rounds_best)
    models.append(xgb_final)
    
xgb_pred = xgb_final.predict(d_tst)

In [None]:
# Save test predictions to file
ids = test.index
output = pd.DataFrame({'id': ids,
                       'target': xgb_pred})
output.to_csv('submission.csv', index=False)