In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

!pip install --upgrade xgboost
import xgboost as xgb
xgb.__version__

# Introduction


This notebook will summarize my process in moving from being ranked 67th percentile to 21st percentile.


1) Simple EDA

2) Initial Gradient Boosted Tree Model

3) Hyperparameter Tuning on GPU

4) Cross Validation

5) Repeat

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sub = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv")
data = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
final_test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")

# Simple EDA

In [None]:
data.describe()

We see that all the value are numeric. No encoding or dummies are required.

In [None]:
print('Training Data')
print(data.isnull().sum())

print()
print()

print('Testing Data')
print(final_test.isnull().sum())

There are no missing values in the training or testing data.


Next we'll take a look at the distributions of the features and target variables.

In [None]:
columns = final_test.columns[1:]

train = data[columns]
target = data['target']

Before plotting, lets note we do not need to normalize our data since:

1) Decision trees don't require normalization

2) We will be using a decision tree ensemble algorithm

In [None]:
sns.distplot(target)
plt.title('Target')
plt.show()

We see that the data is composd of two normal distributions. I'm not sure how to handle gussian mixture models yet but this is definitely something to look into for the future.

In [None]:
# correlation matrix
corr_mat = train.corr()

# make the triangular upper of matrix, to be all ones
mask = np.triu(np.ones_like(corr_mat, dtype=bool))

# customer colourmap 
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_mat, mask=mask,cmap=cmap)

We see variable `cont13` has strong correlation with the most amount of avaiables, `cont6` has very strong correlation with `cont13`, `cont12`, `cont11`, `cont10`, `cont9`.

# Initial Model

In [None]:
# split data
x_train, x_test, y_train, y_test =train_test_split(
    train, target, random_state= 2021, test_size = 0.20)

# initialize
xgb_initial = xgb.XGBRegressor()

# train
xgb_initial.fit(x_train, y_train)

# predict
initial_preds = xgb_initial.predict(x_test)

In [None]:
# want sq_root MSE
mean_squared_error(y_test, initial_preds, squared=False)

#0.7044111055876526

The current RMSE is approximately 0.7044. We will use Optuna to help use find the best parameters for our ensemble tree model. The GPU accelerator option must be turned on.


In this next part, I couldn't find what the paramters I had used so I chose new random values and ran objective function to find our best parameters. The outcome was slightly better than my final submission for this competition. 

You can read more about the parameters in the [xgboost documentation.](https://xgboost.readthedocs.io/en/stable/parameter.html)

# Hyperparameter Tuning

In [None]:
# The objective function defines what we want to optimize


def objective(trial, X_data = train, Y_data = target):
    
    x_train, x_test, y_train, y_test = train_test_split(
        X_data, Y_data, random_state= 2021, test_size = 0.20)
    

    param = {
    'tree_method':'gpu_hist', # use GPU for train
    'predictor': 'gpu_predictor', # use GPU for predict
    'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.50,0.05),
    'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel',0.01,0.91, 0.1),
    'colsample_bynode': trial.suggest_discrete_uniform('colsample_bytree',0.01,0.91, 0.1),
    'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.01,0.91, 0.1),
    'max_depth': trial.suggest_int('max_depth', 1,10),
    'subsample': trial.suggest_discrete_uniform('subsample', 0.20,1, 0.05),
    'n_estimators': trial.suggest_categorical('n_estimators',[4000,5000,6000,7000]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1,401,step=2),
    'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
    'random_state': trial.suggest_categorical('random_state',[2000,3000,4000]),
    'gamma': trial.suggest_discrete_uniform('gamma',0.01,2.01, 0.1)
    }
    
    model = xgb.XGBRegressor(**param)  
    
    model.fit(x_train,y_train,eval_set=[(x_train,y_train)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(x_test)
    
    rmse = mean_squared_error(y_test, preds,squared=False)
    
    return rmse


In [None]:
# create a study and specify we want to minimize the objective
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials= 100)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best objective value:', study.best_value)


In [None]:
# params of the best trial
best_trial = study.best_trial.params
best_trial['tree_method'] = 'gpu_hist'
best_trial['predictor'] = 'gpu_predictor'

In [None]:
# best_trial= {'learning_rate': 0.01,
#  'colsample_bylevel': 0.51,
#  'colsample_bytree': 0.91,
#  'max_depth': 10,
#  'subsample': 0.7,
#  'n_estimators': 4000,
#  'min_child_weight': 245,
#  'lambda': 6.089455795135218e-05,
#  'alpha': 0.0019736061458465663,
#  'random_state': 2000,
#  'gamma': 1,
#  'tree_method': 'gpu_hist',
#  'predictor': 'gpu_predictor'}

In [None]:
# look at how rmse changed over the trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# # what were the most important params that impacted this study's results
optuna.visualization.plot_param_importances(study)

Learning rate was he most importnat parameter. If we fix the learning rate, the model may be able to capture the importance of other parameters. But first let's see how this current model performs with cross validation.

# 1st Cross Valiation

In [None]:
best_trial= {'learning_rate': 0.01,
 'colsample_bylevel': 0.6100000000000001,
 'colsample_bytree': 0.91,
 'max_depth': 10,
 'subsample': 0.8,
#  'n_estimators': 5000,
 'min_child_weight': 67,
 'lambda': 0.012157425362490908,
 'alpha': 7.278941365308569e-08,
 'random_state': 3000,
 'gamma': 1,
 'tree_method': 'gpu_hist',
 'predictor': 'gpu_predictor'}



In [None]:
final_test = xgb.DMatrix(final_test[columns])

In [None]:
train_oof = np.zeros((300000,))
test_preds = 0
train_oof.shape

We will use cross validation here to get a more accurate RMSE metric for our model and we will also use the cross validation outcomes as our testing results. This was an interesting method to of using cross validation to reduce overfitting I learned from this [Notebook](http://www.kaggle.com/tunguz/tps-01-21-feature-importance-with-xgboost-and-shap).

In [None]:
NUM_FOLDS=10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

fold_rmse =[]
for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        
        train_df = xgb.DMatrix(train_df, label=train_target)
        val_df = xgb.DMatrix(val_df, label=val_target)
        
        model =  xgb.train(best_trial, train_df, 2000)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(final_test)

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        rmse_iter = mean_squared_error(temp_oof, val_target, squared=False)
        print(rmse_iter)
        fold_rmse.append(rmse_iter)

In [None]:
fold_rmse

In [None]:
# average rmse of all 10 folds
sum(fold_rmse)/10

We can expect our predicts to have an RMSE of 0.696118 from our 10-fold cross validation.

In [None]:
sub['target'] = test_preds
sub.to_csv('submission2_post_competition.csv', index=False)

# Repeat: Fix Learning Rate

In [None]:
# the objective function defines what we want to optimize
def objective_2(trial, X_data = train, Y_data = target):
    
    x_train, x_test, y_train, y_test = train_test_split(
        X_data, Y_data, random_state= 2021, test_size = 0.20)
    

    param = {
    'tree_method':'gpu_hist', # use GPU for train
    'predictor': 'gpu_predictor', # use GPU for predict
    'learning_rate': 0.01,
    'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel',0.01,0.91, 0.1),
    'colsample_bynode': trial.suggest_discrete_uniform('colsample_bytree',0.01,0.91, 0.1),
    'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.01,0.91, 0.1),
    'max_depth': trial.suggest_int('max_depth', 1,10),
    'subsample': trial.suggest_discrete_uniform('subsample', 0.20,1, 0.05),
    'n_estimators': trial.suggest_categorical('n_estimators',[4000,5000,6000,7000]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1,401,step=2),
    'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
    'random_state': trial.suggest_categorical('random_state',[2000,3000,4000]),
    'gamma': trial.suggest_discrete_uniform('gamma',0.01,2.01, 0.1)
    }
    
    model = xgb.XGBRegressor(**param)  
    
    model.fit(x_train,y_train,eval_set=[(x_train,y_train)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(x_test)
    
    rmse = mean_squared_error(y_test, preds,squared=False)
    
    return rmse


In [None]:
# create a study and specify we want to minimize the objective
study_2 = optuna.create_study(direction='minimize')
study_2.optimize(objective_2, n_trials= 100)

In [None]:
print('Number of finished trials:', len(study_2.trials))
print('Best trial:', study_2.best_trial.params)
print('Best objective value:', study_2.best_value)


In [None]:
# params of the best trial
best_trial_2 = study_2.best_trial.params
best_trial_2['tree_method'] = 'gpu_hist'
best_trial_2['predictor'] = 'gpu_predictor'
best_trial_2['learning_rate'] = 0.01

In [None]:
# look at how rmse changed over the trials
optuna.visualization.plot_optimization_history(study_2)

In [None]:
# # what were the most important params that impacted this study's results
optuna.visualization.plot_param_importances(study_2)

# 2nd Cross Validation 

Note, we have already converted the final_test data to a DMatrix in the first cross validation.

In [None]:
train_oof = np.zeros((300000,))
test_preds_2 = 0
train_oof.shape

In [None]:
best_trial_2 = {'colsample_bylevel': 0.91,
                'colsample_bytree': 0.6100000000000001,
                'max_depth': 10,
                'subsample': 0.5,
#                 'n_estimators': 6000,
                'min_child_weight': 21,
                'lambda': 2.4118345076896113e-05,
                'alpha': 3.234942680594196e-08,
                'random_state': 3000,
                'gamma': 1.51,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',
                'learning_rate': 0.01}

In [None]:
NUM_FOLDS=10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

fold_rmse_2 =[]
for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        
        train_df = xgb.DMatrix(train_df, label=train_target)
        val_df = xgb.DMatrix(val_df, label=val_target)
        
        model =  xgb.train(best_trial_2, train_df, 2000)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(final_test)

        train_oof[val_ind] = temp_oof
        test_preds_2 += temp_test/NUM_FOLDS
        
        rmse_iter = mean_squared_error(temp_oof, val_target, squared=False)
        print(rmse_iter)
        fold_rmse_2.append(rmse_iter)

In [None]:
fold_rmse_2

In [None]:
# average rmse of all 10 folds
sum(fold_rmse_2)/10

We can expect our model to have an rmse of 0.696246 from our cross validation. Even though this rmse is slightly more than the previous rmse of 0.696118 I'll make the submission to see if the model has improved overfitting.

In [None]:
# is the new rmse better than the previous one?
sub['target'] = test_preds_2
sub.to_csv('submission3_post_competition.csv', index=False)

In [None]:
print(sum(fold_rmse)/10)
print(sum(fold_rmse_2)/10)

The initial submission from this notebook is shown below.
![image.png](attachment:image.png)

By simply fixing the learning rate and then finding other optimal paramter, we see that both the private( left ) and public ( right ) score has improved.
![image.png](attachment:image.png)

By simply fixing the most important parameters during optimization, this will help give parameters that previously were unimportant, greater importance and influence for parameter tuning. This helped to find a better model and reduced overfitting.