## TPS August 2021

This notebook is mostly based on the following notebook for Optuna tuning and KFold CV.
https://www.kaggle.com/michael127001/xgbregressor-with-optuna-tuning/notebook

The idea to use yeo-johnson transformation came from this discussion.
https://www.kaggle.com/c/tabular-playground-series-aug-2021/discussion/266321

I'm very new at dealing with high dimensional features, so please don't hesitate to give some input :)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read in the train and test data while making id the index
train_data = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col=['id'])
test_data = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col=['id'])
train_data

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# sklearn doesn't have an rmse function, so we define it here
def my_rmse(y_actual, y_predicted):
    return mean_squared_error(y_actual, y_predicted, squared=False)

## EDA

In [None]:
# Wide range of values
train_data.describe(include='all')

In [None]:
# Check for null values (blank array = no null values)
[null for null in train_data.isnull().sum() if null != 0]

In [None]:
train_data.columns

In [None]:
# Separate columns into features and label
features = [feature for feature in train_data.columns if feature.startswith('f')]
label = ['loss']

In [None]:
# Distributions of all features (many contain right skew)
feature_histograms = train_data[features].hist(figsize = (120, 160), bins=50, grid = False, xlabelsize=8, ylabelsize=8, layout = (101,4))

In [None]:
# Distribution of loss
loss_histogram = train_data[label].hist(figsize=(8,6), bins=10, grid=False, xlabelsize=8, ylabelsize=8)

In [None]:
# No correlation between features
correlations = train_data.corr()
sns.heatmap(data=correlations)

## Data cleaning

In [None]:
X = train_data[features].values
y = train_data[label].values
X_test = test_data.values

In [None]:
# Scale feature data to be roughly the same range
# Only fit to train data to avoid data leakage
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

X_test

In [None]:
from sklearn.preprocessing import PowerTransformer

# Use yeo johnson to deal with skewed data (also deals with negative values)
# Only fit to train data to avoid data leakage
pt = PowerTransformer(method='yeo-johnson', standardize=False)
pt.fit(X)
X = pt.transform(X)
X_test = pt.transform(X_test)

X

In [None]:
unskewed_train_data = pd.DataFrame(data=X, columns=features)
unskewed_train_data

In [None]:
# Feature distributions after fixing skew (More but not all follow normal distribution)
unskewed_features = unskewed_train_data.hist(figsize = (120, 160), bins=50, grid = False, xlabelsize=8, ylabelsize=8, layout = (101,4))

## Hyperparameter Tuning + Modelling

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# Define objective (what to optimize)
def objective(trial):
    # Split dataset for each trial (larger test size to prevent overfitting)
    X_train, X_val, y_train, y_val = train_test_split(X,y, stratify=y, test_size=0.4)
    
    # Specify ranges of hyperparameters to try 
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators',400,4000,400),
        'max_depth': trial.suggest_int('max_depth',6,10),
        'eta': trial.suggest_float('eta', 0.007,0.01),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2,0.9,0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0,2.0,0.1),
        #'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'gamma': trial.suggest_loguniform('gamma', 1e-4,1e4),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4,1e4),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4,1e4),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4,1e4),
    }
    
    # Apparently this set of parameters makes processing faster
    reg = XGBRegressor(
        objective = 'reg:tweedie',
        tree_method = 'gpu_hist',
        predictor = 'gpu_predictor',
        # Split job for 4 CPUs
        n_jobs=4,
        # Use set of params generated by param grid
        **param_grid
    )
    
    reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=False)
    
    return my_rmse(y_val, reg.predict(X_val))

In [None]:
# Study is Optuna's object to call functions on
# direction set to minimize as we want to lower rmse, sampler set to TPESampler for predicting one label
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(), study_name='XGBRegressor')
# Try different sets of hyperparameters set in objective and timeout at 1 hour (60s*60min) 
study.optimize(objective, timeout=60*60)

# trial object holds best params
trial = study.best_trial
print('Best root mean squared error: {}'.format(trial.value))
print('Best trial\'s parameters: ')
for key, value in trial.params.items():
    print('{}: {}'.format(key, value))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
# See which hyperparameters are more important to tweak suggested ranges 
optuna.visualization.plot_param_importances(study)

## KFold Cross Validation on Best Model

In [None]:
from sklearn.model_selection import KFold

best_params = trial.params
best_params['objective'] = 'reg:tweedie'
best_params['tree_method'] = 'gpu_hist'
best_params['predictor'] = 'gpu_predictor'
best_params['n_jobs'] = 4

test_preds = None
rmse_list = []
fold_num = 1

# Use KFold CV to avoid overfitting
for train_index, val_index in KFold(n_splits=10, shuffle=True).split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_val, y_val = X[val_index], y[val_index]
    
    model = XGBRegressor(**best_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=False)
    
    val_predictions = model.predict(X_val)
    rmse_val = my_rmse(y_val, val_predictions)
    print('Fold {fold_num} RMSE: {rmse:.4f}'.format(fold_num=fold_num, rmse=rmse_val))
    rmse_list.append(rmse_val)
    fold_num += 1
    
    # Use each fold's model to predict test values and add them to test_preds
    if test_preds is None:
        test_preds = model.predict(X_test)
    else:
        test_preds += model.predict(X_test)

# Get average of predictions from KFold CV for submission
test_preds /= fold_num
print('Average KFold rmse: {avg_rmse:.4f}'.format(avg_rmse = np.mean(np.array(rmse_list))))

## Submission

In [None]:
submission_df = pd.DataFrame(data=test_preds, columns=['loss'])
submission_df.index = test_data.index
submission_df

In [None]:
submission_df.to_csv('TPS August Submission')