In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from fastai.tabular.all import *
import optuna
from optuna.integration import FastAIV2PruningCallback

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Please upvote if you found this useful 

# Get data and split into continuous and categorical variables

Note that for this dataset we'll treat all the features as **continuous**

In [None]:
path = Path('/kaggle/input/tabular-playground-series-aug-2021')
Path.BASE_PATH = path

df = pd.read_csv(path/'train.csv', low_memory=False)
df.head()

In [None]:
dep_var = 'loss'
cont, cat = cont_cat_split(df, dep_var=dep_var)
cont.remove('id')
len(cont), len(cat)

# Helper functions

We'll use RMSE (Root Mean Squared Error) for the loss function.
Let's also make helper functions to create `DataLoaders` and to train our Neural Net (NN)

In [None]:
def rmse(y_hat, y):
    """
    Root Mean Squared Error
    Note: the competition does not divide by N so we don't here
    """
    return torch.sqrt(torch.mean((y_hat - y)**2))

In [None]:
def create_dls(df, bs = 2048, valid_pct = 0.25):
    to = TabularPandas(df, 
                  procs=[Normalize],
                  y_names=dep_var,
                  cont_names=cont,
                  cat_names=cat,
                  splits=RandomSplitter(valid_pct)(range_of(df)),
                  reduce_memory=False)
    return to.dataloaders(bs=bs)

In [None]:
early_stopping_cbs = [
    EarlyStoppingCallback(monitor='valid_loss', patience=2, min_delta=0.01),
]

def train_nn(dls, layers, ps, wd, y_range_eps, cbs):
    learn = tabular_learner(dls, layers=layers,
                            #procs=[Normalize],
                            y_range=(dls.train.y.min() - y_range_eps, dls.train.y.max() + y_range_eps),
                            config={'ps': ps, 'act_cls': nn.LeakyReLU(inplace=True)}, 
                            wd=wd, 
                            loss_func=rmse)
    suggested_lr = learn.lr_find(show_plot=False)
    learn.fit_one_cycle(2, 1e-3, cbs=early_stopping_cbs + cbs)
    learn.fit_one_cycle(55, 1e-3 / 100, cbs=early_stopping_cbs + cbs)
    return learn

# Optuna Hyperparameter Tuning
I just learned about Optuna and it's been a game changer. Now I don't have to painstakingly fiddle with hyperparameters. Instead I just specify a range of values and Optuna takes care of the rest. 

It's honestly so convenient it almost feels like a cheat code. 

Notice how in our `objective` function, we always decrease the layer sizes. This is because it doesn't really make sense to increase layer_sizes through the network. Increasing layer sizes means we're asking the model to generate more numbers from less numbers. Instead what makes sense is to have the layer sizes __decrease__ because then we're asking the model to take a large number of features and turn them into a smaller, more feature-rich representation. 

In [None]:
dls = create_dls(df, bs=8192, valid_pct=0.5)

def objective(trial: optuna.Trial, y_range_eps: float):
    num_layers = trial.suggest_int('num_layers', 1, 3)
    wd = trial.suggest_float('wd', 0.0, 1.0)
    layers = [
        trial.suggest_categorical('layer_0', [512, 256, 128]),
        trial.suggest_categorical('layer_1', [512, 256, 128]),
        trial.suggest_categorical('layer_2', [512, 256, 128]),
    ]
    ps = [
        trial.suggest_discrete_uniform('dropout_0', 0.0, 0.95, 0.05),
        trial.suggest_discrete_uniform('dropout_1', 0.0, 0.95, 0.05),
        trial.suggest_discrete_uniform('dropout_2', 0.0, 0.95, 0.05),
    ]
    learn = train_nn(dls, layers[:num_layers], ps[:num_layers], wd, 
                     y_range_eps=y_range_eps,
                     cbs=[FastAIV2PruningCallback(trial, monitor='valid_loss',)])
    return learn.validate()[0]

In [None]:
# Comment out to save time

# timeout = 3600 * 2
# study = optuna.create_study(direction='minimize',
#                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=4, interval_steps=2))
# study.optimize(lambda trial: objective(trial, y_range_eps=0.00),
#                n_trials=35, timeout=timeout)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

# Optuna Results

For this dataset I have not been able to get below ~7.90 on the validation set using a NN; and not below 7.94 on the final evaluation set used by kaggle.

In a way this is expected, as XGBRegressor and CatBoostRegressor will often perform better on tabular datasets. Exactly why NN's cannot achieve similar performance is something I'm still looking into. With XGB- and CatBoost- Regressors, I've been able to achieve 7.87 but still cannot hit the SOTA 7.85. If you have any suggestions, tips, or comments, please share them below! 

# Train Model using Optuna Hyperparameters

In [None]:
learn = train_nn(dls, layers=[512, 256],
                 ps=[0.6, 0.7], wd=0.373, y_range_eps=0.0, cbs=[])

# Create Submission

In [None]:
test_df = pd.read_csv(path/'test.csv')
test_dl = learn.dls.test_dl(test_df)
test_dl.show_batch()

In [None]:
preds, _ = learn.get_preds(dl=test_dl)
preds = preds.numpy().flatten()

In [None]:
submission_df = pd.DataFrame({'id': test_df.id, 'loss': preds})
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)

## Please upvote if you found this useful