# Table of Contents
* [Exploration of Target](#1)
* [Exploration of Features](#2)
* [Target vs Features](#3)
* [TabNet Model](#4)

In [None]:
# install TabNet first
!pip install pytorch-tabnet

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# TabNet and ML tools
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import KFold
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
# load training data
df_train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
df_train.head()

<a id='1'></a>
# Exploration of Target

In [None]:
# histogram of target
df_train.target.plot(kind='hist', bins=50)
plt.title('Target - Histogram')
plt.grid()
plt.show()

In [None]:
# KDE plot of target
df_train.target.plot(kind='kde')
plt.title('Target - Kernel Density Estimator')
plt.grid()
plt.show()

In [None]:
# boxplot of target => looking for outliers
df_train.target.plot(kind='box')
plt.title('Target - Boxplot')
plt.grid()
plt.show()

#### Check for the zero value

In [None]:
df_zero = df_train[df_train.target==0]
df_zero

#### This is just one of 30'000 rows, let's remove this row...

In [None]:
df_train = df_train[df_train.target>0]
df_train.target.describe()

<a id='2'></a>
# Exploration of Features

In [None]:
features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5',
            'cont6', 'cont7', 'cont8', 'cont9', 'cont10',
            'cont11', 'cont12', 'cont13', 'cont14']

In [None]:
# summary stats
df_train[features].describe()

### Feature distributions

In [None]:
for f in features:
    plt.figure(figsize=(8,4))
    plt.hist(df_train[f], bins=100)
    plt.title(f)
    plt.grid()
    plt.show()

### Correlations

In [None]:
corr_pearson = df_train[features].corr(method='pearson')
corr_spearman = df_train[features].corr(method='spearman')

In [None]:
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=True, cmap="RdYlGn")
plt.title('Pearson Correlation')
plt.show()

In [None]:
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_spearman, annot=True, cmap="RdYlGn")
plt.title('Spearman Correlation')
plt.show()

In [None]:
# pairwise scatter plot of features (takes some time to render!)
sns.pairplot(df_train[features], kind='scatter', plot_kws={'alpha': 0.01})
plt.show()

<a id='3'></a>
# Target vs Features

### Scatter Plot

In [None]:
for f in features:
    c = df_train[f].corr(df_train.target, method='pearson')
    c = np.round(c,4)
    plt.figure(figsize=(7,7))
    plt.scatter(df_train[f], df_train.target, alpha=0.01)
    plt.title('Target vs ' + f + ' / corr = ' + str(c))
    plt.xlabel(f)
    plt.ylabel('Target')
    plt.grid()
    plt.show()

### Visualization based on binned features

In [None]:
for f in features:
    new_var = f + '_bin'
    df_train[new_var] = pd.cut(df_train[f], bins=10, include_lowest=True)
    plt.figure(figsize=(7,7))
    sns.boxplot(data=df_train, x=new_var, y='target')
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()

<a id='4'></a>
# TabNet model

Thanks to the following notebook for a quick introduction: [https://www.kaggle.com/elvinagammed/tabnet-regression-baseline](https://www.kaggle.com/elvinagammed/tabnet-regression-baseline).

In [None]:
# load test data
df_test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
df_test.head()

In [None]:
# dimensions of test set
df_test.shape

In [None]:
# feature matrices
X = df_train[features].to_numpy()
X_test = df_test[features].to_numpy()

# target
y = df_train.target.to_numpy().reshape(-1, 1)

In [None]:
# random seeds
rnd_seed_cv = 1234
rnd_seed_reg = 1234

In [None]:
# cross validation
n_cv = 7
kf = KFold(n_splits=n_cv, random_state=rnd_seed_cv, shuffle=True)

In [None]:
# train

CVs = []
preds_train = []
preds_test = []
hists = []

t1 = time.time()
for train_index, test_index in kf.split(X):
    # get current train/valid set according to CVs
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    # define regression model
    regressor = TabNetRegressor(
        n_d = 16, # default: 8
        n_a = 16, # default: 8
        n_steps = 4, # default: 3
        n_independent = 2, # default: 2
        n_shared = 2, # default: 2
        lambda_sparse = 0, # default: 1e-3
        optimizer_params = dict(lr = 1e-2, weight_decay=1e-5), # default: dict(lr=2e-2)
        mask_type = 'entmax',
        scheduler_params = dict(mode = 'min',
                                patience = 5,
                                min_lr = 1e-4,
                                factor = 0.8),
        scheduler_fn = ReduceLROnPlateau,
        verbose = 1,
        seed = rnd_seed_reg)
    
    # fit model
    regressor.fit(X_train=X_train, y_train=y_train,
              eval_set=[(X_valid, y_valid)],
              max_epochs=100,
              patience=15,
              batch_size = 1024,
              eval_metric=['rmse'])
    
    # update stats for cross validation performance, predictions and scoring history
    CVs.append(regressor.best_cost)
    preds_train.append(regressor.predict(X))
    preds_test.append(regressor.predict(X_test))
    hists.append(regressor.history)
t2 = time.time()

print('\nElapsed time [s]: ', np.round(t2-t1,3))

#### Details about model parameters etc. see [https://pypi.org/project/pytorch-tabnet/](https://pypi.org/project/pytorch-tabnet/).

In [None]:
# plot scoring history
for i in range(n_cv):
    plt.plot(hists[i]['loss'], label=i)
plt.title('CV loss')
plt.ylim(0.4,0.6)
plt.grid()
plt.legend(loc='lower left')
plt.show()

In [None]:
# plot scoring history
for i in range(n_cv):
    plt.plot(hists[i]['val_0_rmse'], label=i)
plt.title('CV RMSE')    
plt.grid()
plt.legend(loc='lower left')
plt.show()

In [None]:
# plot learning rates
for i in range(n_cv):
    plt.plot(hists[i]['lr'], label=i)
plt.title('Learning Rates')
plt.grid()
plt.legend(loc='lower left')
plt.show()

In [None]:
# Cross Validation performance
print(CVs)
print()
print('Mean CV performance [RMSE]:  ', np.round(np.mean(CVs, axis=0),8))
print('Stdev CV performance [RMSE]: ', np.round(np.std(CVs, axis=0),8))

In [None]:
# show volatility of predictions (on training data)
n_show = 100 # select subset
my_alpha = 0.5
plt.figure(figsize=(18,5))
for i in range(n_cv):
    plt.scatter(range(0,n_show),preds_train[i][0:n_show], alpha=my_alpha)
plt.grid()
plt.show()

In [None]:
# show volatility of predictions (on test set)
n_show = 100 # select subset
my_alpha = 0.5
plt.figure(figsize=(18,5))
for i in range(n_cv):
    plt.scatter(range(0,n_show),preds_test[i][0:n_show], alpha=my_alpha)
plt.grid()
plt.show()

In [None]:
# calc predictions on train and test set by averaging
pred_train = np.mean(preds_train, axis=0)
pred_test = np.mean(preds_test, axis=0)

In [None]:
# plot distribution of predictions on training data / test set
plt.figure(figsize=(10,4))

plt.subplot(1, 2, 1)
plt.hist(pred_train, bins=50)
plt.title('Predictions on Training Data')
plt.grid()

plt.subplot(1, 2, 2)
plt.hist(pred_test, bins=50)
plt.title('Predictions on Test Set')
plt.grid()

plt.show()

In [None]:
# add predictions to training data
df_train['prediction'] = pred_train

In [None]:
# plot predictions vs actual on training data
sns.jointplot(data=df_train, x='target', y='prediction',
             joint_kws={'alpha' : 0.1})
plt.show()

In [None]:
# prepare submission
df_sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
df_sub.target = pred_test
df_sub.head()

In [None]:
# save to file for submission
df_sub.to_csv('submission.csv', index=False)