---
# [Tabular Playground Series - Mar 2022][1]
---
**References**:

1: [fast.ai tutorials][2]

2: [Practical Deep Learning for Coders][3]

---
**Comments:** Thanks to previous great Notebooks.

1: [[TPS MAR] EDA+Modeling with Optuna][4]

---
[1]: https://www.kaggle.com/c/tabular-playground-series-mar-2022
[2]: https://docs.fast.ai/
[3]: https://course.fast.ai/
[4]: https://www.kaggle.com/arootda/tps-mar-eda-modeling-with-optuna

## 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns

import os
import pathlib
import gc
import sys
import re
import math 
import random
import time 
import datetime as dt
from tqdm import tqdm 
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error, mean_absolute_error

import fastai
from fastai.tabular.all import *

import torch
import torch.nn as nn
import torch.nn.functional as F

print('import done!')

In [None]:
# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    os.environ['PYTHONHASHSEED'] = str(s) 
    print('Seeds setted!')
    
global_seed = 42
seed_all(global_seed)

## 1. Data Preprocessing

### 1.1 Data Check

In [None]:
data_config = {'train_csv_path': '../input/tabular-playground-series-mar-2022/train.csv',
              'test_csv_path': '../input/tabular-playground-series-mar-2022/test.csv',
              'sample_submission_path': '../input/tabular-playground-series-mar-2022/sample_submission.csv',
              }

train_df = pd.read_csv(data_config['train_csv_path'])
test_df = pd.read_csv(data_config['test_csv_path'])
submission_df = pd.read_csv(data_config['sample_submission_path'])

train_df.head()

In [None]:
# Null Value Check
print('train_df.info()'); print(train_df.info(), '\n')
print('test_df.info()'); print(test_df.info(), '\n')

In [None]:
def print_unique_category(df, column):
    print(f'unique_category_number: {df[column].nunique()}')
    print(f'cagetories: {df[column].unique()}')
    print()

# categories in train_df
print_unique_category(train_df, 'direction')
print_unique_category(train_df, 'x')
print_unique_category(train_df, 'y')

In [None]:
# categories in test_df
print_unique_category(test_df, 'direction')
print_unique_category(test_df, 'x')
print_unique_category(test_df, 'y')
test_df.head()

In [None]:
# submission_df check
submission_df.head()

### 1.2 EDA

In [None]:
# X and Y
f, ax = plt.subplots(1, 2, figsize=(15, 5))
for i, p in enumerate(['x', 'y']):
    sns.countplot(train_df[p], ax=ax[i], edgecolor='black', linewidth=2)
    ax[i].set_xlabel(p, size=15)
    ax[i].set_ylabel('count', size=15)
    for patch in ax[i].patches:
        x, height, width = patch.get_x(), patch.get_height(), patch.get_width()
        total_cnt = train_df[p].count()
        ax[i].text(x + width / 2, height + 5, f'{height} / {height / total_cnt * 100:2.2f}%', va='center', ha='center', size=8, bbox={'facecolor': 'white', 'boxstyle': 'round'})
f.suptitle('Count by X or Y', size=15)
plt.show()

In [None]:
# Directions
f, ax = plt.subplots(1, figsize=(15, 5))
sns.countplot(train_df['direction'], edgecolor='black', linewidth=2)
ax.set_xlabel('direction', size=15)
ax.set_ylabel('count', size=15)
for patch in ax.patches:
    x, height, width = patch.get_x(), patch.get_height(), patch.get_width()
    total_cnt = train_df[p].count()
    ax.text(x + width / 2, height + 5, f'{height} / {height / total_cnt * 100:2.2f}%', va='center', ha='center', size=10, bbox={'facecolor': 'white', 'boxstyle': 'round'})
    
f.suptitle('Distribution of direction', size=15)
plt.show()

In [None]:
# Congestion
f, ax = plt.subplots(1, 3, figsize=(35, 10))
sns.histplot(data=train_df, x='congestion', element='step', ax=ax[0])
sns.violinplot(train_df.congestion, edgecolor='black', linewidth=2, ax=ax[1])
sns.boxplot(train_df.congestion, ax=ax[2])
#sns.stripplot(train_df.congestion, ax=ax[3])
f.suptitle("congestion's distribution", weight='bold', size=25)
plt.show()

### 1.3 Feature Engineering

In [None]:
train_df = train_df.drop(['row_id'], axis=1)
test_df = test_df.drop(['row_id'], axis=1)

In [None]:
dep_var = 'congestion'

y_max = train_df[dep_var].max()
y_min = train_df[dep_var].min()
train_df[dep_var] = (train_df[dep_var] - y_min) / (y_max - y_min)

train_df.head()

In [None]:
# to handle date
def make_date_features(df, column='time', drop_column=True):
    df['datetime'] = pd.to_datetime(df[column])
    #df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    #df['dayofyear'] = df['datetime'].dt.dayofyear
    #df['week'] = df['datetime'].dt.week
    df['ismonthstart'] = df['datetime'].dt.is_month_start
    df['ismonthend'] = df['datetime'].dt.is_month_end
    #df['isquarterstart'] = df['datetime'].dt.is_quarter_start
    #df['isquarterend'] = df['datetime'].dt.is_quarter_end
    df = df.drop(['datetime'], axis=1)
    if drop_column:
        df = df.drop([column], axis=1)
    return df 

train_df = make_date_features(train_df)
test_df = make_date_features(test_df)

train_df.head()

In [None]:
cont_nn, cat_nn = cont_cat_split(train_df, max_card=5_000, dep_var=dep_var)
print('continuous:', cont_nn)
print('categorical:', cat_nn)
train_df[cat_nn].nunique()

In [None]:
dl_config = {'batch_size': 1024,
            }

procs_nn = [Categorify, FillMissing, Normalize] 

train_all = train_df.copy()
train_df = train_df.query('month!=9').reset_index(drop=True)
train_idx = train_df.query('month!=8 or day<20').index
valid_idx = train_df.query('month==8 and day>=20').index
print(len(train_all), len(train_idx), len(valid_idx))
splits = (list(train_idx),list(valid_idx))


to_nn = TabularPandas(train_df, procs_nn, cat_names=cat_nn, cont_names=cont_nn, 
                      y_names=dep_var, splits=splits)

dls = to_nn.dataloaders(bs=dl_config['batch_size'])

cat_x, cont_x, y = dls.one_batch()
print(cat_x.shape, cont_x.shape, y.shape)

dls.show_batch()

## 2. Model Training

### 2.1 fastai TabularLearner

In [None]:
model_config = {'epochs': 4,
                'layers': [500, 250, 100],
                'dropout_ps':[0., 0.1, 0.2],
                'embed_dropout_p': 0.,
                'lr_max': 1e-3,
                'weight_decay': 0.1,
                'y_range': (0, 1),
               }

learner_config = tabular_config(ps=model_config['dropout_ps'],
                                embed_p=model_config['embed_dropout_p'])

learn = tabular_learner(dls, y_range=model_config['y_range'], 
                        n_out=1, loss_func=F.mse_loss, metrics=mae,
                        layers=model_config['layers'], config=learner_config,) 

learn.model # Check the model architecture

### 2.2 Embedding Customizing

In [None]:
emb_customize_flg = True

In [None]:
if emb_customize_flg:
    #categorical: ['x', 'y', 'direction', 'month', 'day', 'dayofweek', 'hour', 'minute', 'ismonthstart', 'ismonthend']
    cat_dim_szs = np.array([30, 30, 100, 30, 100, 30, 100, 40, 20, 20]) 
    sz_dict = {cat_name: cat_dim_szs[i] for i, cat_name in enumerate(dls.cat_names)}
    
    learn = tabular_learner(dls, y_range=model_config['y_range'], 
                        emb_szs=sz_dict, #emb_szs = get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs)
                        n_out=1, loss_func=F.mse_loss, metrics=mae,
                        layers=model_config['layers'], config=learner_config)
    print(learn.model)

### 2.3 Training

In [None]:
learn.lr_find() # finding the proper learning rate

In [None]:
learn.fit_one_cycle(model_config['epochs'],
                    lr_max=model_config['lr_max'],
                    wd=model_config['weight_decay']) # training

learn.recorder.plot_loss()

In [None]:
learn.recorder.plot_sched() # plotting the learning rate during training

In [None]:
preds, targs = learn.get_preds()
print(mean_absolute_error(preds, targs))

learn.show_results()

### 2.4 Hyperparameter Tuning with Optuna

In [None]:
optuna_flg = True
opt_epochs = 3
n_trials = 100

In [None]:
if optuna_flg:
    
    import optuna
    
    opt_train_df = train_df.query('month==6 or month==7 or month==8').reset_index(drop=True)
    opt_train_idx = opt_train_df.query('month!=8 or day<25').index
    opt_valid_idx = opt_train_df.query('month==8 and day>=25').index
    print(len(opt_train_idx), len(opt_valid_idx))
    splits = (list(opt_train_idx),list(opt_valid_idx))
    
    to_opt = TabularPandas(opt_train_df,
                           procs_nn,
                           cat_names=cat_nn,
                           cont_names=cont_nn, 
                           y_names=dep_var,
                           splits=splits)
    
    opt_dls = to_opt.dataloaders(bs=dl_config['batch_size'])

In [None]:
# Embedding size tuning with Optuna
def objective(trial):
    
    sz_dict = {
        'x': trial.suggest_int('x', get_emb_sz(dls.train_ds)[0][1], 50),
        'y': trial.suggest_int('y', get_emb_sz(dls.train_ds)[1][1], 50),
        'direction': trial.suggest_int('direction', get_emb_sz(dls.train_ds)[2][1], 100),
        'month': trial.suggest_int('month', get_emb_sz(dls.train_ds)[3][1], 50),
        'day': trial.suggest_int('day', get_emb_sz(dls.train_ds)[4][1], 100),
        'dayofweek': trial.suggest_int('dayofweek', get_emb_sz(dls.train_ds)[5][1], 50),
        'hour': trial.suggest_int('hour', get_emb_sz(dls.train_ds)[6][1], 100),
        'minute': trial.suggest_int('minute', get_emb_sz(dls.train_ds)[7][1], 50),
        'ismonthstart': trial.suggest_int('ismonthstart', get_emb_sz(dls.train_ds)[8][1], 50),
        'ismonthend': trial.suggest_int('ismonthend', get_emb_sz(dls.train_ds)[9][1], 50),
    }
    
    learn = tabular_learner(opt_dls, layers=model_config['layers'], 
                            config=learner_config, emb_szs=sz_dict,
                            y_range=model_config['y_range'],
                            n_out=1, loss_func=F.mse_loss, metrics=mae,)
    
    with learn.no_bar():
        with learn.no_logging():
            learn.fit_one_cycle(opt_epochs,
                                lr_max=model_config['lr_max'],
                                wd=model_config['weight_decay'])
    
    score = learn.validate()[-1]
    return score

In [None]:
if optuna_flg:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    print(study.best_params)
    print(study.best_value)
    print(study.best_trial)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
def build_tabular_learner(dls, params):
    sz_dict = {cat_name: params[cat_name] for cat_name in dls.cat_names}
    learn = tabular_learner(dls, layers=model_config['layers'],
                            config=learner_config,
                            emb_szs=sz_dict,
                            y_range=model_config['y_range'],
                            n_out=1, loss_func=F.mse_loss, metrics=mae,)
    return learn

if optuna_flg:
    learn = build_tabular_learner(opt_dls, study.best_params)
    print(learn.model)

### 2.5 Re-training with all data

In [None]:
procs_nn = [Categorify, FillMissing, Normalize] 

train_all = train_all.query('month==6 or month==7 or month==8 or month==9').reset_index(drop=True)
to_all_nn = TabularPandas(train_all, procs_nn, 
                          cat_names=cat_nn, cont_names=cont_nn, 
                          y_names=dep_var)
print(len(to_all_nn.train))
all_dls = to_all_nn.dataloaders(bs=dl_config['batch_size'])

if optuna_flg:
    learn = build_tabular_learner(all_dls, study.best_params)
else:
    learn = tabular_learner(all_dls, y_range=model_config['y_range'], 
                            n_out=1, loss_func=F.mse_loss, metrics=mae,
                            layers=model_config['layers'], config=learner_config,) 
print(learn.model)
learn.lr_find()

In [None]:
learn.fit_one_cycle(model_config['epochs'],
                    lr_max=model_config['lr_max'],
                    wd=model_config['weight_decay'])

## 3. Prediction and Submission

In [None]:
test_dl = learn.dls.test_dl(test_df)
pred = learn.get_preds(dl=test_dl)
pred

In [None]:
test_pred = np.array(pred[0] * (y_max - y_min) + y_min)
test_pred = test_pred.squeeze(axis=1)

#submission_df['congestion'] = test_pred.round().astype(int).tolist()
submission_df['congestion'] = test_pred

submission_df.to_csv('nn_submission.csv', index=False)
submission_df.head()