# Fast.ai on Feb Playground

This notebook uses fast.ai to use a neural network on the February Playground data.
All data analysis is done in a separate notebook.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import packages and load data

In [None]:
import random
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

from fastai import *
from fastai.tabular.all import *
import torch

sns.set_theme(style="white")
cmap_div = sns.diverging_palette(230, 20, as_cmap=True)

In [None]:
# fixing seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED) # gpu vars
torch.backends.cudnn.deterministic = True  #needed
torch.backends.cudnn.benchmark = False

In [None]:
!ls ../input/tabular-playground-series-feb-2021

In [None]:
dpath = Path('../input/tabular-playground-series-feb-2021')
sample_sub = pd.read_csv(dpath / 'sample_submission.csv')
test_raw = pd.read_csv(dpath / 'test.csv')
train_raw = pd.read_csv(dpath / 'train.csv')

## EDA

Just a very brief look to avoid nonsense.

In [None]:
# crate full DataFrame for easier overview of the data
full = pd.concat([train_raw.copy(deep = True), test_raw.copy(deep =True)])

In [None]:
full.head()

In [None]:
# quite a few cont. features are strongly correlated
sns.heatmap(full.drop(columns=['id']).corr(), cmap=cmap_div);

In [None]:
# luckely no columns have NA values
full.isna().sum().to_frame(name='NA').query('NA > 0')

In [None]:
# no duplicated entries
full.drop(columns=['id', 'target']).duplicated().sum()

### TODOs
* look for temporal correlation
* ...


# Feature Engineering

For fast.ai no (or almost no) feature engineering is done.

In [None]:
def one_hot_encoding(df):
    # pandas take care of one-hot encoding of categorial features
    df = pd.get_dummies(df)
    return df

In [None]:
def split_data(df):
    df = df.reset_index(drop=True) # this makes the index going from 0 .. n-1 independently of any transformation before
    id = df['id']
    df = df.drop(columns=['id'])
    
    if 'target' in df.columns:
        x = df.drop(columns=['target'])
        y = df['target']
    else:
        x = df
        y = None
    
    return (x, y, id)

In [None]:
def split_data_fastai(df):
    df = df.reset_index(drop=True) # this makes the index going from 0 .. n-1 independently of any transformation before
    id = df['id']
    df = df.drop(columns=['id'])
    
    return (df, id)

In [None]:
def apply_all(df, funs, debug=False):
    """Helper function to apply a series of functions onto a DataFrame"""
    for fun in funs:
        if debug:
            print(f'Apply {fun.__name__}')
        df = fun(df)
    return df

## Naive Model

Here I calculate the performance of a naive model, which just applies the median.

In [None]:
train = train_raw.copy(deep = True)
prep_triv = lambda x: apply_all(x, [one_hot_encoding, split_data])
x, y, id = prep_triv(train)

triv_model = DummyRegressor(strategy='median')

scores = cross_validate(triv_model, x, y, cv=5,
                        scoring=('neg_root_mean_squared_error'),
                        n_jobs=-1)
triv_model.fit(x, y)


print(f'triv_model - RMSE: {np.mean(scores["test_score"]*-1)}')

## fast.ai

Since the dataset is relativly large, the employed model can also quite big.
The data contains no missing data, so there is no need for much preprocessing here.
I've used a small dropout probability for the embedding and linear layers. Setting the dropout to zero, leads to very unstable training.

In [None]:
train = train_raw.copy(deep = True)
prep_nn1 = lambda x: apply_all(x, [split_data_fastai])
train, train_ids = prep_nn1(train)

torch.device('cuda') # enable cuda, (activate GPU usage)

cont_names = [f'cont{i}' for i in range(14)] # set the continous variables
cat_names = [f'cat{i}' for i in range(10)] # set the categoriall variables
procs = [Categorify, Normalize] # different fast.ai preprocessing steps
dep_var = 'target' # our target variable

splits = RandomSplitter(valid_pct=0.25, seed=42)(train.index) # to validate the results we use randomly 20% of the training set

cfg = tabular_config(embed_p=0.10, ps=0.10)

dls = TabularPandas(train,
                    cont_names=cont_names,
                    cat_names=cat_names,
                    procs=procs,
                    y_names=dep_var,
                    splits=splits).dataloaders(bs=2056)


callbacks = [SaveModelCallback(min_delta=0.001, monitor='_rmse', comp=np.less, fname='model_triv_best')]

learn = tabular_learner(dls, layers=[2000,500], metrics=[rmse], config=cfg)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(45, lr_max=2e-2, cbs=callbacks)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(45, lr_max=slice(1e-6, 1e-6, 1e-2), cbs=callbacks)

In [None]:
learn.recorder.plot_loss()

The final result is not much better than the trivial model. Nevertheless I submit this first results.

In [None]:
test, test_id = prep_nn1(test_raw.copy(deep=True))

test_dl = learn.dls.test_dl(test)

preds, _ = learn.get_preds(dl=test_dl)
preds = preds.numpy().T[0]

submission = pd.DataFrame(
    {'id': test_id,
     'target': preds}
)
submission.to_csv('submission_trivial_nn.csv', index=False)

Finally, run the model also on the full training dataset and store the results of the training and test set for later usage.

In [None]:
full_train_dl = learn.dls.test_dl(train)

preds, _ = learn.get_preds(dl=full_train_dl)
preds = preds.numpy().T[0]

full_train_results = pd.DataFrame(
    {'id': train_ids,
     'target': preds}
)

In [None]:
!mkdir -p '/kaggle/working/Feb2021Playground/FastAi'

In [None]:
submission.to_csv('/kaggle/working/Feb2021Playground/FastAi/test_results_fastai.csv', index=False)
full_train_results.to_csv('/kaggle/working/Feb2021Playground/FastAi/train_results_fastai.csv', index=False)