## Introduction
This notebook experiments with the TabTransformer, which learns contextual embeddings to achieve higher prediction accuracy.  
In a supervised-only setting, the architectures performs on par with GBDTs from benchmarks.  
  
Improvements so far, include:
 - Discretising continuous features to pass to transformer blocks
 - Training with a flat learning rate
 - This version experiments with feeding only column embeddings to the transformer 
 
So far the best local CV is ~*85* AUROC, using 242 continuous columns, and 242 (cont) + 42 (cat) categorical features.    
Thanks for reading
 

In [None]:
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from tqdm import tqdm
from torchmetrics import AUROC
import matplotlib.pyplot as plt
import gc, sys, random, warnings
gc.enable()
warnings.filterwarnings("ignore")

In [None]:
!pip install tab-transformer-pytorch
from tab_transformer_pytorch import TabTransformer

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
seed_all(123)

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv', nrows = 500000) # memory issues on kaggle cpu
test_df =  pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

train_df.drop('id',1, inplace=True)
test_df.drop('id',1, inplace=True)

## Exploration and Feature Engineering

`train_df` has a mix of continuous and potentially one-hot encoded categorical features.  
Integer features start at index 242. We'll save the columns for preprocessing/splitting later

In [None]:
cont_cols = train_df.columns[:242]
cat_cols = train_df.columns[242:-1]

We'll scale the data here with a Gauss Rank transform, with code originally from the MOA competition:  
  https://www.kaggle.com/kushal1506/moa-pytorch-0-01859-rankgauss-pca-nn

In [None]:
train_scaled, test_scaled = train_df.copy(), test_df.copy()
for col in cont_cols:
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_scaled[col].values)
    vec_len_test = len(test_scaled[col].values)
    raw_vec = train_scaled[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_scaled[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_scaled[col] = transformer.transform(test_scaled[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
fig, axs = plt.subplots(2)
axs[0].title.set_text('F0 dist before Gauss Rank')
axs[0].hist(train_df.f0, bins = 100)

axs[1].title.set_text('F0 dist after Gauss Rank')
axs[1].hist(train_scaled.f0, bins = 100)
plt.tight_layout()
plt.show()

In [None]:
del(train_df); del (test_df); gc.collect()

## Discretising Continuous Features 
Here we discretise the scaled features using a `KBinsDiscretizer`  
For this version we'll replace continuous features inplace

In [None]:
disc = KBinsDiscretizer(n_bins=50, encode='ordinal',strategy='uniform')
train_scaled[cont_cols] = disc.fit_transform(train_scaled[cont_cols])
test_scaled[cont_cols] = disc.transform(test_scaled[cont_cols])

In [None]:
y_train = train_scaled.target.values
train_scaled = train_scaled.drop('target', 1).values
test_scaled = test_scaled.values

## Utilities

In [None]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001, verbose = None):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.verbose = verbose
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            if self.verbose:
                print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            if self.verbose:
                print('Validation score improved ({:.4f} --> {:.4f}). Saving model!'.format(self.val_score, epoch_score))
                
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [None]:
# def split_df(df):
#     _cont = df.filter(cont_cols).values
#     _cat = df.filter(cat_cols).values
#     return _cont, _cat

# train_cont, train_cat = split_df(train_scaled)
# test_cont, test_cat = split_df(test_scaled)

## Dataset

In [None]:
# Updated to pass a contant cont value in this version
class TabDataset(Dataset):
    def __init__(self, cat, target = None):
        super().__init__()
        self.cat = cat
        self.target = target
        
    def __len__(self):
        return len(self.cat)
    
    def __getitem__(self, idx):
        cat = self.cat[idx]
        
        _dict = {'cont': torch.ones(1),
                 'cat': torch.LongTensor(cat)}
        
        if self.target is not None:
            target = self.target[idx].item()
            _dict.update({'target': torch.tensor(target, dtype = torch.float)})
        
        return _dict

## Trainer
The Trainer manages context and training for single epochs  
  
A `metric` object is created in evaluation to compute AUROC, which is used for saving weights/early stopping  

In [None]:
class Trainer:
    def __init__(self, model, device, loss_fn, opt, scheduler = None):
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.opt = opt
        self.scheduler = scheduler
        
    def fit_one_epoch(self, dl):
        self.model.train()
        losses = AverageMeter()
        prog_bar = tqdm(enumerate(dl), total = len(dl), file=sys.stdout, leave = False)
        
        for bi, d in prog_bar:
            cont = d["cont"].to(self.device)
            cat = d['cat'].to(self.device)
            target = d['target'].to(self.device)
            
            out = self.model(cat, cont)
            loss = self.loss_fn(out.squeeze(-1), target)
            prog_bar.set_description('loss: {:.2f}'.format(loss.item()))
            losses.update(loss.item(), cont.size(0))
            loss.backward()
            self.opt.step()
            
            if self.scheduler: 
                self.scheduler.step()
                    
            self.opt.zero_grad()
            
    def eval_one_epoch(self, dl, **kwargs):
        self.model.eval()
        losses = AverageMeter()
        metric = AUROC()
        prog_bar = tqdm(enumerate(dl), total = len(dl), file=sys.stdout, leave = False)
        
        for bi, d in prog_bar:  
            cont = d["cont"].to(self.device)
            cat = d['cat'].to(self.device)
            target = d['target'].to(self.device)
            
            with torch.no_grad():
                out = self.model(cat, cont)
                loss = self.loss_fn(out.squeeze(-1), target)
                if metric:
                    auroc = metric(out.squeeze(-1), target.int())
                
                losses.update(loss.item(), cont.size(0))
        auroc = metric.compute()
        print(f"F{kwargs['fold']} E{str(kwargs['epoch']):2s}"\
              f"  Valid Loss: {losses.avg:.4f}  AUROC Score: {auroc:.4f}")
        return auroc.cpu() if metric else losses.avg

## Model Architecture

<img align="left" src=https://raw.githubusercontent.com/lucidrains/tab-transformer-pytorch/main/tab.png>

The TabTransformer uses both categorical and continuous features to model tabular data.  
Only the categorical features/column embeddings are passed to the self-attention layers with this architecture.   
  
In our case, only features $F242 ... F284$ are passed to the transformer blocks.  
  It could be interesting to experiment with discretising the continuous features and passing them to the transformer as well

## Training

In [None]:
class cfg:
    bs = 400
    n_splits = 5
    seed = 2021
    epochs = 3
    lr = 2e-5
    checkpoint = lambda fold: f'full_cat_{fold}.pt'
    
kfold = StratifiedKFold(n_splits = cfg.n_splits, 
                        random_state = cfg.seed, 
                        shuffle = True)
splits = [*kfold.split(X = train_scaled, y = y_train)]

In [None]:
transformer_cfg = {
    'categories' : [50]*242 + [2]*43,           # iterable with the number of unique values for categoric feature
    'num_continuous' : 1,                       # continuous dimensions in data
    'dim' : 32,                                 # hidden dim, paper set at 32
    'dim_out' : 1,                              # binary prediction
    'depth' : 3,                                # depth, paper recommended 6
    'heads' : 6,                                # heads, paper recommends 8
    'attn_dropout' : 0.1,                       # post-attention dropout
    'ff_dropout' : 0.1,                         # feed forward dropout
    'mlp_hidden_mults' : (4, 2),                # relative multiples of each hidden dimension of the last mlp to logits
    'mlp_act' : nn.GELU(),                      # activation for final mlp, defaults to relu
    'continuous_mean_std' : torch.randn(1, 2)   # normalize the continuous values before layer norm (optional)
}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def create_dataloaders(fold):
    train_idx, valid_idx = splits[fold]
    
    _xtr, _ytr = train_scaled[train_idx], y_train[train_idx]
    _xval, _yval = train_scaled[valid_idx], y_train[valid_idx]
    
    train_ds = TabDataset(cat = _xtr, target = _ytr)
    valid_ds = TabDataset(cat = _xval, target = _yval)
                          
    train_dl = DataLoader(train_ds, batch_size = cfg.bs, shuffle = True)
    valid_dl = DataLoader(valid_ds, batch_size = cfg.bs, shuffle = False)
    
    return train_dl, valid_dl

In [None]:
def train_fold(fold, epochs = 20):
    train_dl, valid_dl = create_dataloaders(fold)
    es = EarlyStopping(patience = 7, mode="max", verbose = False)
    
    model = TabTransformer(**transformer_cfg).to(device)
    
    opt = torch.optim.AdamW(model.parameters(), lr = cfg.lr)

    trainer = Trainer(model, 
                      device, 
                      loss_fn=nn.BCEWithLogitsLoss(),
                      opt = opt,
                      scheduler = None,
                     )
    
    for epoch in range(epochs):
        trainer.fit_one_epoch(train_dl)
        valid_loss = trainer.eval_one_epoch(valid_dl, fold = fold, epoch = epoch)
        
        es(valid_loss, trainer.model, model_path = cfg.checkpoint(fold))
        
        if es.early_stop:
            break

In [None]:
for fold in range(cfg.n_splits):
    train_fold(fold, cfg.epochs)
    torch.cuda.empty_cache()
    gc.collect()

## Prediction

In [None]:
y_pred = torch.zeros(len(test_scaled), 1).to(device)
test_ds = TabDataset(cat = test_scaled)
test_dl = DataLoader(test_ds, batch_size = cfg.bs, shuffle = False)

with torch.no_grad():
    for fold in range(cfg.n_splits):
        preds = []
        model = TabTransformer(**transformer_cfg).to(device)
        state_dict = cfg.checkpoint(fold)
        model.load_state_dict(torch.load(state_dict))
        model.eval()
        
        for d in test_dl:
            cont = d["cont"].to(device)
            cat = d['cat'].to(device)
            out = model(cat, cont)
            preds.append(out)
            
        preds = torch.vstack(preds)
        y_pred += preds / cfg.n_splits

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
sub.iloc[:,1] = y_pred.cpu()
sub = sub.set_index('id')
sub.to_csv('submission.csv')