This notebook is based on `lukaszborecki's` great work in this competition:  
https://www.kaggle.com/lukaszborecki/pytorch-fork-of-tps-09-nn  

  This notebook plays around with:
   - Adding scheduler/boilerplate
   - Adding a Trainer object for training/evaluation
   - Concating x and x_bin inputs
   - Adding multi-sample dropout

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from fastai.losses import LabelSmoothingCrossEntropy, LabelSmoothingCrossEntropyFlat
from fastai.layers import Mish
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
from tqdm import tqdm
from torchmetrics import AUROC
import gc, sys, random
gc.enable()

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test_df =  pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
        
seed_all(2021)

## Exploration and Feature Engineering

In [None]:
train_df.claim.value_counts()

In [None]:
# null values in generated data shown to be useful
train_df.isna().sum()

In [None]:
train_df['sum_na'] = train_df.isna().sum(axis=1)
test_df['sum_na'] = test_df.isna().sum(axis=1)

## Model Architecture

In [None]:
class TabModel(nn.Module):
    def __init__(self, act_fn = nn.SiLU(), dropout_num = 1):
        super().__init__()
        self.emb = nn.Embedding(96,18)
        self.fc = nn.Linear(119*18, 30)
        self.dropouts = nn.ModuleList([nn.Dropout(0.4) 
                                       for _ in range(dropout_num)])
        self.fc1 = nn.Linear(119,30)
        self.fc2 = nn.Linear(30*2,30) # concat layers
        self.out = nn.Linear(30,1)
        self.act_fn = act_fn
        
        torch.nn.init.xavier_normal_(self.out.weight)
        torch.nn.init.xavier_normal_(self.emb.weight)
        torch.nn.init.xavier_normal_(self.fc.weight)
        torch.nn.init.xavier_normal_(self.fc1.weight)
        torch.nn.init.xavier_normal_(self.fc2.weight)

    def forward(self, x_bin, x):
        x_bin = self.emb(x_bin)
        x_bin = x_bin.view(-1,119*18)
        x_bin = self.act_fn(self.fc(x_bin))
        
        x = self.act_fn(self.fc1(x))
        x = torch.cat([x_bin,x], -1)
        x = self.act_fn(self.fc2(x))
        
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                out = dropout(x)
                out = self.out(out)
        
            else:
                temp_out = dropout(x)
                temp_out = self.out(temp_out)
                out += temp_out
                
        out /= len(self.dropouts)

        return torch.sigmoid(out)

## Utilities

In [None]:
def preprocess_dataset(x, x_test, target = None):
    if target:
        x = x.copy().drop(target, 1)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    qt = QuantileTransformer(n_quantiles=96, output_distribution='normal')
    bin_cat = KBinsDiscretizer(n_bins=96, encode='ordinal',strategy='uniform')
    
    x = imp.fit_transform(x)
    x = qt.fit_transform(x)
    x_bin = bin_cat.fit_transform(x)
    
    x_test = imp.transform(x_test)
    x_test = qt.transform(x_test)
    x_test_bin = bin_cat.transform(x_test)
    
    return x, x_bin, x_test, x_test_bin

In [None]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001, verbose = None):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.verbose = verbose
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            if self.verbose:
                print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            if self.verbose:
                print('Validation score improved ({:.4f} --> {:.4f}). Saving model!'.format(self.val_score, epoch_score))
                
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [None]:
x_train, x_bin, x_test, x_test_bin = preprocess_dataset(train_df, test_df, target = 'claim')
y_train = train_df.claim.values

## Dataset

In [None]:
class TabDataset(Dataset):
    def __init__(self, x, x_bin, target = None):
        super().__init__()
        self.x = x
        self.x_bin = x_bin
        self.target = target
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx, :]
        x_bin = self.x_bin[idx, :]
        
        _dict = {'x': torch.tensor(x, dtype = torch.float),
                 'x_bin': torch.tensor(x_bin, dtype = torch.long)}
        
        if self.target is not None:
            target = self.target[idx].item()
            _dict.update({'target': torch.tensor(target, dtype = torch.float)})
        
        return _dict

## Trainer

In [None]:
class Trainer:
    def __init__(self, model, device, loss_fn, opt, scheduler = None):
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.opt = opt
        self.scheduler = scheduler
        
    def fit_one_epoch(self, dl):
        self.model.train()
        losses = AverageMeter()
        prog_bar = tqdm(enumerate(dl), total = len(dl), file=sys.stdout, leave = False)
        
        for bi, d in prog_bar:
            x = d["x"].to(self.device)
            x_bin = d['x_bin'].to(self.device)
            target = d['target'].to(self.device)
            
            out = self.model(x_bin, x)
            loss = self.loss_fn(out.squeeze(-1), target)
            prog_bar.set_description('loss: {:.2f}'.format(loss.item()))
            losses.update(loss.item(), x.size(0))
            loss.backward()
            self.opt.step()
            
            if self.scheduler: 
                self.scheduler.step()
                    
            self.opt.zero_grad()
            
    def eval_one_epoch(self, dl, **kwargs):
        self.model.eval()
        losses = AverageMeter()
        metric = AUROC()
        prog_bar = tqdm(enumerate(dl), total = len(dl), file=sys.stdout, leave = False)
        
        for bi, d in prog_bar:  
            x = d["x"].to(self.device)
            x_bin = d['x_bin'].to(self.device)
            target = d['target'].to(self.device)
            
            with torch.no_grad():
                out = self.model(x_bin, x)
                loss = self.loss_fn(out.squeeze(-1), target)
                if metric:
                    auroc = metric(out.squeeze(-1), target.int())
                
                losses.update(loss.item(), x.size(0))
        auroc = metric.compute()
        print(f"F{kwargs['fold']} E{kwargs['epoch']}  Valid Loss: {losses.avg:.4f}  AUROC Score: {auroc:.4f}")
        return auroc.cpu() if metric else losses.avg

## Training

In [None]:
class cfg:
    bs = 1024
    n_splits = 8
    seed = 2021
    epochs = 4
    lr = 1e-4
    checkpoint = lambda fold: f'model_{fold}.pt'
    
kfold = StratifiedKFold(n_splits = cfg.n_splits, 
                        random_state = cfg.seed, 
                        shuffle = True)
splits = [*kfold.split(X = x_train, y = y_train)]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def create_dataloaders(fold):
    train_idx, valid_idx = splits[fold]
    
    _xtr, _xtr_bins, _ytr = x_train[train_idx], x_bin[train_idx], y_train[train_idx]
    _xval, _xval_bins, _yval = x_train[valid_idx], x_bin[valid_idx], y_train[valid_idx]
    
    train_ds = TabDataset(x = _xtr, x_bin = _xtr_bins, target = _ytr)
    valid_ds = TabDataset(x = _xval, x_bin = _xval_bins, target = _yval)
                          
    train_dl = DataLoader(train_ds, batch_size = cfg.bs, shuffle = True)
    valid_dl = DataLoader(valid_ds, batch_size = cfg.bs, shuffle = False)
    
    return train_dl, valid_dl

In [None]:
def train_fold(fold, epochs):
    train_dl, valid_dl = create_dataloaders(fold)
    es = EarlyStopping(patience = 7, mode="max", verbose = False)
    
    model = TabModel(dropout_num = 1).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr = cfg.lr)
    scheduler = OneCycleLR(opt, 
                           max_lr=1e-3, 
                           steps_per_epoch=len(train_dl),
                           epochs = epochs)

    trainer = Trainer(model, 
                      device, 
                      loss_fn=nn.BCELoss(), 
                      opt = opt,
                      scheduler = scheduler,
                     )
    
    for epoch in range(epochs):
        trainer.fit_one_epoch(train_dl)
        valid_loss = trainer.eval_one_epoch(valid_dl, fold = fold, epoch = epoch)
        
        es(valid_loss, trainer.model, model_path = cfg.checkpoint(fold))
        
        if es.early_stop:
            break

In [None]:
for fold in range(cfg.n_splits):
    train_fold(fold, cfg.epochs)

## Prediction

In [None]:
y_pred = torch.zeros(len(x_test), 1).to(device)
test_ds = TabDataset(x_test, x_test_bin)
test_dl = DataLoader(test_ds, batch_size = cfg.bs, shuffle = False)

with torch.no_grad():
    for fold in range(cfg.n_splits):
        preds = []
        model = TabModel(dropout_num = 1).to(device)
        state_dict = cfg.checkpoint(fold)
        model.load_state_dict(torch.load(state_dict))
        model.eval()
        
        for d in test_dl:
            x = d["x"].to(device)
            x_bin = d['x_bin'].to(device)
            out = model(x_bin, x)
            preds.append(out)
            
        preds = torch.vstack(preds)
        y_pred += preds / cfg.n_splits

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub.iloc[:,1] = y_pred.cpu()
sub = sub.set_index('id')
sub.to_csv('submission.csv')