In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import torch
import numpy as np
from datetime import datetime
#from util import AverageMeter
#from model import SwapNoiseMasker, TransformerAutoEncoder
#from data import get_data, SingleDataset
from torch.utils.data import DataLoader
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pandas as pd
import gc
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset

If you find this notebook useful, please visit original discussion post (#1 Solution in Feb21 Comp, which is where I have taken this code from) and upvote.
    
https://www.kaggle.com/c/tabular-playground-series-feb-2021/discussion/222745

And winning solution from Jan

https://www.kaggle.com/c/tabular-playground-series-jan-2021/discussion/216037

In [None]:
class CFG:
    
    batch_size = 384
    init_lr = 3e-4
    lr_decay = .998
    max_epochs = 5
    save_freq=50

In [None]:
PATH = '/kaggle/input/tabular-playground-series-mar-2021/'
fts_categorical = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 
                   'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']

fts_continuous = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

#unique counts should be the count of train PLUS test
unique_counts=[  2,  15,  19,  13,  20,  84,  16,  51,  61,  19, 307,   2,   2,
         2,   2,   4,   4,   4,   4]

print('Categorical Features', fts_categorical)
print('Continuous Features', fts_continuous)

print('Categorical Feature Count', len(fts_categorical))
print('Continuous Feature Count', len(fts_continuous))

In [None]:
def get_data():
    train_data = pd.read_csv(PATH+'train.csv')
    test_data = pd.read_csv(PATH+'test.csv')
    
    #combine train and test data vertically
    X_nums = np.vstack([
        train_data.iloc[:, 20:-1].to_numpy(),
        test_data.iloc[:, 20:].to_numpy()
    ])
    X_nums = (X_nums - X_nums.mean(0)) / X_nums.std(0) #normalize
    
    #stack the categorical data
    X_cat = np.vstack([
        train_data.iloc[:, 1:20].to_numpy(),
        test_data.iloc[:, 1:20].to_numpy()
    ])
    #encode the categoricals
    encoder = OneHotEncoder(sparse=False)
    X_cat = encoder.fit_transform(X_cat)
    
    #join the categorical and continuous data horizontally
    X = np.hstack([X_cat, X_nums])
    y = train_data['target'].to_numpy().reshape(-1, 1)
    return X, y, X_cat.shape[1], X_nums.shape[1] #this lets us know how many categorical and continuous features there are

class SingleDataset(Dataset):
    def __init__(self, x, is_sparse=False):
        self.x = x.astype('float32')
        self.is_sparse = is_sparse

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        x = self.x[index]
        if self.is_sparse: x = x.toarray().squeeze()
        return x    

In [None]:
bce_logits = torch.nn.functional.binary_cross_entropy_with_logits
mse = torch.nn.functional.mse_loss

#torch docs

#embed_dim – total dimension of the model.
#num_heads – parallel attention heads.
#dropout – a Dropout layer on attn_output_weights. Default: 0.0.
#bias – add bias as module parameter. Default: True.
#add_bias_kv – add bias to the key and value sequences at dim=0.
#add_zero_attn – add a new batch of zeros to the key and value sequences at dim=1.
#kdim – total number of features in key. Default: None.
#vdim – total number of features in value. Default: None.

class TransformerEncoder(torch.nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, feedforward_dim):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.linear_1 = torch.nn.Linear(embed_dim, feedforward_dim)
        self.linear_2 = torch.nn.Linear(feedforward_dim, embed_dim)
        self.layernorm_1 = torch.nn.LayerNorm(embed_dim)
        self.layernorm_2 = torch.nn.LayerNorm(embed_dim)
    
    def forward(self, x_in):
        attn_out, _ = self.attn(x_in, x_in, x_in)
        x = self.layernorm_1(x_in + attn_out)
        ff_out = self.linear_2(torch.nn.functional.relu(self.linear_1(x)))
        x = self.layernorm_2(x + ff_out)
        return x


class TransformerAutoEncoder(torch.nn.Module):
    def __init__(
            self, 
            num_inputs, 
            n_cats, 
            n_nums, 
            hidden_size=1024, 
            num_subspaces=8,
            embed_dim=128, 
            num_heads=8, 
            dropout=0, 
            feedforward_dim=512, 
            emphasis=.75, 
            task_weights=[len(fts_categorical), len(fts_continuous)],
            mask_loss_weight=2,
        ):
        super().__init__()
        assert hidden_size == embed_dim * num_subspaces
        self.n_cats = n_cats
        self.n_nums = n_nums
        self.num_subspaces = num_subspaces
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.emphasis = emphasis
        self.task_weights = np.array(task_weights) / sum(task_weights)
        self.mask_loss_weight = mask_loss_weight

        self.excite = torch.nn.Linear(in_features=num_inputs, out_features=hidden_size)
        self.encoder_1 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
        self.encoder_2 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
        self.encoder_3 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
        
        
        self.mask_predictor = torch.nn.Linear(in_features=hidden_size, out_features=num_inputs)
        self.reconstructor = torch.nn.Linear(in_features=hidden_size + num_inputs, out_features=num_inputs)

    def divide(self, x):
        batch_size = x.shape[0]
        x = x.reshape((batch_size, self.num_subspaces, self.embed_dim)).permute((1, 0, 2))
        return x

    def combine(self, x):
        batch_size = x.shape[1]
        x = x.permute((1, 0, 2)).reshape((batch_size, -1))
        return x

    def forward(self, x):
        x = torch.nn.functional.relu(self.excite(x))
        
        x = self.divide(x)
        x1 = self.encoder_1(x)
        x2 = self.encoder_2(x1)
        x3 = self.encoder_3(x2)
        x = self.combine(x3)
        
        predicted_mask = self.mask_predictor(x)
        reconstruction = self.reconstructor(torch.cat([x, predicted_mask], dim=1))
        return (x1, x2, x3), (reconstruction, predicted_mask)

    def split(self, t):
        return torch.split(t, [self.n_cats, self.n_nums], dim=1)

    def feature(self, x):
        attn_outs, _ = self.forward(x)
        return torch.cat([self.combine(x) for x in attn_outs], dim=1)

    def loss(self, x, y, mask, reduction='mean'):   
        _, (reconstruction, predicted_mask) = self.forward(x)
        
        x_cats, x_nums = self.split(reconstruction)
        y_cats, y_nums = self.split(y)
        w_cats, w_nums = self.split(mask * self.emphasis + (1 - mask) * (1 - self.emphasis))
        cat_loss = self.task_weights[0] * torch.mul(w_cats, bce_logits(x_cats, y_cats, reduction='none'))
        num_loss = self.task_weights[1] * torch.mul(w_nums, mse(x_nums, y_nums, reduction='none'))
        reconstruction_loss = torch.cat([cat_loss, num_loss], dim=1) if reduction == 'none' else cat_loss.mean() + num_loss.mean()
        mask_loss = self.mask_loss_weight * bce_logits(predicted_mask, mask, reduction=reduction)
        return reconstruction_loss + mask_loss if reduction == 'mean' else [reconstruction_loss, mask_loss]


class SwapNoiseMasker(object):
    def __init__(self, probas):
        self.probas = torch.from_numpy(np.array(probas))

    def apply(self, X):
        should_swap = torch.bernoulli(self.probas.to(X.device) * torch.ones((X.shape)).to(X.device))
        corrupted_X = torch.where(should_swap == 1, X[torch.randperm(X.shape[0])], X)
        mask = (corrupted_X != X).float()
        return corrupted_X, mask


def test_tf_encoder():
    m = TransformerEncoder(4, 2, .1, 16)
    x = torch.rand((32, 8))
    x = x.reshape((32, 2, 4)).permute((1, 0, 2))
    o = m(x)
    assert o.shape == torch.Size([2, 32, 4])


def test_dae_model():
    m = TransformerAutoEncoder(5, 2, 3, 16, 4, 4, 2, .1, 4, .75)
    x = torch.cat([torch.randint(0, 2, (5, 2)), torch.rand((5, 3))], dim=1)
    f = m.feature(x)
    assert f.shape == torch.Size([5, 16 * 3])
    loss = m.loss(x, x, (x > .2).float())


def test_swap_noise():
    probas = [.2, .5, .8]
    m = SwapNoiseMasker(probas)
    diffs = []
    for i in range(1000):
        x = torch.rand((32, 3))
        noisy_x, _ = m.apply(x)
        diffs.append((x != noisy_x).float().mean(0).unsqueeze(0)) 

    print('specified : ', probas, ' - actual : ', torch.cat(diffs, 0).mean(0))


if __name__ == '__main__':
    test_tf_encoder()
    test_dae_model()
    test_swap_noise()

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
# Hyper-params
model_params = dict(
    hidden_size=1024,
    num_subspaces=8,
    embed_dim=128,
    num_heads=8,
    dropout=0,
    feedforward_dim=512,
    emphasis=.75,
    mask_loss_weight=2
)

#repeats should correspond to categorical counts for categorical columns
repeats = [x for x in unique_counts] + [1 for x in range(len(fts_continuous))]

#probabilities are just set to 0.5 for now
probas = [0.5 for x in range(len(fts_categorical))] + [0.5 for x in range(len(fts_continuous))]

print(len(repeats), len(probas))
swap_probas = sum([[p] * r for p, r in zip(probas, repeats)], [])

#  get data
X, Y, n_cats, n_nums = get_data()

train_dl = DataLoader(
    dataset=SingleDataset(X),
    batch_size=CFG.batch_size,
    shuffle=True,
    pin_memory=True,
    drop_last=True
)

# setup model
model = TransformerAutoEncoder(
    num_inputs=X.shape[1],
    n_cats=n_cats,
    n_nums=n_nums,
    **model_params
).cuda()
model_checkpoint = 'model_checkpoint.pth'

print(model)

noise_maker = SwapNoiseMasker(swap_probas)
optimizer = torch.optim.Adam(model.parameters(), lr=CFG.init_lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=CFG.lr_decay)

# train model
for epoch in range(CFG.max_epochs):
    t0 = datetime.now()
    model.train()
    meter = AverageMeter()
    for i, x in enumerate(train_dl):
        x = x.cuda()
        x_corrputed, mask = noise_maker.apply(x)
        optimizer.zero_grad()
        loss = model.loss(x_corrputed, x, mask)
        loss.backward()
        optimizer.step()

        meter.update(loss.detach().cpu().numpy())

    delta = (datetime.now() - t0).seconds
    scheduler.step()
    
    print('epoch {:5d} - loss {:.6f} - {:4.6f} sec per epoch'.format(epoch, meter.avg, delta))  
    
    model_checkpoint = f'model_checkpoint_{epoch}.pth'
    
    if epoch%CFG.save_freq==0:
        print('Saving to checkpoint')
        model_checkpoint = f'model_checkpoint_{epoch}.pth'
        torch.save({
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
                "model": model.state_dict()
            }, model_checkpoint
        )
        

model_checkpoint = f'model_checkpoint_final.pth'
torch.save({
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "model": model.state_dict()
    }, model_checkpoint
)
model_state = torch.load(model_checkpoint)
model.load_state_dict(model_state['model'])