# Denoising Autoencoder

This is a torch implementation of the same architecture I used for the 1st place submission to TPS June 2022. For the sake of having the notebook run fast, I am using a small model. For the actual submission, I used a larger model and averaged 3 model runs (in torch and in tensorflow).

In [None]:
import math
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
import torch.nn.functional as F
import torch.utils.data
from torch import nn

from tqdm import tqdm

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')

In [None]:
# Create lists of features
features = list()

for f in ('F_1', 'F_2', 'F_3', 'F_4'):
    features.append([col for col in data if col[:3]==f])
    
f1_col, f2_col, f3_col, f4_col = features

features = [f for l in features for f in l]

In [None]:
# Define masks


# Mask for training -> binomial + base one per row
def random_mask(shape, binomial_p=0.05):
    n, k = shape
    mask = np.ones((n, k))
    # Set minimum one random per row
    mask[(
        np.arange(n),
        np.random.randint(0, k, n)
    )] = 0
    # Add binomial probability as well
    b_mask = np.random.binomial(1, 1-binomial_p, (n, k))
    return mask * b_mask


# Mask for validation - fixed n missing per row
def mask_n_rows(shape, n_missing):
    n, k = shape
    s = np.arange(k)[np.newaxis, :].repeat(n, axis=0).reshape(n, k)
    idx = np.random.rand(n, k).argsort(1)[:,:n_missing]
    col_idx = np.take_along_axis(s, idx, axis=1).ravel()
    row_idx = np.arange(n).repeat(n_missing)
    
    mask = np.ones((n, k))
    mask[(
        row_idx,
        col_idx
    )] = 0
    return mask

In [None]:
# Defining the train and validation set

is_missing_bool = data[f4_col].isna().sum(axis=1) > 0

# Define subsets of the data with row-wise missing values
X_complete = data.loc[~is_missing_bool, f4_col].values
X_missing = data.loc[is_missing_bool, f4_col].values

# Split data that has no missing to use for eval set
X_train_complete, X_valid = train_test_split(X_complete, random_state=6) # Same as previous

# Build train set from complete and missing data
X_train = np.concatenate([X_train_complete, X_missing], axis=0)

# Mask to show train values that have been imputed
srce_nan_train = np.concatenate([
    np.zeros(X_train_complete.shape),
    data.loc[is_missing_bool, f4_col].isna().astype(np.uint8).values
])

# Feature scaling
scaler = StandardScaler()

scaler.fit(data[f4_col].values)

X_train = np.nan_to_num(scaler.transform(X_train), 0.0)
X_valid = scaler.transform(X_valid)

In [None]:
# Build Model

class MLP(nn.Module):
# Dense layer with layer normalization and mish activation
    def __init__(self, input_size, output_size):
        super().__init__()
        self.dense = nn.Linear(input_size, output_size)
        self.act = nn.Mish()
        self.layernorm = nn.LayerNorm(output_size, eps=1e-6)
        
    def forward(self, x):
        x = self.dense(x)
        x = self.act(x)
        return self.layernorm(x)
    
# Msked autoencoder model
class MaskedAutoencoder(nn.Module):
    def __init__(self, n_columns, emb_dim=16,
                 units=[512, 512, 512, 512, 512, 128]):
        super().__init__()
        self.n_columns = n_columns

        # Embedding
        self.inp_proj = nn.Linear(1, emb_dim)
        self.mask_proj = nn.Linear(1, emb_dim)
        self.emb_norm = nn.LayerNorm(n_columns * emb_dim, eps=1e-6)
        
        # MLP with skip connection
        self.mlp_layers = nn.ModuleList([])
        for i in range(len(units)):
            if i==0:
                input_size = n_columns * emb_dim
            elif i==1:
                input_size = n_columns * emb_dim + units[0]
            else:
                input_size = units[i-1] + units[i-2]
            output_size = units[i]
            self.mlp_layers.append(
                MLP(input_size=input_size, output_size=output_size)
            )
                
        self.final_dense = nn.Linear(units[-1] + units[-2], self.n_columns)
        
    def forward(self, inputs:torch.Tensor, mask:torch.Tensor):
        # Embeddings
        input_embedding = self.inp_proj(torch.unsqueeze(inputs, 2))
        mask_embedding = self.mask_proj(torch.unsqueeze(1-mask, 2))
        embedding = input_embedding + mask_embedding
        embedding = torch.flatten(embedding, 1)
        x = [self.emb_norm(embedding)]
        
        # MLP
        for i in range(len(self.mlp_layers)):
            if i==0:
                z = self.mlp_layers[i](x[0])
                x.append(z)
            else:
                z = torch.cat((x[-1], x[-2]), 1)
                z = self.mlp_layers[i](z)
                x.append(z)
                
        x = torch.cat((x[-1], x[-2]), 1)
        x = self.final_dense(x)
        
        # Output modification - predict only masked values, otherwise use inputs
        outputs = torch.mul(inputs, mask) + torch.mul(1-mask, x)
        
        return outputs

In [None]:
# Helper validation method
def validate(model, valid_mask, batch_size=4096):
    assert valid_mask.shape == X_valid.shape
    
    n_batches_valid = X_valid.shape[0] // batch_size + 1
    
    model.eval()
    with torch.no_grad():
        ps = []
        for batch in range(n_batches_valid):
            x = torch.tensor(X_valid[batch * batch_size: (batch+1) * batch_size].astype(np.float32)).to(device)
            mask = torch.tensor(valid_mask[batch * batch_size: (batch+1) * batch_size].astype(np.float32)).to(device)
            x_masked = x * mask

            p = model(x_masked, mask).cpu().numpy()
            ps.append(p)

        p = np.vstack(ps)
        mask_bool = (1 - valid_mask).astype(bool)
        rmse = np.sqrt(mean_squared_error(
            scaler.inverse_transform(p)[mask_bool],
            scaler.inverse_transform(X_valid)[mask_bool]
        ))
        return rmse

In [None]:
# Loss function to mask NaNs in the original data
class MaskedMSELoss(nn.Module):
    # Mask should be 1 for masked value, 0 for unmasked value 
    def __init__(self):
        super().__init__()
        self.loss = nn.MSELoss(reduction='none')
    
    def forward(self, inputs, target, mask):
        loss = self.loss(inputs, target)
        return torch.mean(loss * (1 - mask))

In [None]:
# Defining model parameters and learning rate schedule

EPOCHS = 300
LR_START = 0.001
LR_END = 0.00005
BATCH_SIZE = 4096

# This cosine decay function is borrowed from AmbrosM in last month's TPS
def cosine_decay(epoch):
    epochs = EPOCHS
    lr_start = LR_START
    lr_end = LR_END
    if epochs > 1:
        w = (1 + math.cos(epoch / (epochs-1) * math.pi)) / 2
    else:
        w = 1
    return w * lr_start + (1 - w) * lr_end

In [None]:
# Build model

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.01)

# Build model
device = 'cuda'

# Final model uses units = [2048, 2048, 2048, 1024, 512, 256, 128], but I use a smaller model for this notebook
model = MaskedAutoencoder(15, units=[512, 512, 512, 512, 512, 256, 128]).to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=cosine_decay)
loss_fn = MaskedMSELoss()

In [None]:
# Training loop

# for epoch in epochs...

np.random.seed(6)

n = X_train.shape[0]
batch_size = 4096
n_batches = n // batch_size + 1
index = np.arange(n)

valid_per = 5

# Validation Mask
validation_masks = [mask_n_rows(X_valid.shape, i+1) for i in range(5)]
validation_prob = list(data[f4_col].isna().sum(axis=1).value_counts() \
    / data.loc[data[f4_col].isna().sum(axis=1)>0, f4_col].isna().sum(axis=1).value_counts().sum())[1:]

c_scores = [np.zeros(EPOCHS) for i in range(len(validation_masks))]
f_scores = np.zeros(EPOCHS)

# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1} LR {optimizer.param_groups[0]['lr']}")
    
    np.random.shuffle(index)
    losses = 0
    norm_losses = 0
    for i in tqdm(range(n_batches)):
        batch_idx = index[i*batch_size:(i+1)*batch_size]
        # Create batch train data
        srce_mask = torch.tensor(srce_nan_train[batch_idx].astype(np.float32)).to(device)
        x = torch.tensor(X_train[batch_idx].astype(np.float32)).to(device)
        mask_init = torch.tensor(random_mask(x.shape, binomial_p=0.05).astype(np.float32)).to(device)
        mask = mask_init - srce_mask * mask_init
        x_masked = x * mask

        # Forward and backward pass
        optimizer.zero_grad()
        p = model(x_masked, mask)
        loss = loss_fn(p, x, srce_mask)
        loss.backward()
        optimizer.step()
        
        losses += loss # Check
    scheduler.step()
        
        
    # Validation stepb
    if (epoch + 1) % valid_per == 0:
        scores = []
        for i in range(len(validation_masks)):
            v = validate(model, validation_masks[i])
            scores.append(v)
            c_scores[i][epoch] = v
            
        final_score = math.sqrt(sum([scores[i]**2 * validation_prob[i] for i in range(len(scores))]))
        f_scores[epoch] = final_score
        
        for i in range(len(scores)):
            print(f'RMSE ({i+1} rows) {scores[i]}')
        print(f'RMSE (TDGP) {final_score}')