This is my DAE experimentation code from last month. Did not have as much time as I wished to play around with it myself. Feel free to take it from here and see if you can get it working. 

In [None]:
import os
import torch
import random
import pandas as pd
import numpy as np
from operator import lt, gt
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder


def seed_everything(seed=1127):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()


class TrainDataset(Dataset):
    def __init__(self, x, y): 
        self.x, self.y = x.astype('float32'), y.astype('float32')

    def __len__(self): 
        return self.x.shape[0]

    def __getitem__(self, index): 
        return self.x[index], self.y[index]


class PredictDataset(Dataset):
    def __init__(self, x): 
        self.x = x.astype('float32')

    def __len__(self): 
        return self.x.shape[0]

    def __getitem__(self, index): 
        return self.x[index]


class AverageMeter(object):
    def __init__(self): self.reset()

    def reset(self): self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class EarlyStopping(object):
    def __init__(self, mode='min', min_delta=0, percentage=False, patience=10, initial_bad=0, initial_best=np.nan, verbose=0):
        self.mode = mode
        self.patience = patience
        self.best = float('inf') if mode == 'min' else float('-inf')
        self.num_bad_epochs = initial_bad
        self.is_better = self._init_is_better(mode, min_delta, percentage)
        self.verbose = verbose
        self._stop = False

    def step(self, metric):
        if self.is_better(metric, self.best):
            self.num_bad_epochs = 0
            self.best = metric
        else:
            self.num_bad_epochs += 1

        if np.isnan(self.best) and (not np.isnan(metric)):
            self.num_bad_epochs = 0
            self.best = metric

        self._stop = self.num_bad_epochs >= self.patience
        if self.verbose and self._stop: print('Early Stopping Triggered, best score is: ', self.best)
        return self._stop

    def _init_is_better(self, mode, min_delta, percentage):
        comparator = lt if mode == 'min' else gt
        if not percentage:
            def _is_better(new, best):
                target = best - min_delta if mode == 'min' else best + min_delta
                return comparator(new, target)
        else:
            def _is_better(new, best):
                target = best * (1 - (min_delta / 100)) if mode == 'min' else best * (1 + (min_delta / 100))
                return comparator(new, target)
        return _is_better

    
def add_swap_noise_torch(X, ratio=.15):
    obfuscation_mask = torch.bernoulli(ratio * torch.ones(X.shape)).to(X.device)
    obfuscated_X = torch.where(obfuscation_mask == 1, X[torch.randperm(X.shape[0])], X)
    return obfuscated_X, obfuscation_mask


class MLPModel(torch.nn.Module):
    def __init__(self, num_inputs, hidden_size=512, input_dropout=.1, dropout_rate=.25, lower_bound=0, upper_bound=10.5):
        super().__init__()
        self.use_input_dropout = input_dropout > 0
        if input_dropout: self.input_dropout = torch.nn.Dropout(input_dropout)
        self.linear_1 = torch.nn.Linear(num_inputs, hidden_size)
        self.dropout_1 = torch.nn.Dropout(dropout_rate)
        self.linear_2 = torch.nn.Linear(hidden_size, hidden_size)
        self.dropout_2 = torch.nn.Dropout(dropout_rate)
        self.last_linear = torch.nn.Linear(hidden_size, 1)
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound

    def net(self, x):
        if self.use_input_dropout: x = self.input_dropout(x)
        act1 = torch.nn.functional.relu(self.linear_1(x))
        x = self.dropout_1(act1)
        act2 = torch.nn.functional.relu(self.linear_2(x))
        x = self.dropout_2(act2)
        return self.last_linear(x)

    def forward(self, x):
        return torch.sigmoid(self.net(x)) * (self.upper_bound - self.lower_bound) + self.lower_bound


class DeepStackDAE(torch.nn.Module):
    def __init__(self, cards, hidden_size=1500, num_cats=10, num_conts=14, emphasis=1, lower_bound=0, upper_bound=10.5):
        super().__init__()
        self.cards = cards
        self.num_cats = num_cats
        self.num_conts = num_conts

        post_encoding_input_size = num_conts + sum(cards)

        self.linear_1 = torch.nn.Linear(in_features=post_encoding_input_size, out_features=hidden_size)
        self.linear_2 = torch.nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.linear_3 = torch.nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.linear_4 = torch.nn.Linear(in_features=hidden_size, out_features=post_encoding_input_size)

        self.emphasis = emphasis
        self.upper_bound = upper_bound
        self.lower_bound = lower_bound

    def one_hot_encoding(self, x):
        encoded = torch.cat([
            torch.nn.functional.one_hot(x[:, i].long(), num_classes=self.cards[i])
            for i in range(self.num_cats)
        ], dim=1)
        return encoded

    def forward(self, x):
        x_post_encoded = torch.cat([self.one_hot_encoding(x), x[:, self.num_cats:]], dim=1)
        act_1 = torch.nn.functional.relu(self.linear_1(x_post_encoded))
        act_2 = torch.nn.functional.relu(self.linear_2(act_1))
        act_3 = torch.nn.functional.relu(self.linear_3(act_2))
        out = self.linear_4(act_3)
        return act_1, act_2, act_3, out

    def feature(self, x):
        return torch.cat(self.forward(x)[:-1], dim=1)

    def split(self, t):
        return torch.split(t, self.cards + [self.num_conts], dim=1)

    def loss(self, x, y, mask=None, weights=[10, 14]):
        if mask is None: mask = torch.ones(x.shape).to(x.device)
        loss_weights = mask * self.emphasis + (1 - mask) * (1 - self.emphasis)

        out = self.split(self.forward(x)[-1])
        cat_losses = torch.cat([
            torch.nn.functional.cross_entropy(out[i], y[:, i].long(), reduction='none').reshape(-1, 1)
            for i in range(self.num_cats)
        ], dim=1)
        cont_losses = torch.nn.functional.mse_loss(out[-1], y[:, -self.num_conts:], reduction='none')
        unweighted_loss = torch.cat([cat_losses, cont_losses], dim=1)
        weighted_loss = loss_weights * unweighted_loss
        return weighted_loss.mean()

In [None]:
# prepare data
train_data = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

cards = []
X = np.vstack([train_data.iloc[:, 1:-1].to_numpy(), test_data.iloc[:, 1:].to_numpy()])
for i in range(10):
    encoder = LabelEncoder().fit(X[:, i])
    X[:, i] = encoder.transform(X[:, i])
    cards.append(len(encoder.classes_))
embeded_dims = [int(c ** (2/3)) for c in cards]

y = train_data['target'].to_numpy().reshape(-1, 1)

In [None]:
# hyperparams
dae_hidden_size = 1500
dae_noise_ratio = .3
dae_batch_size = 512 
dae_init_lr = 3e-4
dae_lr_gamma = .995
dae_denoise_emphesis = .8

mlp_hidden_size = 512
mlp_batch_size = 512
mlp_init_lr = 5e-5
mlp_input_dropout = 0
mlp_dropout = .4
mlp_l2_reg = 2e-3

In [None]:
dae_dl = DataLoader(dataset=PredictDataset(X), batch_size=dae_batch_size, shuffle=True, pin_memory=True, drop_last=True)
dae = DeepStackDAE(
    cards, 
    hidden_size=dae_hidden_size, 
    emphasis=dae_denoise_emphesis
).cuda()
optimizer = torch.optim.Adam(
    dae.parameters(), 
    lr=dae_init_lr
)
scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer, 
    gamma=dae_lr_gamma
)
earlystopper = EarlyStopping(mode='min', min_delta=1e-7, patience=200, percentage=False, verbose=0)

for epoch in range(3000):
    dae.train()    
    meter = AverageMeter()
    for i, x in enumerate(dae_dl):
        x = x.cuda()
        noisy_x, mask = add_swap_noise_torch(x, dae_noise_ratio)
        optimizer.zero_grad()
        loss = dae.loss(noisy_x, x, mask)
        loss.backward()
        optimizer.step()
        meter.update(loss.detach().cpu().numpy())
    scheduler.step()
    if epoch % 100 == 0: print(epoch, meter.avg)    
    if earlystopper.step(meter.avg): break    
    

In [None]:
n_total = len(train_data)
cut_off = int(n_total * .9)
train_dl = DataLoader(dataset=TrainDataset(X[:cut_off], y[:cut_off]), batch_size=mlp_batch_size, shuffle=True, pin_memory=True, drop_last=True)
valid_dl = DataLoader(dataset=TrainDataset(X[cut_off:n_total], y[cut_off:]), batch_size=mlp_batch_size, shuffle=False, pin_memory=True, drop_last=False)
model = MLPModel(
    3 * dae_hidden_size, 
    hidden_size=mlp_hidden_size, 
    input_dropout=mlp_input_dropout, 
    dropout_rate=mlp_dropout
).cuda()
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=mlp_init_lr, 
    weight_decay=mlp_l2_reg
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=1/3, patience=10, verbose=0, cooldown=2, min_lr=1e-7)
earlystopper = EarlyStopping(mode='min', min_delta=1e-7, patience=20, percentage=False, verbose=0)

for epoch in range(777):
    model.train()
    for i, (x, target) in enumerate(train_dl):
        x, target = x.cuda(), target.cuda()
        with torch.no_grad(): x = dae.feature(x)
        optimizer.zero_grad()
        loss = torch.nn.functional.mse_loss(model.forward(x), target)
        loss.backward()
        optimizer.step()

    model.eval()
    predictions = []
    with torch.no_grad():
        for _, (x, _) in enumerate(valid_dl):
            x = dae.feature(x.cuda())
            prediction = model.forward(x)
            predictions.append(prediction.detach().cpu().numpy())
    predictions = np.concatenate(predictions)
    valid_rmse = mean_squared_error(valid_dl.dataset.y, predictions, squared=False)
    scheduler.step(valid_rmse)
    if epoch % 20 == 0: print(epoch, valid_rmse)    
    if earlystopper.step(valid_rmse): break

In [None]:
predictions = []
test_dl = DataLoader(dataset=PredictDataset(X[n_total:]), batch_size=mlp_batch_size, shuffle=False, pin_memory=True, drop_last=False)
with torch.no_grad():
    for _, x in enumerate(test_dl):
        x = dae.feature(x.cuda())
        prediction = model.forward(x)
        predictions.append(prediction.detach().cpu().numpy())
predictions = np.concatenate(predictions)

sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')
sub['target'] = predictions
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()