# Variational Autoencoder on Tabular Data

I use [wine dataset](https://archive.ics.uci.edu/ml/datasets/wine) to show how Variational Autoencoder (VAE) with PyTorch on tabular data works. I use the VAE to reduce the dimensionality of dataset, in this case don to 3 Variables (embeddings). I then plot the embeddings in a 3D graph to show how VAE is similar to a PCA but works in a non-linear way.

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io import arff

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Define Path to Dataset

In [3]:
DATA_PATH = 'Data/diabetes.csv'

# Define Functions

In [4]:
df = pd.read_csv('Data/diabetes.csv')
df.iloc[:, 1:-1]

Unnamed: 0,plas,pres,skin,insu,mass,pedi,age
0,148,72,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,32
3,89,66,23,94,28.1,0.167,21
4,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [5]:
def load_diabetes_data(path):
    # read in from csv
    df = pd.read_csv(path, sep=',')
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

# Build DataLoader

In [6]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path):
        self.x, self.standardizer, self.wine = load_diabetes_data(DATA_PATH)
        self.len=self.x.shape[0]
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len

In [7]:
data_set=DataBuilder(DATA_PATH)
trainloader=DataLoader(dataset=data_set,batch_size=32)

In [8]:
data_set.x

tensor([[ 0.6399,  0.8483,  0.1496,  ...,  0.2040,  0.4685,  1.4260],
        [-0.8449, -1.1234, -0.1605,  ..., -0.6844, -0.3651, -0.1907],
        [ 1.2339,  1.9437, -0.2639,  ..., -1.1033,  0.6044, -0.1056],
        ...,
        [ 0.3430,  0.0033,  0.1496,  ..., -0.7352, -0.6852, -0.2758],
        [-0.8449,  0.1598, -0.4707,  ..., -0.2402, -0.3711,  1.1707],
        [-0.8449, -0.8730,  0.0462,  ..., -0.2021, -0.4738, -0.8714]],
       device='cuda:0')

# Build Model and train it

In [9]:
class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):

        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)

#         # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
#        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

#         # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
#         self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
#         self.fc_bn4 = nn.BatchNorm1d(H2)

#         # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)

        self.gelu = nn.GELU()

    def encode(self, x):
        lin1 = self.gelu(self.lin_bn1(self.linear1(x)))
        lin2 = self.gelu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.gelu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.fc1(lin3))

        r1 = self.fc21(fc1) # Generating mu
        r2 = self.fc22(fc1) # Generating sigma

        return r1, r2

    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_() # Convert it to std deviation
            eps = Variable(std.data.new(std.size()).normal_()) # Generate a noise of same size as std
            return eps.mul(std).add_(mu) # Perform reparameterization
        else:
            return mu

    def decode(self, z):
        fc3 = self.gelu(self.fc3(z)) # Not sure why these two are required. 
        fc4 = self.gelu(self.fc4(fc3))#.view(128, -1)

        lin4 = self.gelu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.gelu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))



    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # self.decode(z) ist später recon_batch, mu ist mu und logvar ist logvar
        return self.decode(z), mu, logvar
    
    def embed(self,x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return z
    

In [10]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")

    # x_recon ist der im forward im Model erstellte recon_batch, x ist der originale x Batch, mu ist mu und logvar ist logvar
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

In [11]:
D_in = data_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [12]:
loss_mse = customLoss()

# Train

In [13]:
epochs = 1000
val_losses = []
train_losses = []

In [14]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:
        print('====> Epoch: {} Average loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

In [15]:
for epoch in range(1, epochs + 1):
    train(epoch)

====> Epoch: 200 Average loss: 5.9565
====> Epoch: 400 Average loss: 5.8488
====> Epoch: 600 Average loss: 5.7728
====> Epoch: 800 Average loss: 5.6425
====> Epoch: 1000 Average loss: 5.6685


# Evaluate

In [16]:
standardizer = trainloader.dataset.standardizer

In [17]:
model.eval()
test_loss = 0
# no_grad() bedeutet wir nehmen die vorher berechneten Gewichte und erneuern sie nicht
with torch.no_grad():
    for i, data in enumerate(trainloader):
        data = data.to(device)
        recon_batch, mu, logvar = model(data)

In [19]:
data.shape

torch.Size([32, 8])

In [22]:
index = 3
sample = data[index].unsqueeze(0) # Sample one of the data and add batch dimension
recon_batch, mu, logvar = model(sample)

In [23]:
print(standardizer.inverse_transform(data[index].cpu().numpy().reshape(1,-1)))
print(standardizer.inverse_transform(recon_batch.cpu().detach().numpy().reshape(1,-1)))

[[ 1.0000001e+00  1.0200000e+02  7.4000000e+01 -6.3578290e-07
   2.5431316e-06  3.9500000e+01  2.9300001e-01  4.2000000e+01]]
[[  3.2997031 128.95108    75.40324     2.5731156  10.135608   32.635094
    0.3530952  32.513977 ]]


In [25]:
latent = model.embed(sample)
print(latent)

tensor([[-0.0117, -0.1629,  1.0278]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


# Get Embeddings

In [None]:
mu_output = []
logvar_output = []

with torch.no_grad():
    for i, (data) in enumerate(trainloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)


            mu_tensor = mu
            mu_output.append(mu_tensor)
            mu_result = torch.cat(mu_output, dim=0)

            logvar_tensor = logvar
            logvar_output.append(logvar_tensor)
            logvar_result = torch.cat(logvar_output, dim=0)

In [None]:
mu_result.shape

In [None]:
mu_result[1:5,:]

# Plot Embeddings

In [None]:
from mpl_toolkits import mplot3d

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [None]:
ax = plt.axes(projection='3d')
# Data for three-dimensional scattered points
winetype = data_set.wine
zdata = mu_result[:,0].cpu().numpy()
xdata = mu_result[:,1].cpu().numpy()
ydata = mu_result[:,2].cpu().numpy()
ax.scatter3D(xdata, ydata, zdata, c=winetype);