# Variational Autoencoder on Tabular Data

I use [wine dataset](https://archive.ics.uci.edu/ml/datasets/wine) to show how Variational Autoencoder (VAE) with PyTorch on tabular data works. I use the VAE to reduce the dimensionality of dataset, in this case don to 3 Variables (embeddings). I then plot the embeddings in a 3D graph to show how VAE is similar to a PCA but works in a non-linear way.

# Imports

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io import arff

In [85]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# Define Path to Dataset

In [86]:
DATA_PATH = 'Data/diabetes.csv'

# Define Functions

In [87]:
df = pd.read_csv('Data/diabetes.csv')
df.iloc[:, 1:-1]

Unnamed: 0,plas,pres,skin,insu,mass,pedi,age
0,148,72,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,32
3,89,66,23,94,28.1,0.167,21
4,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [262]:
def load_diabetes_data(path, sep=","):
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

# Build DataLoader

In [263]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, sep=","):
        self.x, self.standardizer, self.outcome = load_diabetes_data(path, sep)
        self.len=self.x.shape[0]
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len

In [198]:
data_set=DataBuilder(DATA_PATH)
print(data_set.outcome[0])
trainloader=DataLoader(dataset=data_set,batch_size=32)

b'tested_positive'


In [91]:
data_set.x

tensor([[ 0.6399,  0.8483,  0.1496,  ...,  0.2040,  0.4685,  1.4260],
        [-0.8449, -1.1234, -0.1605,  ..., -0.6844, -0.3651, -0.1907],
        [ 1.2339,  1.9437, -0.2639,  ..., -1.1033,  0.6044, -0.1056],
        ...,
        [ 0.3430,  0.0033,  0.1496,  ..., -0.7352, -0.6852, -0.2758],
        [-0.8449,  0.1598, -0.4707,  ..., -0.2402, -0.3711,  1.1707],
        [-0.8449, -0.8730,  0.0462,  ..., -0.2021, -0.4738, -0.8714]])

# Build Model and train it

In [92]:
class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):

        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)

#         # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
#        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

#         # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
#         self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
#         self.fc_bn4 = nn.BatchNorm1d(H2)

#         # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)

        self.gelu = nn.GELU()

    def encode(self, x):
        lin1 = self.gelu(self.lin_bn1(self.linear1(x)))
        lin2 = self.gelu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.gelu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.fc1(lin3))

        r1 = self.fc21(fc1) # Generating mu
        r2 = self.fc22(fc1) # Generating sigma

        return r1, r2

    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_() # Convert it to std deviation
            eps = Variable(std.data.new(std.size()).normal_()) # Generate a noise of same size as std
            return eps.mul(std).add_(mu) # Perform reparameterization
        else:
            return mu

    def decode(self, z):
        fc3 = self.gelu(self.fc3(z)) # Not sure why these two are required. 
        fc4 = self.gelu(self.fc4(fc3))#.view(128, -1)

        lin4 = self.gelu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.gelu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))



    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # self.decode(z) ist später recon_batch, mu ist mu und logvar ist logvar
        return self.decode(z), mu, logvar
    
    def embed(self,x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return z
    

In [93]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")

    # x_recon is the reconstruction batch created in the forward pass of the model, x is the original x batch, mu is mu, and logvar is logvar
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

In [94]:
D_in = data_set.x.shape[1]
H = 50
H2 = 12
model = Autoencoder(D_in, H, H2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [95]:
loss_mse = customLoss()

# Train

In [96]:
epochs = 1000
val_losses = []
train_losses = []

In [97]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:
        print('====> Epoch: {} Average loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

In [98]:
for epoch in range(1, epochs + 1):
    train(epoch)

====> Epoch: 200 Average loss: 5.9511
====> Epoch: 400 Average loss: 5.8387
====> Epoch: 600 Average loss: 5.8386
====> Epoch: 800 Average loss: 5.7032
====> Epoch: 1000 Average loss: 5.6341


# Evaluate

In [99]:
standardizer = trainloader.dataset.standardizer

In [106]:
model.eval()
test_loss = 0
# no_grad() bedeutet wir nehmen die vorher berechneten Gewichte und erneuern sie nicht
with torch.no_grad():
    for i, data in enumerate(trainloader):
        data = data.to(device)
        recon_batch, mu, logvar = model(data)

In [107]:
data[0].unsqueeze(0)

tensor([[-1.1419,  0.1598,  0.8734,  0.4054,  0.3491, -0.5829,  0.1302, -1.0415]])

In [108]:
index = 3
sample = data[index].unsqueeze(0) # Sample one of the data and add batch dimension
recon_batch, mu, logvar = model(sample)

In [109]:
print(standardizer.inverse_transform(data[index].cpu().numpy().reshape(1,-1)))
print(standardizer.inverse_transform(recon_batch.cpu().detach().numpy().reshape(1,-1)))

[[ 1.0000001e+00  1.0200000e+02  7.4000000e+01 -6.3578290e-07
   2.5431316e-06  3.9500000e+01  2.9300001e-01  4.2000000e+01]]
[[  4.1282735  120.91098     77.64493      3.7382195   11.428401
   31.841553     0.34282812  32.976517  ]]


In [110]:
latent = model.embed(sample)
print(latent)

tensor([[ 0.7332,  0.0037, -0.6617]], grad_fn=<AddmmBackward0>)


In [41]:
trainloader

<torch.utils.data.dataloader.DataLoader at 0x129e0fb50>

In [32]:
latent.detach().numpy()

array([[-0.50417924, -0.01486496,  1.1199479 ]], dtype=float32)

# Get Diabeted Latent

In [130]:
df = load_diabetes_data('Data/diabetes.csv')
actual_data = df[0]
outcomes = df[2]

In [131]:
outcomes_numeric = [1 if outcome == "b'tested_positive'" else 0 for outcome in outcomes]
outcomes_numeric

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,


In [142]:
latents = []
outcomes = []

In [143]:
with torch.no_grad():
    for idx, e in enumerate(actual_data):
        sample = e.unsqueeze(0)  # Add batch dimension
        latent = model.embed(sample)  # Get the latent representation
        latents.append(latent.squeeze().cpu().numpy())

In [144]:
latents_df = pd.DataFrame(latents)
outcomes_df = pd.DataFrame(outcomes_numeric)
outcome_original = pd.DataFrame(df[2])
# Save DataFrame to a CSV file
data_with_outcomes = pd.concat([latents_df, outcomes_df, outcome_original], axis=1)

data_with_outcomes.to_csv('latent_representations_with_outcomes.csv', index=False)


# Testing Reconstruction Loss

In [None]:
decoder = model()

# Get Embeddings

In [23]:
mu_output = []
logvar_output = []

with torch.no_grad():
    for i, (data) in enumerate(trainloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)


            mu_tensor = mu
            mu_output.append(mu_tensor)
            mu_result = torch.cat(mu_output, dim=0)

            logvar_tensor = logvar
            logvar_output.append(logvar_tensor)
            logvar_result = torch.cat(logvar_output, dim=0)

In [24]:
mu_result.shape

torch.Size([768, 3])

In [25]:
mu_result[1:5,:]

tensor([[-6.1445e-01, -2.6897e-03, -4.6281e-02],
        [ 2.7454e-01, -1.3039e-02,  1.1808e+00],
        [-6.1821e-01, -2.8625e-03, -3.4618e-02],
        [-6.1318e-01,  3.4801e-02, -3.0064e+00]])

# Plot Embeddings

In [27]:
from mpl_toolkits import mplot3d

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Resemblance Measure

In [19]:
import numpy as np
from scipy.stats import pearsonr, ks_2samp
from scipy.spatial.distance import jensenshannon
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier

In [20]:
def compute_categorical_similarity(col_real, col_synthetic):
    # Compute Theil's U for categorical features
    p_real = pd.Series(col_real).value_counts(normalize=True)
    p_synthetic = pd.Series(col_synthetic).value_counts(normalize=True)
    u = (p_real * np.log(p_real / p_synthetic)).sum()
    return 1 - u

In [186]:
def column_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data, synthetic_data):
        correlation, _ = pearsonr(col_real, col_synthetic)
        similarity = correlation
        similarities.append(similarity)
    return np.mean(similarities)

In [22]:
def correlation_similarity(real_data, synthetic_data):
    real_corr = np.corrcoef(real_data, rowvar=False)
    synthetic_corr = np.corrcoef(synthetic_data, rowvar=False)
    correlation, _ = pearsonr(real_corr.flatten(), synthetic_corr.flatten())
    return correlation

In [23]:
def jensen_shannon_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute probability distributions and Jensen-Shannon divergence
        p_real = np.histogram(col_real, bins=10, density=True)[0]
        p_synthetic = np.histogram(col_synthetic, bins=10, density=True)[0]
        similarity = 1 - jensenshannon(p_real, p_synthetic)
        similarities.append(similarity)
    return np.mean(similarities)

In [24]:
def kolmogorov_smirnov_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute cumulative distributions and Kolmogorov-Smirnov distance
        _, p_value = ks_2samp(col_real, col_synthetic)
        similarity = 1 - p_value
        similarities.append(similarity)
    return np.mean(similarities)

In [25]:
def propensity_mean_absolute_similarity(real_data, synthetic_data):
    # Train XGBoost classifier to discriminate between real and synthetic samples
    X = np.vstack([real_data, synthetic_data])
    y = np.concatenate([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    # Compute mean absolute error of classifier probabilities
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    error = mean_absolute_error(y_test, y_pred_proba)
    return 1 - error

In [41]:
real_data = np.random.randn(100, 5)
synthetic_data = np.random.randn(100, 5)
resemblance_score = (
    column_similarity(real_data, synthetic_data) +
    correlation_similarity(real_data, synthetic_data) +
    jensen_shannon_similarity(real_data, synthetic_data) +
    kolmogorov_smirnov_similarity(real_data, synthetic_data) +
    propensity_mean_absolute_similarity(real_data, synthetic_data)
) / 5
print("Resemblance Score:", resemblance_score)

Resemblance Score: 0.5531516708702601


In [174]:
column_similarity(real_data, real_data)

1.0

In [47]:
jensen_shannon_similarity(real_data, synthetic_data)

1.0

In [63]:
gen_data_x = np.load('X_num_train.npy')

In [65]:
gen_data_x

array([[-8.70613469e-01, -3.26594508e-03,  2.72276446e-01],
       [ 1.72692685e+00, -4.62443238e-03, -2.61707506e+00],
       [-1.46701814e+00,  3.29327470e-03,  1.81796738e+00],
       ...,
       [-3.07056424e-01, -3.36094633e-03, -2.15171718e-01],
       [-7.13104756e-01,  1.57992833e-03,  1.07577687e+00],
       [-9.54200109e-01,  4.73620546e-03,  1.78563590e+00]])

In [51]:
gen_data_y = np.load('y_diabetes_train.npy')

In [52]:
gen_data_y

array([0, 1, 0, ..., 0, 1, 1])

In [61]:
propensity_mean_absolute_similarity(real_data, synthetic_data)

0.5329577771015466

In [62]:
kolmogorov_smirnov_similarity(real_data, synthetic_data)

0.4982160155142396

# Generate Data From Decoder

In [114]:
gen_latent = np.load('X_num_train.npy')

In [115]:
gen_latent

array([[-8.70613469e-01, -3.26594508e-03,  2.72276446e-01],
       [ 1.72692685e+00, -4.62443238e-03, -2.61707506e+00],
       [-1.46701814e+00,  3.29327470e-03,  1.81796738e+00],
       ...,
       [-3.07056424e-01, -3.36094633e-03, -2.15171718e-01],
       [-7.13104756e-01,  1.57992833e-03,  1.07577687e+00],
       [-9.54200109e-01,  4.73620546e-03,  1.78563590e+00]])

In [116]:
gen_torch_data = torch.from_numpy(gen_latent).float()

In [117]:
gen_torch_data[0].unsqueeze(0)

tensor([[-0.8706, -0.0033,  0.2723]])

In [120]:
z = model.decode(gen_torch_data)

In [123]:
z.shape

torch.Size([10000, 8])

In [128]:
generated_data = standardizer.inverse_transform(z.cpu().detach().numpy())

In [130]:
generated_data

array([[  1.3103098 , 110.829765  ,  66.52012   , ...,  33.312458  ,
          0.4907393 ,  24.706215  ],
       [  1.9702326 ,  81.967125  ,  46.508427  , ...,  17.81031   ,
          0.8155537 ,  61.53886   ],
       [  1.2547129 , 157.32109   ,  80.13958   , ...,  39.85358   ,
          0.9414892 ,  25.419695  ],
       ...,
       [  2.2281542 , 106.60665   ,  66.78925   , ...,  28.889374  ,
          0.4007322 ,  25.88699   ],
       [  1.787898  , 138.67224   ,  71.98399   , ...,  36.225372  ,
          0.45762414,  26.242487  ],
       [  0.92643625, 148.66544   ,  79.21351   , ...,  41.56632   ,
          0.43674093,  25.877264  ]], dtype=float32)

In [126]:
gen_latent_y = np.load('y_diabetes_train.npy')

In [127]:
gen_latent_y

array([0, 1, 0, ..., 0, 1, 1])

In [132]:
x_real = load_diabetes_data("Data/diabetes.csv")

In [133]:
x_real[0]

tensor([[ 0.6399,  0.8483,  0.1496,  ...,  0.2040,  0.4685,  1.4260],
        [-0.8449, -1.1234, -0.1605,  ..., -0.6844, -0.3651, -0.1907],
        [ 1.2339,  1.9437, -0.2639,  ..., -1.1033,  0.6044, -0.1056],
        ...,
        [ 0.3430,  0.0033,  0.1496,  ..., -0.7352, -0.6852, -0.2758],
        [-0.8449,  0.1598, -0.4707,  ..., -0.2402, -0.3711,  1.1707],
        [-0.8449, -0.8730,  0.0462,  ..., -0.2021, -0.4738, -0.8714]])

In [134]:
x_real_real = standardizer.inverse_transform(x_real[0].cpu().detach().numpy())

In [135]:
x_real_real

array([[  6.       , 148.       ,  72.       , ...,  33.6      ,
          0.627    ,  50.       ],
       [  1.0000001,  85.       ,  66.       , ...,  26.6      ,
          0.351    ,  31.       ],
       [  8.       , 183.       ,  64.       , ...,  23.3      ,
          0.672    ,  32.       ],
       ...,
       [  5.       , 121.       ,  72.       , ...,  26.2      ,
          0.245    ,  30.       ],
       [  1.0000001, 126.       ,  60.       , ...,  30.1      ,
          0.349    ,  47.       ],
       [  1.0000001,  93.       ,  70.       , ...,  30.4      ,
          0.315    ,  23.       ]], dtype=float32)

In [155]:
generated_data[:769].shape

(769, 8)

In [185]:
column_similarity(generated_data[:768], x_real_real)

[  1.3103098 110.829765   66.52012    31.02438    97.52567    33.312458
   0.4907393  24.706215 ]
***********
[6.0000000e+00 1.4800000e+02 7.2000000e+01 3.5000000e+01 2.5431316e-06
 3.3599998e+01 6.2699997e-01 5.0000000e+01]
[ 1.9702326 81.967125  46.508427   2.013879  -9.124097  17.81031
  0.8155537 61.53886  ]
***********
[1.0000001e+00 8.5000000e+01 6.6000000e+01 2.9000000e+01 2.5431316e-06
 2.6600000e+01 3.5100001e-01 3.1000000e+01]
[  1.2547129 157.32109    80.13958    40.5078    346.85416    39.85358
   0.9414892  25.419695 ]
***********
[ 8.0000000e+00  1.8300000e+02  6.4000000e+01 -6.3578290e-07
  2.5431316e-06  2.3299999e+01  6.7199999e-01  3.2000000e+01]
[  5.443246   142.55363     79.4045       0.78184825  -1.5882543
  32.132923     0.3180279   52.077732  ]
***********
[ 1.0000001  89.         66.         23.         94.         28.1
  0.16700001 21.        ]
[  2.6073902  107.47725     70.59845     12.094096    29.020742
  28.23395      0.37615764  26.35084   ]
***********


0.7536006864475144

In [None]:
correlation_similarity(generated_data, x_real_real)

In [187]:
resemblance_score = (
    column_similarity(generated_data[:768], x_real_real) +
    correlation_similarity(generated_data[:768], x_real_real) +
    jensen_shannon_similarity(generated_data[:768], x_real_real) +
    kolmogorov_smirnov_similarity(generated_data[:768], x_real_real) +
    propensity_mean_absolute_similarity(generated_data[:768], x_real_real)
) / 5
print("Resemblance Score:", resemblance_score)

Resemblance Score: 0.8307471509439492


# Cardio Dataset

In [233]:
cadio_df = pd.read_csv('Data/cardio_train.csv', delimiter=";")
df_base = cadio_df.iloc[:, :-1]
df_target = cadio_df.iloc[:,-1].values

In [236]:
df_base

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0


In [225]:
cardio_data = load_diabetes_data("Data/cardio_train.csv", sep=";")

In [211]:
cardio_data[0].shape

torch.Size([70000, 12])

In [265]:
data_set_2=DataBuilder("Data/cardio_train.csv", sep=";")
trainloader=DataLoader(dataset=data_set,batch_size=32)

In [270]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(trainloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    if epoch % 200 == 0:
        print('====> Epoch: {} Average loss: {:.4f}'.format(
            epoch, train_loss / len(trainloader.dataset)))
        train_losses.append(train_loss / len(trainloader.dataset))

In [271]:
for epoch in range(1, epochs + 1):
    train(epoch)

====> Epoch: 5 Average loss: 5.4874
====> Epoch: 10 Average loss: 5.5752
====> Epoch: 15 Average loss: 5.5025
====> Epoch: 20 Average loss: 5.6022
====> Epoch: 25 Average loss: 5.5200
====> Epoch: 30 Average loss: 5.6080
====> Epoch: 35 Average loss: 5.5140
====> Epoch: 40 Average loss: 5.5755
====> Epoch: 45 Average loss: 5.5081
====> Epoch: 50 Average loss: 5.5166
====> Epoch: 55 Average loss: 5.4810
====> Epoch: 60 Average loss: 5.5351
====> Epoch: 65 Average loss: 5.5991
====> Epoch: 70 Average loss: 5.4943
====> Epoch: 75 Average loss: 5.4810
====> Epoch: 80 Average loss: 5.5693
====> Epoch: 85 Average loss: 5.5784
====> Epoch: 90 Average loss: 5.4453
====> Epoch: 95 Average loss: 5.5090
====> Epoch: 100 Average loss: 5.5136
====> Epoch: 105 Average loss: 5.4572
====> Epoch: 110 Average loss: 5.5596
====> Epoch: 115 Average loss: 5.5373
====> Epoch: 120 Average loss: 5.5564
====> Epoch: 125 Average loss: 5.5562
====> Epoch: 130 Average loss: 5.4625
====> Epoch: 135 Average loss: 5

In [290]:
model.eval()
test_loss = 0
# no_grad() bedeutet wir nehmen die vorher berechneten Gewichte und erneuern sie nicht
with torch.no_grad():
    for i, data in enumerate(trainloader):
        data = data.to(device)
        recon_batch, mu, logvar = model(data)

In [292]:
data[0]
index = 3
sample = data[index].unsqueeze(0) # Sample one of the data and add batch dimension
recon_batch, mu, logvar = model(sample)


In [306]:
data.shape

torch.Size([32, 8])

In [300]:
index = 1
sample = data[index].unsqueeze(0)

latent = model.embed(sample)
print(latent)

tensor([[0.7591, 0.0073, 0.3315]], grad_fn=<AddmmBackward0>)


# Real Cardio Latent

In [303]:
df = load_diabetes_data('Data/cardio_train.csv', sep=";")
actual_data = df[0]
actual_labels = df[2]

latents = []
outcomes = []

df

(tensor([[-1.7321, -0.4361,  1.3641,  ..., -0.3109, -0.2384,  0.4942],
         [-1.7320,  0.3077, -0.7331,  ..., -0.3109, -0.2384,  0.4942],
         [-1.7320, -0.2480, -0.7331,  ..., -0.3109, -0.2384, -2.0236],
         ...,
         [ 1.7339, -0.1633,  1.3641,  ..., -0.3109,  4.1949, -2.0236],
         [ 1.7339,  1.2006, -0.7331,  ..., -0.3109, -0.2384, -2.0236],
         [ 1.7340,  0.4341, -0.7331,  ..., -0.3109, -0.2384,  0.4942]]),
 StandardScaler(),
 array([0, 1, 1, ..., 1, 1, 0]))

In [309]:
with torch.no_grad():
    for idx, e in enumerate(actual_data):
        sample = e.unsqueeze(0)  # Add batch dimension
        latent = model.embed(sample)  # Get the latent representation
        latents.append(latent.squeeze().cpu().numpy())

tensor([[-1.7321, -0.4361,  1.3641,  0.4435, -0.8479, -0.1222, -0.0882, -0.5393,
         -0.3957, -0.3109, -0.2384,  0.4942]])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x12 and 8x50)

In [280]:
latents_df = pd.DataFrame(latents)
outcomes_df = pd.DataFrame(actual_labels)
outcome_original = pd.DataFrame(df[2])
data_with_outcomes = pd.concat([latents_df, outcomes_df, outcome_original], axis=1)

data_with_outcomes.to_csv('cardio_latent_representations_with_outcomes.csv', index=False)