In [1]:
import math
import numpy as np
import pandas as pd
import torch
from torch import nn, Tensor
from torch.nn.functional import softplus
from torch.distributions import Distribution

## Gaussian Distribution

In [2]:
# From Deep Learning course week 7
class ReparameterizedDiagonalGaussian(Distribution):
    """
    A distribution `N(y | mu, sigma I)` compatible with the reparameterization trick given `epsilon ~ N(0, 1)`.
    """

    def __init__(self, mu: Tensor, log_sigma: Tensor):
        assert (
            mu.shape == log_sigma.shape
        ), f"Tensors `mu` : {mu.shape} and ` log_sigma` : {log_sigma.shape} must be of the same shape"
        self.mu = mu
        self.sigma = log_sigma.exp()

    def sample_epsilon(self) -> Tensor:
        """`\eps ~ N(0, I)`"""
        return torch.empty_like(self.mu).normal_()

    def sample(self) -> Tensor:
        """sample `z ~ N(z | mu, sigma)` (without gradients)"""
        with torch.no_grad():
            return self.rsample()

    def rsample(self) -> Tensor:
        """sample `z ~ N(z | mu, sigma)` (with the reparameterization trick) """
        return self.mu + self.sigma * self.sample_epsilon()

    def log_prob(self, z: Tensor) -> Tensor:
        """return the log probability: log `p(z)`"""
        return torch.distributions.normal.Normal(self.mu, self.sigma).log_prob(z)

## Dataset: Titanic

In [3]:
train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')

In [10]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.transforms import ToTensor
from functools import reduce

batch_size = 64
eval_batch_size = 100
# The loaders perform the actual work
train_loader = DataLoader(train, batch_size=batch_size,
                          sampler=stratified_sampler(train.train_labels))
test_loader  = DataLoader(test, batch_size=eval_batch_size, 
                          sampler=stratified_sampler(test.test_labels))

NameError: name 'stratified_sampler' is not defined

## Building the model

In [5]:
class VariationalAutoencoder(nn.Module):
    """A Variational Autoencoder with
    * a Gaussian observation model `p(x|z)`
    * a Gaussian prior `p(z) = N(z | 0, I)`
    * a Gaussian posterior `q(z|x) = N(z | \mu(x), \sigma(x))`
    """
    def __init__(self, input_dimension, latent_features, output_dimension = None):

        super().__init__()

        self.input_dimension = input_dimension
        self.latent_features = latent_features
        self.observation_features = np.prod(input_dimension)

        if output_dimension is None:
            self.output_dimension = input_dimension
        else:
            self.output_dimension = output_dimension
        
        # Encode the observation `x` into the parameters of the posterior distribution
        self.encoder = nn.Sequential(
            nn.Linear(in_features=self.observation_features, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64), 
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=32), 
            nn.ReLU(),
            # A Gaussian is fully characterised by its mean \mu and variance \sigma**2
            nn.Linear(in_features=32, out_features=2*latent_features) # <- note the 2*latent_features
        )

        # Decode the latent sample `z` into the parameters of the observation model
        self.decoder = nn.Sequential(
            nn.Linear(in_features=latent_features, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=512), 
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=self.observation_features)
        )

        # Define the parameters of the prior, chosen as p(z) = N(0, I)
        self.register_buffer('prior_params', torch.zeros(torch.Size([1, 2*latent_features])))

        def posterior(self, x: Tensor) -> Distribution:
            """return the distribution `q(z|x) = N(z | \mu(x), \sigma(x))`"""
            # Compute the parameters of the posterior
            h_x = self.encoder(x)
            mu, log_sigma = h_x.chunk(2, dim=-1)

            # Return a distribution `q(z|x) = N(z | \mu(x), \sigma(x))`
            return ReparameterizedDiagonalGaussian(mu, log_sigma)
        
        def prior(self, batch_size: int=1) -> Distribution:
            """return the distribution `p(z)`"""
            prior_params = self.prior_params.expand(batch_size, *self.prior_params.shape[-1:])
            mu, log_sigma = prior_params.chunk(2, dim=-1)

            # return the distribution `p(z)`
            return ReparameterizedDiagonalGaussian(mu, log_sigma)
        
        def observation_model(self, z: Tensor) -> Distribution:
            """return the distribution `p(x|z)`"""
            h_z = self.decoder(z, batch_sizes)
            mu, log_sigma = h_z.data.chunk(2, dim=-1)

            return ReparameterizedDiagonalGaussian(mu, log_sigma)

        def forward(self, x):
            # define the posterior q(z|x) / encode x into q(z|x)
            qz = self.posterior(x)
        
            # define the prior p(z)
            pz = self.prior(batch_size=x.size(0))
        
            # sample the posterior using the reparameterization trick: z ~ q(z | x)
            z = qz.rsample()
        
            # define the observation model p(x|z) = B(x | g(z))
            px = self.observation_model(z)
        
            return {'px': px, 'pz': pz, 'qz': qz, 'z': z}

latent_features = 2
vae = VariationalAutoencoder(train.shape, latent_features)
print(vae)

VariationalAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=10692, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=4, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=10692, bias=True)
  )
)


## Training and evaluation

In [7]:
from collections import defaultdict

latent_features = 2
vae = VariationalAutoencoder(train.shape, latent_features)

# The Adam optimizer works really well with VAEs.
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)

# define dictionary to store the training curves
training_data = defaultdict(list)
validation_data = defaultdict(list)

epoch = 0