# Variational Autoencoder in Pytorch

Thought it would be fun to try using a variational autoencoder approach and see what the latent space looks like. This is a demo using 15 of the most common classes.

https://github.com/hsinyilin19/ResNetVAE/blob/master/ResNetVAE_reconstruction.ipynb

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from PIL import Image
import torch
from torch import optim
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.utils.data import Dataset, DataLoader, TensorDataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import os
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
from sklearn.manifold import TSNE
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('../input/landmark-recognition-2020/train.csv')
submission=pd.read_csv('../input/landmark-recognition-2020/sample_submission.csv')

## How many landmarks are there?
This could influence our choice for the dimensionality of the latent space.

There is a considerable class imbalance with the rarest landmarks containing less than 10 images and the most common containing 1000s.

In [None]:
len(train.landmark_id.unique())

In [None]:
plt.hist(train.landmark_id,bins=100)
plt.xscale('log')
plt.title('Histogram of number of images per landmark id')

## Dataset
The dataset is relatively simple for this challenge (with large parts borrowed from https://www.kaggle.com/rhtsingh/pytorch-training-inference-efficientnet-baseline). Each time it is called for an item we read the relevant file, apply some transformations, and resize to the relevant size for the base model (here a resnet50).

In [None]:
class RetrievalDataset(Dataset):
    def __init__(self,ids,df,train=True):
        self.train=train
        if self.train:
            self.root='../input/landmark-recognition-2020/train/'
        else:
            self.root='../input/landmark-recognition-2020/test/'
        self.df=df.iloc[ids]
        self.image_ids=self.df.id.values
        self.landmark_ids=self.df.landmark_id.values
        transforms_list = []
        if self.train:
            # Increase image size from (64,64) to higher resolution,
            # Make sure to change in RandomResizedCrop as well.
            transforms_list = [
                transforms.Resize((224,224)),
                transforms.RandomHorizontalFlip(),
                transforms.RandomChoice([
                    transforms.RandomResizedCrop(224),
                    transforms.ColorJitter(0.2, 0.2, 0.2, 0.2),
                    transforms.RandomAffine(degrees=15, translate=(0.2, 0.2),
                                            scale=(0.8, 1.2), shear=15,
                                            resample=Image.BILINEAR)
                ]),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                      std=[0.229, 0.224, 0.225]),
            ]
        else:
            transforms_list.extend([
                # Keep this resize same as train
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                      std=[0.229, 0.224, 0.225]),
            ])
        self.transforms = transforms.Compose(transforms_list)
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id=self.image_ids[idx]
        img=Image.open(self.root+image_id[0]+'/'+image_id[1]+'/'+image_id[2]+'/'+image_id+'.jpg')
        img = self.transforms(img)
        if self.train:
            label=self.landmark_ids[idx]
            return {'image':img, 'label':label}
        else:
            return {'image':img}

## The Metric
This is an implementation for the competition metric - generalized average precision.

In [None]:
def GAP(predicts: torch.Tensor, confs: torch.Tensor, targets: torch.Tensor):
    ''' Simplified GAP@1 metric: only one prediction per sample is supported '''
    assert len(predicts.shape) == 1
    assert len(confs.shape) == 1
    assert len(targets.shape) == 1
    assert predicts.shape == confs.shape and confs.shape == targets.shape

    _, indices = torch.sort(confs, descending=True)

    confs = confs.cpu().numpy()
    predicts = predicts[indices].cpu().numpy()
    targets = targets[indices].cpu().numpy()

    res, true_pos = 0.0, 0

    for i, (c, p, t) in enumerate(zip(confs, predicts, targets)):
        rel = int(p == t)
        true_pos += rel

        res += true_pos / (i + 1) * rel

    res /= targets.shape[0]-(targets == 0).sum()
    return res

## The Loss function
The variational autoencoder loss function has two parts. The mean squared error component assures that the model produces faithful reconstructions. The 'Kullback-Liebler Divergence' ensures that the latent space is fit to a multivariate gaussian. 

In [None]:
def vae_loss_function(recon_x, x, mu, logvar):
    # MSE = F.mse_loss(recon_x, x, reduction='sum')
    MSE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

## Functions for training and validating the model for an epoch

In [None]:
def train_model(model,train_loader):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    epoch_losses=[]
    model.train()
    epoch_loss=0
    for i, data in enumerate(train_loader): 
        batch_size, _, _, _ = data['image'].shape
        optimizer.zero_grad()
        X_reconst, z, mu, logvar = model(data['image'].to(device))  # VAE
        loss = vae_loss_function(X_reconst, data['image'].to(device), mu, logvar)
        epoch_loss+=loss.item()
        loss.backward()
        optimizer.step()
        print("Batch loss ",i,": ",loss)
    return model,epoch_loss

In [None]:
def validate_model(model,val_loader):
    criterion = nn.CrossEntropyLoss()
    all_y, all_z, all_mu, all_logvar = [], [], [], []
    model.eval()
    epoch_loss=0
    for i, data in enumerate(val_loader): 
        batch_size, _, _, _ = data['image'].shape
        X_reconst, z, mu, logvar = model(data['image'].to(device))  # VAE
        loss = vae_loss_function(X_reconst, data['image'].to(device), mu, logvar)
        epoch_loss+=loss.item()
        print(loss)
        all_y.extend(data['label'].data.cpu().numpy())
        all_z.extend(z.data.cpu().numpy())
        all_mu.extend(mu.data.cpu().numpy())
        all_logvar.extend(logvar.data.cpu().numpy())
    return epoch_loss, all_y, all_z, all_mu, all_logvar

## Function for prediction

In [None]:
def inference(data_loader, model):
    model.eval()
    activation = nn.Softmax(dim=1)
    all_predicts, all_confs, all_targets = [], [], []

    with torch.no_grad():
        for i, data in enumerate(tqdm(data_loader, disable=IN_KERNEL)):
            if dataloader.dataset.train:
                image, target = data['image'], data['target']
            else:
                image, target = data['image'], None

            output = model(image.to(device))
            output = activation(output)

            confs, predicts = torch.topk(output, NUM_TOP_PREDICTS)
            all_confs.append(confs)
            all_predicts.append(predicts)

            if target is not None:
                all_targets.append(target)

    predicts = torch.cat(all_predicts)
    confs = torch.cat(all_confs)
    targets = torch.cat(all_targets) if len(all_targets) else None

    return predicts, confs, targets

## Function for generating submissions

In [None]:
def generate_submission(test_loader, model, label_encoder):
    sample_sub = pd.read_csv('../input/landmark-recognition-2020/sample_submission.csv')

    predicts_gpu, confs_gpu, _ = inference(test_loader, model)
    predicts, confs = predicts_gpu.cpu().numpy(), confs_gpu.cpu().numpy()

    labels = [label_encoder.inverse_transform(pred) for pred in predicts]
    print('labels')
    print(np.array(labels))
    print('confs')
    print(np.array(confs))

    sub = test_loader.dataset.df
    def concat(label: np.ndarray, conf: np.ndarray) -> str:
        return ' '.join([f'{L} {c}' for L, c in zip(label, conf)])
    sub['landmarks'] = [concat(label, conf) for label, conf in zip(labels, confs)]

    sample_sub = sample_sub.set_index('id')
    sub = sub.set_index('id')
    sample_sub.update(sub)

    sample_sub.to_csv('submission.csv')

## The Model

### Since this competition is offline, we need to load pre-trained models using kaggle datasets.

In [None]:
def get_pretrained_model(model_name):
    """Retrieve a pre-trained model from torchvision

    Params
    -------
        model_name (str): name of the model (currently only accepts vgg16 and resnet50)

    Return
    --------
        model (PyTorch model): cnn

    """

    if model_name == 'vgg16':
        model = models.vgg16(pretrained=False)
        model.load_state_dict(torch.load('../input/vgg16/vgg16.pth'))

        # Freeze early layers
        for param in model.parameters():
            param.requires_grad = False
        #n_outputs = model.classifier[6].out_features

    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=False)
        model.load_state_dict(torch.load('../input/resnet50/resnet50.pth'))

        for param in model.parameters():
            param.requires_grad = False

        #n_outputs = model.fc.out_features

    return model#, n_outputs

### The VAE
The variational autoencoder is similar to the autoencoder. It has an encoder (here the resnet50) and a decoder (another CNN). However the variational autoencoder encodes the latent variables as means and variances of a guassian distribution rather than as deterministic variables. When we decode, we take a sample from this multivariate distribution as the input to the decoder rather than deterministic variables. This sampling takes place in the reparameterize step.

In [None]:
class ResNet_VAE(nn.Module):
    def __init__(self, fc_hidden1=1024, fc_hidden2=768, drop_p=0.3, CNN_embed_dim=256,model_name='resnet50'):
        super(ResNet_VAE, self).__init__()

        self.fc_hidden1, self.fc_hidden2, self.CNN_embed_dim = fc_hidden1, fc_hidden2, CNN_embed_dim

        # CNN architechtures
        self.ch1, self.ch2, self.ch3, self.ch4 = 16, 32, 64, 128
        self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3)      # 2d kernal size
        self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2)      # 2d strides
        self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0)  # 2d padding

        # encoding components
        resnet = get_pretrained_model(model_name)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet_modules=modules
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, self.fc_hidden1)
        self.bn1 = nn.BatchNorm1d(self.fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.bn2 = nn.BatchNorm1d(self.fc_hidden2, momentum=0.01)
        # Latent vectors mu and sigma
        self.fc3_mu = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)      # output = CNN embedding latent variables
        self.fc3_logvar = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)  # output = CNN embedding latent variables

        # Sampling vector
        self.fc4 = nn.Linear(self.CNN_embed_dim, self.fc_hidden2)
        self.fc_bn4 = nn.BatchNorm1d(self.fc_hidden2)
        self.fc5 = nn.Linear(self.fc_hidden2, 64 * 4 * 4)
        self.fc_bn5 = nn.BatchNorm1d(64 * 4 * 4)
        self.relu = nn.ReLU(inplace=True)

        # Decoder
        self.convTrans6 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=self.k4, stride=self.s4,
                               padding=self.pd4),
            nn.BatchNorm2d(32, momentum=0.01),
            nn.ReLU(inplace=True),
        )
        self.convTrans7 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=32, out_channels=8, kernel_size=self.k3, stride=self.s3,
                               padding=self.pd3),
            nn.BatchNorm2d(8, momentum=0.01),
            nn.ReLU(inplace=True),
        )

        self.convTrans8 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=8, out_channels=3, kernel_size=self.k2, stride=self.s2,
                               padding=self.pd2),
            nn.BatchNorm2d(3, momentum=0.01),
            nn.Sigmoid()    # y = (y1, y2, y3) \in [0 ,1]^3
        )


    def encode(self, x):
        x = self.resnet(x)  # ResNet
        x = x.view(x.size(0), -1)  # flatten output of conv

        # FC layers
        x = self.bn1(self.fc1(x))
        x = self.relu(x)
        x = self.bn2(self.fc2(x))
        x = self.relu(x)
        # x = F.dropout(x, p=self.drop_p, training=self.training)
        mu, logvar = self.fc3_mu(x), self.fc3_logvar(x)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu

    def decode(self, z):
        x = self.fc4(z)
        x = self.relu(self.fc_bn4(x))
        x = self.fc5(x)
        x = self.relu(self.fc_bn5(x)).view(-1, 64, 4, 4)
        x = self.convTrans6(x)
        x = self.convTrans7(x)
        x = self.convTrans8(x)
        x = F.interpolate(x, size=(224, 224), mode='bilinear')
        return x

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_reconst = self.decode(z)
        return x_reconst, z, mu, logvar

In [None]:
# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 1024
CNN_embed_dim = 256     # latent dim extracted by 2D CNN
res_size = 224        # ResNet image size
dropout_p = 0.2       # dropout probability


# training parameters
epochs = 100        # training epochs
batch_size = 50
learning_rate = 1e-3
log_interval = 10   # interval for displaying training info

In [None]:
# Create model
model = ResNet_VAE(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)

In [None]:
model_params = list(model.parameters())
optimizer = torch.optim.Adam(model_params, lr=learning_rate)

In [None]:
MIN_SAMPLES_PER_CLASS = 50
counts = train.landmark_id.value_counts()
selected_classes = counts[counts >= MIN_SAMPLES_PER_CLASS].index
num_classes = selected_classes.shape[0]
print('classes with at least N samples:', num_classes)
train = train.loc[train.landmark_id.isin(selected_classes)]
print(train.shape)


### For this demo select a small subset of the classes to train on

In [None]:
#Demo mode to get a random subset of classes
demo=True
num_classes_demo=15
if demo:
    random_class_subset=np.random.choice(train.landmark_id.unique(),num_classes_demo,replace=False)
    train = train.loc[train.landmark_id.isin(random_class_subset)]

In [None]:
def plot_examples(train,random_class_subset,n_examples):
    root='../input/landmark-recognition-2020/train/'
    l=len(random_class_subset)
    fig, axs=plt.subplots(len(random_class_subset),n_examples,figsize=(50,100))
    for c in range(l):
        c_ids=train.loc[train.landmark_id==random_class_subset[c]][:n_examples].id.values
        for i in range(n_examples):
            image_id=c_ids[i]
            axs[c,i].imshow(Image.open(root+image_id[0]+'/'+image_id[1]+'/'+image_id[2]+'/'+image_id+'.jpg'))
            plt.axis('off')
            axs[c,i].set_title('Landmark '+ str(random_class_subset[c])+': Example '+ str(i))
    plt.tight_layout()
    plt.show()

plot_examples(train,random_class_subset,3)

In [None]:
batch_size=32

ids = np.arange(len(train))
np.random.shuffle(ids)
ids=np.array(ids)

train_ids,val_ids=np.split(ids, [int(round(0.9 * len(ids), 0))])


train_dataset=RetrievalDataset(train_ids,train)
val_dataset=RetrievalDataset(val_ids,train)
train_loader=DataLoader(train_dataset, batch_size=batch_size,
                              shuffle=True, num_workers=5)
val_loader=DataLoader(val_dataset, batch_size=batch_size,
                              shuffle=True, num_workers=5)

## Benchmarking Loader

In [None]:
import time
batch_size=32
workers=4
k=30
train_loader=DataLoader(train_dataset, batch_size=batch_size,
                              shuffle=False, num_workers=workers,pin_memory=True)
t0=time.time()
for i, data in enumerate(train_loader): 
    if i==k:
        break
        
print("Time per image: " , (time.time()-t0)/(k*32), " seconds")
print("Time per epoch: " , (time.time()-t0)*len(train_loader)/(k*60), " minutes")

In [None]:
epochs=50
import time

t0=time.time()
for epoch in range(epochs):
    model,epoch_loss=train_model(model,train_loader)
    epoch_loss, all_y, all_z, all_mu, all_logvar = validate_model(model,val_loader)
    
print(time.time()-t0)

In [None]:
classes = random_class_subset

In [None]:
y_train = np.array(all_y)
z_train = np.array(all_z)

fig = plt.figure(figsize=(12, 10))
plots = []
#markers = ['o', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']
for i, c in enumerate(classes):
    ind = (y_train == c).tolist() or ([j < N // len(classes) for j in range(len(y_train))])
    color = cm.jet([i / len(classes)] * sum(ind))
    plots.append(plt.scatter(z_train[ind, 1], z_train[ind, 2], c=color, s=8, label=i))

plt.axis('off')
plt.legend(plots, classes, fontsize=14, loc='upper right')
plt.title('direct projection:  2-dim')
#plt.savefig("./ResNetVAE_{}_direct_plot.png".format(exp), bbox_inches='tight', dpi=600)
plt.show()

In [None]:
z_embed = TSNE(n_components=2, n_iter=12000).fit_transform(z_train)

fig = plt.figure(figsize=(12, 10))
plots = []
#markers = ['o', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']  # select different markers
for i, c in enumerate(classes):
    ind = (y_train == c).tolist()
    color = cm.jet([i / len(classes)] * sum(ind))
    # plot each category one at a time 
    plots.append(plt.scatter(z_embed[ind, 0], z_embed[ind, 1], c=color, s=8, label=i))

plt.axis('off')
plt.legend(plots, classes, fontsize=14, loc='upper right')
plt.title('t-SNE: 2-dim')
#plt.savefig("./ResNetVAE_{}_embedded_plot.png".format(exp), bbox_inches='tight', dpi=600)
plt.show()

In [None]:
z_embed3D = TSNE(n_components=3, n_iter=12000).fit_transform(z_train)

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

plots = []
#markers = ['o', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']  # select different markers
for i, c in enumerate(classes):
    ind = (y_train == c).tolist()
    color = cm.jet([i / len(classes)] * sum(ind))
    # plot each category one at a time 
    ax.scatter(z_embed3D[ind, 0], z_embed3D[ind, 1], c=color, s=8, label=i)

ax.axis('on')

#r_max = 20
#r_min = -r_max

ax.set_xlim(r_min, r_max)
ax.set_ylim(r_min, r_max)
ax.set_zlim(r_min, r_max)
ax.set_xlabel('z-dim 1')
ax.set_ylabel('z-dim 2')
ax.set_zlabel('z-dim 3')
ax.set_title('t-SNE: 3-dim')
ax.legend(plots, classes, fontsize=14, loc='upper right')
#plt.savefig("./ResNetVAE_{}_embedded_3Dplot.png".format(exp), bbox_inches='tight', dpi=600)
plt.show()