# 0- Introduction
This notebook runs a Supervised MLP, a self supervised VIME based on MLP and a semi-supervised VIME on MNIST dataset

In [1]:
import torch
from utils import mask_generator, pretext_generator
from data_loader import load_mnist_data
import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
import torchvision.transforms as transforms  # Transformations we can perform on our dataset
import torchvision
import os
import pandas as pd
from torch.utils.data import (
    Dataset,
    DataLoader,
)

In [2]:
torch.manual_seed(0)

<torch._C.Generator at 0x212eef56050>

In [4]:
from data_sets import TabLabDataset, TabSemiUnlabDataset, TabUnlabDataset, ConcatDataset

In [5]:
from vim_self_pytorch import VimPretext
from mlp import MLP


We use only a subset of 1000  examples as labeled and the rest as unlabeled

In [6]:
# load the data as numpy arrays first
label_data_rate = 0.1  # size of unlabeled = 90% * 60000 = 54000
p_m = 0.3 # binomial distribution parameter
x_train, y_train, x_unlab, x_test, y_test = load_mnist_data(label_data_rate)
label_no = 1000     
# Use subset of labeled data
x_train = x_train[:label_no, :]
y_train = y_train[:label_no, :]  

# 1- Self supervised VIME (working)

Let's corrupt the unlabeled data

In [7]:
# corrupt x_unlab
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)

In [8]:
x_train.shape, x_unlab.shape, x_tilde.shape

((1000, 784), (54000, 784), (54000, 784))

In [9]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
learning_rate = 3e-4
batch_size = 128
num_epochs = 10
# Load Data
dataset = TabUnlabDataset(x_unlab, x_tilde, m_label)

In [10]:
#train_set, test_set = torch.utils.data.random_split(dataset, [int(0.7*x_unlab.shape[0]), x_unlab.shape[0] - int(0.7*x_unlab.shape[0])])
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last = True)
#test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True, drop_last = True)

Let's train the self supervised model

In [11]:
model = VimPretext()
model.to(device)
model.train()
loss_function = torch.nn.MSELoss()
# Loss and optimizer
criterion =torch.nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
# Train Network
for epoch in range(num_epochs):
    losses = []
    for batch_idx,(x, x_tilde, mask) in enumerate(train_loader):
        # Get data to cuda if possible
        x = x.squeeze().float().to(device=device)
        x_tilde = x_tilde.squeeze().float().to(device=device)
        mask = mask.squeeze().float().to(device=device)
        # forward
        mask_lab, feature, encoder = model(x_tilde.float())
        loss_rec = loss_function(x.float(), feature)
        loss_mask = criterion(mask_lab,mask.float())
        loss = 2 * loss_rec + loss_mask
        losses.append(loss.item())
        # backward
        optimizer.zero_grad()
        loss.backward()
        # gradient descent or adam step
        optimizer.step()

    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")

Cost at epoch 0 is 0.3419393597617568
Cost at epoch 1 is 0.2715295735843957
Cost at epoch 2 is 0.2576055600354337
Cost at epoch 3 is 0.24999438661577583
Cost at epoch 4 is 0.24558537253433055
Cost at epoch 5 is 0.24316236560814738
Cost at epoch 6 is 0.2417228785022704
Cost at epoch 7 is 0.240718988369876
Cost at epoch 8 is 0.23993116921597024
Cost at epoch 9 is 0.2392337374619237


In [12]:
torch.save(model.state_dict(), "self_vim.pth")

In [13]:
model.eval()

VimPretext(
  (fc0): Sequential(
    (0): Linear(in_features=784, out_features=784, bias=True)
    (1): ReLU()
  )
  (fc1): Sequential(
    (0): Linear(in_features=784, out_features=784, bias=True)
    (1): Sigmoid()
  )
  (fc2): Sequential(
    (0): Linear(in_features=784, out_features=784, bias=True)
    (1): Sigmoid()
  )
)

# 2 - Semi-Supervised VIME (not working correctly)

We will train on two datasets jointly : the labeled and the unlabeled that's why we will use ConcatDataset class

The semi supervised training doesn't work as expected. So I still need to debug this part. I'm suspecting the problem comes from the dataloader

In [18]:
# Load Data
train_data = TabLabDataset(x_train, y_train)
test_data = TabLabDataset(x_test, y_test)
unlab_data = TabSemiUnlabDataset(x_unlab)

In [61]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i %len(d)] for d in self.datasets)

    def __len__(self):
        return max(len(d) for d in self.datasets)

In [62]:
train_loader = torch.utils.data.DataLoader(
             ConcatDataset(
                 train_data,
                 unlab_data
             ),
             batch_size=128, shuffle=True, drop_last = True)

In [63]:
test_loader = torch.utils.data.DataLoader(test_data,
             batch_size=128, shuffle=False, drop_last = True)

Inspect the elements of the train_loader : 

In [64]:
for i, t in enumerate(train_loader):
    print(t[0][0].squeeze().shape)
    print(t[0][1].squeeze().shape)
    print(t[1].squeeze().shape)
    break

torch.Size([128, 784])
torch.Size([128, 10])
torch.Size([128, 784])


Let's train the network in a semi supervised fashion

In [78]:
predictor = MLP(28* 28, 10)

In [79]:
predictor = predictor.to(device)
predictor.train()
loss_function = torch.nn.MSELoss()
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(predictor.parameters(), lr=learning_rate, weight_decay=1e-5)

# Train Network
for epoch in range(num_epochs):
    losses = []

    for batch_idx,(t) in enumerate(train_loader):
        # Get data to cuda if possible
        x_train = t[0][0].squeeze().float().to(device=device)
        y_train = t[0][1].squeeze().float().to(device=device)
        x_unlab_tr = t[1].squeeze().float().to(device=device)

        # encode labeled data
        x_encoded = model(torch.tensor(x_train).float())[-1]
        y_hat = predictor(x_encoded)
        # supervised loss
        loss_sup = criterion(y_hat,y_train)
        loss_unsup = 0
        for i in range(5):
            # corrupt x_unlab
            m_unlab = mask_generator(p_m, x_unlab_tr)
            m_label, x_tilde = pretext_generator(m_unlab, x_unlab_tr)
            x_tilde_encoded = model(torch.tensor(x_tilde).float())[-1]
            y_unlab_hat = predictor(x_tilde_encoded)


            x_un_encoded = model(torch.tensor(x_unlab_tr).float())[-1]
            y_unlab = predictor(x_un_encoded)
            loss_unsup += loss_function(y_unlab,y_unlab_hat)
        # unsupervised loss
        loss_unsup_mean = loss_unsup/5
        loss = loss_sup + 2* loss_unsup_mean
        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")

  x_encoded = model(torch.tensor(x_train).float())[-1]
  x_tilde_encoded = model(torch.tensor(x_tilde).float())[-1]
  x_un_encoded = model(torch.tensor(x_unlab_tr).float())[-1]


Cost at epoch 0 is 0.009099787882044734
Cost at epoch 1 is 0.009009868660919598
Cost at epoch 2 is 0.008987508652258372
Cost at epoch 3 is 0.008969726242266614
Cost at epoch 4 is 0.008936467197792263
Cost at epoch 5 is 0.008905681883230487
Cost at epoch 6 is 0.008882651286920855
Cost at epoch 7 is 0.008825121690782402


# 3 - Supervised MLP for benchmark (working)

We need to train a supervised MLP on the labeled data : 

In [26]:
supervised = MLP(28* 28, 10)

In [27]:
supervised = supervised.to(device)
supervised.train()
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(supervised.parameters(), lr=learning_rate, weight_decay=1e-5)

# Train Network
for epoch in range(num_epochs):
    losses = []

    for batch_idx,(t) in enumerate(train_loader):
        # Get data to cuda if possible
        x_train = t[0][0].squeeze().float().to(device=device)
        y_train = t[0][1].squeeze().float().to(device=device)

        y_hat = supervised(x_train)
        loss = criterion(y_hat,y_train)

        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")



Cost at epoch 0 is 0.008954781772834914
Cost at epoch 1 is 0.008669431321322918
Cost at epoch 2 is 0.008353438095322676
Cost at epoch 3 is 0.008019167863364731
Cost at epoch 4 is 0.007654338942042419
Cost at epoch 5 is 0.007310390538935151
Cost at epoch 6 is 0.006951497601611274
Cost at epoch 7 is 0.006588164783482041
Cost at epoch 8 is 0.006263115177197116
Cost at epoch 9 is 0.005957594806594508


We will train a supervised MLP using the encoding given by the self supervised training approach :

In [28]:
self_supervised = MLP(28* 28, 10)
self_supervised = self_supervised.to(device)
self_supervised.train()
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(self_supervised.parameters(), lr=learning_rate, weight_decay=1e-5)

# Train Network
for epoch in range(num_epochs):
    losses = []

    for batch_idx,(t) in enumerate(train_loader):
        # Get data to cuda if possible
        x_train = t[0][0].squeeze().float().to(device=device)
        y_train = t[0][1].squeeze().float().to(device=device)

        # encode x_train
        
        x_encoded = model(torch.tensor(x_train).float())[-1]
        y_hat = self_supervised(x_encoded)
        loss = criterion(y_hat,y_train)

        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")



  x_encoded = model(torch.tensor(x_train).float())[-1]


Cost at epoch 0 is 0.008675807820899146
Cost at epoch 1 is 0.007869995731328214
Cost at epoch 2 is 0.0071040018062506404
Cost at epoch 3 is 0.006337571223931653
Cost at epoch 4 is 0.005657581612467766
Cost at epoch 5 is 0.0050347282418182916
Cost at epoch 6 is 0.004467035510710308
Cost at epoch 7 is 0.004025617381557822
Cost at epoch 8 is 0.003681020312277334
Cost at epoch 9 is 0.0033261486927845646


# Final comparison

Now Let's compare the performances of the three following models on the same test set : <br>
    - Supervised MLP <br>
    - Self supervised only MLP with VIME approach <br>
    - Semi-supervised MLP with VIME approach

In [48]:
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.squeeze().float().to(device=device)
            y = y.squeeze().float().to(device=device)
            

            scores = model(x.float())
            _, predictions = scores.max(1)
            num_correct += (predictions == y.argmax(-1)).sum()
            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples

In [49]:
# Check accuracy on training & test to see how good our model
def check_accuracy_self(loader, model, encoder):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.squeeze().float().to(device=device)
            y = y.squeeze().float().to(device=device)
            x_encoded = encoder(torch.tensor(x).float())[-1]
            scores = model(x_encoded.float())
            _, predictions = scores.max(1)
            num_correct += (predictions == y.argmax(-1)).sum()
            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples

In [71]:
# Check accuracy on training & test to see how good our model
def check_accuracy_semi(loader, model, encoder):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.squeeze().float().to(device=device)
            y = y.squeeze().float().to(device=device)
            x_encoded = encoder(torch.tensor(x).float())[-1]
            scores = model(x_encoded.float())
            _, predictions = scores.max(1)
            num_correct += (predictions == y.argmax(-1)).sum()
            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples

In [None]:
print(f"Accuracy on test set for the semi supervised approach: {check_accuracy_semi(test_loader, predictor,model)*100:.2f}")
print(f"Accuracy on test set for the supervised only approach: {check_accuracy(test_loader, supervised)*100:.2f}")
print(f"Accuracy on test set for the self supervised approach: {check_accuracy_self(test_loader, self_supervised, model)*100:.2f}")


  x_encoded = encoder(torch.tensor(x).float())[-1]


Accuracy on test set for the semi supervised approach: 59.41
Accuracy on test set for the supervised only approach: 76.76
Accuracy on test set for the self supervised approach: 83.36
