# Practical assignment on Sparse variatioal dropout
Idea of the assignment and code: Arsenii Ashukha;
Author of the tutorial: Nadezhda Chirkova

- [Link to a tutorial](https://github.com/nadiinchi/BayesianSparsificationTutorial/blob/master/BayesianSparsificationTutorial.pdf)
- [Cheating link](https://github.com/nadiinchi/BayesianSparsificationTutorial/blob/master/Bayesian_sparsification_tutorial_solution.ipynb)
- Variational Dropout Sparsifies Deep Neural Networks https://arxiv.org/abs/1701.05369


In this task, we will train SparseVD model for a simple fully-connected network on a MNIST dataset. We will start with a short reminder about training neural networks in PyTorch and then implement necessary functions for SparseVD and incorporate them into model training.

## Training fully-connected network in pytorch

In [1]:
# Logger
# if you have problems with this import
# check that you are working with python3
# and downloaded logger.py file to the folder with this notebook
from logger import Logger

In [None]:
# run if you don't have pytorch or torchvision
!pip install torch
!pip install torchvision

In [2]:
import torch
import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from logger import Logger
from torch.nn import Parameter
from torchvision import datasets, transforms

Load a dataset:

In [3]:
batch_size = 100
# define data preprocessing
tranform = transforms.Compose([transforms.ToTensor(), 
                               transforms.Normalize((0.1307,), (0.3081,))])
# load train and test datasets and wrap them into loaders (batch generators)
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
    transform=tranform), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, download=True,
    transform=tranform), batch_size=batch_size, shuffle=True)

Define a network:

In [4]:
class Net(nn.Module):
    def __init__(self,):
        # components of the model
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 300)
        self.fc2 = nn.Linear(300,  10)

    def forward(self, x):
        # how to perform forward pass
        x = F.relu(self.fc1(x))
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

In [5]:
# difene device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
# make instance of the Net
model = Net().to(device)

Define loss:

In [7]:
lossfun = F.cross_entropy

Define optimizer:

In [8]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

Create logger instance:

In [9]:
fmt = {'tr_los': '3.1e', 'te_loss': '3.1e'}
logger = Logger('fc_net', fmt=fmt)

Define device and training procedure:

In [10]:
def train(train_loader, test_loader, model, lossfun, logger, device,
          optimizer, scheduler=None, epochs = 100, sparse=False):
    # we will need it for sparsification
    if sparse:
        kl_weight = 0.02

    for epoch in range(1, epochs + 1):
        # we will need it for sparsification
        if sparse:
            kl_weight = min(kl_weight+0.02, 1)
            logger.add_scalar(epoch, 'kl', kl_weight)
            lossfun.kl_weight = kl_weight
        if scheduler is not None:
            scheduler.step()
            logger.add_scalar(epoch, 'lr', scheduler.get_lr()[0])
        
        ### iterate over training batches, perform forward and backward pass and count metrics
        model.train()
        train_loss, train_acc = 0, 0 
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            data = data.view(-1, 28*28)
            optimizer.zero_grad()

            output = model(data)
            pred = output.data.max(1)[1] 
            loss = lossfun(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss 
            train_acc += np.sum(pred.cpu().numpy() == target.data.cpu().numpy())

        logger.add_scalar(epoch, 'tr_los', train_loss / len(train_loader.dataset))
        logger.add_scalar(epoch, 'tr_acc', train_acc / len(train_loader.dataset) * 100)
        
        ### iterate over testing batches, perform forward pass and count metrics
        model.eval()
        test_loss, test_acc = 0, 0
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)
            data = data.view(-1, 28*28)
            output = model(data)
            test_loss += float(lossfun(output, target))
            pred = output.data.max(1)[1] 
            test_acc += np.sum(pred.cpu().numpy() == target.data.cpu().numpy())

        logger.add_scalar(epoch, 'te_loss', test_loss / len(test_loader.dataset))
        logger.add_scalar(epoch, 'te_acc', test_acc / len(test_loader.dataset) * 100)
        
        if sparse:
            for i, c in enumerate(model.children()):
                if hasattr(c, 'kl_reg'):
                    logger.add_scalar(epoch, 'sp_%s' % i, (c.log_alpha.data.cpu().numpy() > model.threshold).mean())

        logger.iter_info()

In [11]:
train(train_loader, test_loader, model, lossfun, logger, device,
          optimizer, scheduler=None, epochs = 20, sparse=False)

  epoch    tr_los    tr_acc    te_loss    te_acc
-------  --------  --------  ---------  --------
      1   2.4e-03      92.7    1.2e-03      96.6
      2   1.0e-03      97.0    9.3e-04      97.0
      3   6.6e-04      98.0    8.2e-04      97.5
      4   4.8e-04      98.5    7.4e-04      97.8
      5   3.7e-04      98.8    6.8e-04      97.8
      6   2.7e-04      99.2    8.2e-04      97.6
      7   2.3e-04      99.3    8.9e-04      97.6
      8   1.9e-04      99.4    8.0e-04      97.8
      9   1.6e-04      99.5    7.5e-04      98.2
     10   1.5e-04      99.5    9.4e-04      97.7
     11   1.3e-04      99.6    7.7e-04      98.0
     12   1.3e-04      99.5    8.6e-04      97.8
     13   1.2e-04      99.6    8.8e-04      98.1
     14   8.2e-05      99.7    1.0e-03      97.8
     15   9.9e-05      99.6    1.1e-03      97.8
     16   9.8e-05      99.6    1.1e-03      97.8
     17   6.8e-05      99.8    1.1e-03      98.0
     18   1.1e-04      99.6    1.0e-03      98.0
     19   8.6e-05   

## Pytorch cheatsheet

* Main entity in pytorch is a tensor. It can be stored either on cpu or gpu
* Use x.cpu() and x.gpu() to switch between devices
* Type convertation work in the same way, for example x.float()
* Get shape: x.shape, change shape: x.reshape(...)
* To create a new empty tensor on the same device as x and with the same shape use .new: y = x.new(x.shape)
* Matrix multiplication: A.dot(B)
* Element-wise operations: +, -, *, /, torch.sqrt, torch.abs, torch.exp, torch.log, < , >
* Fill tensor x with some values: x.fill\_(5), x.zero\_(), x.ones\_(), x.normal\_()
* Clip tensor x so that values are between A and B: torch.clamp(x, A, B)
* If you want some tensor to be trained, wrap it in nn.Parameter: p = nn.Parameter(x). You can access tensor using p.data

## Sparse Variational Dropout

![image with formulas 1](https://github.com/nadiinchi/BayesianSparsificationTutorial/raw/master/images/slide1.png)
![image with formulas 2](https://github.com/nadiinchi/BayesianSparsificationTutorial/raw/master/images/slide2.png)



## Implementation of SparseVD

To implement SparseVD, we need to modify two items:
* fully-connected layer
* loss

You need to implement LinearSVDO layer that is inserted into the Net instead of nn.Linear:

In [None]:
class NetSVDO(nn.Module):
    def __init__(self, threshold):
        super(NetSVDO, self).__init__()
        self.fc1 = LinearSVDO(28*28, 300, threshold) ### instead of nn.Linear
        self.fc2 = LinearSVDO(300,  10, threshold) ### instead of nn.Linear
        self.threshold = threshold

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

Implement the layer:

In [None]:
class LinearSVDO(nn.Module):
    def __init__(self, in_features, out_features, threshold, bias=True):
        ### hyperparameters
        super(LinearSVDO, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.threshold = threshold

        self.W = Parameter(torch.Tensor(out_features, in_features))
        ###########################################################
        ########         You Code should be here         ##########
        # Create a Parameter to store log sigma
        self.log_sigma = ...
        ###########################################################
        self.bias = Parameter(torch.Tensor(1, out_features))
        
        self.reset_parameters()

    def reset_parameters(self):
        self.bias.data.zero_()
        self.W.data.normal_(0, 0.02)
        self.log_sigma.data.fill_(-5)        
        
    def forward(self, x): 
        # On training stage, we do LRT, i. e. sample preactivation
        # On testing stage, we zero out weights and perform forward pass with mean weights
        ###########################################################
        ########         You Code should be here         ##########
        if self.training:
            lrt_mean = ... # Compute activation's mean. Do not forget about bias!
            lrt_std = ...  # Compute activation's var. Do not forget to add 1e-9 before sqrt!
            eps = ... # sample random noise. Use <some existing tensor>.new function
            return lrt_mean + lrt_std * eps
        else:
            ########         If not training        ##########
            self.log_alpha = ... # Eval log alpha as a function(log_sigma, W)
            # Do not forget to add 1e-15 before log for numerical stability
            self.log_alpha = ... # Clip log alpha to be in [-10, 10] for numerical stability 
            W = ... # Prune out redundant wights 
            return F.linear(x, W) + self.bias
            ###########################################################
        
    def kl_reg(self):
        ###########################################################
        ########         You Code should be here         ##########
        ########  Eval Approximation of KL Divergence    ##########
        # use torch.log1p for numerical stability. You can also use torch.sigmoid
        log_alpha = ... # Eval log alpha as a function(log_sigma, W)
        log_alpha = ... # Clip log alpha to be in [-10, 10] for numerical suability 
        k1, k2, k3 = torch.Tensor([0.63576]).to(device), \
                     torch.Tensor([1.8732]).to(device), \
                     torch.Tensor([1.48695]).to(device)
        KL  = ...
        return KL 
        ########  Return a KL divergence, a Tensor 1x1   ##########
        ###########################################################    

In [None]:
# make instance of the Net
model = NetSVDO(threshold=3).to(device)

Implement ELBO (loss):

In [None]:
# Define a new Loss Function -- SGVLB 
class SGVLB(nn.Module):
    def __init__(self, net, train_size):
        super(SGVLB, self).__init__()
        self.train_size = train_size
        self.net = net
        self.kl_weight = 1.0

    def forward(self, input, target):
        assert not target.requires_grad
        kl = 0
        for module in self.net.children():
            if hasattr(module, 'kl_reg'):
                kl = kl + module.kl_reg()
        ###########################################################
        ########         You Code should be here         ##########    
        # Compute Stochastic Gradient Variational Lower Bound
        # Hint: you will need F.cross_entropy
        # Do not forget to scale up Data term to N/M,
        # where N is a size of the dataset and M is a size of minibatch
        # Remember that F.cross_entropy returns loss averaged accross data points
        SGVLB = ...
        return SGVLB # a Tensor 1x1 
        ###########################################################

In [None]:
sgvlbloss = SGVLB(model, len(train_loader.dataset)).to(device)

## Training SparseVD model

Define optimizer for a new model and create a new logger:

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50,60,70,80], gamma=0.2)
fmt = {'tr_los': '3.1e', 'te_loss': '3.1e', 'sp_0': '.3f', 'sp_1': '.3f', 'lr': '3.1e', 'kl': '.2f'}
logger = Logger('sparse_vd', fmt=fmt)

Train SparseVD model:

In [None]:
train(train_loader, test_loader, model, sgvlbloss, logger, device,
    optimizer, scheduler=scheduler, epochs = 100, sparse=True)

Compute compression rate:

In [None]:
all_w, kep_w = 0, 0

for c in model.children():
    kep_w += (c.log_alpha.data.cpu().numpy() < model.threshold).sum()
    all_w += c.log_alpha.data.cpu().numpy().size

print('kept weight ratio =', all_w/kep_w)

    # Good result should be like 
    #   epoch    kl       lr    tr_los    tr_acc    te_loss    te_acc    sp_0    sp_1
    #  -------  ----  -------  --------  --------  ---------  --------  ------  ------
    #      100     1  1.6e-06  -1.4e+03      98.0   -1.4e+03      98.3   0.969   0.760
    # kept weight ratio = 30.109973454683352

## Visualization

Visualize part of the sparsified weight matrix:

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# mask zero weights
log_alpha = (model.fc1.log_alpha.detach().cpu().numpy() < 3).astype(np.float)
W = model.fc1.W.detach().cpu().numpy()

# visualize part of the matrix
plt.figure(figsize=(20, 10))
plt.imshow((log_alpha * W)[:, 200:400], cmap='RdBu_r', interpolation=None)
plt.colorbar()
plt.title("Part of the weight matrix of the 1st layer")

Visualize sparsified weights reshaped to the image shape:

In [None]:
s = 0

z = np.zeros((28*10, 28*10))

for i in range(10):
    for j in range(10):
        s += 1
        z[i*28:(i+1)*28, j*28:(j+1)*28] =  np.abs((log_alpha * W)[s].reshape(28, 28))
        
z[::28] = 2
z[:, ::28] = 2
z[-1] = 2
z[:, -1] = 2
      
plt.figure(figsize=(20, 10))
plt.imshow(z, cmap='hot_r')
plt.colorbar()
plt.axis('off')
plt.title("Weights reshaped to the image shape")

Visualize $\log |\mu|$ and $\log \sigma$ of the weights (do it yourself):

In [None]:
###########################################################
########         You Code should be here         ##########    
# Choose a submatrix of the weight matrix of the 1-st FC layer (about 100 x 100 elements)
# Scatter their log abs mu and log sigma (use plt.scatter)



###########################################################

## Compression with Sparse Matrixes

Let's check how efficient is weight compression using sparse matrix formats:

In [None]:
import scipy
import numpy as np
from scipy.sparse import csc_matrix, csc_matrix, coo_matrix, dok_matrix

row, col, data = [], [], []
M = list(model.children())[0].W.data.cpu().numpy()
LA = list(model.children())[0].log_alpha.data.cpu().numpy()

for i in range(300):
    for j in range(28*28):
        if LA[i, j] < 3:
            row += [i]
            col += [j]
            data += [M[i, j]]

Mcsr = csc_matrix((data, (row, col)), shape=(300, 28*28))
Mcsc = csc_matrix((data, (row, col)), shape=(300, 28*28))
Mcoo = coo_matrix((data, (row, col)), shape=(300, 28*28))

In [None]:
np.savez_compressed('M_w', M)
scipy.sparse.save_npz('Mcsr_w', Mcsr)
scipy.sparse.save_npz('Mcsc_w', Mcsc)
scipy.sparse.save_npz('Mcoo_w', Mcoo)

In [None]:
ls -lah | grep _w

### If you have time left
* Implement _structured_ Bayesian sparsification (slides 83-86)
* Compare SparseVD and ARD (slides 60-64) in terms of quality/compression ratio
* Play with hyperparameters like KL weight and threshold