# 
<center><h1> Homework 3: ConvNet
<center> Apoorv Sharma </center>
    <center> DATA 598 (Winter 2022), University of Washington </center>

# 1. The Effect of BatchNorm on a ConvNet

In this exercise, we will combine both the topics we covered in class this week. The goal of this exercise is to
visualize the effective smoothness of a covolutional neural network with and without batch normalization.

Let $\phi(.; \omega) : \mathbb{R}^{28x28} \rightarrow \mathbb{R}^{10}$ denote a convolution neural network with parameters $\omega$ which takes in an image of size $28 × 28$ and returns a score for 10 output classes (All the MLPs and ConvNets we have considered so far fit this input-output description of $\phi$, upto a reshaping of the images). Consider the
objective function:

$$
f(\omega) = \frac{1}{n} \sum_{i=1}^{n} l(y_i, \phi(x_i,\omega))
$$

Concretely, your task is as follows:
* Use the FashionMNIST dataset. Perform the same preprocessing as in previous homeworks.
* Code up a ConvNet module with two convolutional layers with the following structure (the input has 1 channel, so we write the image as 1 × 28 × 28):

In [179]:
import torch
import numpy as np

from torchvision.datasets import FashionMNIST
from torch.nn.functional import cross_entropy, relu

import pickle
import os

import matplotlib.pyplot as plt 
%matplotlib inline

In [180]:
path = './models'
if not os.path.exists(path):
  os.makedirs(path)

#### CovNet1 Specification

* k denotes the kernel/filter size and “#filters” denotes the number of filters
* In PyTorch, the convolutions and pooling operations on images are called “Conv2d” and “Max-Pool2d” respectively.
* For the first conv layer, the specification asks you to use a kernel size of 5, and a padding of 2. The number of input channels is the same as the number of channels from the preceding layer (here, it is 1 since the preceding layer is just the image with 1 channel). Finally, the number of output channels is the same as the number of filters (here, 16). The second conv layer is constructed in a similarly; the number of input channels is the same as the number of outputs channels of the first conv layer (since ReLU and MaxPool do not change the number of channels). When not specified, we take the stride to be 1.
* The last “Linear” layer takes in the output of the second MaxPool and flattens it down to a vector of a certain size S. You are to figure out this size by running a dummy input through these layers and analyzing the output size, as we have done in the lab. The linear layer then maps this S-dimensional input to a 10-dimensional output, one for each class.

In [181]:
class CovNetNoBatchNorm(torch.nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.conv_ensemble_1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=1))
        self.conv_ensemble_2 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=1))
        
        # cov_ensemble_2 has shape: torch.Size([1, 32, 26, 26])
        self.fully_connected_layer = torch.nn.Linear(26*26*32, num_classes)
        
    def forward(self, x):
        x = x.view(-1, 1, 28, 28)  # reshape input; convolutions need a channel
        out = self.conv_ensemble_1(x)  # first convolution + relu + pooling
        out = self.conv_ensemble_2(out) # second convolution + relu + pooling
        out = out.view(out.shape[0], -1)  # flatten output
        out = self.fully_connected_layer(out)  # output layer
        return out

In [182]:
image_size = 28
random_image = torch.randn(1, 1, image_size, image_size)

In [183]:
class CovNetBatchNorm(torch.nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.conv_ensemble_1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=1),
            torch.nn.BatchNorm2d(16))
        self.conv_ensemble_2 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(2, stride=1),
            torch.nn.BatchNorm2d(32))
        
        # cov_ensemble_2 has shape: torch.Size([1, 32, 26, 26])
        self.fully_connected_layer = torch.nn.Linear(26*26*32, num_classes)
        
    def forward(self, x):
        x = x.view(-1, 1, 28, 28)  # reshape input; convolutions need a channel
        out = self.conv_ensemble_1(x)  # first convolution + relu + pooling
        out = self.conv_ensemble_2(out) # second convolution + relu + pooling
        out = out.view(out.shape[0], -1)  # flatten output
        out = self.fully_connected_layer(out)  # output layer
        return out

Test the models and ensure that they work

In [184]:
m1 = CovNetNoBatchNorm(num_classes=10)

In [185]:
m2 = CovNetBatchNorm(num_classes=10)

In [186]:
output = m1(random_image)
print(output)

tensor([[ 0.0470, -0.0932,  0.0651,  0.3884,  0.3967, -0.1737, -0.2109, -0.2891,
          0.0209, -0.1509]], grad_fn=<AddmmBackward0>)


In [187]:
output = m2(random_image)
print(output)

tensor([[ 0.5358, -0.4206, -0.7428, -0.3545,  0.7004, -0.1301, -1.1664, -0.8595,
          0.3647,  0.7489]], grad_fn=<AddmmBackward0>)


In [188]:
# download dataset (~117M in size)
train_dataset = FashionMNIST('./data', train=True, download=True)
X_train = train_dataset.data # torch tensor of type uint8
y_train = train_dataset.targets # torch tensor of type Long
test_dataset = FashionMNIST('./data', train=False, download=True)
X_test = test_dataset.data
y_test = test_dataset.targets

# choose a subsample of 10% of the data:
idxs_train = torch.from_numpy(
    np.random.choice(X_train.shape[0], replace=False, size=X_train.shape[0]//10))
X_train, y_train = X_train[idxs_train], y_train[idxs_train]
# idxs_test = torch.from_numpy(
#     np.random.choice(X_test.shape[0], replace=False, size=X_test.shape[0]//10))
# X_test, y_test = X_test[idxs_test], y_test[idxs_test]

print(f'X_train.shape = {X_train.shape}')
print(f'n_train: {X_train.shape[0]}, n_test: {X_test.shape[0]}')
print(f'Image size: {X_train.shape[1:]}')

# Normalize dataset: pixel values lie between 0 and 255
# Normalize them so the pixelwise mean is zero and standard deviation is 1

X_train = X_train.float()  # convert to float32
X_train = X_train.view(-1, 784)  # flatten into a (n, d) shape
mean, std = X_train.mean(axis=0), X_train.std(axis=0)
X_train = (X_train - mean[None, :]) / (std[None, :] + 1e-6)  # avoid divide by zero

X_test = X_test.float()
X_test = X_test.view(-1, 784)
X_test = (X_test - mean[None, :]) / (std[None, :] + 1e-6)

n_class = np.unique(y_train).shape[0]

X_train.shape = torch.Size([6000, 28, 28])
n_train: 6000, n_test: 10000
Image size: torch.Size([28, 28])


In [189]:
def compute_objective(model, X, y):
    """ Compute the multinomial logistic loss. 
        model is a module
        X of shape (n, d) and y of shape (n,)
    """
    # send 
    score = model(X)
    # PyTorch's function cross_entropy computes the multinomial logistic loss
    return cross_entropy(input=score, target=y, reduction='mean') 

@torch.no_grad()
def compute_accuracy(model, X, y):
    """ Compute the classification accuracy
        ws is a list of tensors of consistent shapes 
        X of shape (n, d) and y of shape (n,)
    """
    is_train = model.training  # if True, model is in training mode
    model.eval()  # use eval mode for accuracy
    score = model(X)
    predictions = torch.argmax(score, axis=1)  # class with highest score is predicted
    if is_train:  # switch back to train mode if appropriate
        model.train()
    return (predictions == y).sum() * 1.0 / y.shape[0]

@torch.no_grad()
def compute_logs(model, verbose=False):
    is_train = model.training  # if True, model is in training mode
    model.eval()  # switch to eval mode
    train_loss = compute_objective(model, X_train, y_train)
    test_loss = compute_objective(model, X_test, y_test)
    train_accuracy = compute_accuracy(model, X_train, y_train)
    test_accuracy = compute_accuracy(model, X_test, y_test)
    if verbose:
        print(('Train Loss = {:.3f}, Train Accuracy = {:.3f}, ' + 
               'Test Loss = {:.3f}, Test Accuracy = {:.3f}').format(
                train_loss.item(), train_accuracy.item(), 
                test_loss.item(), test_accuracy.item())
    )
    if is_train:  # switch back to train mode if appropriate
        model.train()
    return (train_loss, train_accuracy, test_loss, test_accuracy)

In [190]:
def minibatch_sgd_one_pass(model, X, y, learning_rate, batch_size, verbose=False):
    model.train()
    num_examples = X.shape[0]
    average_loss = 0.0
    num_updates = int(round(num_examples / batch_size))
    L_hats = []
    for i in range(num_updates):
        idxs = np.random.choice(X.shape[0], size=(batch_size,)) # draw `batch_size` many samples
        model.train()  # make sure we are in train mode
        # compute the objective. 
        objective = compute_objective(model, X[idxs], y[idxs]) 
        average_loss = 0.99 * average_loss + 0.01 * objective.item()
        if verbose and (i+1) % 100 == 0:
            print(average_loss)
        
        # compute the gradient using automatic differentiation
        gradients = torch.autograd.grad(outputs=objective, inputs=model.parameters())
        
        # Get the current objective
        model.eval()
        w_old = model.parameters()
        delta_f_w = torch.autograd.grad(outputs=compute_objective(model, X, y) , inputs=w_old)
        
        # perform SGD update. IMPORTANT: Make the update inplace!
        with torch.no_grad():
            for (w, g) in zip(model.parameters(), gradients):
                w -= learning_rate * g
        
        u = model.parameters() - w_old
        delta_f_uw = torch.autograd.grad(outputs=compute_objective(model, X, y) , inputs=model.parameters())
        
        L_hat = 0
        for old, new in zip(delta_f_w, delta_f_uw):
            L_hat += torch.linalg.norm((new - old), ord=2) / torch.linalg.norm(u, ord=2)
        
        L_hats.append(L_hat)
        
    return model, L_hats

In [191]:
batch_logs = []
learning_rate = 0.01
num_passes = 10
batch_size = 32

m1 = CovNetNoBatchNorm(num_classes=10)
m2 = CovNetBatchNorm(num_classes=10)

for i, model in enumerate([m1, m2]):
    logs = []
    logs.append( list(compute_logs(model, verbose=True)) + [0] )
    
    for _ in range(num_passes):
        model, L_hat = minibatch_sgd_one_pass(model, X_train, y_train, learning_rate, 
                                       batch_size=batch_size, verbose=True)
        
        log = list(compute_logs(model, True)) + [L_hat]
        logs.append(log)
    
    # done training this mode - append logs
    batch_logs.append(logs)
    
    # save the model parms
    torch.save(model.state_dict(), f'./models/{type(model).__name__}.pt')

with open('./models/logs.pkl', 'wb') as f:
    pickle.dump(batch_logs, f)

Train Loss = 2.304, Train Accuracy = 0.114, Test Loss = 2.306, Test Accuracy = 0.119


TypeError: unsupported operand type(s) for -: 'generator' and 'generator'

In [None]:
# idxs = np.random.choice(X_train.shape[0], size=(batch_size,)) # draw `batch_size` many samples
# objective = compute_objective(m1, X_train[idxs], y_train[idxs])
# print(objective)