# Computer Vision and Deep Learning - Laboratory 3

In this laboratory session, we'll be diving into deep convolutional neural networks.

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt


import torch
import torchvision
import torchvision.transforms as transforms

In [None]:
!wget https://www.math.hkust.edu.hk/~masyleung/Teaching/CAS/MATLAB/image/images/cameraman.jpg

# Warm-up


Let's start by implementing the basic blocks of a convolutional neural network: the convolutional and (optional) the pooling operations. This would be the last "low-level" implementation that you'll do for this class.

## Convolutions

The convolutional layer is the main building block of a convolutional neural network. These layers contain a set of learnable filters, which will learn which features are relevant for the classification problem based on the training data.
During the forward pass, each filter (which __must__ have the same depth as the input volume) is slided over the spatial dimensions of the input volume and we compute an element-wise multiplication between the filter weights and the region of interest in the input volume that lies beneath the filter.

The hyperparameters of a convolutional layer are:
- the filter size F (usually this is an odd value);
- the padding amount which will be added to the input volume P;
- the stride S (or the step used when sliding across the input volume);
- the number of filters k; the depth of each filter must match the depth of the input volume;

Given an input volume of shape  ($H_i$, $W_i$, $D$), the convolutional layer will produce an output of shape ($H_o$, $W_o$, $k$), where:

\begin{equation}
W_o = \frac{W_i - F + 2P}{S} + 1
\end{equation}

\begin{equation}
H_o = \frac{H_i - F + 2P}{S} + 1
\end{equation}

<img src="https://lh6.googleusercontent.com/gZxwFH6mQ5tPjz6LzVbOaNeVuR1NC-BnuemIWO41qnn7r1PvP4qzwXRWC1OJgo2_PD08qaqJ2-VCF3q9laeK885IJwK-dHhpLDkvRZrx4vxrbLDTsKD2iZYM5SFRq4A6XTklk7_h"/>

In [None]:
def zero_pad(X, pad):
  """
  This function applies the zero padding operation on all the images in the array X
  :param X input array of images; this array has a of rank 4 (batch_size, height, width, channels)
  :param pad the amount of zeros to be added around around the spatial size of the images
  """
  # hint you might find the function numpy.pad useful for this purpose
  # keep in mind that you only need to pad the spatial dimensions (height and width)
  # TODO your code here
  X = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), constant_values=(0,))
  return X

# load the image using Pillow
img = cv2.imread('cameraman.jpg', cv2.IMREAD_GRAYSCALE)
img = np.asarray(img)

# TODO your code here
# pad and display the cameraman.jpg image
# (if you are using matplotlib to display the image, use cmap='gray' in the imshow function)
img_padded = zero_pad(img[np.newaxis, :, :, np.newaxis], 50)[0]
plt.imshow(img_padded, cmap='gray')

In [None]:
def convolution(X, W, bias, pad, stride):
  """
  This function applied to convolution operation on the input X of shape (num_samples, iH, iW, iC)
  using the filters defined by the W (filter weights) and  (bias) parameters.

  :param X - input of shape (num_samples, iH, iW, iC)
  :param W - weights, numpy array of shape (fs, fs, iC, k), where fs is the filter size,
    iC is the depth of the input volume and k is the number of filters applied on the image
  :param biases - numpy array of shape (1, 1, 1, k)
  :param pad - hyperparameter, the amount of padding to be applied
  :param stride - hyperparameter, the stride of the convolution
  """

  # 0. compute the size of the output activation map and initialize it with zeros

  num_samples = X.shape[0]
  iW = X.shape[2]
  iH = X.shape[1]
  f = W.shape[0]

  # TODO your code here
  # compute the output width (oW), height (oH) and number of channels (oC)
  oW = (iW - f + 2 * pad) // stride + 1
  oH = (iH - f + 2 * pad) // stride + 1
  oC = W.shape[3]
  # initialize the output activation map with zeros
  activation_map = np.zeros((num_samples, oH, oW, oC))
  # end TODO your code here

  # 1. pad the samples in the input
  # TODO your code here, pad X using pad amount
  X_padded = zero_pad(X, pad)
  # end TODO your code here

  # go through each input sample
  for i in range(num_samples):
    # TODO: get the current sample from the input (use X_padded)
    X_i = X_padded[i]
    # end TODO your code here

    # loop over the spatial dimensions
    for y in range(oH):
      # TODO your code here
      # compute the current ROI in the image on which the filter will be applied (y dimension)
      # tl_y - the y coordinate of the top left corner of the current region
      # br_y - the y coordinate of the bottom right corner of the current region
      tl_y = y * stride
      br_y = tl_y + f
      # end TODO your code here

      for x in range(oW):
        # TODO your code here
        # compute the current ROI in the image on which the filter will be applied (x dimension)
        # tl_x - the x coordinate of the top left corner of the current region
        # br_x - the x coordinate of the bottom right corner of the current region
        tl_x = x * stride
        br_x = tl_x + f
        # end TODO your code here

        for c in range(oC):
            # select the current ROI on which the filter will be applied
            roi = X_i[tl_y: br_y, tl_x: br_x, :]
            w = W[:, :, :, c]
            b = bias[:, :, :, c]

            # TODO your code here
            # apply the filter with the weights w and bias b on the current image roi

            # A. compute the elemetwise product between roi and the weights of the filters (np.multiply)
            a = roi * w
            # B. sum across all the elements of a
            a = np.sum(a)
            # C. add the bias term
            a = a + b

            # D. add the result in the appropriate position of the output activation map
            activation_map[i, y, x, c] = a
            # end TODO your code here
  return activation_map



np.random.seed(10)
# 100 samples of shape (13, 21, 4)
X = np.random.randn(100, 13, 21, 4)

# 8 filters (last dimension) of shape (3, 3)
W = np.random.randn(3, 3, 4, 8)
b = np.random.randn(1, 1, 1, 8)

am = convolution(X, W, b, pad=1, stride=2)
print("am's mean =\n", np.mean(am))
print("am[1, 2, 3] =\n", am[3,2,1])


Expected output:

am's mean =
 -0.42841306

am[1, 2, 3] =
 [ 1.780819  -6.5181394 -4.3581524 -2.9115834  1.8401672 -3.722643
 -8.327618  -3.227787 ]

Now let's analyse the effect of applying some well known filters used in image processing.

### Low pass filters
Low pass filters are used to keep the low frequency information within an, while reducing the high frequency information. These filters are the basis of image smoothing.

Two well known low pass filters are the _mean filter_ and the _Gaussian filter_.

In [None]:
image = cv2.imread('cameraman.jpg', cv2.IMREAD_GRAYSCALE)
image = np.expand_dims(image, axis=-1)

# X contains a single image sample
X = np.expand_dims(image, axis=0)

plt.imshow(image, cmap='gray')

In [None]:
############################################################
# MEAN FILTER
############################################################

bias = np.asarray([0])
bias = bias.reshape((1, 1, 1, 1))

mean_filter_3 = np.ones(shape=(3, 3, 1, 1), dtype=np.float32)
mean_filter_3 = mean_filter_3/9.0

mean_filter_9 = np.ones(shape=(9, 9, 1, 1), dtype=np.float32)
mean_filter_9 = mean_filter_9/81.0

mean_3x3 = convolution(X, mean_filter_3, bias, pad=0, stride=1)
mean_9x9 = convolution(X, mean_filter_9, bias, pad=0, stride=1)

plt.figure(0)
plt.subplot(1, 2, 1)
plt.imshow(image[:, :, 0], cmap='gray')
plt.title('Original image')
plt.subplot(1, 2, 2)
plt.imshow(mean_3x3[0, :, :, 0], cmap='gray')
plt.title('mean filter 3x3')

plt.figure(2)
plt.subplot(1, 2, 1)
plt.imshow(image[:, :, 0], cmap='gray')
plt.title('Original image')
plt.subplot(1, 2, 2)
plt.imshow(mean_9x9[0, :, :, 0], cmap='gray')
plt.title('mean filter 9x9')


############################################################
# GAUSSIAN FILTER
############################################################

gaussian_filter = np.asarray(
    [[1, 2, 1],
     [2, 4, 2],
     [1, 2, 1]],
     dtype=np.float32
)
gaussian_filter = gaussian_filter.reshape(3, 3, 1, 1)
gaussian_filter = gaussian_filter/16.0

gaussian_smoothed = convolution(X, gaussian_filter, bias, pad=0, stride=1)

plt.figure(3)
plt.subplot(1, 2, 1)
plt.imshow(image[:, :, 0], cmap='gray')
plt.title('Original image')
plt.subplot(1, 2, 2)
plt.imshow(gaussian_smoothed[0,:,:,0], cmap='gray')
plt.title('Gaussian filtered')


__Optional__: Now load a color image and apply the mean filtering and Gaussian filtering on this color image.
Not much changes at the call of the convolution operation, you just need to "play" with the convolutional kernels configurations.

In [None]:
# TODO your code here
![ -f "cute_cat.jpg" ] || wget "https://docs.google.com/uc?export=download&id=1zjltpYscUqnDSP6eUlU-gecadGXvQtTz" -O cute_cat.jpg

cat_img = cv2.imread('cute_cat.jpg')
cat_img = cv2.cvtColor(cat_img, cv2.COLOR_BGR2RGB)
cat_img = cv2.resize(cat_img, (cat_img.shape[1] // 4, cat_img.shape[0] // 4))

rgb_eye = np.eye(3)[:, :, None, None]
rgb_filters = []
rgb_bias = np.zeros((1, 1, 1, 3))

to_rgb_filter = lambda x: np.transpose(rgb_eye * x, (2, 3, 0, 1))

for i in [3, 9, 16]:
    mean_filter_i = np.ones((i, i), dtype=np.float32)
    mean_filter_i /= (i * i * 1.0)
    mean_filter_i = to_rgb_filter(mean_filter_i)
    rgb_filters.append({ "name": f"mean filter {i}x{i}", "mat": mean_filter_i })

gaussian_filter = np.array([[1, 2, 1],
     [2, 4, 2],
     [1, 2, 1]],
     dtype=np.float32
) / 16.0
gaussian_filter = to_rgb_filter(gaussian_filter)
rgb_filters.append({ "name": "Gaussian filtered", "mat": gaussian_filter })

cat_img = cat_img[np.newaxis, :]

for idx, filter in enumerate(rgb_filters, start=1):
    plt.figure(idx)
    plt.subplot(1, 2, 1)
    plt.imshow(cat_img[0])
    plt.title("Original image")
    cat_filtered = convolution(cat_img, filter["mat"], rgb_bias, pad=10, stride=1)
    cat_filtered = np.array(np.clip(cat_filtered, 0.0, 255.0), dtype=np.uint8)
    plt.subplot(1, 2, 2)
    plt.imshow(cat_filtered[0])
    plt.title(filter["name"])

### High pass filters

On the other hand, high pass filters are used to highlight the high frequency information in an image (edges, abrupt changes in intensities).

One of the most commonly used high pass filters is the Sobel kernel (depicted below). These filters can be seen as discrete differentiation operators, and they compute an approximation of the gradient (on the horizontal or vertical direction) of the image intensity function.

<img src="https://i.ytimg.com/vi/W7OpxFbrD84/maxresdefault.jpg" width=300px/>

In [None]:
sobel_horiz = np.asarray([[-1, 0, 1],
                          [-2, 0, 2],
                          [-1, 0, 1]])

sobel_vert = sobel_horiz.T

sobel_horiz = np.reshape(sobel_horiz, (3, 3, 1, 1))
sobel_vert = np.reshape(sobel_vert, (3, 3, 1, 1))

sobel_x = convolution(X, sobel_horiz, bias, 0, 1)
sobel_y = convolution(X, sobel_vert, bias, 0, 1)


plt.subplot(1, 3, 1)
plt.imshow(image[:, :, 0], cmap='gray')
plt.title('Original image')
plt.subplot(1, 3, 2)
plt.imshow(np.abs(sobel_x[0,:,:,0])/np.abs(np.max(sobel_x[0,:,:,0]))*255, cmap='gray')
plt.title('Sobel X')
plt.subplot(1, 3, 3)
plt.imshow(np.abs(sobel_y[0,:,:,0])/np.abs(np.max(sobel_y[0,:,:,0]))*255, cmap='gray')
plt.title('Sobel Y')
plt.tight_layout()


# Convolutional Neural Networks in pytorch


In this laboratory, you'll be using convolutional neural networks (CNNs) to perform image classification in torch.
You'll follow two approaches:
- First, you'll design, implement, and train a simple network **from scratch**. However, in practice, you won't get to train an entire CNN from scratch (with random initialization), because it is relatively rare to have a dataset of sufficient size. Instead, the norm is to pre-train a CNN on a very large dataset and then use these weights as an initialization or a fixed feature extractor for the task of interest (transfer learning).
- Therefore, in the second part you'll use **transfer learning** to fine-tune an already trained model on your dataset. Transfer learning is a machine learning technique where a model pre-trained on one task is adapted to a different, but related, task. This approach leverages the knowledge and features learned during the initial training to improve performance and reduce data requirements for the new task, making it more efficient and effective.


The main pipeline when training a neural network model is:
1. "Get one with the data". Analyze your input images,
2. Define the model (start with something simple in the beginning)
3. Define the training setup
4. Train the model
5. Test and *analyze* the results.
*Repeat the steps 2-5*


# 1. Datasets and data loaders


You will be working with the [Oxford-IIIT Pet Dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/), which is a 37 category pet dataset with roughly 200 images for each class. The images have large variations in scale, pose, and lighting, and they are annotated with the breed of the pet (37 classes), the head ROI, and pixel-level trimap segmentation.


*Datasets* and *DataLoader* are the core pytorch data structures for interacting with your data. Ideally, you would want your data handling code to be completely decoupled from the model training and testing code (you'll often need to evaluate your model on different datasets).


``torch.utils.data.Dataset`` stores the actual information about the dataset (the samples and their corresponding ground truth labels), while the torch.``utils.data.DataLoader`` wraps an iterable around the dataset, allowing easy access to the data, automatic batching, and multi-process data loading).


For now, you'll use the OxfordPets dataset implementation from [torchvision](https://pytorch.org/vision/stable/generated/torchvision.datasets.OxfordIIITPet.html), but next time you'll be learning how you can create your own custom dataset and how to configure data loaders.
torchvision is a popular package that comprises popular datasets, model architectures, and common image transformations for computer vision.


Transforms are common image transformations available in the ``torchvision.transforms`` module and can be used to preprocess and augment the input data. They can be chained together using *Compose*.
You can also use it to augment your data.
Image augmentation generates similar but distinct training examples after a series of random changes to the training images, and can help reduce overfitting.




```
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


```

In [None]:
# TODO you code here
# - create an object of type torchvision.datasets.OxfordIIITPet, download it
# - torch.utils.data.DataLoader object
# - display some samples
from torch.utils.data import DataLoader


IMG_SHAPE = (256, 256, 3)
transform = transforms.Compose([
    transforms.Resize(IMG_SHAPE[0:2]),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
])

dataset = torchvision.datasets.OxfordIIITPet(root='./data', transform=transform, download=True)
pet_classes = dataset.classes

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = torch.utils.data.random_split(dataset, [train_size, test_size])


batch_size = 128
plot_size = 4
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

for i, (images, labels) in enumerate(train_loader, start=1):
    fig = plt.figure(i)
    for j, (image, label) in enumerate(zip(images, labels), start=1):
        plt.subplot(1, plot_size, j)
        plt.imshow(np.clip(np.transpose(image.numpy(), (1, 2, 0)), 0, 1))
        plt.title(f"{pet_classes[label.numpy()]}")
        if j == plot_size:
            break
    fig.tight_layout()
    if i == plot_size:
        break




# 2. The Convolutional Neural Network

## Convolutional Neural Networks for scratch

Check the tutorial from reference [[2]](#scrollTo=my1Fk-G5KKmz&line=2&uniqifier=1).

You'll define your convolutional neural network by extending the [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) class, which is the base class for all the neural network modules.
In the constructor, you define the layers (and their properties) that comprise your module. ``torch.nn`` [package](https://pytorch.org/docs/stable/nn.html) provides classes for the basic layers of a CNN.

The function that you need to override is the _forward()_ function in which you specify computation performed at every call (i.e. how are layers chained and how does the data flow over the computational graph). In other words, this defines the forward pass through your model.



In [None]:
# TODO your code here: define a simple CNN model, pass a single example through the model
import torch.nn as nn


class MyModel(nn.Module):
    def __init__(self, img_shape, num_classes):
        super(MyModel, self).__init__()
        self.convs = nn.Sequential(
            nn.Conv2d(in_channels=img_shape[2], out_channels=20, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=20, out_channels=5, kernel_size=3, padding=1),
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(img_shape[0] * img_shape[1] * 5, num_classes)
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.convs(x)
        x = self.flatten(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x


model = MyModel(IMG_SHAPE, len(pet_classes))
for images, labels in train_loader:
    output = model(images)
    assert output.shape[0] == labels.shape[0] and output.shape[1] == len(pet_classes)
    break
print("Model works! :)")

## Transfer learning


Check the tutorial from reference [[3]](#scrollTo=my1Fk-G5KKmz&line=2&uniqifier=1).


The ``torchvision`` module provides the implementation and pre-trained weights for common neural network architectures.
For example, to load the resnet18 architecture and its weights (after training on ImageNet, you can use:


```[python]
from torchvision.models import resnet18, ResNet18_Weights


# Using pretrained weights:
model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
```

 Remember from the first lab, that when using a pre-trained model you must preprocess to the image as the images used for training the model. Using the correct preprocessing method is critical and failing to do so may lead to decreased accuracy or incorrect outputs. Each architecture uses a different preprocessing technique, so there is no standard way to achieve this.


#### Note (transfer learning training)
 In the tutorial, you will notice that the authors use model.train() and model.eval() in the training loop. These functions "tell" the model how to act when it is being run. In the next lectures, you will learn that some layers (such as dropout, batch normalization, and so on) behave differently during train and evaluation, and hence the model will produce unexpected results if run in the wrong mode. So don't forget these steps.


 To freeze the weights of the model and train only the rest, you can set requires_grad of the parameters you want to freeze to False.
```
for param in model.features.parameters():
    param.requires_grad = False
```


On the other hand, the ``torch.no_grad()``context manager that we used in the prvious lab  is used to prevent calculating gradients in the following code block. Usually it is used when you evaluate your model and don’t need to call backward() to calculate the gradients and update the corresponding parameters. In this mode, the result of every computation will have ``requires_grad=False``, even when the inputs have ``requires_grad=True``.



In [None]:
# TODO : your code here
# get a pretrained torchvision module, change the last layer,  pass a single example through the model
from torchvision.models import resnet18, ResNet18_Weights


def init_model():
    model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)

    # Freezing all the parameteres initially
    for param in model.parameters():
        param.requires_grad = False

    # Replacing the fully connected layer
    model.fc = nn.Linear(in_features=512, out_features=len(pet_classes))

    # Checking the parameteres which require gradients
    req_grad_params = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            req_grad_params.append(name)
    assert len(req_grad_params) == 2
    assert req_grad_params[0] == "fc.weight"
    assert req_grad_params[1] == "fc.bias"
    # Passing an artificial input to check if the model produces the desired shape
    samples = 3
    image = np.random.rand(samples, *IMG_SHAPE)
    image = np.transpose(image, (0, 3, 1, 2))
    image = torch.from_numpy(image).float()
    out = model(image)
    assert out.shape == (samples, len(pet_classes),)
    return model

model = init_model()
print("Model works! :)")

# Using the GPU

``torch`` is designed to allow for computation both on CPU and on GPU.
If your system has a GPU and the required libraries configured for torch compatibility, the cell below will print information about its state.

If you are running your code on colab, you can enable GPU computation from: Runtime->Change Runtime type -> T4 GPU

In [None]:
import torch
if torch.cuda.is_available():
    !nvidia-smi
else:
    print("NO GPU ☹️")

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Now we can start to use accelaration.
You now need to explictly specify on which device your tensors reside. You can
move all of the model's parameters `.to` a certain device (the GPU)
and also move the data on the same device there as well
before applying the model and calculating the loss.

# 3. Training the model


For training, we need to define a loss function and an optimizer. We'll cover optimizers next time, in this laboratory we'll just stick to stochastic gradient descent.


Let's first define some concepts:
- epoch: an epoch defines a pass through the entire training dataset. The number of epochs (passes of the entire training dataset the machine learning algorithm has completed) is a hyperparameter of your model. An epoch consists of one or more batches.
- batch:  a batch defines how many samples your model "sees" before updating its weights. In other words, the batch size is the number of samples that will be passed through to the network at one time during its training.
- sample: a sample is just a single training example.


As you saw in the previous laboratory, a typical training loop looks like this:
```


optimizer - the chosen optimizer. It holds the current state of the model and will update the parameters based on the computed gradients. Notice that in the constructor of the optimizer you need to pass the parameters of your model and the learning rate.
criterion - the chosen loss function.


for epoch in range(num_epochs):  # num_epochs is a hyperparameter that specifies when is the training process


    running_loss = 0.0
    for i, data in enumerate(dataloader, 0): # iterate over the dataset, now we use data loaders
        # get a batch of data (inputs and their corresponding labels)
        inputs, labels = data


        # IMPORTANT! set the gradients of the tensors to 0. by default torch accumulates the gradients on subsequent backward passes
        # if you omit this step, the gradient would be a combination of the old gradient, which you have already used to update the parameters
        optimizer.zero_grad()


        # perform the forward pass through the network
        outputs = net(inputs)
       
        # apply the loss function to determine how your model performed on this batch
        loss = criterion(outputs, labels)

        # start the backprop process. it will compute the gradient of the loss with respect to the graph leaves
        loss.backward()


        # update the model parameters by calling the step function
        optimizer.step()


```



In [None]:
# TODO code to train your model
from tqdm import trange


NUM_EPOCHS = 25

def train(model, optimizer, criterion, scheduler=None):
    losses = []
    accuracies = []

    for epoch in trange(NUM_EPOCHS, desc="Epoch"):
        train_loss = 0.0
        train_acc = 0.0
        for i, data in enumerate(train_loader):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()
            train_loss += loss.cpu().detach().numpy()
            train_acc += torch.sum(torch.argmax(outputs.cpu(), dim=1) == labels)
        train_loss /= len(train_loader)
        train_acc /= len(train_loader.dataset)

        test_loss = 0.0
        test_acc = 0.0
        with torch.no_grad():
            for data in test_loader:
                inputs, labels = data
                outputs = model(inputs.to(device))
                loss = criterion(outputs, labels.to(device))
                test_loss += loss.cpu().detach().numpy()
                test_acc += torch.sum(torch.argmax(outputs.cpu(), dim=1) == labels)
            test_loss /= len(test_loader)
            test_acc /= len(train_loader.dataset)
        losses.append({ "train": train_loss, "test": test_loss })
        accuracies.append({ "train": train_acc, "test": test_acc })

        if scheduler:
            scheduler.step()

    return losses, accuracies


def plot_losses(losses):
    epochs = np.arange(1, NUM_EPOCHS + 1)
    plt.plot(epochs, [loss["train"] for loss in losses], label="Train loss")
    plt.plot(epochs, [loss["test"] for loss in losses], label="Test loss")
    plt.legend()


def plot_accuracies(accuracies):
    epochs = np.arange(1, NUM_EPOCHS + 1)
    plt.plot(epochs, [acc["train"] for acc in accuracies], label="Train accuracy")
    plt.plot(epochs, [acc["test"] for acc in accuracies], label="Test accuracy")
    plt.legend()


criterion = torch.nn.CrossEntropyLoss()
learning_rates = [1e-2, 1e-3, 1e-4]

for idx, learning_rate in enumerate(learning_rates, start=1):
    model = init_model()
    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    losses, accuracies = train(model, optimizer, criterion)
    plt.subplot(2, 3, idx)
    plot_losses(losses)
    plt.title(f"loss lr {learning_rate} w/o sched")

    plt.subplot(2, 3, 3 + idx)
    plot_accuracies(accuracies)
    plt.title(f"acc lr {learning_rate} w/o sched")

plt.subplots_adjust(wspace=1.6, hspace=0.4)



Now let's examine the effect of the learning rate over the training process.

- First, create two plots: one in which you plot, for each epoch, the loss values on the training and the test data (two series on the same graph), and another one in which you plot, for each epoch, the accuracy values on the training and the test data.
- Experiment with different values for the learning rate.
- Then, experiment with a torch.optim.lr_scheduler to adjust the learning rate during the training process [doc](!https://pytorch.org/docs/stable/optim.html).

```
optimizer = SGD(model, lr)
scheduler = ExponentialLR(optimizer, gamma=0.9)

for epoch in range(num_epochs):
    for input, target in dataset:
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
    # apply the learning rate scheduler
    scheduler.step()
```

Plot the learning curves for all the training that you performed.
Fill in the table to compare the accuracy of your trained models.

| Model              | lr config            | accuracy  train| accuracy test |
| -----------        | -----------          | ------         | -----         |
| Model              | lr info              |   acc          |acc            |
| Model              | lr info              |   acc          |acc            |


You can work in teams and each team will train the model with a different setup.



In [None]:
for idx, learning_rate in enumerate(learning_rates, start=1):
    model = init_model()
    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    losses, accuracies = train(model, optimizer, criterion, scheduler)
    plt.subplot(2, 3, idx)
    plot_losses(losses)
    plt.title(f"loss lr {learning_rate} with sched")

    plt.subplot(2, 3, 3 + idx)
    plot_accuracies(accuracies)
    plt.title(f"acc lr {learning_rate} with sched")
plt.subplots_adjust(wspace=1.6, hspace=0.4)

#Useful references

- [1] [a "recipe" ](http://karpathy.github.io/2019/04/25/recipe/)  when you will start training artifcial neural networks;
- [2] [Defining a CNN](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html) in torch;
- [3] [Transfer learning](https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html) in torch;
- [4] [model debugging](https://developers.google.com/machine-learning/testing-debugging/common/overview).

# <font color='red'> Optional </font>  
## Pooling

The pooling layer is used to reduce the spatial dimension of the activation maps, and thus the computational burden. It has no learnable parameters and it operates individually across each input channel and resizes it spatially.

The two most common types of pooling are max pooling and average pooling.


The hyperparameters of a pooling layer are:
- the filter size F (usually this is an odd value);
- the stride S (or the step used when sliding across the input volume);

Given an input volume of shape  ($H_i$, $W_i$, $D$), the convolutional layer will produce an output of shape ($H_o$, $W_o$, $D$), where:

\begin{equation}
W_o = \frac{W_i - F}{S} + 1
\end{equation}

\begin{equation}
H_o = \frac{H_i - F}{S} + 1
\end{equation}

An illustration of the pooling operation is depicted in the image below:

![picture](https://www.researchgate.net/profile/Alla-Eddine-Guissous/publication/337336341/figure/fig15/AS:855841334898691@1581059883782/Example-for-the-max-pooling-and-the-average-pooling-with-a-filter-size-of-22-and-a.jpg)

In [None]:
# TODO your code here
# apply the pooling operation on a grayscale image and on a color image
# try different values for the stride and filter size. What do you observe?

def pooling(X, filter_size, stride, type):
    """
    Implements the pooling operation

    :param X - input volume of shape (num_samples, iH, iW, iC)
    :param filter_size - the size of the pooling
    :param stride - the stride of the pooling operation
    :param type - can be 'max' or 'avg'; the type of the pooling operation to apply

    Returns the output of the pooling operation.
    """
    # TODO your code here implement the pooling operation
    # you can inspire yourself from the convolution implementation on how to organize your code
    num_samples, iH, iW, iC = X.shape

    oW = (iH - filter_size) // stride + 1
    oH = (iW - filter_size) // stride + 1
    oC = 1

    activation_map = np.zeros((num_samples, oW, oH, oC), dtype=np.float64)

    if type not in ['max', 'avg']:
      raise ValueError("Invalid type!")
    operation = lambda x: np.max(x) if type == 'max' else np.average(x)

    for i in range(num_samples):
        for y in range(oH):
            tl_y = y * stride
            br_y = tl_y + filter_size

            for x in range(oW):
                tl_x = x * stride
                br_x = tl_x + filter_size

                for c in range(oC):
                    roi = X[i, tl_y: br_y, tl_x: br_x, :]
                    activation_map[i, y, x, c] = operation(roi)

    return activation_map

x = np.array([[2, 3, 2, 0], [5, -2, 2, 8], [-1, -6, 7, 3], [-4, -5, 4, 2]], dtype=np.float64)
x = x[np.newaxis, :, :, np.newaxis]
print("Max-Pooling:")
print(pooling(x, filter_size=2, stride=2, type='max'))
print("Average-Pooling:")
print(pooling(x, filter_size=2, stride=2, type='avg'))

In [None]:
image = cv2.imread('cameraman.jpg', cv2.IMREAD_GRAYSCALE)
image = image[np.newaxis, :, :, np.newaxis]

pools = [
    { "filter": 2, "stride": 2 },
    { "filter": 4, "stride": 4 },
    { "filter": 8, "stride": 8 },
    { "filter": 16, "stride": 16 },
]

for idx, pool in enumerate(pools, start=1):
    fig = plt.figure(idx)
    plt.subplot(1, 3, 1)
    plt.imshow(image[0], cmap='gray')
    plt.title("Original Image")

    plt.subplot(1, 3, 2)
    max_pooled_img = pooling(image, pool["filter"], pool["stride"], "max")
    plt.imshow(max_pooled_img[0], cmap='gray')
    plt.title(f"Max Pool F={pool['filter']} S={pool['stride']}")

    plt.subplot(1, 3, 3)
    avg_pooled_img = pooling(image, pool["filter"], pool["stride"], "avg")
    plt.imshow(avg_pooled_img[0], cmap='gray')
    plt.title(f"Avg Pool F={pool['filter']} S={pool['stride']}")

    fig.tight_layout()