In [6]:
# Later in the exercise, you will have to download the CIFAR-10 data-set. It might take a while,
# so it would be a good idea to run this cell to start downloading the data already now.
import torchvision
import torch as tc
torchvision.datasets.CIFAR10(root='data', download=True)

Files already downloaded and verified


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train

# Exercise 1:

### Exercise 1.1: Convolve manually

Perform the following calculations by hand, and write the result below. You decide if you want to make a convolution or correlation.


<img src="https://nextcloud.theailab.dk/s/fCF7XZktJDwYA4C/download/inputKernel.jpg" alt="Kernel" style="width:300px;"/>


1. Manually convolve the input with the kernel, and write down the result. Use no padding and stride of 1.
1. Perform $2\times2$ max pooling on the output of the convolution. Use a stride of 2.

**Answer:**

\begin{array}{|c|c|}
\hline
     ~ & ~ \\\hline
     ~ & ~ \\\hline
\end{array}




In [47]:
def pad(_input: tc.Tensor,
        _pad_sizes: tuple) -> tc.Tensor:
    h_pad, w_pad = _pad_sizes
    assert 0 <= h_pad and 0 <= w_pad
    h_in, w_in = _input.shape
    _result = tc.zeros((h_in+2*h_pad,w_in+2*w_pad))
    _result[h_pad:h_pad+h_in,w_pad:w_pad+w_in] = _input
    return _result

In [43]:
test = tc.tensor(data=[[1,1,1],
                         [2,1,1],
                         [3,2,2]],dtype=tc.float64)

pads = (1,1)
pad(test,pads)

tensor([[0., 0., 0., 0., 0.],
        [0., 1., 1., 1., 0.],
        [0., 2., 1., 1., 0.],
        [0., 3., 2., 2., 0.],
        [0., 0., 0., 0., 0.]])

In [44]:
#### Note that: (h_out,w_out) = floor(((h_in,w_in)-(h_window,w_window))/(stride_h,stride_w)) + 1

#### task 1) ####
def correlation_2d(_input: tc.Tensor,
                   _kernel: tc.Tensor,
                   _stride: tuple) -> tc.Tensor:
    row_stride, col_stride = _stride
    assert 1 <= col_stride <= _input.shape[1] - _kernel.shape[1]
    assert 1 <= row_stride <= _input.shape[0] - _kernel.shape[0]
    assert _kernel.shape[0] < _input.shape[0] and _kernel.shape[1] < _input.shape[1]
    """Compute 2D cross-correlation."""
    h_win,w_win = _kernel.shape
    h_in,w_in = _input.shape
    stride_h,stride_w = _stride
    _result = tc.zeros((int((h_in-h_win)/stride_h)+1,int((w_in-w_win)/stride_w)+1))
    for i in range(_result.shape[0]):
        for j in range(_result.shape[1]):
            _result[i, j] = (_input[i*stride_h:i*stride_h+h_win, j*stride_w:j*stride_w+w_win] * _kernel).sum()
    return _result

Kernel = tc.tensor(data=[[0,0,0],
                         [2,1,0],
                         [3,2,2]],dtype=tc.float64)

Input = tc.tensor(data=[[1,0,0,0],
                        [3,0,0,0],
                        [3,3,0,0],
                        [4,2,2,2]],dtype=tc.float64)
Stride = (1,1)
correlation_2d(Input,Kernel,Stride)

tensor([[21.,  9.],
        [29., 20.]])

In [38]:
#### task 2) ####
def pooling_2d(_input: tc.Tensor,
               _window_size: tuple,
               _stride: tuple,
               _pooling_func) -> tc.Tensor:
    row_stride, col_stride = _stride
    assert 1 <= col_stride <= _input.shape[1] - _window_size[1]
    assert 1 <= row_stride <= _input.shape[0] - _window_size[0]
    assert _window_size[0] <= _input.shape[0] and _window_size[1] <= _input.shape[1]
    """Compute 2D pooling."""
    h_win,w_win = _window_size
    h_in,w_in = _input.shape
    stride_h,stride_w = _stride
    _result = tc.zeros((int((h_in-h_win)/stride_h)+1,int((w_in-w_win)/stride_w)+1))
    for i in range(_result.shape[0]):
        for j in range(_result.shape[1]):
            _result[i, j] = _pooling_func(_input[i*stride_h:i*stride_h+h_win, j*stride_w:j*stride_w+w_win])
    return _result


window_size = (2,2)
stride = (2,2)
pooling_2d(Input,window_size,stride,tc.amax)

tensor([[3., 0.],
        [4., 2.]])


### Exercise 1.2: Calculate output sizes of convolution output

In the following list, you will be given a 3D tensor and some filters. Based on their sizes, calculate the size of the output (if valid). We will use PyTorch notation of dimensions. This means that dimensions are given as channel-first. I.e.: `(channel, height, width)`.

The size of the 3D tensor input is givens as `(channel, height, width)`.
A number `(channels_out)` of filters, have a size of `(channels_in, filter_height, filter_width)`, stride `(height, width)` and padding `(height, width)`.


1. input tensor with dimensionality (1, 28, 28) and 16 filters of size (1, 5, 5) with stride (1, 1) and padding (0, 0)
2. input tensor with dimensionality (3, 32, 32) and 24 filters of size (3, 3, 3) with stride (1, 1) and padding (0, 0)
3. input tensor with dimensionality (40, 32, 32) and 3 filters of size (40, 2, 2) with stride (2, 2) and padding (0, 0)
4. input tensor with dimensionality (11, 8, 16) and 7 filters of size (11, 3, 3) with stride (2, 2) and padding (1, 1)
5. input tensor with dimensionality (128, 256, 256) and 112 filters of size (128, 3, 3) with stride (1, 1) and padding (1, 1)


1:  (24, 24) for each input.
2:  (24, 24) for each input.
