In [None]:
import torch.optim as optim 
from torchvision import datasets, transforms 
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device) 

In [None]:
data_path = "./datasets/"
cifar10 = datasets.CIFAR10(root = data_path, 
                           train=True, 
                           download=False,
                           transform = transforms.Compose(
                               [transforms.ToTensor(), 
                                transforms.Normalize(mean = (0.4914, 0.4822, 0.4465), std=(0.2470, 0.2435, 0.2616))]
                           ))
cifar10_val = datasets.CIFAR10(root = data_path,
                               train=False,
                               download=False,
                               transform = transforms.Compose(
                                   [transforms.ToTensor(), 
                                    transforms.Normalize(mean = (0.4914, 0.4822, 0.4465), std=(0.2470, 0.2435, 0.2616))]
                            ))

In [None]:
class_names = ['airplane', 'bird']
label_map = {0:0, 2:1}

cifar2 = [(img, label_map[label])for img, label in cifar10 if label in [0,2]]
cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0,2]]

In [None]:
# dataloaders 
train_loader = torch.utils.data.DataLoader(dataset = cifar2, batch_size=64, shuffle=True, pin_memory=True)
val_loader = torch.utils.data.DataLoader(dataset = cifar2_val, batch_size=64, shuffle=True, pin_memory=True)

At a minimum, the arguments we provide to the Conv2d layer are the number of input features (channels of our multichannel images) The number of output features, and the size of the kernel. The more channels in the output image, the more the capacity of the network. Let's start with a kernel size of 3x3. Its common to have kernel sizes that are the same in all directions, so specifying a single value for the kernel argument will be interpreted as such. 

In [None]:
conv = nn.Conv2d(3, 16, kernel_size=3)
conv

What do we expect for the shape of the weight tensor? 

- Kernel size is `3 x 3` so we want the weight to consist of `3 x 3` parts. 
- For a single output pixel value, our kernel would conside `3` input channels. 
- Therefore the weight component for a single output pixel value would be `in_ch * 3 * 3`
- We have as many of these as we have output channels, which we have specified as `16`
- The shape of our weight tensor will be `out_ch * in_ch * 3 * 3`
- Finally, we have a bias term for each output channel. Remember this is a constant. 

In [None]:
conv.weight.shape, conv.bias.shape

Great, now we have smaller models looking for local patterns, whose weights are optimized across the entire image. 

A 2d convolution pass produces a 2D image as output, whose pixels are a weighted sum over neighborhoods of the input image. Let's pass an image through this layer just as we did when first introduced to linear layers. Note, `nn.Conv2d` expects the following input dimensions (B x C x H x W)

In [None]:
img, label = cifar2[0]
output = conv(img.unsqueeze(0))
img.unsqueeze(0).shape, output.shape

In [None]:
plt.imshow(output[0,0].detach(), cmap='gray')

In [None]:
plt.imshow(img.unsqueeze(0)[0,0].detach(), cmap='gray')

In [None]:
# add padding 
img, label = cifar2[0]
img = img.unsqueeze(0)

conv = nn.Conv2d(3, 1, kernel_size=3, padding=1)
output = conv(img)
img.shape, output.shape

Experimenting with deliberately set weights and biases within our convolution layer. 

- bias=0
- weights= constant value, so each pixel in the output gets the mean of its neighbors. 

In [None]:
with torch.no_grad():
    conv.bias.zero_()
with torch.no_grad():
    conv.weight.fill_(1.0 / 9.0)
output = conv(img)
plt.imshow(output[0,0].detach(), cmap='gray')

Remember, every pixel of the output is the average of a neighborhood of the input, so pixels in the output are correlated and change more smoothly, thus a blurred image. 

Next we construct a vertical edge detection kernel. 

- 'detection' meaning the output has a high magnitude

In [None]:
# edge detection kernel 
conv_e = nn.Conv2d(3, 1, kernel_size=3, padding=1)
with torch.no_grad():
    conv_e.weight[:] = torch.tensor([[-1.0, 0.0, 1.0],
                                   [-1.0, 0.0, 1.0],
                                   [-1.0, 0.0, 1.0]])
    conv_e.bias.zero_()

In [None]:
output = conv_e(img) 
print("OUTPUT VERTICAL EDGE DETECTION KERNEL")
plt.imshow(output[0,0].detach(), cmap='gray')
plt.show()

In [None]:
output = conv_e(img) 
print("INPUT")
plt.imshow(img[0,0].detach(), cmap='gray')
plt.show()

### Downsampling

- Convolutions have helped solve the translation invariace issue, but we need to introduce downsampling and pooling, which will help us recognize not only subsections of the image, but the full image itself. 
- Combining convolutions and downsampling can help us recognize larger structured. 

In [None]:
pool = nn.MaxPool2d(2)
output = pool(img) 
output.shape, img.shape

In [None]:
model = nn.Sequential(

    # First conv layer produces 16 independent (output) features that operate to (hopefully) discriminate low-level features 
    nn.Conv2d(3, 16, kernel_size=3, padding=1), 

    # activation 
    nn.Tanh(),

    # Reduce 16-channel 32 x 32 image to a 16-channel 16 x 16 image
    # downsample by half the size -> 4 x 4
    nn.MaxPool2d(2),

    # Produces an 8-channel 16 x 16 output. (intended to extract high level features) 
    nn.Conv2d(16, 8, kernel_size=3, padding=1),

    # activation 
    nn.Tanh(),

    # pool to an 8-channel, 8 x 8 output
    # downsample by half the size -> 2 x 2
    nn.MaxPool2d(2),

    # ... Reshape from 8-channel 8 x 8 to a 1d 512 vector (batch x 512 technically)

    nn.Linear(8*8*8, 32), # 532
    nn.Tanh(),
    nn.Linear(32, 2)
)

In [None]:
# parameters 
numel_list = [p.numel() for p in model.parameters()]
print("Total Parameters: ", sum(numel_list),"\nParameters Per Layer: ",numel_list)

In [5]:
import torch.nn as nn 

class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # input 3C 32x32  
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1) # -> 16C 32x32 
        self.act1 = nn.Tanh()
        self.pool1 = nn.MaxPool2d(2), # -> 16C 16x16
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1) # 8C 16x16
        self.act2 = nn.Tanh()
        self.pool2 = nn.MaxPool2d(2) # -> 8C 8x8
        # view(-1, 512) see forward()
        # -> batch x 512
        self.fc1 = nn.Linear(8*8*8, 32) # -> batch x 32
        self.act3 = nn.Tanh()
        self.fc2 = nn.Linear(32, 2) # -> batch x 2

    def forward(self, x):
        out = self.pool1(self.act1(self.conv1(x)))
        out = self.pool2(self.act2(self.conv2(out)))
        out = out.view(-1, 8*8*8)
        out = self.act3(self.fc1(out))
        out = self.fc2(out)
        return out
model = Net() 
numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

(18090, [432, 16, 1152, 8, 16384, 32, 64, 2])

Note that when we register submodules in the constructor of our class, we are making sure we have access to their parameters. But what about submodules that have no parameters? (activation and pooling layers). There is no need to register these as submodules, rather we can call them in the forward using `torch.nn.functional`. By 'functional' we mean having no internal state. In other words, the output is strictly dependent on the input, unlike layers with parameters. 

In [None]:
import torch.nn as nn 
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # input 3C 32x32  
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1) # -> 16C 32x32 
        # activation function here
        # pool here -> 16C 16x16
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1) # 8C 16x16
        # activation function here 
        # pool here -> 8C 8x8
        # view(-1, 512) -> batch x 512
        self.fc1 = nn.Linear(8*8*8, 32) # -> batch x 32
        # activation function here
        self.fc2 = nn.Linear(32, 2) # -> batch x 2

    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(x)), 2)
        out = out.view(-1, 8*8*8)
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        return out
    