In [2]:
import w0d3Remnant as fromYesterday
import torch as t
from torch import nn as nn
import utils
import typing

# Assembling ResNet

This is a combination of my day four *and* five work.

Day Four - Spent two hours, eight minutes on this so far. Going to revisit it later, cannot focus on it
Day Five - Spent three hours and thirty three minutes. Finished it. LETS GOOOOOO

### Batch Normalization in CNNs

Batch Norm is a method to address overfitting and slow training issues in Deep Neural Networks.

Normalization itself is just a pre-processing technique used to standardize data. This is often done to make features balanced - if they have differences in ranges, different features could recieve inflated importances. Normalizing your features helps models learn better.

You can instead normalize batches of data inside the network itself, done between layers. Each neuron will first apply its weights (not biases!): $$z = g(w,x);$$

But before applying the activation function, a batch norm is applied to each neurons output across the layer.

$$z^N = (\frac{z - m_z}{s_z}) \cdot \gamma + \beta$$

where $z^N$ is the output of the Batch Norm, $m_z$ is the mean of the neuron's output, $s_z$ is the standard deviation of the output of the neurons, and $\gamma$ and $\beta$ are learnable parameters which eventually represent the standard deviation and the mean of the outputs, respectively.

The output of batch norm is then fed into the activation function

$$a = f(z^N)$$

Why does this work? Firstly, think of it intuitively as the same thing you are doing to the features. But secondly, it reduces the "internal covariate shift of the network" - it reduces the shift due to a change in data distribution. Here, it the data recieved from the previous layer, which is constantly changing.

Batch norm also has a regularization effect, speeds up computation!

When applying this to convolutional layers, each feature map ends up having a single mean and standard deviation.



In [3]:
class Sequential(nn.Module):
    def __init__(self, *modules: nn.Module):
        super().__init__()
        for i, mod in enumerate(modules):
            self.add_module(str(i), mod)

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Chain each module together, with the output from one feeding into the next one.'''
        for mod in self._modules.values():
            x = mod(x)
        return x

model = Sequential(fromYesterday.Conv2d(3,10,3,1,0),
fromYesterday.ReLU(),
nn.Linear(10,5))

print(model)

Sequential(
  (0): Conv2d(weights shape = torch.Size([10, 3, 3, 3]) stride = (1, 1) padding = (0, 0) )
  (1): ReLU()
  (2): Linear(in_features=10, out_features=5, bias=True)
)


In [4]:
from math import gamma
from operator import truediv
import einops
from tkinter.tix import MAIN


class BatchNorm2d(nn.Module):
    running_mean: t.Tensor         # shape: (num_features,)
    running_var: t.Tensor          # shape: (num_features,)
    num_batches_tracked: t.Tensor  # shape: ()

    def __init__(self, num_features: int, eps=1e-05, momentum=0.1):
        '''Like nn.BatchNorm2d with track_running_stats=True and affine=True.

        Name the learnable affine parameters `weight` and `bias` in that order.
        '''
        super().__init__()
        #print("num features is " + str(num_features))
        running_mean = t.zeros(num_features)
        running_var = t.ones(num_features)
        num_batches_tracked = t.tensor(0)
        
        self.weight = nn.Parameter(t.ones(num_features)) # tracks gamma
        self.bias = nn.Parameter(t.zeros(num_features)) # tracks beta
        self.register_buffer('running_mean', running_mean) # holds mean across channels
        self.register_buffer('running_var', running_var) # holds variance across channels
        self.register_buffer('num_batches_tracked', num_batches_tracked) # stores num batches tracked
        self.num_features = num_features
        self.momentum = momentum
        self.eps = eps


    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Normalize each channel.

        Compute the variance using `torch.var(x, unbiased=False)`
        Hint: you may also find it helpful to use the argument `keepdim`.

        x: shape (batch, channels, height, width)
        Return: shape (batch, channels, height, width)
        '''
        if self.training:
            # update running mean
            mean = t.mean(x, (0,2,3), keepdim = False)
            self.running_mean = (self.momentum) * mean + (1 - self.momentum) * self.running_mean
            # update running variance
            var = t.var(x, (0,2,3), keepdim = False, unbiased=False)
            self.running_var = (self.momentum) * var + (1 - self.momentum) * self.running_var
            self.num_batches_tracked += 1
        else:
            mean = self.running_mean
            var = self.running_var

        mean = einops.rearrange(mean, 'c -> 1 c 1 1')
        var = einops.rearrange(var, 'c -> 1 c 1 1')
        gamma = einops.rearrange(self.weight, 'c -> 1 c 1 1')
        beta = einops.rearrange(self.bias, 'c -> 1 c 1 1')

        returnX = ((x - mean) / (t.sqrt(var + self.eps))) * gamma + beta
        return returnX

       
    # credit to callum for this
    def extra_repr(self) -> str:
        return ", ".join([f"{key}={getattr(self, key)}" for key in ["num_features", "eps", "momentum"]])



if MAIN:
    utils.test_batchnorm2d_module(BatchNorm2d)
    utils.test_batchnorm2d_forward(BatchNorm2d)
    utils.test_batchnorm2d_running_mean(BatchNorm2d)

All tests in `test_batchnorm2d_module` passed!
All tests in `test_batchnorm2d_forward` passed!
All tests in `test_batchnorm2d_running_mean` passed!


## Watching a paper review of ResNet

A key thing to realize is that when they were contemplating why accuracy wasn't increasing with model depth, the issue was not overfitting!

Each chunk of the model only has to learn the difference in how they wish for the data to change, rather than also having to learn an identiy function, too. Regluarization pushes against identity since it pushes everything down to zero.

In [5]:
class AveragePool(nn.Module):
    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (batch, channels, height, width)
        Return: shape (batch, channels)
        '''

        return t.mean(x,(2,3))


## Residual Block

In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, in_feats: int, out_feats: int, first_stride=1):
        '''A single residual block with optional downsampling.

        For compatibility with the pretrained model, declare the left side branch first using a `Sequential`.

        If first_stride is > 1, this means the optional (conv + bn) should be present on the right branch. Declare it second using another `Sequential`.
        '''

        super().__init__()
        
        left_branch = nn.Sequential(
            fromYesterday.Conv2d(in_feats, out_feats, 3, first_stride, 1), # TODO: why is padding 1? not zero?
            BatchNorm2d(out_feats),
            fromYesterday.ReLU(),
            fromYesterday.Conv2d(out_feats, out_feats, 3, 1, 1),
            BatchNorm2d(out_feats)
        )

        if first_stride > 1:
            right_branch = nn.Sequential(
                fromYesterday.Conv2d(in_feats, out_feats, 1, first_stride, 0),
                BatchNorm2d(out_feats)
            )
        else:
            right_branch = nn.Sequential(
                nn.Identity()
            )

        self.left = left_branch
        self.right = right_branch
        self.relu = fromYesterday.ReLU()
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Compute the forward pass.

        x: shape (batch, in_feats, height, width)

        Return: shape (batch, out_feats, height / first_stride, width / first_stride)
        '''
        leftOutput = self.left.forward(x)
        rightOutput = self.right.forward(x)
        return self.relu.forward(leftOutput + rightOutput)

In [7]:
class BlockGroup(nn.Module):
    def __init__(self, n_blocks: int, in_feats: int, out_feats: int, first_stride=1):
        '''An n_blocks-long sequence of ResidualBlock where only the first block uses the provided stride.'''
        super().__init__()
        self.firstRes = ResidualBlock(in_feats,out_feats,first_stride)
        self.otherRes = nn.ModuleList([ResidualBlock(out_feats, out_feats,1) for i in range(n_blocks - 1)])
        # TODO: convert back to normal list if this doesn't work!
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Compute the forward pass.
        x: shape (batch, in_feats, height, width)

        Return: shape (batch, out_feats, height / first_stride, width / first_stride)
        '''
        result = self.firstRes.forward(x)
        for l in self.otherRes:
            result = l(result)
        
        return result

In [8]:
import pwd
class ResNet34(nn.Module):
    def __init__(
        self,
        n_blocks_per_group=[3, 4, 6, 3],
        out_features_per_group=[64, 128, 256, 512],
        first_strides_per_group=[1, 2, 2, 2],
        n_classes=1000,
    ):
        super().__init__()
        BlockGroups = []
        for i, j in enumerate(n_blocks_per_group):
            n_blocks = j
            if i > 1:
                in_feats = out_features_per_group[i-1]
            else:
                in_feats = 64
            out_feats = out_features_per_group[i]
            first_stride = first_strides_per_group[i]
            BlockGroups.append(BlockGroup(n_blocks, in_feats, out_feats, first_stride))


        self.preBlockGroups = nn.Sequential(
            fromYesterday.Conv2d(3,64,7,2,3),
            BatchNorm2d(64),
            fromYesterday.ReLU(),
            fromYesterday.MaxPool2d(3,2),
        )
        self.layers = nn.Sequential(
            *BlockGroups
        )

        self.postBlockGroups = nn.Sequential(
            AveragePool(),
            fromYesterday.Flatten(),
            fromYesterday.Linear(out_features_per_group[-1], 1000)
        )

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (batch, channels, height, width)

        Return: shape (batch, n_classes)
        '''
        #print("HERE")
        y = self.preBlockGroups(x)
        #print("went through pre block groups")
        y = self.layers(y)
        #print("went through post block groups")
        y = self.postBlockGroups(y)
        return y


In [9]:
import torchvision
from torchvision.models import resnet34



In [10]:
def copy_weights(myresnet: ResNet34, pretrained_resnet: torchvision.models.resnet.ResNet) -> ResNet34:
    '''Copy over the weights of `pretrained_resnet` to your resnet.'''

    mydict = myresnet.state_dict()
    pretraineddict = pretrained_resnet.state_dict()

    # Check the number of params/buffers is correct
    assert len(mydict) == len(pretraineddict), "Number of layers is wrong. Have you done the prev step correctly?"

    # Initialise an empty dictionary to store the correct key-value pairs
    state_dict_to_load = {}

    for (mykey, myvalue), (pretrainedkey, pretrainedvalue) in zip(mydict.items(), pretraineddict.items()):
        state_dict_to_load[mykey] = pretrainedvalue

    myresnet.load_state_dict(state_dict_to_load)

    return myresnet

utils.compare_my_resnet_to_pytorch(ResNet34())
officialresnet = resnet34()
myresnet = copy_weights(ResNet34(), officialresnet)

Unnamed: 0,their name,their shape,your name,your shape
0,conv1.weight,"(64, 3, 7, 7)",preBlockGroups.0.weight,"(64, 3, 7, 7)"
1,bn1.weight,"(64,)",preBlockGroups.1.weight,"(64,)"
2,bn1.bias,"(64,)",preBlockGroups.1.bias,"(64,)"
3,bn1.running_mean,"(64,)",preBlockGroups.1.running_mean,"(64,)"
4,bn1.running_var,"(64,)",preBlockGroups.1.running_var,"(64,)"
5,bn1.num_batches_tracked,(),preBlockGroups.1.num_batches_tracked,()
6,layer1.0.conv1.weight,"(64, 64, 3, 3)",layers.0.firstRes.left.0.weight,"(64, 64, 3, 3)"
7,layer1.0.bn1.weight,"(64,)",layers.0.firstRes.left.1.weight,"(64,)"
8,layer1.0.bn1.bias,"(64,)",layers.0.firstRes.left.1.bias,"(64,)"
9,layer1.0.bn1.running_mean,"(64,)",layers.0.firstRes.left.1.running_mean,"(64,)"


In [11]:
from torchvision import transforms
from pathlib import Path
from PIL import Image 
import torch as t

IMAGE_FILENAMES = [
    "chimpanzee.jpg",
    "golden_retriever.jpg",
    "platypus.jpg",
    "frogs.jpg",
    "fireworks.jpg",
    "astronaut.jpg",
    "iguana.jpg",
    "volcano.jpg",
    "goofy.jpg",
    "dragonfly.jpg",
]

IMAGE_FOLDER = Path("./resnet_inputs")

images = [Image.open(IMAGE_FOLDER / filename) for filename in IMAGE_FILENAMES]
# ImageNet transforms copied from solution:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def prepare_data(images: list[Image.Image]) -> t.Tensor:
    '''
    Return: shape (batch=len(images), num_channels=3, height=224, width=224)
    '''
    return t.stack([transform(i) for i in images], dim = 0)

prepared_images = prepare_data(images)

def predict(model, images):
    logits = model(images)
    return logits.argmax(dim=1)

In [12]:
prepared_images.shape

torch.Size([10, 3, 224, 224])

In [13]:
predict(myresnet, prepared_images)

tensor([707, 707, 707, 707, 864,  71, 707, 707, 864, 707])

In [14]:
import json
with open("imagenet_labels.json") as f:
    imagenet_labels = list(json.load(f).values())

In [15]:
predict(officialresnet, prepared_images)

tensor([707, 707, 707, 707, 864,  71, 707, 707, 864, 707])

In [16]:
len(prepared_images)

10

In [17]:
myresnet

ResNet34(
  (preBlockGroups): Sequential(
    (0): Conv2d(weights shape = torch.Size([64, 3, 7, 7]) stride = (2, 2) padding = (3, 3) )
    (1): BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1)
    (2): ReLU()
    (3): MaxPool2d(kernel_size = (3, 3) stride = (2, 2) padding = (1, 1))
  )
  (layers): Sequential(
    (0): BlockGroup(
      (firstRes): ResidualBlock(
        (left): Sequential(
          (0): Conv2d(weights shape = torch.Size([64, 64, 3, 3]) stride = (1, 1) padding = (1, 1) )
          (1): BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1)
          (2): ReLU()
          (3): Conv2d(weights shape = torch.Size([64, 64, 3, 3]) stride = (1, 1) padding = (1, 1) )
          (4): BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1)
        )
        (right): Sequential(
          (0): Identity()
        )
        (relu): ReLU()
      )
      (otherRes): ModuleList(
        (0): ResidualBlock(
          (left): Sequential(
            (0): Conv2d(weights shape = torch.Si

# Finetuning ResNet

day 6 begins here

I'll admit. at a first passthrough, I'm basically copying much of the tutorial code after writing out some pseudocode. 

In [18]:
model_name = "resenet"
data_dir = Path("./hymenoptera_data")
num_classes = 2
batch_size = 8
num_epochs = 6
feature_extract = False

Helper functioin to do train the model

In [19]:
from cmath import sin
import time
import copy
def train_model(model, dataloaders, criterion, optimizer, nun_epochs=25, is_inception = False):
    since = time.time()
    val_acc_history = []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # for each epoch, go through train and validation
        for phase in ['train', 'val']:
            running_loss = 0.0
            running_corrects = 0.0 #TODO: may need to be int, and add .double() later

            if phase == 'train':
                model.train()
            else:
                model.eval()

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with t.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs) #do a forward pass
                    loss = criterion(outputs, labels)

                    _, preds = t.max(outputs, 1)
                    optimizer.step()
            
                running_loss += loss.item() * inputs.size(0)
                running_corrects += t.sum(preds == labels.data)
        
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [20]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [21]:
from torchvision import datasets, models, transforms
def initialize_model(model_name, num_classes, feature_extract, use_pretrained = True):
    my_model = ResNet34()
    input_size = 0

    if model_name == "resnet":
        pure_model_ft = models.resnet34(pretrained = use_pretrained)
        copy_weights(my_model, pure_model_ft)

        set_parameter_requires_grad(my_model, feature_extract)
        
        num_ftrs = my_model.postBlockGroups[2].weight.shape[1]
        my_model.postBlockGroups[2] = fromYesterday.Linear(num_ftrs, num_classes,True)
        input_size = 224 # yo? hello? the tutorial just threw it down but idk where it came from
    else:
        print('yeah i got nothing for you')

    return my_model, input_size


In [22]:
# WE NEED TO MAKE SURE THIS MATCHES
myresnet.postBlockGroups[2]

Linear(weight.shape = torch.Size([1000, 512]) bias.shape = torch.Size([1000]))

In [23]:
my_model, input_size = initialize_model("resnet", 2, True)



In [24]:
# CHECK WITH ABOVE TO MAKE SURE IT IS THE SAME!
my_model.postBlockGroups

Sequential(
  (0): AveragePool()
  (1): Flatten(start_dim = 1 end_dim = -1)
  (2): Linear(weight.shape = torch.Size([2, 512]) bias.shape = torch.Size([2]))
)

In [25]:
import os

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# create train and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x]) for x in ['train', 'val']}

# Create training and validation dataloaders
dataloaders_dict = {x: t.utils.data.DataLoader(image_datasets[x],
batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

In [26]:
dataloaders_dict

{'train': <torch.utils.data.dataloader.DataLoader at 0x1657ed970>,
 'val': <torch.utils.data.dataloader.DataLoader at 0x1657edd60>}

In [27]:
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")

In [28]:
my_model = my_model.to(device)

In [29]:
from torch import optim

params_to_update = my_model.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    print("THIS IS NOT FINISHED! I AM NOT FEATURE EXTRACTING")
else:
    for name, param in my_model.named_parameters():
        if param.requires_grad == True:
            print("\t", name)

optimizer_ft = t.optim.Adam(params_to_update)


Params to learn:
	 postBlockGroups.2.weight
	 postBlockGroups.2.bias


In [30]:
criterion = nn.CrossEntropyLoss()

#model_ft, hist = train_model(my_model, dataloaders_dict, criterion, optimizer_ft, num_epochs, is_inception=(model_name=="inception"))

# Random debugging stuff
Cheating, again

In [None]:
from einops import rearrange
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm_notebook
import PIL

epochs = 3
loss_fn = nn.CrossEntropyLoss()
batch_size = 128

MODEL_FILENAME = "./w1d2_convnet_mnist.pt"
device = "cuda" if t.cuda.is_available() else "cpu"

trainset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

def train_convnet(trainloader: DataLoader, testloader: DataLoader, epochs: int, loss_fn) -> list:
    """
    Defines a ResNet using our previous code, and trains it on the data in trainloader.
    
    Returns tuple of (loss_list, accuracy_list), where accuracy_list contains the fraction of accurate classifications on the test set, at the end of each epoch.
    """
    
    my_model, input_size = initialize_model("resnet", 10, True)
    model = my_model.to(device).train()

    model = resnet34().to(device).train()
    optimizer = t.optim.Adam(model.parameters())
    loss_list = []
    accuracy_list = []
    
    for epoch in tqdm_notebook(range(epochs)):
        
        for (x, y) in tqdm_notebook(trainloader, leave=False):
            
            x = x.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.item())
        
        with t.inference_mode():
            
            accuracy = 0
            total = 0
            
            for (x, y) in testloader:

                x = x.to(device)
                y = y.to(device)

                y_hat = model(x)
                y_predictions = y_hat.argmax(1)
                accuracy += (y_predictions == y).sum().item()
                total += y.size(0)

            accuracy_list.append(accuracy/total)
            
        print(f"Epoch {epoch+1}/{epochs}, train loss is {loss:.6f}, accuracy is {accuracy}/{total}")
    
    print(f"Saving model to: {MODEL_FILENAME}")
    t.save(model, MODEL_FILENAME)
    return loss_list, accuracy_list

loss_list, accuracy_list = train_convnet(trainloader, testloader, epochs, loss_fn)