### Imports 

In [2]:
import torch 
import torchvision
import torch.nn as nn 
import torch.nn.functional as F
from IPython.display import Image 
from torchvision import transforms
import matplotlib.pyplot as plt
import random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7efe84c06648>

#### Mount your google drive so you can save model checkpoints, and report your test results on the final best model after hyperparameter tuning

In [3]:
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/My Drive"

Mounted at /content/drive
'2017 Fall Semester Schedule.gsheet'
'2018 Fall Semester Schedule.gsheet'
'2018 Spring Semester Schedule.gsheet'
'av parking'
'Colab Notebooks'
 Control_System_Engineering_Norman_S._Nis.pdf
 DataStructuresAndAlgorithmAnalysisInCpp_2014.pdf
'Downtown Parking Model.gslides'
 FAV_log_YilangHao.gdoc
 foo.txt
'homework investment theory-7.doc'
 housing
 HW#7.gdoc
'KCF Algorithm.pdf'
'MA 224 Probability and Statistical Inference Student Manual.pdf'
 new-doc-2018-11-29-19.51.03_20181129195230.pdf
 nyu
'Physics 9th Edition.pdf'
 Study.rar
'Traffic Flow Theory.gslides'
 VID_20151118_165741.mp4
 VID_20180510_172439~2.mp4
 VID_20180510_172439.mp4
 YilangHao_HW1_ece335.pdf
 YilangHao_HW3_ece335.m
 YilangHao_HW4.pdf
 YilangHao_HW5_ece335.pdf
 YilangHao_HW5_ece335_revised.pdf
 YilangHao_MengxueGao_HW.pdf
'Yilang Hao_Resume_20170921.docx'
'Yilang Hao_Resume_20170921.pdf'
'Yilang Hao_Senior_Resume_20180926.docx'
'Yilang Hao_Senior_Resume_20181002.pdf'
 报告.gdoc
'数据结构与算法分析 C++语

### Data loading 

##### Run the following cells to load the dataset. Setting download=True will download it for you.

In [4]:
cifar10_train = torchvision.datasets.CIFAR10(root='./cifar10', 
                                             train=True, 
                                             transform=None, 
                                             target_transform=None,
                                             download=True)
cifar10_test = torchvision.datasets.CIFAR10(root='./cifar10', 
                                             train=False, 
                                             transform=None, 
                                             target_transform=None,
                                             download=True)

# Divides the dataset into train and val so that we can use the val to choose our hyperparameters
train_dataset, val_dataset = torch.utils.data.random_split(cifar10_train, [40000, 10000], 
                                                           generator=torch.Generator().manual_seed(12345))
test_dataset = cifar10_test

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar10/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./cifar10/cifar-10-python.tar.gz to ./cifar10
Files already downloaded and verified


In [5]:
# Helper code to support adding different transforms on the dataset lazily after downloading the dataset
# From https://discuss.pytorch.org/t/apply-different-transform-data-augmentation-to-train-and-validation/63580/5
class MapDataset(torch.utils.data.Dataset):
    """
    Given a dataset, creates a dataset which applies a mapping function
    to its items (lazily, only when an item is called).

    Note that data is not cloned/copied from the initial dataset.
    """

    def __init__(self, dataset, map_fn):
        self.dataset = dataset
        self.map = map_fn

    def __getitem__(self, index):
        if self.map:     
            x = self.map(self.dataset[index][0]) 
        else:     
            x = self.dataset[index][0]  
        y = self.dataset[index][1]         
        return x, y

    def __len__(self):
        return len(self.dataset)

#### Standard transforms to apply on images - Convert to tensors and normalize with mean and std. These are the basic transforms that you will always apply. The mean and std have been pre calculated on the training set. 

In [6]:
# Notice that we apply the same mean and std normalization calculated on train, to both the train and test datasets.
test_transform = transforms.Compose([
                                     transforms.ToTensor(),
                                     transforms.Normalize(
                                         [0.4373, 0.4434, 0.4725],
                                         [0.1201, 0.1231, 0.1052])
                                     ])

train_transform = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize(
                                          [0.4373, 0.4434, 0.4725],
                                          [0.1201, 0.1231, 0.1052])
                                      ])


In [7]:
train_dataset_w_transform  = MapDataset(train_dataset, train_transform)
val_dataset_w_transform = MapDataset(val_dataset, test_transform)
test_dataset_w_transform = MapDataset(test_dataset, test_transform)

In [8]:
bs = 128 #batch size
torch.backends.cudnn.benchmark = True
train_loader = DataLoader(train_dataset_w_transform, batch_size=bs, shuffle=True, drop_last=False,num_workers=10, pin_memory=True)
val_loader = DataLoader(val_dataset_w_transform, batch_size=bs, shuffle=False, drop_last=False,num_workers=10, pin_memory=True)
test_loader = DataLoader(test_dataset_w_transform, batch_size=bs, shuffle=False, drop_last=False,num_workers=10, pin_memory=True)

### Q 2.1 Training loop

In this question, fill in the missing parts to build a generic training loop that returns the train and validation losses and accuracies. The #TODOs will guide you through the key points and you should fill some code for each #TODO. You might need to add some additional code for bookkeeping the losses and accuracies

In [44]:
def train_loop(model, criterion, optimizer,  train_loader, val_loader):
    """
    Generic training loop

    Parameters
    ----------
    model : Object instance of your model class 
    criterion : Loss function 
    optimizer : Instance of optimizer class of your choice 
    train_loader : Training data loader 
    val_loader : Validation data loader

    Returns
    -------
    train_losses : List with train loss on dataset per epoch
    train_accuracies : List with train accuracy on dataset per epoch
    val_losses : List with validation loss on dataset per epoch
    val_accuracies : List with validation accuracy on dataset per epoch

    """
    best_val = 0.0
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    max_patience = 5
    patience_counter = 0


    
    # Training
    for t in tqdm(range(50)):

        # TODO : Set the model to train mode   
        model.train()   

        correct = 0
        correct_v = 0 
        loss_v = 0 
        val_loss = 0

        # TODO: Loop over the training set 
        for batch_idx, (data, target) in enumerate(train_loader):

            # TODO: Put the inputs and targets on the write device
            data = data.to(device) 
            target = target.to(device)
            # TODO: Feed forward to get the logits           
            output = model(data)

            # TODO: Compute the loss and accuracy
            loss = criterion(output, target)
            train_losses.append(loss.item())
            pred = output.data.max(1, keepdim=True)[1] 
            correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()
            accuracy = 100. * correct / len(test_loader.dataset)
            train_accuracies.append(accuracy)


            # TODO: zero the gradients before running
            # the backward pass.
            optimizer.zero_grad()

            # TODO: Backward pass to compute the gradient
            # of loss w.r.t our learnable params. 
            loss.backward()

            # TODO: Update params
            optimizer.step()
            # TODO: Keep track of accuracy and loss
            if batch_idx % 100 == 0:
              print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accuracy: ({:.0f}%)\n'.format(
                t, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item(), accuracy))            

        
        # TODO: Switch the model to eval mode
        model.eval()


        with torch.no_grad():
            # TODO: Loop over the validation set 
            for data_v, target_v in val_loader:
            
                # TODO: Put the inputs and targets on the write device
                data_v, target_v = data.to(device), target.to(device)
            
                # TODO: Feed forward to get the logits 
                output_v = model(data_v)

                # TODO: Compute the loss and accuracy
                loss_v += criterion(output_v, target_v).item()
                val_loss /= len(val_loader.dataset)
                val_losses.append(val_loss)

                pred_v = output_v.data.max(1, keepdim=True)[1] 
                correct_v += pred_v.eq(target_v.data.view_as(pred_v)).cpu().sum().item()
                accuracy_v = 100. * correct_v / len(val_loader.dataset)
                val_accuracies.append(accuracy_v)

                # TODO: Keep track of accuracy and loss
                print('\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                    val_loss, correct, len(val_loader.dataset),
                    accuracy_v))

        if val_accuracies[-1] > best_val:
          best_val = val_accuracies[-1]
          patience_counter = 0

          # TODO: Save best model, optimizer, epoch_number
          state = {
            'epoch': t,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()}
          path = '/content/'
          torch.save(state, path) 
          #did not select a specific PATH
          
        else:
          patience_counter += 1    
          if patience_counter > max_patience: 
            break

        print("[EPOCH]: %i, [TRAIN LOSS]: %.6f, [TRAIN ACCURACY]: %.3f" % (t, train_losses[-1], train_accuracies[-1]))
        print("[EPOCH]: %i, [VAL LOSS]: %.6f, [VAL ACCURACY]: %.3f \n" % (t, val_losses[-1] ,val_accuracies[-1]))

    return train_losses, train_accuracies, val_losses, val_accuracies

### Q 2.2 Shallow convolutional network. 

In [10]:
class View(nn.Module):
    def __init__(self, shape):
      super().__init__()
      self.shape = shape

    def forward(self, x):
        return x.view(*self.shape)

In [11]:

ShallowNet =  nn.Sequential(
      nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size=5, padding=2),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size=3, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=8),
      View((-1,256)),
      nn.Linear(256, 10),
)

#### Write the object oriented version of ShallowNet

In [12]:
class ShallowConvnet(nn.Module):
    def __init__(self, input_channels, num_classes):
        """

        Parameters
        ----------
        input_channels : Number of input channels
        num_classes : Number of classes for the final prediction 
        """
        
        # TODO
        super(ShallowConvnet, self).__init__()
        self.num_classes = num_classes
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 5, padding =2)
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(256, num_classes)


    def forward(self, x):
        """

        Parameters
        ----------
        x

        Returns
        -------
        output : Result after running through the model
        """
        
        # TODO
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size = 2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size = 2)
        x = self.conv3(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size = 8)
        x = x.view(*(-1,256))
        output = self.fc1(x)


        return output


        

### Q2.3 Instantiate the model and run this using an SGD optimizer, with the appropriate loss function for classification

Report the learning curves (training and validation accuracy vs number of epochs)

In [45]:
# TODO : Initialize the model and cast to correct device
model_23 = ShallowConvnet(3,10)
model_23.to(device)
# TODO : Initialize the criterion
criterion = nn.CrossEntropyLoss()
# TODO : Initialize the SGD optimizer with lr 1e-3
optimizer = torch.optim.SGD(model_23.parameters(), lr = 0.001)
# TODO : Run the training loop using this model
train_losses, train_accuracies, val_losses, val_accuracies = train_loop(model_23, criterion, optimizer, train_loader, val_loader)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))






Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (0%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (0%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (0%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (0%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (1%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (2%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (2%)


Validation set: Average loss: 0.0000, Accuracy: 5435/10000 (2%)


Valid

IsADirectoryError: ignored

In [43]:
!pwd
!path
!ls
!cd drive
!pwd

/content
/bin/bash: path: command not found
cifar10  drive	sample_data
/content


### Q2.4 Simple convolution network - 

Design a convolutional neural network with the following specification: 
For each convolution layer, use appropriate padding such that it maintains the resolution of the image. The resolution should be changing only when you introduce maxpooling layers. Each convolution layer should be followed by a relu non-linearity. The first two blocks containing 3 convolutional layers are each followed by a maxpooling layer that halves the resolution of the image. After the third block, use maxpooling to get a resolution of 1 X 1. Finally, apply a linear transformation to project to the number of classes. 

Structure of the convolution layers of the model:
1. Number of input channels to the model = 3
2. First convolution layer of kernel size 5 with filter size 64 and padding such that it maintains the resolution of the image.
3. Followed by a block of 3 convolution layers of kernel size 3, filter size 64 and padding such that it maintains the resolution of the image.
5. Followed by 3 convolution layers of kernel size 3, filter size 128 and padding such that it maintains the resolution of the image.
6. Followed by 3 convolution layers of kernel size 3, filter size 256 and padding such that it maintains the resolution of the image. 




In [34]:
# Use the description of the structure of the model and the hints given below 

class SimpleConvnet(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(SimpleConvnet, self).__init__()
        # TODO
        self.num_classes = num_classes
        self.input_channels = input_channels
        self.conv1 = nn.Conv2d(in_channels= input_channels, out_channels= 64, kernel_size= 5, padding= 2)
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv4 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv5 = nn.Conv2d(in_channels = 64, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv6 = nn.Conv2d(in_channels = 128, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv7 = nn.Conv2d(in_channels = 128, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv8 = nn.Conv2d(in_channels = 128, out_channels= 256, kernel_size= 3, padding= 1)
        self.conv9 = nn.Conv2d(in_channels = 256, out_channels= 256, kernel_size= 3, padding= 1)
        self.conv10 = nn.Conv2d(in_channels = 256, out_channels= 256, kernel_size= 3, padding= 1)
        self.fc1 = nn.Linear(256,num_classes)

    
    def forward(self, x):
       
        # TODO 

        # HINT: first conv layer 
        x = F.relu(self.conv1(x))


        # HINT: block of 3 conv 
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.max_pool2d(x, kernel_size = 2)


        # HINT : block of 3 conv 
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv7(x))
        x = F.max_pool2d(x, kernel_size = 2)


        # HINT: block of 3 conv 
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv9(x))
        x = F.relu(self.conv10(x))
        x = F.max_pool2d(x, kernel_size = 8)


        # HINT: projection 
        x = x.view(-1,256)
        output = self.fc1(x)

        
        return output
        

In [30]:
X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
X = torch.reshape(torch.arange(16, dtype=torch.float32), (1, 1, 4, 4))
X = torch.cat((X, X + 1), 1)
print(X)
y = F.max_pool2d(X, kernel_size= 2)
print(y)
z = y.view(*(-1,2))
print(z)
w = nn.Linear(2,1)(z)
print(w)

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])
tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])
tensor([[ 5.,  7.],
        [13., 15.],
        [ 6.,  8.],
        [14., 16.]])
tensor([[2.0862],
        [2.7867],
        [2.1737],
        [2.8743]], grad_fn=<AddmmBackward>)


In [1]:
# TODO : Initialize the model and cast to correct device
input_channels = 3
num_classes = 10
model_24 = SimpleConvnet(input_channels,num_classes)
model_24.to(device)

# TODO : Initialize the criterion 
criterion = nn.CrossEntropyLoss()
# TODO : Initialize the optimizer 
optimizer = torch.optim.SGD(model_24.parameters(), lr = 0.001)
# TODO : Run the training loop using this model
train_losses, train_accuracies, val_losses, val_accuracies = train_loop(model_24, criterion, optimizer, train_loader, val_loader)

NameError: ignored

### Q 2.5 Report results of training using SGD optimizer for both ShallowNet and SimpleConvnet. What do you observe?

### Q 2.6 Add batch normalization

#### Q2.6 a After each relu layer, add a batch normalization layer to the network SimpleConvnet you created above

In [None]:
class SimpleConvnet2(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(SimpleConvnet2, self).__init__()
        
        # TODO
        self.num_classes = num_classes
        self.input_channels = input_channels
        self.conv1 = nn.Conv2d(in_channels= input_channels, out_channels= 64, kernel_size= 5, padding= 2)
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv4 = nn.Conv2d(in_channels = 64, out_channels= 64, kernel_size= 3, padding= 1)
        self.conv5 = nn.Conv2d(in_channels = 64, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv6 = nn.Conv2d(in_channels = 128, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv7 = nn.Conv2d(in_channels = 128, out_channels= 128, kernel_size= 3, padding= 1)
        self.conv8 = nn.Conv2d(in_channels = 128, out_channels= 256, kernel_size= 3, padding= 1)
        self.conv9 = nn.Conv2d(in_channels = 256, out_channels= 256, kernel_size= 3, padding= 1)
        self.conv10 = nn.Conv2d(in_channels = 256, out_channels= 256, kernel_size= 3, padding= 1)
        self.fc1 = nn.Linear(256,num_classes)
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(256)
    
    def forward(self, x):
       
        # TODO
        x = F.relu(self.conv1(x))
        x = self.bn1(x)

        x = F.relu(self.conv2(x))
        x = self.bn1(x)
        x = F.relu(self.conv3(x))
        x = self.bn1(x)
        x = F.relu(self.conv4(x))
        x = self.bn1(x)
        x = F.max_pool2d(x, kernel_size = 2)

        x = F.relu(self.conv5(x))
        x = self.bn2(x)
        x = F.relu(self.conv6(x))
        x = self.bn2(x)
        x = F.relu(self.conv7(x))
        x = self.bn2(x)
        x = F.max_pool2d(x, kernel_size = 2)

        x = F.relu(self.conv8(x))
        x = self.bn3(x)
        x = F.relu(self.conv9(x))
        x = self.bn3(x)
        x = F.relu(self.conv10(x))
        x = self.bn3(x)
        x = F.max_pool2d(x, kernel_size = 8)

        x = x.view(-1,256)
        output = self.fc1(x)

        
        return output
        

In [None]:
# TODO : Initialize the model and cast to correct device
input_channels = 3
num_classes = 10
model_26 = SimpleConvnet2(input_channels,num_classes)
model_26.to(device)
# TODO : Initialize the criterion 
criterion = nn.CrossEntropyLoss()
# TODO : Initialize the optimizer 
optimizer = torch.optim.SGD(model_26.parameters(), lr = 0.001)
# TODO : Run the training loop using this model
train_losses_bn, train_accuracies_bn, val_losses_bn, val_accuracies_bn = train_loop(model_26, criterion, optimizer, train_loader, val_loader)

#### Q2.6 b Plot the the training curves (training loss vs \# epochs, training accuracy vs # epochs) using SGD (lr 1e-3) with and without batch normalization. Comment on the difference. 

#### Q2.6 c. Try running the same two networks with an Adam optimizer (lr 1e-4). Plot the the training curves (training loss vs \# epochs, training accuracy vs # epochs) with and without batch normalization. Comment on the difference. 

#### Q2.6 d Once you choose an optimizer and see that it does train, make sure your model has enough capacity by overfitting on one batch of the data set. You should be able to get 100% train accuracy. 


### Q 2.7 Add residual connections

#### Residual connections help stabilise training and lead to faster convergence. In this question you will introduce residual connections into the SimpleConvnet2 model that you built above. 

We will add residual connections after each block of 3 convolutional layers. Lets consider the first block of three convolutional layers. The input to this block, the so called residual, is added to the output of the block before the final batch normalization layer of that block. 


IMP NOTE: You will notice that the number of filters of these two summands are not the same. For this, you will need to use a convolution layer on the residual component, which changes the number of filters while keeping the rest of the dimensions the same. This can be achieved with a careful selection of the input_channels, output_channels, kernel_size and padding parameters. 

In [None]:
class ResidualConvnet(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(ResidualConvnet, self).__init__()
        
        # TODO
    
    def forward(self, x):
        
        # TODO 
        return output
        

In [None]:
# TODO : Initialize the model and cast to correct device

# TODO : Initialize the criterion 

# TODO : Initialize the optimizer 

# TODO : Run the training loop using this model

### Q 2.8 Plot the training curves with and without the residual connection. Comment on the difference. 

### Q2.9 Reducing overfiting 



In the previous questions, you might have observed that there is a large difference between the training and validation losses. This is a sign that the model is overfitting. One way to combat this is by adding random transformations to the input data to make your model more robust and prevent it from memorizing the input data. 

Torchvision provides several transforms that you can readily apply to your data. Experiment with adding a few transforms and report your results in terms of learning curves to see if the gap between the training and validation loss reduces and try to achieve better perfomance on the validation set. 

In [None]:
test_transform = transforms.Compose([
                                     transforms.ToTensor(),
                                     transforms.Normalize(
                                         [0.4373, 0.4434, 0.4725],
                                         [0.1201, 0.1231, 0.1052])
                                     ])

train_transform = transforms.Compose([
                                      # TODO: Add more transforms here 
                                      transforms.ToTensor(),
                                      transforms.Normalize(
                                          [0.4373, 0.4434, 0.4725],
                                          [0.1201, 0.1231, 0.1052])
                                      ])


In [None]:
train_dataset_w_transform  = MapDataset(train_dataset, train_transform)
val_dataset_w_transform = MapDataset(val_dataset, test_transform)
test_dataset_w_transform = MapDataset(test_dataset, test_transform)

In [None]:
bs = 128
train_loader = DataLoader(train_dataset_w_transform, batch_size=bs, shuffle=True, drop_last=False)
val_loader = DataLoader(val_dataset_w_transform, batch_size=bs, shuffle=False, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False, drop_last=False)

#### Use the residual network that you built above and use data augmentation to reduce the overfitting 

In [None]:
# TODO : Initialize the model and cast to correct device

# TODO : Initialize the criterion 

# TODO : Initialize the optimizer 

# TODO : Run the training loop using this model

### Q 2.10  Effect of learning rate decay 

#### Here you need to change the training loop to have one additional operation - add the scheduler step at the end of each epoch. Experiment with different learning rate schedulers provided by pytorch. Report results using atleast StepLR. 

In [None]:
from torch.optim.lr_scheduler import StepLR

In [None]:
def train_loop2(model, criterion, optimizer, scheduler,  train_loader, val_loader):
    """
    Generic training loop

    Parameters
    ----------
    model : Object instance of your model class 
    criterion : Loss function 
    optimizer : Instance of optimizer class of your choice 
    scheduler : Instance of scheduler class of your choice 
    train_loader : Training data loader 
    val_loader : Validation data loader

    Returns
    -------
    train_losses : List with train loss on dataset per epoch
    train_accuracies : List with train accuracy on dataset per epoch
    val_losses : List with validation loss on dataset per epoch
    val_accuracies : List with validation accuracy on dataset per epoch

    """
    best_val = 0.0
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    max_patience = 5
    patience_counter = 0

    # Training
    for t in tqdm(range(50)):

        # TODO : Set the model to train mode        

        # TODO: Loop over the training set 

            # TODO: Put the inputs and targets on the write device
            
            # TODO: Feed forward to get the logits

            # TODO: Compute the loss and accuracy

            # TODO: zero the gradients before running
            # the backward pass.

            # TODO: Backward pass to compute the gradient
            # of loss w.r.t our learnable params. 

            # TODO: Update params
            
            # TODO: Keep track of accuracy and loss

        
        # Switch the model to eval mode
        # TODO


        with torch.no_grad():
            # TODO: Loop over the validation set 
            
                # TODO: Put the inputs and targets on the write device
            
                # TODO: Feed forward to get the logits

                # TODO: Compute the loss and accuracy

                # TODO: Keep track of accuracy and loss

        if val_accuracies[-1] > best_val:
          best_val = val_accuracies[-1]
          patience_counter = 0

          # TODO: Save best model, optimizer, epoch_number
          
        else:
          patience_counter += 1    
          
          if patience_counter > max_patience: 
            break

        print("[EPOCH]: %i, [TRAIN LOSS]: %.6f, [TRAIN ACCURACY]: %.3f" % (t, train_losses[-1], train_accuracies[-1]))
        print("[EPOCH]: %i, [VAL LOSS]: %.6f, [VAL ACCURACY]: %.3f \n" % (t, val_losses[-1] ,val_accuracies[-1]))

        # TODO : scheduler step

    return train_losses, train_accuracies, val_losses, val_accuracies

In [None]:
# TODO : Initialize the model and cast to correct device

# TODO : Initialize the criterion 

# TODO : Initialize the optimizer 

# TODO : Initialize the 

# TODO : Run the training loop using this model

### Q2.11 Hyper parameter tuning 


#### Experiment with a range of learning rates and optimizers, as well as the parameter in the learning rate scheduler for StepLR. Report the following plots: 

1. Learning curves (training and validation loss for 5 different learning rate with SGD optimizer)
2. Learning curves (training and validation loss for 5 different learning rate with Adam optimizer)
3. Learning curves (training and validation loss for 5 different gamma parameter for the StepLR)





### Q2.12 Load the model that gave you best validation accuracy and report results on the test set. 