<a href="https://colab.research.google.com/github/saikat-roy/Vision-Systems-Lab/blob/master/Assignment4/Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4: CudaVision
------
### Group Members:
__1.__ Saikat Roy

__2.__ Albert Gubaidullin

## Import Dependencies
------

In [0]:
import numpy as np

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import dataloader, random_split

from torchvision import datasets, transforms

from torchsummary import summary

import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid

import time

## Softmax Regression MLP object
------
The `SoftmaxRegressionMLP` class is a basic object to create a MLP with ReLU hidden units and a linear output unit (argmax(x) is the same as argmax(softmax(x)). Options while initializing include those of type of non-linearity and the number of layer and number of hidden units.

In [62]:
class ConvNet(nn.Module):

    def __init__(self, n_input, n_output, n_hidden_layers=None, h_units=None, pool='max', 
                 dropout=0.3):
        """
        Initialization for a simply softmax regression MLP model with ReLU activations in hidden layers
        :param n_input (int): Number of input units to network
        :param n_hidden_layers (int): Number of hidden layers in network
        :param n_output (int): Number of output units of network
        :param h_units (int or list): hidden unit count or list of hidden units in each hidden layer of network
        """
        super(ConvNet, self).__init__()
        self.n_out = n_output

        layers = []  
        
        assert type(h_units) is list and n_hidden_layers is None
        
        # Add input layers
        if type(h_units) is list:
            n_hidden_layers = len(h_units)
            layers.extend(self.add_convlayer_and_act(n_input, h_units[0], 'relu'))
        else:
            layers.extend(self.add_convlayer_and_act(n_input, h_units, 'relu'))
        
        # Add hidden layers
        for i in range(1, n_hidden_layers):
            if type(h_units) is list:
                layers.extend(self.add_convlayer_and_act(h_units[i - 1], 
                                            h_units[i], 'relu', pool))
                #layers.extend([nn.Dropout(p=dropout)])  # Dropout added here after each hidden layer
            else:
                layers.extend(self.add_convlayer_and_act(h_units, 
                                            h_units, 'relu', pool))
                #layers.extend([nn.Dropout(p=dropout)])  # Dropout added here
       
        self.block = nn.Sequential(*layers) 
        
        # Add output layer
        if type(h_units) is list:
          self.last = h_units[-1]
        else:
          self.last = h_units
        self.drop = nn.Dropout(dropout)
        self.out = nn.Linear(self.last, n_output)
    
    def forward(self, x):
        """
        Simple forward pass
        :param x:
        :return:
        """
        x = self.block(x)
        # Global Average Pooling
#         print("Size before global average pooling: {}".format(x.size()))
        x = F.avg_pool2d(x, x.size()[2:])
#         print("Size after global average pooling: {}".format(x.size()))
        
        x = x.view(-1,self.last) # Converting into vector of 
                                 # size (batch_size, n_last_hidden)
        x = self.drop(x)
        x = self.out(x)
        return x
    
    def add_convlayer_and_act(self, n_inp, n_out, nl_type, pool='max'):
        """
        
        """
        layer_list = []
        layer_list.extend([nn.Conv2d(n_inp, n_out, kernel_size=(3,3), padding=1), 
                               nn.BatchNorm2d(n_out), self.non_lin(nl_type)])
#         layer_list.extend([nn.Conv2d(n_out, n_out, kernel_size=(3,3)), 
#                                nn.BatchNorm2d(n_out), self.non_lin(nl_type)])
        
        if pool is not None:
          layer_list.append(self.add_pool(pool))
          
        return layer_list
    
    def non_lin(self, nl_type='sigmoid'):
        """
        Simply plugs in a predefined non-linearity from a dictionary to be used throughout the network
        :param nl_type: type based on predefined types. Defaults to sigmoid on wrong type.
        :return:
        """
        nl = {'sigmoid': nn.Sigmoid(), 'relu': nn.ReLU(), 'softmax': nn.Softmax(self.n_out)}
        try:
            return nl[nl_type]
        except:
            print("non linearity type not found. Defaulting to sigmoid.")
            return nl['sigmoid']
          
    def add_pool(self, pool_type):
        """

        """
        p = {"max": nn.MaxPool2d(2), "mean": nn.AvgPool2d(2)}
        assert pool_type in p.keys() # Only max and mean pool supported
        return p[pool_type]


m = ConvNet(3,10,None,[4,5,6]).cuda()
print(summary(m, input_size=(3, 256, 256), batch_size=1))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [1, 4, 256, 256]             112
       BatchNorm2d-2           [1, 4, 256, 256]               8
              ReLU-3           [1, 4, 256, 256]               0
         MaxPool2d-4           [1, 4, 128, 128]               0
            Conv2d-5           [1, 5, 128, 128]             185
       BatchNorm2d-6           [1, 5, 128, 128]              10
              ReLU-7           [1, 5, 128, 128]               0
         MaxPool2d-8             [1, 5, 64, 64]               0
            Conv2d-9             [1, 6, 64, 64]             276
      BatchNorm2d-10             [1, 6, 64, 64]              12
             ReLU-11             [1, 6, 64, 64]               0
        MaxPool2d-12             [1, 6, 32, 32]               0
          Dropout-13                     [1, 6]               0
           Linear-14                   

## Method for calculating accuracy
------
The `acc` method simply calculates the predictive accuracy of a model on the given dataloader. It returns the true labels, predicted labels and the accuracy as a float in the range [0,1]

In [0]:
def acc(dataloader):
    """
    Calculate accuracy of predictions from model for dataloader.
    :param dataloader: dataloader to evaluate
    :return:
    """
    acc = 0.0
    true_y = []
    pred_y = []
    total = 0.0
    model.eval()
    with torch.no_grad():
        for batch_id, (x, y) in enumerate(dataloader):
            x = x.cuda()
            y = y.cuda()
            #x = x.view(batch_size, -1)
            # print(x[0])
            preds = model(x)
            preds = torch.argmax(preds, dim=1)
            acc += ((preds==y).sum().item())
            total+= y.size(0)

            true_y.extend(list(preds.view(-1).cpu().numpy()))
            pred_y.extend(list(y.view(-1).cpu().numpy()))

        acc/=total
    return true_y, pred_y, acc

## Method for training model in PyTorch
------
The `train` method takes a dataloader object and trains the model on it for the specified amount of iterations and returns the loss per iteration as a list. Also calculates the accuracy on the training validation set per epoch.

In [0]:
def train(train_dataloader, valid_dataloader, iters = 20, suppress_output=False,
         model_save_path = "best.pth"):
    """
    Trains the model on the given dataloader and returns the loss per epoch
    :param dataloader: The autoencoder is trained on the dataloader
    :param iters: iterations for training
    :return:
    """
    loss_l = []
    train_acc_l = []
    valid_acc_l = []
    best_valid_acc = 0.0
    equiv_train_acc = 0.0
    for itr in range(iters):
        av_itr_loss = 0.0
        model.train()
        for batch_id, (x, y) in enumerate(train_dataloader):
            optimizer.zero_grad()
            x = x.cuda()
            y = y.cuda()
            #x = (x>0.5).float() * 1
            #x = x.view(batch_size, -1)
            # print(x[0])
            preds = model(x)
            # print((z==1).sum())
            batch_loss = loss(preds, y)
            batch_loss.backward()
            optimizer.step()
            av_itr_loss += (1/y.size(0))*batch_loss.item()
        loss_l.append(av_itr_loss)
        _, _, train_acc = acc(train_dataloader)
        _, _, valid_acc = acc(valid_dataloader)
        if not suppress_output:
          if itr%1 == 0 or itr==iters-1:
            print("Epoch {}: Loss={}, Training Accuracy:{}, Validation Accuracy:{}"
                  .format(itr, av_itr_loss, train_acc, valid_acc))
        train_acc_l.append(train_acc)
        valid_acc_l.append(valid_acc)
        if valid_acc>best_valid_acc:
          best_valid_acc = valid_acc
          equiv_train_acc = train_acc
          torch.save(model.state_dict(), model_save_path)
    
    model.load_state_dict(torch.load(model_save_path))
    
#   return loss_l, train_acc_l, valid_acc_l
    return loss_l, equiv_train_acc, best_valid_acc


## Method for plotting confusion matrix
------
__Please note that this function has been borrowed from the sklearn tutorial regarding the visualization of confusion matrices__. The function has however had minor modification to shorten its outputs and increase the figure size.

https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

In [0]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
"""
THIS CONFUSION MATRIX FUNCTION HAS BEEN BORROWED FROM THE SCIKIT-LEARN TUTORIAL ON GENERATING VISUAL CONFUSION MATRIX
PLOTS. REINVENTING THE WHEEL IN THIS CASE SEEMED TO BE EXTREMELY REDUNDANT.
"""

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
#     print(unique_labels(y_true, y_pred))
#     classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

#   print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Resize the subplots
    plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
#     fig.tight_layout()
    return ax

## Initializing Hyperparameters and Datasets
-------
The base Datasets, the test Dataloaders and some training hyperparameters are initialized. The images are resized to `64x64` to allow for a suitably deep CNN to be used for classification . Different types of non-linearities are tested later on the best model from here.

In [66]:
batch_size = 256
n_itr = 25
lr = 0.001

transform_list = transforms.Compose([transforms.Resize(64),
                                     transforms.ToTensor()])
trainset = datasets.CIFAR10('./data', train=True, download=True, transform=transform_list)
testset = datasets.CIFAR10('./data', train=False, download=True, transform=transform_list)

test_dataloader  = torch.utils.data.DataLoader(testset,  batch_size=batch_size, shuffle=False, drop_last=True)

Files already downloaded and verified
Files already downloaded and verified


## Training the model with 1 convolution  before pooling
------
The model is trained using models with different number of filters per layer training algorithms. The architecture is kept basic with sequences of `Conv2d-BatchNorm2d-ReLU-Max/AvgPool2d` blocks.  We use both the generic uniform number of convolutions filters per layers as well as the `VGG` way of doubling filters after downsampling.

###   Generic Way (All layers with uniform feature maps)

Here we train with 64 filters in each layer.

In [67]:
# torch.set_default_tensor_type(torch.cuda.FloatTensor)
loss_type = "Cross_Entropy"
model_layers = [5]
h_units = 64
optim = torch.optim.Adam
lr = 0.0005

# print(axes)

train_split, valid_split = random_split(trainset, [ int(len(trainset)*0.8), 
                                        int(len(trainset)-(len(trainset)*0.8))])
train_dataloader = torch.utils.data.DataLoader(train_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(valid_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)


for i in range(len(model_layers)):
    t1 = time.time()
    
    model_layer = model_layers[i]
    #color = colors[i]

    print("\nTraining using {} layers:".format(model_layer))
    
    h_unit_list = [int(h_units*1**(i+1)) for i in range(model_layer)]
    #h_unit_list = [h_units for i in range(1,model_layer+1)]
    print(h_unit_list)
    model = ConvNet(3, 10, None, h_unit_list).cuda()
#   print(model)
    summary(model, (3,64,64))
    model.train()
    loss = nn.CrossEntropyLoss()
    optimizer = optim(model.parameters(), lr=lr)
    loss_list, train_acc, valid_acc = train(train_dataloader, valid_dataloader, 
                                            iters=n_itr, model_save_path="model1.pth")
    _,_, test_acc = acc(test_dataloader)
    t2 = time.time()
    print("Time to converge: {} sec".format(t2-t1))
    print("Best train accuracy={}, valid accuracy={} (based on the later)".format(train_acc, valid_acc))
    print("Test accuracy on best model={}".format(test_acc))


Training using 5 layers:
[64, 64, 64, 64, 64]
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           1,792
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
         MaxPool2d-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]          36,928
       BatchNorm2d-6           [-1, 64, 32, 32]             128
              ReLU-7           [-1, 64, 32, 32]               0
         MaxPool2d-8           [-1, 64, 16, 16]               0
            Conv2d-9           [-1, 64, 16, 16]          36,928
      BatchNorm2d-10           [-1, 64, 16, 16]             128
             ReLU-11           [-1, 64, 16, 16]               0
        MaxPool2d-12             [-1, 64, 8, 8]               0
           Conv2d-13             [-1, 64, 8, 8]         

### Doubling filters after convolution 

Here we train with `16-32-64-128` filters in each layer. 

In [68]:
# torch.set_default_tensor_type(torch.cuda.FloatTensor)
loss_type = "Cross_Entropy"
model_layers = [5]
h_units = 16
optim = torch.optim.Adam
lr = 0.0005

# print(axes)

train_split, valid_split = random_split(trainset, [ int(len(trainset)*0.8), 
                                        int(len(trainset)-(len(trainset)*0.8))])
train_dataloader = torch.utils.data.DataLoader(train_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(valid_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)


for i in range(len(model_layers)):
    t1 = time.time()
    
    model_layer = model_layers[i]
    #color = colors[i]

    print("\nTraining using {} layers:".format(model_layer))
    
    h_unit_list = [int(h_units*2**(i)) for i in range(model_layer)]
    #h_unit_list = [h_units for i in range(1,model_layer+1)]
    print(h_unit_list)
    model = ConvNet(3, 10, None, h_unit_list).cuda()
#   print(model)
    summary(model, (3,64,64))
    model.train()
    loss = nn.CrossEntropyLoss()
    optimizer = optim(model.parameters(), lr=lr)
    loss_list, train_acc, valid_acc = train(train_dataloader, valid_dataloader, 
                                            iters=n_itr, model_save_path="model2.pth")
    _, _, test_acc = acc(test_dataloader)
    t2 = time.time()
    print("Time to converge: {} sec".format(t2-t1))
    print("Best train accuracy={}, valid accuracy={} (based on the later)".format(train_acc, valid_acc))
    print("Test accuracy on best model={}".format(test_acc))


Training using 5 layers:
[16, 32, 64, 128, 256]
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 64, 64]             448
       BatchNorm2d-2           [-1, 16, 64, 64]              32
              ReLU-3           [-1, 16, 64, 64]               0
         MaxPool2d-4           [-1, 16, 32, 32]               0
            Conv2d-5           [-1, 32, 32, 32]           4,640
       BatchNorm2d-6           [-1, 32, 32, 32]              64
              ReLU-7           [-1, 32, 32, 32]               0
         MaxPool2d-8           [-1, 32, 16, 16]               0
            Conv2d-9           [-1, 64, 16, 16]          18,496
      BatchNorm2d-10           [-1, 64, 16, 16]             128
             ReLU-11           [-1, 64, 16, 16]               0
        MaxPool2d-12             [-1, 64, 8, 8]               0
           Conv2d-13            [-1, 128, 8, 8]       

## Training the model with 2 convolutions  before pooling
------
The model is trained using models with different number of filters per layer training algorithms. The architecture is kept basic with sequences of `Conv2d-BatchNorm2d-ReLU-Conv2d-BatchNorm2d-ReLU-Max/AvgPool2d` blocks.  We use both the generic uniform number of convolutions filters per layers as well as the `VGG` way of doubling filters after downsampling.


### Redefining ConvNet object
-----
Only `add_convlayer_and_act` function has been overridden.

In [0]:
class ConvNet2(ConvNet):
 
  def add_convlayer_and_act(self, n_inp, n_out, nl_type, pool='max'):
    
    """

    """
    layer_list = []
    layer_list.extend([nn.Conv2d(n_inp, n_out, kernel_size=(3,3), padding=1), 
                           nn.BatchNorm2d(n_out), self.non_lin(nl_type)])
    layer_list.extend([nn.Conv2d(n_out, n_out, kernel_size=(3,3), padding=1), 
                           nn.BatchNorm2d(n_out), self.non_lin(nl_type)])

    if pool is not None:
      layer_list.append(self.add_pool(pool))

    return layer_list 

### Generic Way (All layers with uniform feature maps)

Here we train with 64 filters in each layer.

In [70]:
# torch.set_default_tensor_type(torch.cuda.FloatTensor)
loss_type = "Cross_Entropy"
model_layers = [5]
h_units = 64
optim = torch.optim.Adam
lr = 0.0005

# print(axes)

train_split, valid_split = random_split(trainset, [ int(len(trainset)*0.8), 
                                        int(len(trainset)-(len(trainset)*0.8))])
train_dataloader = torch.utils.data.DataLoader(train_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(valid_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)


for i in range(len(model_layers)):
    t1 = time.time()
    
    model_layer = model_layers[i]
    #color = colors[i]

    print("\nTraining using {} layers:".format(model_layer))
    
    h_unit_list = [int(h_units*1**(i+1)) for i in range(model_layer)]
    #h_unit_list = [h_units for i in range(1,model_layer+1)]
    print(h_unit_list)
    model = ConvNet2(3, 10, None, h_unit_list).cuda()
#   print(model)
    summary(model, (3,64,64))
    model.train()
    loss = nn.CrossEntropyLoss()
    optimizer = optim(model.parameters(), lr=lr)
    loss_list, train_acc, valid_acc = train(train_dataloader, valid_dataloader,
                                            iters=n_itr, model_save_path="model3.pth")
    _,_,test_acc = acc(test_dataloader)
    t2 = time.time()
    print("Time to converge: {} sec".format(t2-t1))
    print("Best train accuracy={}, valid accuracy={} (based on the later)".format(train_acc, valid_acc))
    print("Test accuracy on best model={}".format(test_acc))


Training using 5 layers:
[64, 64, 64, 64, 64]
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           1,792
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
            Conv2d-4           [-1, 64, 64, 64]          36,928
       BatchNorm2d-5           [-1, 64, 64, 64]             128
              ReLU-6           [-1, 64, 64, 64]               0
         MaxPool2d-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,928
       BatchNorm2d-9           [-1, 64, 32, 32]             128
             ReLU-10           [-1, 64, 32, 32]               0
           Conv2d-11           [-1, 64, 32, 32]          36,928
      BatchNorm2d-12           [-1, 64, 32, 32]             128
             ReLU-13           [-1, 64, 32, 32]         

### Doubling filters after convolution 

Here we train with `16-32-64-128` filters in each layer. 

In [71]:
# torch.set_default_tensor_type(torch.cuda.FloatTensor)
loss_type = "Cross_Entropy"
model_layers = [5]
h_units = 16
optim = torch.optim.Adam
lr = 0.0005

train_split, valid_split = random_split(trainset, [ int(len(trainset)*0.8), 
                                        int(len(trainset)-(len(trainset)*0.8))])
train_dataloader = torch.utils.data.DataLoader(train_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(valid_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)


for i in range(len(model_layers)):
    t1 = time.time()
    
    model_layer = model_layers[i]
    #color = colors[i]

    print("\nTraining using {} layers:".format(model_layer))
    
    h_unit_list = [int(h_units*2**(i)) for i in range(model_layer)]
    #h_unit_list = [h_units for i in range(1,model_layer+1)]
    print(h_unit_list)
    model = ConvNet(3, 10, None, h_unit_list).cuda()
#   print(model)
    summary(model, (3,64,64))
    model.train()
    loss = nn.CrossEntropyLoss()
    optimizer = optim(model.parameters(), lr=lr)
    loss_list, train_acc, valid_acc = train(train_dataloader, valid_dataloader, 
                                            iters=n_itr, model_save_path="model4.pth")
    _, _, test_acc = acc(test_dataloader)
    t2 = time.time()
    print("Time to converge: {} sec".format(t2-t1))
    print("Best train accuracy={}, valid accuracy={} (based on the later)".format(train_acc, valid_acc))
    print("Test accuracy on best model={}".format(test_acc))


Training using 5 layers:
[16, 32, 64, 128, 256]
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 64, 64]             448
       BatchNorm2d-2           [-1, 16, 64, 64]              32
              ReLU-3           [-1, 16, 64, 64]               0
         MaxPool2d-4           [-1, 16, 32, 32]               0
            Conv2d-5           [-1, 32, 32, 32]           4,640
       BatchNorm2d-6           [-1, 32, 32, 32]              64
              ReLU-7           [-1, 32, 32, 32]               0
         MaxPool2d-8           [-1, 32, 16, 16]               0
            Conv2d-9           [-1, 64, 16, 16]          18,496
      BatchNorm2d-10           [-1, 64, 16, 16]             128
             ReLU-11           [-1, 64, 16, 16]               0
        MaxPool2d-12             [-1, 64, 8, 8]               0
           Conv2d-13            [-1, 128, 8, 8]       

## Average Pooling on Best Model of Max Pooling (Overall Best Model)
------
Average Pooling is used to replace all the subsampling layers in our best model where the accuracy is `0.8118`. The average pooling version gives an accuracy of `0.8220`. Therefore in our case, average pooling seems to work better than max pooling and forms the best model we have.

In [74]:
model_layers = [5]
h_units = 64
optim = torch.optim.Adam
lr = 0.0005

# print(axes)

train_split, valid_split = random_split(trainset, [ int(len(trainset)*0.8), 
                                        int(len(trainset)-(len(trainset)*0.8))])
train_dataloader = torch.utils.data.DataLoader(train_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)
valid_dataloader = torch.utils.data.DataLoader(valid_split, batch_size=batch_size,
                                               shuffle=True, drop_last=True)


for i in range(len(model_layers)):
    t1 = time.time()
    
    model_layer = model_layers[i]
    #color = colors[i]

    print("\nTraining using {} layers:".format(model_layer))
    
    h_unit_list = [int(h_units*1**(i+1)) for i in range(model_layer)]
    #h_unit_list = [h_units for i in range(1,model_layer+1)]
    print(h_unit_list)
    model = ConvNet2(3, 10, None, h_unit_list, pool='mean').cuda()
#   print(model)
    summary(model, (3,64,64))
    model.train()
    loss = nn.CrossEntropyLoss()
    optimizer = optim(model.parameters(), lr=lr)
    loss_list, train_acc, valid_acc = train(train_dataloader, valid_dataloader,
                                            iters=n_itr, model_save_path="model5.pth")
    _,_,test_acc = acc(test_dataloader)
    t2 = time.time()
    print("Time to converge: {} sec".format(t2-t1))
    print("Best train accuracy={}, valid accuracy={} (based on the later)".format(train_acc, valid_acc))
    print("Test accuracy on best model={}".format(test_acc))



Training using 5 layers:
[64, 64, 64, 64, 64]
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           1,792
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
            Conv2d-4           [-1, 64, 64, 64]          36,928
       BatchNorm2d-5           [-1, 64, 64, 64]             128
              ReLU-6           [-1, 64, 64, 64]               0
         MaxPool2d-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,928
       BatchNorm2d-9           [-1, 64, 32, 32]             128
             ReLU-10           [-1, 64, 32, 32]               0
           Conv2d-11           [-1, 64, 32, 32]          36,928
      BatchNorm2d-12           [-1, 64, 32, 32]             128
             ReLU-13           [-1, 64, 32, 32]         