In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#An observation I made while doing this was that the Adam Optimiser requires more hyperparameter tuning than the SGD optimiser.

In [2]:
import torch
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

In [3]:
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize( (0.5, 0.5, 0.5), (0.5, 0.5, 0.5) )
                               ])

In [4]:
# Training and test datasets
trainingData = datasets.CIFAR10('data', train=True,
                              download=True, transform=transform)
testData = datasets.CIFAR10('data', train=False,
                             download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


In [5]:
dataIndices = np.arange(len(trainingData))
np.random.shuffle(dataIndices)
print(f"Training Images:{len(trainingData)}")
print(f"Test images: {len(testData)}")
print(dataIndices)
#Taking a random 20% of training images for the purpose of Validation
validationIndices, trainingIndices = dataIndices[:10000], dataIndices[10000:]
print(f"Images for validation {len(validationIndices)}")

Training Images:50000
Test images: 10000
[36771  7369 35524 ... 48317 36180 24438]
Images for validation 10000


In [6]:
indexToClass = {n:l for l, n in trainingData.class_to_idx.items()}
#A hash map that has a mapping between the labels and indices

for index, label in indexToClass.items():
    print(label)
#The different classes in alphabetical order

def classDistribution(dataset):
    freqDict = {l:0 for l,n in dataset.class_to_idx.items()}
    #empty frequency dictionary
    
    for element in dataset:
        label = element[1]
        label = indexToClass[label]
        freqDict[label] += 1

    return freqDict

print(f"Frequency of image classes {classDistribution(trainingData)}")

airplane
automobile
bird
cat
deer
dog
frog
horse
ship
truck
Frequency of image classes {'airplane': 5000, 'automobile': 5000, 'bird': 5000, 'cat': 5000, 'deer': 5000, 'dog': 5000, 'frog': 5000, 'horse': 5000, 'ship': 5000, 'truck': 5000}


In [7]:
#Creating samplers for training and validation
validationSampler = SubsetRandomSampler(validationIndices)
trainingSampler = SubsetRandomSampler(trainingIndices)

dataLoader_train = torch.utils.data.DataLoader(trainingData, batch_size = 4, sampler = trainingSampler)
dataLoader_valid = torch.utils.data.DataLoader(trainingData, batch_size = 4, sampler = validationSampler)  
dataLoader_test  = torch.utils.data.DataLoader(testData    , batch_size = 4)

In [8]:
#A utility funtion to un-normalise and convert back from a tensor
def imshow(image):
    image = (image/2) + 0.5 #as currently it has values in the range [-1,1]
    plt.imshow(np.transpose(image, (1, 2, 0)))
    
#Utility function for correct number of labels
def correctLabels(preds, labels):
    return preds.argmax(dim = 1).eq(labels).sum().item()

In [9]:
#Defining layers as class attributes after extending the nn.Module base class
#We have to call the super class constructor to take advantage of the pytorch nn.Functional module
#Using the layers that come in the pytorch library in the class
#A CNN with 2 convolutional layers
#CNN layers get bigger and the FC Layers get smaller
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    # def __init__(self): #Dubund init
    #     super(Network, self).__init__()
    #     self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 16, kernel_size = 3, padding = 1)
    #     self.conv2 = nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1)
    #     self.conv3 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1)        
        
    #     self.fc1 = nn.Linear(in_features = 64 * 4 * 4, out_features = 512)
    #     self.fc2 = nn.Linear(in_features = 512, out_features = 64)
    #     self.out = nn.Linear(in_features = 64, out_features = 10)
    #     self.pool = nn.MaxPool2d(kernel_size = 2, stride = 2) #2x2 kernel for max Pooling
    #     # self.dropout = nn.Dropout(0.5) #Hyper parameter
        
    # def forward(self, t):
    #     t = self.pool(F.relu(self.conv1(t)))
    #     t = self.pool(F.relu(self.conv2(t)))
    #     t = self.pool(F.relu(self.conv3(t)))
        
    #     #print(t.shape)
    #     t = t.view(-1, 64 * 4 * 4)
    #     #t = t.view(32 * 4 * 4, -1)
    #     #We want 32 * 4 * 4 input features, letting pytorch handle the number of batches
        
    #     t = self.dropout(t)
    #     t = F.relu(self.fc1(t))
    #     t = self.dropout(t)
    #     t = F.relu(self.fc2(t))
    #     t = self.dropout(t)
    #     t = self.out(t)
    #     #Usually use a softmax layer at the end but the loss that we will be using is 
    #     #the cross entropy loss which implicity performs the softmax operation
        
    #     return t
    
    

#    def __repr__(self): #Overriding a python function
#        return "A simple python class"

In [10]:
Net = Network()
print(Net)
#We get a nice text description as we have extended the nn.Module class

Network(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [11]:
Net.conv1.weight.shape
#A tensor dimension has all the information that we need about a layer
#It is like packaging all our filters into a tensor
#We can see the first axis is representative of our output channels
#The second axis is the number of input channels (depth to each of the filters)
#The third and four axis represent the height and width of the kernel
#On printing we see that this is a rank 4 tensor

torch.Size([6, 3, 5, 5])

In [12]:
Net.fc1.weight.shape
#We see that linear layers can be parametrised by flattened, rank 1 tensors.
#The input is transformed to the output by a rank 2 tensor of weights (weight tensor)

torch.Size([120, 400])

In [13]:
for Name, Param in Net.named_parameters():
    print(Name, '\t\t', Param.shape)

conv1.weight 		 torch.Size([6, 3, 5, 5])
conv1.bias 		 torch.Size([6])
conv2.weight 		 torch.Size([16, 6, 5, 5])
conv2.bias 		 torch.Size([16])
fc1.weight 		 torch.Size([120, 400])
fc1.bias 		 torch.Size([120])
fc2.weight 		 torch.Size([84, 120])
fc2.bias 		 torch.Size([84])
fc3.weight 		 torch.Size([10, 84])
fc3.bias 		 torch.Size([10])


In [14]:
print(Net.conv1.weight.grad)

None


In [15]:
def weights_init(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            # initialize the weight tensor, here we use a normal distribution
            m.weight.data = torch.randn(m.weight.data.shape)
            #print(m.weight.data.shape)
            #exit()
        if isinstance(m, nn.Conv2d):
            m.weight.data = torch.randn(m.weight.data.shape)

weights_init(Net)

In [18]:
optimizer = optim.SGD(Net.parameters(), lr = 0.1)

#1 epoch is training the network over all the batches
maxEpoch = 500

for epoch in range(1, 1 + maxEpoch):
    trainingLoss = 0
    validationLoss = 0
    correctClassifications = 0

    for batch in dataLoader_train:        
        images, labels = batch

        predictions = Net(images)
        loss = F.cross_entropy(predictions, labels)

        optimizer.zero_grad() #As pytorch adds to the gradients, we need to initalise it to zero
        loss.backward()
        optimizer.step()

        trainingLoss += loss.item()
        correctClassifications += correctLabels(predictions, labels)

    # for tester in dataLoader_valid:
    #     validationImage, validationLabel = tester
    #     predictions = Net(validationImage)
    #     loss2 = F.cross_entropy(predictions, labels)
    #     validationLoss += loss2.item()
 

    print(f"epoch {epoch}, total correct: {correctClassifications}, loss: {trainingLoss}, accuracy: ~{np.floor(100 * (correctClassifications / len(trainingData)))}")


epoch 1, total correct: 4021, loss: 23079.83822798729, accuracy: ~8.0
epoch 2, total correct: 3895, loss: 23086.77760028839, accuracy: ~7.0
epoch 3, total correct: 3972, loss: 23081.49997997284, accuracy: ~7.0
epoch 4, total correct: 4122, loss: 23082.018139362335, accuracy: ~8.0
epoch 5, total correct: 3998, loss: 23083.186648607254, accuracy: ~7.0
epoch 6, total correct: 3994, loss: 23082.97275543213, accuracy: ~7.0
epoch 7, total correct: 3997, loss: 23078.023597955704, accuracy: ~7.0
epoch 8, total correct: 3994, loss: 23086.685501098633, accuracy: ~7.0
epoch 9, total correct: 4018, loss: 23083.770142793655, accuracy: ~8.0
epoch 10, total correct: 4027, loss: 23079.61675429344, accuracy: ~8.0
epoch 11, total correct: 4029, loss: 23082.91792654991, accuracy: ~8.0
epoch 12, total correct: 3948, loss: 23083.479597330093, accuracy: ~7.0
epoch 13, total correct: 4059, loss: 23086.380138397217, accuracy: ~8.0
epoch 14, total correct: 4014, loss: 23085.42527461052, accuracy: ~8.0
epoch 15

KeyboardInterrupt: ignored

In [19]:
correctClassifications / len(trainingData)

0.07472

In [20]:
print(len(trainingData))

50000


In [21]:
#Testing accuracy
totalLoss = 0.0
correctClassification = list(0. for i in range(10))
totalClassification = list(0. for i in range(10))

for batch in dataLoader_test:
    data, labels = batch
    predictions = Net(data)
    loss = F.cross_entropy(predictions, labels)
    print(loss.item())


2.2838497161865234
2.3075742721557617
2.2341480255126953
2.3242387771606445
2.3355541229248047
2.311145067214966
2.362720251083374
2.375558853149414
2.3799033164978027
2.2755792140960693
2.372445583343506
2.3083839416503906
2.3304896354675293
2.2838497161865234
2.323220729827881
2.2981622219085693
2.3451199531555176
2.3505477905273438
2.336235284805298
2.2910022735595703
2.1707024574279785
2.3966822624206543
2.296116828918457
2.355114221572876
2.3083910942077637
2.3388330936431885
2.188410520553589
2.2988781929016113
2.292734384536743
2.341233253479004
2.2691519260406494
2.2838497161865234
2.3056674003601074
2.2914493083953857
2.361529588699341
2.353261709213257
2.3605778217315674
2.3707494735717773
2.3298778533935547
2.3799033164978027
2.3263471126556396
2.317103147506714
2.3357455730438232
2.335597038269043
2.297025680541992
2.3959219455718994
2.3023176193237305
2.3203649520874023
2.270110845565796
2.294640064239502
2.250002861022949
2.2392122745513916
2.3119053840637207
2.3234784603