In [1]:
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
# Hyperparameter

num_epochs = 10
batch_size = 5
learning_rate = 0.01

In [3]:
# Device Configuration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# DataLoader & Data Transforms

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform = transform)

test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size = batch_size, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
# Hardcoded classes {will be used for later verification individual accuracy of the classes}

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


In [6]:
class ConvNet(nn.Module):

  def __init__(self):

    super(ConvNet, self).__init__()

    self.conv1 = nn.Conv2d(3,6,5) # '3' --> no. of input channels {color channels, so '3' colour channels}, '6' --> no. of o/p channels
                                  # '6' --> no. of o/p channels
                                  # '5' is kernel size of image ==> meaning 5x5 is image array size

    self.pool = nn.MaxPool2d(2,2) # First '2' --> Kernel size of image filter ==> so 2x2 is filter size.
                                  # Second '2' --> Size of the stride: no. of jumping steps.

    self.conv2 = nn.Conv2d(6, 16, 5) # '6' --> Input from the previous output '6' of conv1
                                     # '16' is arbitrary o/p size, '5': same as in conv1

    self.fc1 = nn.Linear(16*5*5, 120) # '16x5x5' obtained from a formula [see note below]: mandatory to be fixed acc. to that rule
                                      # '120' is arbitrary o/p size [can be played with]

    self.fc2 = nn.Linear(120, 84) # '120' from o/p of fc1, '84' is arbitrary o/p size [can be played with]

    self.fc3 = nn.Linear(84, 10)  # '84' from o/p of fc2, '10' is no. of classes which should always be the o/p of the last layer

    # Note: Since we implement a Cross Entropy Loss function, we shouldn't implement the softmax activation function in the last layer.

  def forward(self,x):

    x = self.pool(F.relu(self.conv1(x))) # First Conv+ReLu+Pool Layer
    x = self.pool(F.relu(self.conv2(x))) # Second Conv+ReLu+Pool Layer

    # Now before proceeding on to the fully connected layers, we've to flatten the existing layers

    x = x.view(-1, 16*5*5) # Resizing, hence flattening

    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))

    return x






#### **Formula that was being referred to above: (W-F+2P)/S + 1. W=Image Width (Kernel Size of Image), F = Filter Size (Size of Image Filter), P = Size of Padding, S = Stride Size**

#### So, if we substitute the o/p generated by the first conv layer --> which is 32x32, W=32, F=5, P=0, S=1; so new size is: (32-5+0)/1 = 28, so 28x28.

#### Another key aspect to be noted here is: why the input to the fully connected first layer is 16x5x5? It is important to understand that before the first fully connected layer, this layer undergoes four different operations that changes the tensor size over and over again:



#### 1.   The first layer of Convolution which will alter the size of image from 32x32 to 28x28. ***Size before Conv1 : [4,3,32,32]***; ***size after Conv1: [4,6,28,28]***. Immediately after this a ReLU activation function is applied; but this doesn't change the dimensions.

#### 2.   Then comes a maxpooling layer which will cut the image size by half (as kernel and stride size = 2). ***Size before MaxPool1 = [4,6,28,28]; size after MaxPool1 = [4,6,14,14]***.

#### 3. Similarly, comes the second layer of convolution which will alter the output from MaxPool1 to: ***[4,16,10,10]*** , followed by a ReLU activation function, which again has no effect.

#### 4. Lastly, a maxpooling layer is applied; which will ***finally make the dimensions to be: [4,16,5,5]***. The softmax applied by default by the CrossEntropyLoss will not the change the dimensions. So, it is from this that the input "16x5x5" comes to fully connected layer.



In [7]:
model = ConvNet().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)

for epoch in range(num_epochs):

  for i, (images,labels) in enumerate(train_loader):

    images = images.to(device)
    labels = labels.to(device)

    # Forward Pass

    outputs = model(images)

    loss = criterion(outputs, labels)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    if (i+1)%200:

      print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')









[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [10/10], Step [4975/10000], Loss: 1.5835
Epoch [10/10], Step [4976/10000], Loss: 0.5101
Epoch [10/10], Step [4977/10000], Loss: 1.6525
Epoch [10/10], Step [4978/10000], Loss: 0.9682
Epoch [10/10], Step [4979/10000], Loss: 0.9728
Epoch [10/10], Step [4980/10000], Loss: 1.1840
Epoch [10/10], Step [4981/10000], Loss: 0.1435
Epoch [10/10], Step [4982/10000], Loss: 1.1827
Epoch [10/10], Step [4983/10000], Loss: 0.8571
Epoch [10/10], Step [4984/10000], Loss: 0.6611
Epoch [10/10], Step [4985/10000], Loss: 0.6935
Epoch [10/10], Step [4986/10000], Loss: 1.7228
Epoch [10/10], Step [4987/10000], Loss: 0.3582
Epoch [10/10], Step [4988/10000], Loss: 0.6669
Epoch [10/10], Step [4989/10000], Loss: 0.7665
Epoch [10/10], Step [4990/10000], Loss: 0.5136
Epoch [10/10], Step [4991/10000], Loss: 0.9733
Epoch [10/10], Step [4992/10000], Loss: 1.1233
Epoch [10/10], Step [4993/10000], Loss: 1.4936
Epoch [10/10], Step [4994/10000], Loss: 0.

In [8]:
with torch.no_grad():

  n_correct = 0
  n_samples = 0

  n_class_correct = [0 for i in range(10)]
  n_class_samples = [0 for i in range(10)]

  for images, labels in test_loader:

    images = images.to(device)
    labels = labels.to(device)

    outputs = model(images)

    _, predicted = torch.max(outputs,1)
    n_samples += labels.shape[0]
    n_correct += (predicted==labels).sum().item()

    for i in range(batch_size):

      label = labels[i]
      pred = predicted[i]

      if(label == pred):

        n_class_correct[label] +=1

      n_class_samples[label] +=1

acc = 100.0 * n_correct/n_samples

print(f'Accuracy of the whole network: {acc:.8f} %')

for i in range(10):

  acc2 = 100.0 * n_class_correct[i]/n_class_samples[i]

  print(f'Accuracy of {classes[i]}: {acc2} %')

Accuracy of the whole network: 62.98000000 %
Accuracy of plane: 51.1 %
Accuracy of car: 75.4 %
Accuracy of bird: 55.7 %
Accuracy of cat: 44.7 %
Accuracy of deer: 55.0 %
Accuracy of dog: 48.9 %
Accuracy of frog: 74.6 %
Accuracy of horse: 70.2 %
Accuracy of ship: 82.3 %
Accuracy of truck: 71.9 %
