# Image classification: Convolutional Neural Network

In this notebook, we implement a convolutional neural network for image classification. We use the very popular CIFAR-10 dataset, which has images belonging to ten classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck. 

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Dataset

dataset has PILImage images of range [0, 1], which we transform to tensors of normalized range (-1, 1)

In [2]:
transform = transforms.Compose([transforms.ToTensor(), 
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_dataset = torchvision.datasets.CIFAR10(root = './data',
                                            train = True,
                                            download = True,
                                            transform = transform)

test_dataset = torchvision.datasets.CIFAR10(root = './data',
                                           train = False,
                                           transform = transform)

Files already downloaded and verified


In [3]:
print(train_dataset)
print('--------------------')
print(test_dataset)

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
           )
--------------------
Dataset CIFAR10
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
           )


In [4]:
print(len(train_dataset))

image, label = train_dataset[0]
print(image.size(), label)

50000
torch.Size([3, 32, 32]) 6


In [5]:
batch_size = 5

train_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size = batch_size,
                                          shuffle = True)

test_loader = torch.utils.data.DataLoader(test_dataset,
                                         batch_size = batch_size,
                                         shuffle = False)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 
           'dog', 'frog', 'horse', 'ship', 'truck')

## Model and Training

The size of an image after employing convolution is (W-F+2P)/S + 1, where W is input width, F is filter size and P is padding and S is stride.
<br>Thus, if we apply convolution on an image of size 5 * 5 (W = 5), by 3 * 3 filter (F = 3), given no padding (P = 0) and stride (S = 1), the width of the output convoluted image is (5 - 3 + 0)/1 + 1 = 3, i.e, we have 3 * 3 image.


Let's first define the ConvNet model we want to train

In [6]:
class ConvNet(nn.Module):
    
    def __init__(self, ):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)   # input channel size = 3, output channel size = 6, kernel size = 5 (5 by 5) 
        self.pool = nn.MaxPool2d(2, 2)    # kernel size = 2 (2 by 2 kernel), stride = 2
        self.conv2 = nn.Conv2d(6, 16, 5)  # input channel size must be equal to last output channel size
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
           
    def forward(self, x):
        out = torch.relu(self.conv1(x))      # first convolutional layer
        out = self.pool(out)                 # pooling layer
        out = torch.relu(self.conv2(out))    # second convolutional layer
        out = self.pool(out)                 # pooling layer
        out = out.view(-1, 16*5*5)           # flattening the tensor
        out = torch.relu(self.fc1(out))      # first fully connected layer
        out = torch.relu(self.fc2(out))      # second fully connected layer
        out = self.fc3(out)                  # third fully connected layer
        return out

In [7]:
learning_rate = 0.001

model = ConvNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

n_epochs = 20
total_batches = len(train_loader)
print(total_batches)

for epoch in range(n_epochs):
    
    for i, (images, labels) in enumerate(train_loader):
        
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if (i+1) % 1000 == 0:
            print(f'epoch: {epoch+1}/{n_epochs}; step: {i+1}/{total_batches}; loss: {loss.item():.4f}')
        
        
with torch.no_grad():
    
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    
    for images, labels in test_loader:
        
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        _, predictions = torch.max(outputs, 1)
        n_correct += (predictions == labels).sum().item()
        n_samples += labels.shape[0]
        
        for i in range(batch_size):
            
            label = labels[i]
            pred = predictions[i]
            
            if label == pred:       
                n_class_correct[label] += 1
            
            n_class_samples[label] += 1
    
    overall_accuracy = 100.0 * n_correct / n_samples
    print('Overall accuracy %.4f' % (overall_accuracy))

10000
epoch: 1/20; step: 1000/10000; loss: 2.3043
epoch: 1/20; step: 2000/10000; loss: 2.2993
epoch: 1/20; step: 3000/10000; loss: 2.3186
epoch: 1/20; step: 4000/10000; loss: 2.3009
epoch: 1/20; step: 5000/10000; loss: 2.3042
epoch: 1/20; step: 6000/10000; loss: 2.3120
epoch: 1/20; step: 7000/10000; loss: 2.2874
epoch: 1/20; step: 8000/10000; loss: 2.2889
epoch: 1/20; step: 9000/10000; loss: 2.3277
epoch: 1/20; step: 10000/10000; loss: 2.2163
epoch: 2/20; step: 1000/10000; loss: 2.2970
epoch: 2/20; step: 2000/10000; loss: 2.1748
epoch: 2/20; step: 3000/10000; loss: 2.3356
epoch: 2/20; step: 4000/10000; loss: 2.1196
epoch: 2/20; step: 5000/10000; loss: 2.3641
epoch: 2/20; step: 6000/10000; loss: 1.8763
epoch: 2/20; step: 7000/10000; loss: 1.6302
epoch: 2/20; step: 8000/10000; loss: 2.0208
epoch: 2/20; step: 9000/10000; loss: 1.9638
epoch: 2/20; step: 10000/10000; loss: 1.7376
epoch: 3/20; step: 1000/10000; loss: 1.7287
epoch: 3/20; step: 2000/10000; loss: 1.3165
epoch: 3/20; step: 3000/

epoch: 19/20; step: 5000/10000; loss: 0.6810
epoch: 19/20; step: 6000/10000; loss: 0.4845
epoch: 19/20; step: 7000/10000; loss: 0.5913
epoch: 19/20; step: 8000/10000; loss: 1.3311
epoch: 19/20; step: 9000/10000; loss: 0.5690
epoch: 19/20; step: 10000/10000; loss: 0.9449
epoch: 20/20; step: 1000/10000; loss: 0.9597
epoch: 20/20; step: 2000/10000; loss: 1.4032
epoch: 20/20; step: 3000/10000; loss: 1.2400
epoch: 20/20; step: 4000/10000; loss: 0.2377
epoch: 20/20; step: 5000/10000; loss: 0.5256
epoch: 20/20; step: 6000/10000; loss: 2.0739
epoch: 20/20; step: 7000/10000; loss: 1.7049
epoch: 20/20; step: 8000/10000; loss: 1.3427
epoch: 20/20; step: 9000/10000; loss: 1.2717
epoch: 20/20; step: 10000/10000; loss: 0.7615
Overall accuracy 63.1000
