In [75]:
#import everything
import torch
import torchvision
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torchvision.datasets import CIFAR10
from torchvision.transforms import ToTensor
from torch.utils.data import random_split

#what to do differently
1. Use test set as validation set,=
2. Normalise data channel-wise, so mean of values is 0 and deviation 1
3. Apply different transformations on the data for data augmentation`

In [76]:
import torchvision.transforms as tt

In [77]:
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) #mean for each channel, followed by s.deviations
#normalised as (value-mean)/deviation
train_transforms = tt.Compose([tt.RandomCrop(32,padding=4,padding_mode='reflect'),
                               tt.RandomHorizontalFlip(),
                               tt.ToTensor(),
                               tt.Normalize(*stats,inplace=True)])
validation_transforms = tt.Compose([tt.ToTensor(),tt.Normalize(*stats)])

In [78]:
train_dataset = CIFAR10(root='datasets/',download=True,transform=train_transforms)
valid_dataset = CIFAR10(root='datasets/',train=False,transform=validation_transforms)

Files already downloaded and verified


In [79]:
train_dataset.classes

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [80]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data,device):
    if isinstance(data,(list,tuple)):
        return [to_device(x,device) for x in data]
    return data.to(device,non_blocking=True)

class DeviceDataLoader(): #wrapper over DataLoader class, to push it to device and create __iter__ class for taking full advantage of GPU
    def __init__(self,data_loader,device):
        self.data_loader = data_loader
        self.device = device
    
    def __iter__(self): #this class makes the magic of GPU happen
        for batch in self.data_loader:
            yield(to_device(batch,self.device))
    
    def __len__(self):
        return len(self.data_loader)

In [81]:
device = get_default_device()
device

device(type='cuda')

In [82]:
batch_size = 400
train_loader = DeviceDataLoader(DataLoader(train_dataset,batch_size,shuffle=True,num_workers=0,pin_memory=True),device)
valid_loader = DeviceDataLoader(DataLoader(valid_dataset,batch_size*2,num_workers=0,pin_memory=True),device)

New thing now is addition of a Residual Block which adds the original input back to the output feature map
<br>so f(x) is changed to f(x) + x

In [83]:
class SimpleResidualBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3,3,kernel_size=3,stride=1,padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(3,3,kernel_size=3,stride=1,padding=1)
        self.relu2 = nn.ReLU()
        #notice that neither the size of image nor the number of channels change here
        
    def forward(self,x):
        out = self.conv1(x)
        out = self.relu1(x)
        out = self.conv2(x)
        return self.relu2(x) + x #relu can be applied before or after adding input

In [84]:
simple_resnet = to_device(SimpleResidualBlock(),device)

In [85]:
for images, labels in train_loader:
    out = simple_resnet(images)
    print(out.shape)
    break

torch.Size([400, 3, 32, 32])


In [86]:
del simple_resnet, images, labels
torch.cuda.empty_cache()
#free gpu memory

This small change makes a huge improvement in the model
<br>Also, after each conv layer, we add a batch norm layer
<br> Learn more about the architecture here https://myrtle.ai/learn/how-to-train-your-resnet/

In [87]:
class ImageBase(nn.Module):
    def train_step(self,X_batch):
        images,labels = X_batch
        out = self.forward(images)
        loss = F.cross_entropy(out,labels)
        return loss
    
    def valid_step(self,X_batch):
        images,labels = X_batch
        out = self.forward(images)
        loss = F.cross_entropy(out,labels)
        _,preds = torch.max(out,dim=1)
        acc = torch.tensor(torch.sum(preds==labels).item()/len(preds))
        return {'loss':loss,'acc':acc}

In [88]:
def conv_block(in_channels,out_channels,pool=False):
    layers = [nn.Conv2d(in_channels,out_channels,kernel_size=3,stride=1,padding=1),
              nn.BatchNorm2d(out_channels),
              nn.ReLU(inplace=True)]
    if pool:
        layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)

In [89]:
class ResNet9(ImageBase):
    def __init__(self,in_channels,num_classes):
        super().__init__()
        self.conv1 = conv_block(in_channels,64)
        self.conv2 = conv_block(64,128,pool=True)
        self.res1  = nn.Sequential(conv_block(128,128),conv_block(128,128))
        
        self.conv3 = conv_block(128,256,pool=True)
        self.conv4 = conv_block(256,512,pool=True)
        self.res2  = nn.Sequential(conv_block(512,512),conv_block(512,512))
        
        self.classifier = nn.Sequential(nn.MaxPool2d(4),
                                        nn.Flatten(),
                                        nn.Dropout(0.2),
                                        nn.Linear(512,num_classes))
    def forward(self,X_batch):
        out = self.conv1(X_batch)
        out = self.conv2(out)
        out = self.res1(out) + out
        out = self.conv3(out)
        out = self.conv4(out)
        out = self.res2(out) + out
        return self.classifier(out)       

In [90]:
model = to_device(ResNet9(3,10),device)

In [91]:
model

ResNet9(
  (conv1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (conv2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (res1): Sequential(
    (0): Sequential(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=Tr

Now train process also a little different
<br>1. learning rate changed after every batch, instead of using a fixed one every batch. Here increase it for initial 30% of epochs and then gradually decrease to very low value for the rest.
<br>2. Weight decay regularization
<br>3. Gradient Clipping of large gradient values

In [92]:
@torch.no_grad()
def evaluate(model,loader):
    model.eval()
    out = [model.valid_step(batch) for batch in loader]
    batch_loss = [x['loss'] for x in out]
    batch_acc  = [x['acc'] for x in out]
    epoch_loss = torch.stack(batch_loss).mean()
    epoch_acc  = torch.stack(batch_acc).mean()
    print(f"loss is {epoch_loss} and acc is {epoch_acc}")

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs,max_lr,model,train_loader,valid_loader,weight_decay=0,grad_clip=None,optim_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    optimizer = optim_func(model.parameters(),max_lr,weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr,epochs=epochs,steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            loss = model.train_step(batch)
            loss.backward()
            
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(),grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
        evaluate(model,valid_loader)

In [93]:
evaluate(model,valid_loader)

loss is 2.308152675628662 and acc is 0.10673077404499054


In [94]:
epochs = 8
max_lr = 0.01
grad_clip = 0.1
weight_decay = 1e-4
optim_func = torch.optim.Adam

In [95]:
fit_one_cycle(epochs,max_lr,model,train_loader,valid_loader,weight_decay,grad_clip,optim_func)

loss is 1.2002620697021484 and acc is 0.5823076963424683
loss is 1.417062759399414 and acc is 0.5799999833106995
loss is 0.7173216342926025 and acc is 0.7549999952316284
loss is 0.6920413970947266 and acc is 0.7737500667572021
loss is 0.5628334283828735 and acc is 0.8066346049308777
loss is 0.4335850477218628 and acc is 0.8489423990249634
loss is 0.315218985080719 and acc is 0.8934614658355713
loss is 0.2876301407814026 and acc is 0.9033653736114502


In [96]:
fit_one_cycle(5,0.001,model,train_loader,valid_loader,weight_decay,grad_clip,optim_func)

loss is 0.3243520259857178 and acc is 0.8898076415061951
loss is 0.4771917164325714 and acc is 0.8474999666213989
loss is 0.34957972168922424 and acc is 0.8843269348144531
loss is 0.28438901901245117 and acc is 0.9109615683555603
loss is 0.26964375376701355 and acc is 0.914711594581604
