In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Note:** It is easier to redownload the dataset, then figure out how to extract .7z files in Kaggle**

In [None]:
# Dowload the dataset
from torchvision.datasets.utils import download_url
dataset_url = "http://files.fast.ai/data/cifar10.tgz"
download_url(dataset_url, '.')
import tarfile
# Extract from archive
with tarfile.open('./cifar10.tgz', 'r:gz') as tar:
    tar.extractall(path='./data')

In [None]:
!ls data/cifar10

In [None]:
labels = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv')
labels.head()

## Are Datasets these days this Unbiased?

The CIFAR-10 Dataset has *equal number* of labels for each class (ie, a balanced dataset), which is often good for training :). The number of images for each label (a.k.a. class) are printed below. We see there are 5000 images for each of the 10 classes.

In [None]:
labels['label'].value_counts()

# Reading Datasets in PyTorch

In [None]:
import tqdm
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor

In [None]:
dataset = ImageFolder('./data/cifar10/train', transform=ToTensor())

## Split data into Train + Validation

- An **80 : 20** split seems fair to start with. This means 80% of the data will be used for training, while 20% will be kept aside and used to determine how well the model performs on unseen data. 
    
    It is important to keep training our model as long as both the training and validation loss decrease (this prevents underfitting). When we see that our validation loss starts increasing after a certain point, while training loss decreases - we must STOP training immediately. This prevents overfitting and enables our model to generalize well on unseen data.  


- You may also choose splits of 60:40 or 90:10 or <*insert favorite_number : 100 - fav_num*>

> ### Calculating Split Sizes

In [None]:
val_split = 0.20
val_size = int(val_split * len(dataset))
train_size = int(len(dataset) - val_size)
print(train_size, val_size)

### Performing the Stratified Split

In the previous cell, we calculated the sizes of our train and validation splits. Now we actually split the dataset.

It would make sense to ensure our train and validation splits have the same class proportions as the original balanced dataset. Stratified Splitting does exactly this!

Imagine what would happen during a random split where **nearly all images of a particular class** got into your validation set. Since, no image from that class would be in the train set, your model would almost surely have a huge validation error - and you'd be left wondering if theres a bug in your code xP

In [None]:
# Collect Indices and their targets("labels"). 
# We later feed this to sklearn.train_test_split ready-made function to get the indices to be put in the train_set and those in the validation_set
indices = np.arange(len(dataset))
targets = np.array(dataset.targets)
indices.shape, targets.shape

In [None]:
from sklearn.model_selection import train_test_split
train_idx, val_idx, train_labels, val_labels = train_test_split(indices, targets, test_size=val_size, stratify=targets)

We see our 80:20 stratified split ensures there are 4000 train images per class and 1000 validation images per class.

In [None]:
print("Train Split: \n", pd.Series(train_labels).value_counts())
print("Validation Split: \n", pd.Series(val_labels).value_counts())

### Sampling the Train and Validation set from Torch.dataset using Indices

In the previous section, we generated `train_idx` and `val_idx` as arrays containing the train indices and validation indices from the whole dataset. We now use these indices to retrieve the actual train_data and val_data from the torch.dataset (CIFAR 10 read by Torch API) with the help of the `SubsetRandomSampler`.

We set the batch_size to 64. This parameter is restricted by the memory limitations of your GPU. 

In [None]:
batch_size = 64

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
val_sampler = torch.utils.data.SubsetRandomSampler(val_idx)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

# Defining our CNN Model

Experimenting the performance of a tiny inception-based network with dilations. We first code the inception block below, where we replace the traditional conv5x5 with a dilated conv3x3 in the third branch.


In [None]:
class InceptionBlock(nn.Module):
    def __init__(self, in_channels, o1, o2, o3, o4, **kwargs):
        """
        @Params:
            in_channels: Number of input channels from previous layer/ inception block
            o1(int): Number of output channels from branch_1
            o2(tuple): Number of output channels from branch 2. (b2_0 -> b2_1, b2_1 -> output)
            o3(tuple): Number of output channels from branch_3. (b3_0 -> b3_1, b3_1 -> output)
            o4(int): Number of output channels from branch_4.
            
        Note: branch_2 and branch_3 have two layers: conv1x1 followed by conv3x3, and thus, takes a tuple argument o2/ o3.
        
        Outputs a tensor with (o1 + o2[1] + o3[1] + o4) channels.
        """
        super().__init__(**kwargs)
        
        # Branch 1 is a conv1x1 layer, outputs o1 channels
        self.b1 = nn.Conv2d(in_channels, o1, kernel_size=1)
        
        # Branch 2 is a conv1x1 layer followed by a conv3x3
        self.b2_0 = nn.Conv2d(in_channels, o2[0], kernel_size=1)
        self.b2_1 = nn.Conv2d(o2[0], o2[1], kernel_size=3, padding=1)
        
        # Branch 3 is a conv1x1 followed by a dilated conv3x3
        self.b3_0 = nn.Conv2d(in_channels, o3[0], kernel_size=1)
        self.b3_1 = nn.Conv2d(o3[0], o3[1], kernel_size=3, padding=2, dilation=2)
        
        # Branch 4 is a 3x3 Max Pooling layer followed by a conv1x1
        self.b4_0 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.b4_1 = nn.Conv2d(in_channels, o4, kernel_size=1)

    def forward(self, x):
        branch_1 = F.relu(self.b1(x))
        branch_2 = F.relu(self.b2_1(F.relu(self.b2_0(x))))
        branch_3 = F.relu(self.b3_1(F.relu(self.b3_0(x))))
        branch_4 = F.relu(self.b4_1(self.b4_0(x)))
        # Concatenate the outputs on the channel dimension
        return torch.cat((branch_1, branch_2, branch_3, branch_4), dim=1)

We can now build our TinyNet model using the `InceptionBlock` from above.

In [None]:
class TinyNet(nn.Module):
    
    def __init__(self, input_channels=3):
        super().__init__()
        
        self.input_block = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        )
        
        self.block_1 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        self.inception_1 = nn.Sequential(
            InceptionBlock(192, 64, (96, 128), (16, 32), 32),
            InceptionBlock(256, 128, (128, 192), (32, 96), 64),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        self.inception_2 = nn.Sequential(
            InceptionBlock(480, 192, (96, 208), (16, 48), 64),
            InceptionBlock(512, 160, (112, 224), (24, 64), 64),
            nn.AdaptiveMaxPool2d((1,1)),
            nn.Flatten()
        )
        
        self.classifier = nn.Linear(512, 10)
        
    def forward(self, x):        
        x = self.input_block(x)
        x = self.block_1(x)
        x = self.inception_1(x)
        x = self.inception_2(x)
        x = self.classifier(x)
        return x
        
            

Instantiate the model we defined above. Also, we tell pytorch to use a GPU (if available) as it speeds up training time.

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = TinyNet().to(device)
# print(model)

Let us see the outputs dimensions from each layer. Helps to keep track of the dimensions while creating the model and also gives us an idea of the number of parameters to learn.

In [None]:
!pip install torchsummary

In [None]:
from torchsummary import summary

sample_input_size = (3, 32, 32)
summary(model, sample_input_size)

**More than 1 million parameters to learn!** Why would any one call this a "TinyNet"? :)

# Training the Model

## Choosing an Optimizer

A good optimizer changes the learning_rate and weights of the network to minimize the loss function("criterion").

I chose to try AdaBound [ICLR 2019] , an optimizer that claims to behave like Adam at the beginning of training, and gradually transforms to SGD at the end. See more here :  [Adaptive Gradient Methods with Dynamic Bound of Learning Rate](https://openreview.net/forum?id=Bkg3g2R9FX)

You may also want to see Rectified Adam. 

In [None]:
!pip install adabound

In [None]:
import adabound
optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)

## Define a Loss Function

We use the standard cross entropy loss for this image classification problem. 

In [None]:
criterion = nn.CrossEntropyLoss()

## Training Loop

`num_epochs`: An epoch is a run over the entire training dataset. Remember, that we defined `batch_size = 64` and, therefore, only look at 64 images per iteration over the training set.

`history`: Dictionary that stores train_acc, train_loss, val_acc, val_loss for each epoch.

For every epoch, we first train our model on the train_set only. We then run predictions on the validation_set, and print the scores to see how well our model is training. 

In [None]:
num_epochs = 50
history = {
    'train': { 'acc': [], 'loss': [] },
    'val'  : { 'acc': [], 'loss': [] }
}

print("TinyNet training...")

for epoch in range(num_epochs):
    
    #Epoch Statistics
    train_loss, train_acc, val_loss, val_acc = 0.0, 0.0, 0.0, 0.0
    
    #Train
    correct = 0
    model.train()
    for images, labels in train_loader:
        
        images, labels = images.to(device), labels.to(device) #Shift to GPU
        
        optimizer.zero_grad() #Zero Parameter Gradients!
        
        #Forward Pass -> Back Propagation -> Optimize
        outputs = model(images)
        batch_loss = criterion(outputs, labels)
        batch_loss.backward()
        optimizer.step()
        
        #Accuracy
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        
        train_loss += batch_loss.item()
    
    train_acc = correct/train_size
    train_loss = train_loss/train_size
    
    #Validation
    with torch.no_grad():
        correct = 0
        model.eval()
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device) #Shift to GPU
            
            outputs = model(images)
            batch_loss = criterion(outputs, labels)
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            
            val_loss += batch_loss.item()
            
    val_acc = correct/val_size
    val_loss = val_loss/val_size
    
    #Save Epoch Statistics
    history['train']['acc'] = train_acc
    history['train']['loss'] = train_loss
    history['val']['acc'] = val_acc
    history['val']['loss'] = val_loss
    print(f"Epoch {epoch}: Train_acc: {train_acc} , \t Train_loss: {train_loss} , \t Val_acc: {val_acc} , \t Val_loss: {val_loss}")

    