# Image classification with Convolutional NN.


## Import all the packages required.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# import time for timekeeping
import time
# io allows reading and writing image from disk
# from skimage import io


# Pytorch (Our Deep Learning Framework)
import torch

# Torch Data Loader (this will be helful to load image)
from torch.utils.data import Dataset, DataLoader

# datasets have mnist if using coustom images import io from skimage
from torchvision import datasets, transforms, utils

# stores different optimizors like SGD
import torch.optim as optim

# Some torch functions that are used multiple times
import torch.nn.functional as F
import torch.nn as nn

# Set to False if not using GPUs
FLAG_GPU = True



## Here is the Multi Layer Perceptron definition you saw.
* Any network has an * __ init __ * function that initializes all the layers on a NN that require learnable parameters.
* A MPL is stack of fully connected layers. In this example we use three fully connected layers named :''fc0'', ''fc1'' and ''fc2''.
* Note that each fully connected layer has a number of input neurons that connect to a number of output neurons. 
* These input and output dimenssions are specified in fc layers initialization.
* If a fully connected layers connect to another, its output size = input size of fully connected layer that followes.
* Number of paramenters in any fully connected layer is #Input x #Output (and 1 bias per output).

## How do we write a forward function?
* torch.flatten(x, start_dim = dim) converts an image like entity to a vector.
* Remeber that you need activations after every fc layer. In this case ReLu. 
* Notice the log_sofmax layer at the end. This is a softmax activation function followed by log function as name suggests.

In [None]:
class MLPNet(nn.Module):
    def __init__(self):
        super(MLPNet, self).__init__()
        
        # First fully connected layers input image is 28x28 = 784 dim.
        self.fc0 = nn.Linear(784, 256) # nparam = 784*256 = 38400
        # Two more fully connected layers
        self.fc1 = nn.Linear(256, 84)
        self.fc2 = nn.Linear(84, 10)

    def forward(self, x):
        # Flattens the image like structure into vectors
        x = torch.flatten(x, start_dim=1)

        # fully connected layers with activations
        x = self.fc0(x)
        x = F.relu(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        # Outputs are log(p) so softmax followed by log.
        #return(x)
        output = F.log_softmax(x, dim=1)
        return output

# Our task today is to replace this with a convolutional NN.

## The Lecun Net we want to implement should look like the one in this figure:

![alt text](https://cdn-images-1.medium.com/max/1200/1*1TI1aGBZ4dybR6__DI9dzA.png)

* Our network now has two blocks, each of them has the structure 'convolution followed by relu followed by max pooling'.
* These two blocks replace the 'fc0'+relu layer in the example MLP. 
* Read inline TODO comments to change the model convolution net ?for training.

**Conv2d is 2D convolutional layer:**
   * Initialization reqires the kernal/filter size, number of input channels and number of filters (defining size of output).
   * First block has 5x5 convolutional filters. We use 6 of them. Convolutional layer takes a 28x28 image of one channel as input.
   * *TODO* What do you think will be the number of parameters needed for adding this layer?
   * What will be the size after the first 5x5 convolution? Why?
   * Second convolution is again 5x5 but this time we use 16 filters as the data we want to encode is more complex.
   * Remember to add activation after every convolution!
    
**MaxPooling2D does subsampling**
   * y = F.max_pool2d(x, k) command is used to perform kxk max pooling of some data x to create a smaller y. 
   * If the input images to pooling are 2Mx2N, then you will get MxN size output.
   * We will use 2x2 max pooling after every convolution-relu in this excersise.
**We will keep the 'fc1' and 'fc2' from MLP as it is**

# Your job here is to put conv-relu-pooling layers in appropriate order to write a forward function.
* **Remember that torch.flatten() converts images to vectors, where will you put the flatten layer now?**
* **Think about the number of parameters that you saved by replacing the fc0 of the MLP in this case**

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        # Two convolution layers I am writing the first one
        # First convolutional layer takes single chennel images (batch_size specify the number of images) as input
        # We have 5x5 convolutions
        # We have 6 convolutional filter to produce output size 6*28*28 for a single training sample.
        # structure is : nn.conv2d(number of input channels, number of filters, conv kernel size, stride = 1)
        self.conv1 = nn.Conv2d(1, 6, 5, 1)  
        # Note that Nparam 1*6*5*5 = 150 (+ 5 for bias per output).
        
        #################################################################################
        # TODO: add another layer called self.conv2, 5x5 convolutions 16 filters in total.
        #################################################################################
        self.conv2 = nn.Conv2d(6, 16, 5, 1)  

        # Two more fully connected layers arguments (input size, output size)
        self.fc1 = nn.Linear(256, 84)
        #################################################################################
        # TODO: what is the input and output sizes to fc2?
        #################################################################################
        self.fc2 = nn.Linear(84, 10)
        # 10 outputs are probability of any specefic digit present in the image
        # All sum to one

    def forward(self, x):
        # Input goes to convolution so no need to flatten the image yet
        #################################################################################
        # TODO: add a 5x5 convolution block (conv1 followed by activation followed by 2x2 max pooling)
        #################################################################################
        # use conv1 output = self.conv1(input)
        # use relu as activation with syntext: output = F.relu(input)
        # use max pooling with syntext:  output = F.max_pool2d(input, pooling kernal size)
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        
        
        #################################################################################
        # TODO: add aother 5x5 convolution block (conv2 followed by activation followed by max pooling)  
        #################################################################################
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        
        
        
        # Think what will be the size now of the image now 
        # if you don't pad images it is actually (4x4x16)
       
        #################################################################################
        # TODO: following upon your understanding regarding the size of the output, 
        # do you need to adjust the forward function in any way?
        #################################################################################
        x = torch.flatten(x, start_dim=1)
        
        # fully connected layers these remains as is
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)

        # return(x)
        # Outputs are log(p)
        output = F.log_softmax(x, dim=1)
        return output

# The rest of the code to train can be used as it is.
# We initialize the instance of ConvNet insted of MLP and train it!

## Initializing a instance of the defined network here.
* Note that puting a network to GPU is as simple as writing .cuda() at the end of the instance.
* Same is true for a variable. In this  notebook the code inside command "if FLAG_GPU" shows all the modifications you need to run your code on GPU.

In [None]:
net = ConvNet()
if FLAG_GPU:
    net.cuda()
    print(net)
else:
    print(net)

## Dataloaders and Transforms.
* dataset.MNIST in pytorch has functionality to download and process MNIST data.
* dataloader function usually allows for loading parts of training and test data in minibatches.
* It can use somple simple transformations implemented in class transforms that assists training. For example normalizing, resizing or cropping images.
* Functionality to dataset, transforms and dataloader classes are usually added to suit new data and training proceedure related to the problem at hand.

In [None]:
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
# Training dataset and training loader.
trainset = datasets.MNIST(root='../data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32,
                                          shuffle=True, num_workers=2)
# Test dataset and loader.
testset = datasets.MNIST(root='../data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=32,
                                         shuffle=False, num_workers=2)

## Here we see sample usage of loading some MNIST training data.
* How does out training minibatch looks?
* At times simple visualization and print statements allowes for understanding/debugging effectively.

In [None]:
def imshow(img, l):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
    print('Labels were:')
    print(l.reshape(-1,8).numpy())

# Load sample data
dataiter = iter(trainloader)
images, labels = next(dataiter)
print('shape of images', images.shape)

# display batch
imshow(utils.make_grid(images),labels)

## Loss function for learning.
* NLLLoss: The abbrivation NLL stands for Negetive log likelihood. It is however a bit of misnomer as the log is not included in the loss itself but was part of the network defination above. 
* NOTE: When you want to get the probability/likelihood of an image being of a perticular class you need to remove the log from the forward function and use simple softmax activation at test time. Alternatively simply use ''exp'' function from torch to invert log and leave the forward function as it is. 

## Optimizer
* pytorch have various optimization rutines (beyond SGD) pre-implemented.
* class optim will take care of backpropogation with these different optimizations for learning as long as the network defination with appropriate forward function is written correctly.
* Here we just use SGD. with learning rate 0.001 and momentum 0.9.

In [None]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
if FLAG_GPU:
    criterion = criterion.cuda()

## This cell of the notebook is now training a network.

* First for loop goes throught the entire data 5 times (We run 5 epochs for our training).
* The simple steps for training a NN with pytorch are:
    * Load data in minibatches.
    * Set gradients for all the network parameters to zero (dont forget this)
    * Pass data to the NN using a net.forward() to compute layer by layer output.
        * Intermediate outputs can be returned as extra variables in forward function.
    * Compute the loss from the output (remember it is defined above).
    * Use loss.backword() to compute all the gradients by appropriately applying chain rule! 
        * It actually know how to differentiate things!!!
    * Use optimizer.step() updates weights.
    
## At the end of every epoch usually we check if NN generalizes.
* Generalization is critical in learning.
* We evaluate the performance of our NN on new data, for which the NN loss was not minimized.
* torch.no_grad() command forces the following code to not keep track of the gradients as for testing we dont need them.
* As no gradients are maintained, the code runs faster!
* It a very good practice to make use of no_grad function to ensure that we dont accidently minimize loss on the data we are testing the performance on.


 


In [None]:
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    
    # Simply for time keeping
    start_time = time.time()
    # Loop over all training data
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
 
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward 
        if FLAG_GPU:
            outputs = net(inputs.cuda())
            loss = criterion(outputs, labels.cuda())
        else:
            outputs = net(inputs)
            loss = criterion(outputs, labels)

        # Compute Gradients
        loss.backward()
        # BackProp
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
        # endif
    # end for over minibatches epoch finishes
    end_time = time.time()

    # test the network every epoch on test example
    correct = 0
    total = 0

    # Test after the epoch finishes (no gradient computation needed)
    with torch.no_grad():
        for data in testloader:
            # load images and labels
            images, labels = data

            if FLAG_GPU:
                outputs = net(images.cuda())
                # note here we take the max of all probability
                _, predicted = torch.max(outputs.cpu(), 1)
            else:
                outputs = net(images)
                # note here we take the max of all probability
                _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

         #end for
    #end with
    print('Epoch', epoch+1, 'took', end_time-start_time, 'seconds')
    print('Accuracy of the network after', epoch+1, 'epochs is' , 100*correct/total)

print('Finished Training')