# Introduction
Author: Aaron Chen <br>
I used Google Colab Notebook to run CNN with CUDA initially(free for basic computation power). <br>
You can download this notebook to run it on Google Colab. 

In [None]:
### Author: Aaron Chen ###
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
# this step mount your personal Google Drive to the Google Colab directory
# from google.colab import drive
# drive.mount('/content/drive')

# 1. Create Pytorch custom dataset
The first step is to make a Pytorch custom dataset. Pytorch website has a nice tutorial here: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html <br>
The basic idea and purpose of the torch.Dataset is to read some data (i.e. Pandas Dataframe from csv file), make a `__getitem__()` class function so that the Pytorch Dataloader can load your original data in a batch fashion. <ba>
The MNIST dataset are 28x28, one channel images. These pixels are formatted in the csv file so that each row represents one image, and pixels of this image are flattened in one dimension (1x784). Therefore, we need to reshape each row to 1x28x28. In many cases where we load real images directly (instead of formatted csv file), images pixels are Width x Height x Channels. However, Pytorch requires the input tensors formatted as Channels x Height x Width. This is just a small detail to be noticed.

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Digit Recognizer/train.csv')
# df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Digit Recognizer/test.csv')
df = pd.read_csv('../input/digit-recognizer/train.csv')
df_test = pd.read_csv('../input/digit-recognizer/test.csv')

class MyMNIST(Dataset):
    # reference from https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    
    def __init__(self, df, transform = None, test = False):
        """
        Args:
            df (Dataframe): Dataframe of specific test, validation or test.
            transform (callable, optional): Optional transform to be applied
                on a sample.
            test (Boolean): is this dataframe a training set or test set
        """
        self.df = df
        self.transform = transform
        self.test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if not self.test:
            # here x is the pixel (without the label) so [idx, 1:].
            # we reshape it to 1x28x28 and divided by 255 so that all the pixel values are between [0, 1]
            # I specifies the dtype = np.float32 because otherwise Pytorch throws an error later I think
            x = torch.tensor(self.df.iloc[idx,1:].to_numpy(dtype=np.float32).reshape(1,28,28)/255)
            # for the training/validation set, we need to preserve the label information as well
            y = torch.tensor(self.df.iloc[idx,0])
        else:
            # for the test set, we don't have the label information so return pixels directly
            x = torch.tensor(self.df.iloc[idx].to_numpy(dtype=np.float32).reshape(1,28,28)/255)
            if self.transform:
                x = self.transform(x)
            return x
            
        # Here you can specify what transforms to use on each data set. 
        if self.transform:
            x = self.transform(x)

        return (x, y)

# 2. Data prepossessing and data augmentation 
Here, I want to train the model on the first 90% of the data and use the last 10% for the validation. <br>
In the custom datset, I devide all the pixel values by 255 to make them between 0 and 1. During the transform, I set the mean and std of the normalization to be 0.5 and 0.5 so that the normalized pixel values will be between -1 and 1. This normalization is applied to all training, validation and test dataset. For the training set, I applied the data augmentation of random horizontal/vertical flip and random rotation with a probability of 0.5. <br>
<br>
Then I load the training and validation set using `DataLoader` with a batch size of 100. The batch size is a hyperparameter you can choose but larger the batch size, larger the memory requirement; if it's too small, the optimization of weight and bias might not be stable.<br>
<br>
I plotted an image from the training set for visualization purpose. Because of the random transform, the ouput image might be flipped or rotated.

In [None]:
split = len(df)//10 * 9
df_train = df.iloc[:split]
df_val = df.iloc[split:]
# mean, std = 0.5, 0.5 to normalize the pixels in the range of [-1, 1]
mean, std = 0.5, 0.5
# 1. normalization and 2. simple data augmentation with random flips and rotation
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(0.5),
                                    transforms.RandomVerticalFlip(0.5),
                                    transforms.RandomApply([transforms.RandomRotation((-25,25))], p=0.5),
                                    transforms.Normalize((mean,),(std,))])
# for validation and testing, we only applies normalization
val_transform = transforms.Normalize((mean,),(std,))
test_transform = transforms.Normalize((mean,),(std,))
# create train, val, test MyMNIST dataset
train = MyMNIST(df_train, train_transform, test  =False)
val = MyMNIST(df_val, val_transform, test = False)
test = MyMNIST(df_test, test_transform, test = True)
# Load train, val dataset in a batch size of 100
# when you loop throught them, the dimension would be 
# [100, 1, 28, 28] == [batch_size, num_of_channel, H, W]
train_loader = DataLoader(train, batch_size = 100, shuffle = True)
val_loader = DataLoader(val, batch_size = 100, shuffle = True)
# Load test datset, since we just need to make prediction, we can load them one by one
# you can definitely load all of them, but it would simply requires a lot more memory usage
test_loader = DataLoader(test, batch_size = 1, shuffle = False)
# plot one sample from the training set along with its label
sample_x, sample_y = next(iter(train_loader))
print(sample_x.shape, sample_x.view(-1,28,28).shape)
plt.imshow(sample_x[0][0].cpu().data.numpy(), cmap='gray')
plt.show()
print('label:', sample_y[0])
# show pixel distribution if you want to
show_pixel_distribution = True
if show_pixel_distribution:
    pixel = []  
    for x, y in train_loader:
        pixel.append(x.numpy())
    plt.hist(np.array(x).ravel(), bins=30, density=True)
    plt.xlabel("pixel values")
    plt.ylabel("relative frequency")
    plt.title("distribution of pixels")         

# 3. Build the model and calculate the dimension in the CNN layers
I just want to build a relatively simple CNN with downsampling of 32 and 64 CNN layers. Some people went aggresive with multiple 128, 256 layers or even more advanced CNN backbone (such as VGG or ResNet) designed for ImageNet classification. In my humble opinion, I don't think it's necessary (although yes, this code will only achieve ~96% accuracy). MNIST is a small and simple dataset and we are not even training on the whole dataset. With a extremely large CNN, the model might overfit; sometimes the CNN has such high capacity that it "remembers" rather than learns the training set. <br>
<br>
To implement CNN, we do need to have a basic understanding of how it affects the dimension of the input. Stanford CS231n is an awesome class to look at (http://cs231n.stanford.edu/). I will try to use their materials here to at least show you how to calculate the dimension (I hope).<br>
## **Conv2D layer** 
Input image/data/etc: n_channels (D1), height (H1), width (W1) <br>
Conv2D layer parameters: in_channels (D1), out_channels (aka number of CNN filters K), kernel_size (F), stride (S), padding (P) etc (skip dilation here). <br>
Output image/data/etc after this Conv2D layer:<br>
D2 = K<br>
H2 = (H1−F+2P)/S+1<br>
W2 = (W1−F+2P)/S+1<br>
For example, the input batch is 100x1x28x28 where D1 = 1, H1 = 28, W1 = 28. After self.conv1 (defined below in the code):
D2 = 32<br>
H2 = (28-3+2*1)/1+1 = 28<br>
W2 = (28-3+2*1)/1+1 = 28<br>
so that the output batch is 100x32x28x28<br>
You can certainly play with the Conv2D parameters and there are tons of research what hyperparameters to use. Here (also common setting) is F = 3, S = 1, P = 1. The padding is nice because we don't lose the boundary information and it keeps the height and width the same after the Conv2D.<br>
## **MaxPool2d**
Max Pooling layer reduces the spatial size and keep the pixel info with the highest activation value. For example, a 2x2 MaxPool2d takes the highest value from adjacent 2x2 block. To calculate the output dimension:<br>
Input image/data/etc: n_channels (D1), height (H1), width (W1) <br>
MaxPool2d layer parameters: kernel_size (F), stride (S). <br>
Output image/data/etc after this MaxPool2d layer: <br>
D2 = D1<br>
H2 = (H1-F)/S+1<br>
W2 = (W1-F)/S+1<br>
For example, the input batch is 100x32x28x28 where D1 = 32, H1 = 28, W1 = 28. After self.pool (defined below in the code):
D2 = 32<br>
H2 = (28-2)/2+1 = 14<br>
W2 = (28-2)/2+1 = 14<br>
so that the output batch is 100x32x14x14<br>
## **ReLu** 
ReLu acts as the non-linear layer in the series linear transformation in the CNN. <br>
## **Drop out layer**
Drop out layer acts as a regularization method to "drop" some node in the neural network to curb overfitting issue. I referenced this website of the choice of the drop out layer position and probability. https://stats.stackexchange.com/questions/240305/where-should-i-place-dropout-layers-in-a-neural-network<br>
## **BatchNorm2d**
Batch normalization makes the model more robust (some sort of regularization) and allow faster training (higher learning rate). I am not gonna pretend that I know a lot about the batch normalization but it has became a common practice nowadays. Note: Batch normalization is placed right after the Conv2d/Linear layer and before the activation function. You may read the original paper if you are interested: https://arxiv.org/pdf/1502.03167.pdf <br>
## **Linear layer aka fully connected layer** 
Here the linear layer works as "fully connected" layer to represent and transforms the high dimensional pixel information to our final prediction (i.e. which class or which number the input image belongs to) <br>

In [None]:
class CNN(torch.nn.Module):

    def __init__(self, num_classes):
        super(CNN, self).__init__()

        ##forward layers:
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1) 
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.ff1 = nn.Linear(64*7*7, 128) 
        self.out = nn.Linear(128, num_classes)    

        ##activations:
        self.relu = nn.ReLU()

        #other activations:
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        # pooling layer:
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        #dropout:
        self.do1 = nn.Dropout(0.1)
        self.do2 = nn.Dropout(0.25)

        #batch-normalization:
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)

    def forward(self, x):
        # conv 1
        x = self.conv1(x) # 28*28*1 >(32 3*3 pad 1)> 28*28*32
        x = self.bn1(x)
        x = self.relu(x)
        x = self.do1(x)
        x = self.pool(x)
        # conv 2
        x = self.conv2(x) # 14*14*32 >(64 3*3 pad 1)> 14*14*64
        x = self.bn2(x)
        x = self.relu(x)
        x = self.do1(x)
        x = self.pool(x) # 7*7*64 
        # fc and output
        x = x.view(-1,7*7*64)
        x = self.ff1(x)
        x = self.do2(x)
        out = self.out(x)

        return out #returns class probabilities for each image

# 4. Initalize the model/loss function and use GPU if you can
Here I printed the dimesion of the parameters for each layer. For example, the first Conv2d layer has 32*1*3*3 (weight) + 32 (bias) parameters which is 320

In [None]:
model = CNN(num_classes=10)
for p in model.parameters():
    print(p.size())
gpu_boole = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu_boole, device)
model = model.to(device)

I Use CrossEntropyLoss() for the Loss function. Use Adam optimizer for the model's parameter optimization. (I used the default setting for the Adam) <br>
I also made train_eval, val_eval and test_eval (if necessary, not actually used here) for online evaluation of the model each epoch. I don't want to print or record the loss or accuracy of each batch but each epoch. <br>
A few things to pay attention: <br>
1. If you use GPU, your data and label need to be on GPU as well. data.cuda() does the trick.
2. model.train() tells Pytorch it should run model in the training mode. Sometimes we use random dropout during training but we don't want to have these drop out during validation or prediction. model.eval() tells Pytorch it should run the model in the evaluation mode.

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

def train_eval():
    
    model.train()

    correct = 0
    total = 0
    loss_sum = 0
    # forward pass
    with torch.no_grad():
        for data, label in train_loader:
            if gpu_boole:
                data, label = data.cuda(), label.cuda()
            output = model(data)
            # calculate loss and error rate
            loss_sum += criterion(output, label)
            _, pred_label = torch.max(output.data, 1)
            total += label.size(0)
            correct += (pred_label.float() == label.float()).sum()

    # return average loss and average accuracy
    # after looping through the whole dataset
    return loss_sum.cpu().data.numpy().item()/total, 100.*correct/total 

def val_eval():

    model.eval()

    correct = 0
    total = 0
    loss_sum = 0
    # forward pass
    with torch.no_grad():
        for data, label in val_loader:
            if gpu_boole:
                data, label = data.cuda(), label.cuda()
            output = model(data)
            # calculate loss and error rate
            loss_sum += criterion(output, label)
            _, pred_label = torch.max(output.data, 1)
            total += label.size(0)
            correct += (pred_label.float() == label.float()).sum()

    # return average loss and average accuracy
    # after looping through the whole dataset
    return loss_sum.cpu().data.numpy().item()/total, 100.*correct/total 

def test_eval():

    model.eval()

    correct = 0
    total = 0
    loss_sum = 0
    # forward pass
    with torch.no_grad():
        for data, label in test_loader:
            if gpu_boole:
                data, label = data.cuda(), label.cuda()
            output = model(data)
            # calculate loss and error rate
            loss_sum += criterion(output, label)
            _, pred_label = torch.max(output.data, 1)
            total += label.size(0)
            correct += (pred_label.float() == label.float()).sum()

    # return average loss and average accuracy
    # after looping through the whole dataset
    return loss_sum.cpu().data.numpy().item()/total, 100.*correct/total 

# 5. Run the model
You can initialize the weights in your model. Alternatively, you can make this a class function inside your model. You can also use many other initialization but I don't think it would make a huge difference for this project.<br>
Similarly, tells Pytorch you are in training model by calling model.train() and make sure your model and data are on the GPU. 

In [None]:
def init_weights(model):
    # reference: https://stackoverflow.com/a/49433937
    if type(model) == nn.Linear or type(model) == nn.Conv2d:
        torch.nn.init.xavier_normal_(model.weight)
model.apply(init_weights)

results = []
loss_per_epoch = []
n_epochs = 50
n_epochs_per_eval = 5
best_train_acc = 0.
best_val_acc = 0.
for epoch in range(n_epochs):
    model.train()
    for i,(data,label) in enumerate(train_loader):
        if gpu_boole:
            data, label = data.cuda(), label.cuda()
        # calculate loss and gradient
        if i > 0 or epoch > 0:
            optimizer.zero_grad()
        output = model.forward(data)
        loss = criterion(output, label)
        loss.backward()

        # call the optimizer
        optimizer.step()
    
    if epoch % n_epochs_per_eval == 0:
        train_loss, train_acc = train_eval()
        val_loss, val_acc = val_eval()
        if train_acc > best_train_acc:
            best_train_acc = train_acc
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            print('Step {:5d}: We have a best validation accuracy of {:.3f}%.'.format(epoch, val_acc))
        results.append([epoch,train_loss,train_acc,val_loss, val_acc])

In [None]:
fig, ax = plt.subplots(2,1,sharex=True)
epoch, train_loss, train_acc, val_loss, val_acc = zip(*results)
ax[0].plot(epoch, train_loss, epoch, val_loss)
ax[0].legend(['Train loss', 'Validation Loss'])
ax[1].plot(epoch, train_acc, epoch, val_acc)
ax[1].legend(['Train accuracy', 'Validation accuracy'])
print ('best train accuracy: ', best_train_acc,'best validation accuracy: ',best_val_acc)

Compared with my Version 1, you don't really see overfitting happening, which means that adding drop out layer and more data augmentation helps. However, the loss curve goes kind of flat and perhaphs we can tune the Adam optimizer for a better optimization result.

In [None]:
### make a prediction on the test set
model.eval()
id = []
label = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        if gpu_boole:
            data = data.cuda()
        output = model(data)
        _, pred_label = torch.max(output.data, 1)
        id.append(i+1)
        label.append(pred_label.cpu().item())
df_pred = pd.DataFrame({'ImageID':id, 'Label': label})
# df_pred.to_csv('/content/drive/MyDrive/Colab Notebooks/Digit Recognizer/prediction.csv', index = False)
df_pred.to_csv('prediction.csv', index = False)

What if I use all the data from train.csv for the model training? Spoiler alert, it doesn't improve too much. Honestly I think it's nice to use some data for the validation purpose so that you can monitor your model performance better and identify overfitting early.

In [None]:
train = MyMNIST(df, train_transform, test = False)
train_loader = DataLoader(train, batch_size = 100, shuffle = True)

model.apply(init_weights)

results = []
loss_per_epoch = []
n_epochs = 50
n_epochs_per_eval = 5
best_train_acc = 0.
for epoch in range(n_epochs):
    model.train()
    for i,(data,label) in enumerate(train_loader):
        if gpu_boole:
            data, label = data.cuda(), label.cuda()
        # calculate loss and gradient
        if i > 0 or epoch > 0:
            optimizer.zero_grad()
        output = model.forward(data)
        loss = criterion(output, label)
        loss.backward()

        # call the optimizer
        optimizer.step()

    if epoch % n_epochs_per_eval == 0:
        train_loss, train_acc = train_eval()
        if train_acc > best_train_acc:
            best_train_acc = train_acc
            print('Step {:5d}: We have a best training accuracy of {:.3f}%.'.format(epoch, train_acc))
            
        results.append([epoch,train_loss,train_acc])
fig, ax = plt.subplots(2,1,sharex=True)
epoch, train_loss, train_acc = zip(*results)
ax[0].plot(epoch, train_loss)
ax[0].legend(['Train loss'])
ax[1].plot(epoch, train_acc)
ax[1].legend(['Train accuracy'])
print ('best train accuracy: ', best_train_acc)

model.eval()
id = []
label = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        if gpu_boole:
            data = data.cuda()
        output = model(data)
        _, pred_label = torch.max(output.data, 1)
        id.append(i+1)
        label.append(pred_label.cpu().item())
df_pred = pd.DataFrame({'ImageID':id, 'Label': label})
# df_pred.to_csv('/content/drive/MyDrive/Colab Notebooks/Digit Recognizer/prediction_v2.csv', index = False)
df_pred.to_csv('prediction_with_all.csv', index = False)