In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms

#
class inout(nn.Module):
    def __init__(self):                                                         # I chose 6, 16 and 120 layers as the hyperparameters as they were originally used in the LeNet model, similar reason for the conversion to 84 in the output size
        super(inout, self).__init__()                                           # these layers are extracting the number of features and patterns

        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)        # this layer with 6 filters extracts (outputs) low-level features like edges and corners
                                                                                # this kernel extracts information by traversing across  a 5x5 region of the image
                                                                                # we have to add padding here as Output size = input size−kernel/stridge size​+1, and we have to preserve the original resolution of the image here (standard)

        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)       # this layer with 16 extracts the mid-level features like textures and patterns
                                                                                # the stridge is the no. of pixels that the kernel moves at each step (here it moves pixel by pixel, covering each pixel of the image)
                                                                                # in the other convolution networks we do not need to preserve resolution as we need fewer spatial details

        self.conv3 = nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0)     # this layer with 120 extracts the high-level features like digit shapes and structures

        self.fc1 = nn.Linear(120, 84)                                           # this layer converts the 120 features into 84 featues (120 in 84 out)
        self.fc2 = nn.Linear(84, 10)                                            # this layer maps the 84 converted features into the final 10 output classes

    def forward(self, x):

        x = torch.relu(self.conv1(x))                                           # conv1 performs 2d convolution and extracts features. ReLu introduces non-linearity (reasoning docs)
        x = torch.max_pool2d(x, kernel_size=2, stride=2)                        # takes a 2x2 area (kernel) and keeps only the max value in that region to retain the most important features
                                                                                # (downsampling to get to the root value ?), stride = 2 also for the same reason -> resolution becomes 14x14 with 6 channels

        x = torch.relu(self.conv2(x))                                           # this layer takes the output of the previous max pooling layer as input.
        x = torch.max_pool2d(x, kernel_size=2, stride=2)                        # resolution becomes 5x5 with 16 channels

        x = torch.relu(self.conv3(x))                                           # resolution becomes 1x1 with 120 channels

        x = x.view(x.size(0), -1)                                               # output tensor is a 1x1x120 tensor

        x = torch.relu(self.fc1(x))                                             # the 120 feature output is now passed into the first fully-connected layer. It learns the mapping from doing the 120 features to 84 features
                                                                                # like y= w1.x + b1, where x is the 120 input features, and y is the 84 output features

        x = self.fc2(x)                                                         # the 84 feature output is then passed into the 2nd fully connected layer which finally maps it out into 10 output digits
                                                                                # like z= w2.y + b2
        return x


# Loading dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])                # compose is for many operations, transforms the image data from numpy array to tensors -> pixel data in range(0,255) get normalized to (0,1)
                                                                                                             # ? why 0.5
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)    # all of this is standard procedeure for loading
val_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_dl = DataLoader(train_dataset, batch_size=512, shuffle=True)                                            # this is to provide batches of data during training and validation, making the process efficient and manageable, again standard procedure
val_dl = DataLoader(val_dataset, batch_size=1000, shuffle=False)


# Function for training the model
def train(numb_epoch, lr=0.001, device='cuda'):                                 # passed with number of iterations, learning rate (0.001 as standard) and GPU

    cnn = inout()
    cnn = cnn.to(device)                                                        # moving model to GPU

    criterion = nn.CrossEntropyLoss()                                           # this is the definition of the cost function

    optimizer = optim.Adam(cnn.parameters(), lr=lr)                             # this is used to update the model's weights after each completion

    max_accuracy = 0                                                            # just to keep track of best validation accuracy
    accuracies = []                                                             # this is to store the accuracy for each epoch

    for epoch in range(numb_epoch):
        cnn.train()
        running_loss = 0.0                                                      # keeps track of loss for each epoch

        for i, (images, labels) in enumerate(train_dl):                         # this loop iterates over the batches in training dataset

            images = images.to(device)                                          # this moves the input to the same device as the model
            labels = labels.to(device)

            optimizer.zero_grad()                                               # resetting the gradient to zero after each iteration
            outputs = cnn(images)                                               # triggers the forward fxn and the actual model starts working
            loss = criterion(outputs, labels)                                   # calc. the loss from output and the actual values (labels)
            loss.backward()                                                     # triggers the backpropogation and calculates the gradient for the cost/loss function
            optimizer.step()                                                    # ? updates the model's parameters (w and b) by SGD - w=w−η⋅∂Loss​∂w

            running_loss += loss.item()                                         # calculates the loss for each iteration

        accuracy = float(validate(cnn, val_dl, device))                         # validating the model after every iteration
        accuracies.append(accuracy)                                             # collection of accuracy of the model

        print(f"Epoch {epoch+1}/{numb_epoch}, Loss: {running_loss/len(train_dl)}, Accuracy: {accuracy}")   # prints the accuracy

        if accuracy > max_accuracy:                                             # updating the max accuracy
            max_accuracy = accuracy

    return cnn                                                                  # returning the trained model

# Function for validating the model
def validate(model, data, device):                                              # model is cnn
    model.eval()                                                                # ???
    correct = 0                                                                 # keeping track of the correct and wrong predictions
    total = 0
    with torch.no_grad():                                                       # as we are just trying to find the accuracy (like yes or no), we do not need to update the parameters so we don't perform grad, descent
        for i, (images, labels) in enumerate(data):
            images = images.to(device)                                          # Moving input to the same device as model
            labels = labels.to(device)
            outputs = model(images)                                             # model is the output generated by the model
            _, predicted = torch.max(outputs, 1)                                # picks the max of the logits, then its index
                                                                                # here there are 10 logits and it picks the index with the highest value(activation)
            total += labels.size(0)                                             # total no. of images in the batch evaluated
            correct += (predicted == labels).sum().item()                       # if equal, then one correct identified

    accuracy = correct / total                                                  # calculating the accuracy
    return accuracy

# Setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Calling train function
trained_model = train(5, lr=0.015, device=device)


Epoch 1/5, Loss: 0.3359263442393582, Accuracy: 0.9791
Epoch 2/5, Loss: 0.05947585028233165, Accuracy: 0.9855
Epoch 3/5, Loss: 0.04359694044658188, Accuracy: 0.9829
Epoch 4/5, Loss: 0.04381925125716854, Accuracy: 0.9814
Epoch 5/5, Loss: 0.033892418676198036, Accuracy: 0.9881
