In [None]:
# Import torch, the main PyTorch library. It provides tensor computation (like NumPy) with strong GPU acceleration and deep neural networks built on a tape-based autograd system.
import torch

# Import nn module from PyTorch. This module provides a way to create and train neural networks. It includes a wide range of layer types, activation functions, and utilities for building deep learning models.
import torch.nn as nn

# Import functional module from PyTorch. This module contains functions for operations used in building neural networks, like activation functions (e.g., relu, sigmoid), pooling operations, and loss functions.
import torch.nn.functional as F

# Import optim module from PyTorch. This module includes various optimization algorithms for adjusting the parameters of your neural networks, such as SGD, Adam, etc.
import torch.optim as optim

# Import datasets and transforms from torchvision. 'torchvision' is a package in the PyTorch project that provides utilities for working with image data. It includes common datasets and transformations you can use to preprocess your data.
from torchvision import datasets, transforms

# This command uses the pip package installer to install the torchsummary package. torchsummary provides a simple way to see the details of your PyTorch model, such as the number of parameters and the shape of the output at each layer.
!pip install torchsummary

# Import the summary function from the torchsummary package. This function is used to display the model architecture in a clear and concise manner, including information on the output shapes of each layer and the number of parameters.
from torchsummary import summary




In [None]:
# Import the torch library to check for CUDA availability
import torch

# Check if CUDA is available on the system
use_cuda = torch.cuda.is_available()

# Set the device to CUDA if available, otherwise fall back to CPU
device = torch.device("cuda" if use_cuda else "cpu")

# The 'device' variable now holds a reference to the device (GPU or CPU) where the computations will be performed.
# This line, if executed, would simply output the type of device selected, showing 'cuda' for GPU or 'cpu' for CPU.
# However, in a script, this line as standalone would not produce output without being printed or used in a computation.


In [None]:
# Set the batch size for loading the data
batch_size = 128

# Create a DataLoader for the training data
train_loader = torch.utils.data.DataLoader(
    # Load the MNIST dataset. If it's not present locally, it will be downloaded.
    # The dataset is set to training mode (train=True).
    datasets.MNIST('../data', train=True, download=True,
                    # Transform the data by converting images to PyTorch tensors and normalizing them.
                    # Normalization uses mean=0.1307 and std=0.3081, which are derived from the MNIST dataset.
                    transform=transforms.Compose([
                        transforms.ToTensor(),  # Convert images to PyTorch tensors.
                        transforms.Normalize((0.1307,), (0.3081,))  # Normalize images.
                    ])),
    batch_size=batch_size,  # Specify the batch size for the DataLoader.
    shuffle=True)  # Shuffle the data every epoch to avoid model overfitting on the order of the data.

# Create a DataLoader for the test data
test_loader = torch.utils.data.DataLoader(
    # Load the MNIST dataset for testing (train=False indicates test set).
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),  # Convert images to PyTorch tensors.
                        transforms.Normalize((0.1307,), (0.3081,))  # Normalize images.
                    ])),
    batch_size=batch_size,  # Specify the batch size.
    shuffle=True)  # Shuffle the test data to ensure random order during evaluation.


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 90412052.80it/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 84888362.88it/s]


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 30724595.60it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 13156442.52it/s]

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw






# Some Notes on our naive model

We are going to write a network based on what we have learnt so far.

The size of the input image is 28x28x1. We are going to add as many layers as required to reach RF = 32 "atleast".

In [None]:
class FirstDNN(nn.Module):
  def __init__(self):
    super(FirstDNN, self).__init__()
    # r_in:1, n_in:28, j_in:1, s:1, r_out:3, n_out:28, j_out:1
    self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
    # r_in:3 , n_in:28 , j_in:1 , s:1 , r_out:5 , n_out:28 , j_out:1
    self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
    # r_in5: , n_in:28 , j_in:1 , s:2 , r_out:6 , n_out:14 , j_out:2
    self.pool1 = nn.MaxPool2d(2, 2)
    # r_in:6 , n_in:14 , j_in:2 , s:1 , r_out:10 , n_out:14 , j_out:2
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
    # r_in:10 , n_in:14 , j_in:2 , s:1 , r_out:14 , n_out:14 , j_out:2
    self.conv4 = nn.Conv2d(128, 256, 3, padding = 1)
    # r_in:14 , n_in:14 , j_in:2 , s:2 , r_out:16 , n_out:7 , j_out:4
    self.pool2 = nn.MaxPool2d(2, 2)
    # r_in:16 , n_in:7 , j_in: 4, s:1 , r_out:32 , n_out:3 , j_out:4
    self.conv5 = nn.Conv2d(256, 512, 3)
    # r_in:24 , n_in:5 , j_in:4 , s:1 , r_out:32 , n_out:3 , j_out:4
    self.conv6 = nn.Conv2d(512, 1024, 3)
    # r_in:32 , n_in:3 , j_in:4 , s:1 , r_out:40 , n_out:1 , j_out:4
    self.conv7 = nn.Conv2d(1024, 10, 3)
# Correct values
# https://user-images.githubusercontent.com/498461/238034116-7db4cec0-7738-42df-8b67-afa971428d39.png
  def forward(self, x):
    x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
    x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
    x = F.relu(self.conv6(F.relu(self.conv5(x))))
    x = self.conv7(x)
   #x = F.relu(x) # this is the last step. Think what ReLU does to our results at this stage!
    x = x.view(-1, 10)
    return F.log_softmax(x)


In [None]:
class FirstDNN(nn.Module):
  def __init__(self):
    super(FirstDNN, self).__init__()
    # Initializes the parent class (nn.Module), essential for using PyTorch's model functionalities.

    # First convolutional layer: 1 input channel, 32 output channels, kernel size 3x3, padding to keep output size.
    self.conv1 = nn.Conv2d(1, 32, 3, padding=1)

    # Second convolutional layer: Increases depth to 64, kernel size 3x3, padding to keep output size.
    self.conv2 = nn.Conv2d(32, 64, 3, padding=1)

    # First pooling layer: Reduces spatial dimensions by half, using 2x2 pooling window with stride 2.
    self.pool1 = nn.MaxPool2d(2, 2)

    # Third convolutional layer: Increases depth to 128, kernel size 3x3, padding to keep output size.
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1)

    # Fourth convolutional layer: Increases depth to 256, kernel size 3x3, padding to keep output size.
    self.conv4 = nn.Conv2d(128, 256, 3, padding=1)

    # Second pooling layer: Further reduces spatial dimensions by half, using 2x2 pooling window with stride 2.
    self.pool2 = nn.MaxPool2d(2, 2)

    # Fifth convolutional layer: Increases depth to 512, kernel size 3x3, no padding, reducing output size.
    self.conv5 = nn.Conv2d(256, 512, 3)

    # Sixth convolutional layer: Increases depth to 1024, kernel size 3x3, no padding, maintaining output size.
    self.conv6 = nn.Conv2d(512, 1024, 3)

    # Seventh (final) convolutional layer: Reduces depth to match number of classes (10), kernel size 3x3, no padding.
    self.conv7 = nn.Conv2d(1024, 10, 3)
  def forward(self, x):
    # Applies the first convolutional layer followed by ReLU activation function.
    x = F.relu(self.conv1(x))
    # Applies the second convolutional layer followed by ReLU and then the first pooling layer.
    x = self.pool1(F.relu(self.conv2(x)))

    # Applies the third and fourth convolutional layers with ReLU in between and ends with the second pooling layer.
    x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))

    # Applies the fifth and sixth convolutional layers with ReLU activation function in between.
    x = F.relu(self.conv6(F.relu(self.conv5(x))))

    # Applies the final, seventh convolutional layer. No ReLU here as it's right before the output layer.
    x = self.conv7(x)

    # Reshapes (flattens) the output to match the expected format for log_softmax, preparing for classification.
    x = x.view(-1, 10)

    # Applies the log_softmax function on the output, which is common for classification tasks.
    return F.log_softmax(x, dim=-1)  # Added 'dim=-1' to specify the dimension over which softmax is applied.

In [None]:
model = FirstDNN().to(device)

In [None]:
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------

In [None]:
from tqdm import tqdm

def train(model, device, train_loader, optimizer, epoch):
    model.train()  # Sets the model to training mode (affects dropout, batch normalization, etc.).

    pbar = tqdm(train_loader)  # Wraps the data loader with a progress bar.
    for batch_idx, (data, target) in enumerate(pbar):  # Loops over batches of data.
        data, target = data.to(device), target.to(device)  # Moves data and target tensors to the specified device (CPU or GPU).

        optimizer.zero_grad()  # Clears the gradients of all optimized tensors to prevent accumulation from previous iterations.

        output = model(data)  # Feeds the input data through the model to get the output predictions.

        loss = F.nll_loss(output, target)  # Calculates the loss using negative log likelihood loss between predictions and actual targets.

        loss.backward()  # Computes the gradient of the loss w.r.t. model parameters.

        optimizer.step()  # Updates the model parameters based on gradients.

        pbar.set_description(desc=f'loss={loss.item()} batch_id={batch_idx}')  # Updates the progress bar with the current loss.


In [None]:
def test(model, device, test_loader):
    model.eval()  # Sets the model to evaluation mode (affects dropout, batch normalization, etc.).

    test_loss = 0  # Initializes the total test loss to 0.
    correct = 0  # Initializes the count of correct predictions to 0.

    with torch.no_grad():  # Disables gradient calculation to save memory and computations, since gradients are not needed for evaluation.
        for data, target in test_loader:  # Loops over batches of test data.
            data, target = data.to(device), target.to(device)  # Moves data and target tensors to the specified device.

            output = model(data)  # Feeds the input data through the model to get the output predictions.

            test_loss += F.nll_loss(output, target, reduction='sum').item()  # Sums up the loss for each batch.

            pred = output.argmax(dim=1, keepdim=True)  # Finds the predicted class with the highest probability.

            correct += pred.eq(target.view_as(pred)).sum().item()  # Counts the number of correct predictions.

    test_loss /= len(test_loader.dataset)  # Calculates the average loss per data point.

    # Prints the test set's average loss and accuracy.
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


In [None]:
# Set up the optimizer with specific parameters for the model's training process.
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optim.SGD: Stochastic Gradient Descent optimizer.
# model.parameters(): Collects all trainable parameters of the model for optimization.
# lr=0.01: Learning rate, determining the step size at each iteration to minimize the loss function.
# momentum=0.9: Momentum helps accelerate gradients vectors in the right directions, thus leading to faster converging.

# Start a loop to train and test the model across a specified number of epochs (cycles through the full dataset).
for epoch in range(1, 3):
    # epoch: Current iteration number out of the total specified iterations.

    # Call the train function for the current epoch, passing the model, device (CPU or GPU), training data, optimizer, and epoch number.
    train(model, device, train_loader, optimizer, epoch)
    # train(...) function: Trains the model using the provided training data and updates the model's weights using the optimizer.

    # Evaluate the model's performance on the test dataset after training.
    test(model, device, test_loader)
    # test(...) function: Tests the trained model using a separate dataset not seen by the model during the training phase to evaluate its performance.


loss=0.062166977673769 batch_id=468: 100%|██████████| 469/469 [00:33<00:00, 14.16it/s]



Test set: Average loss: 0.0643, Accuracy: 9791/10000 (98%)



loss=0.07963237166404724 batch_id=468: 100%|██████████| 469/469 [00:31<00:00, 14.71it/s]



Test set: Average loss: 0.0417, Accuracy: 9864/10000 (99%)

