In [1]:
# Mount Google Drive

from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os

os.chdir("gdrive/MyDrive")

In [3]:
%cd ./'Colab Notebooks'

/content/gdrive/MyDrive/Colab Notebooks


In [4]:
# setup, importing libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, confusion_matrix

# Checking availability of GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [5]:
# loading/preprocessing data

# Define dataset paths
train_file_path = '/content/gdrive/MyDrive/data/mnist_train.csv'
test_file_path = '/content/gdrive/MyDrive/data/mnist_test.csv'

# Read the CSV files
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Print first 5 rows of the train and test data to check the upload
print("First 5 rows of training data:")
print(train_data.head())

print("First 5 rows of test data:")
print(test_data.head())

# Separating features and labels
X_train = train_data.iloc[:, 1:].values / 255.0  # Normalizing pixel values
Y_train = train_data.iloc[:, 0].values

X_test = test_data.iloc[:, 1:].values / 255.0  # Normalizing pixel values
Y_test = test_data.iloc[:, 0].values

# tensor : Multi-dimensional array that is a generalization of vectors and matrices to higher dimensions
# Fundamental data structure in PyTorch (and other DL frameworks), enabling efficient computation on GPU and CPU
# 0-D tensor (scalar): A single number, 1-D tensor (vector): A 1D array of numbers,
# 2-D tensor (matrix): A 2D array of numbers, N-D tensor: An N-dimensional array of numbers

# converting numpy array to PyTorch tensors for usage in models and computations
# feature tensors : torch.float32
# label tensors : torch.int64 or torch.long

X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test, dtype=torch.long)

# Creating PyTorch datasets
# TensorDataset combines the feature tensors and label tensors into a single dataset object
# This object can be passed to DataLoader for iteration
train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)

# train_loader / test_loader are iterators created to efficiently iterate over the datasets
# batch_size=32 : each batch will contain 32 samples
# shuffle=True : ensures that the model sees the training data in a different order each epoch, which helps with generalization.
# shuffle=False : ensures that the model maintains the order of testing data for evaluation purposes.
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print the shape of the datasets
print("Shape of X_train:", X_train.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_test:", Y_test.shape)

First 5 rows of training data:
   label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0      5    0    0    0    0    0    0    0    0    0  ...      0      0   
1      0    0    0    0    0    0    0    0    0    0  ...      0      0   
2      4    0    0    0    0    0    0    0    0    0  ...      0      0   
3      1    0    0    0    0    0    0    0    0    0  ...      0      0   
4      9    0    0    0    0    0    0    0    0    0  ...      0      0   

   28x21  28x22  28x23  28x24  28x25  28x26  28x27  28x28  
0      0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0      0  

[5 rows x 785 columns]
First 5 rows of test data:
   label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0      7    0    0    0  

In [6]:
# defining the neural network model
# The neural network model is defined by creating a subclass of nn.Module, (base class for all
# neural network modules in PyTorch. The class NeuralNetwork represents the architecture of the neural network.
# super() call (nn.Module) is necessary to initialize the base class properly.
# nn.Linear(input_dimensions, output_dimensions) for individual layers
# activation functions (for non-linearity) -
#   nn.Relu() : sets all negative values to zero and keeps positive values unchanged.
#   nn.Softmax(dim=1) :
#     converts the raw output scores into probability distribution by normalizing the values so that they sum to 1
#     dim=1 indicates that the softmax function is applied along the dimension corresponding to the classes.
#     typically used for the output layer of a neural network, especially in multi-class classification problems.
#     output of Softmax function can be interpreted as the probability that the input belongs to each class.
#     ease of interpretability for which is most likely class by selecting the one with the highest probability.

class NeuralNetwork(nn.Module):
  def __init__(self):
    super(NeuralNetwork, self).__init__()
    self.fc1 = nn.Linear(784, 128)
    self.fc2 = nn.Linear(128, 64)
    self.fc3 = nn.Linear(64, 10)
    self.relu = nn.ReLU()
    self.softmax = nn.Softmax(dim=1)

  # forward() method defines how the input data flows through the network layers during the forward pass
  # x : training tensor X. It is processed through the layers
  # the result of passing through a layer while applying its respective activation functions is stored back in x,
  # the process is repeated, and the final output x contains the probabilities for each of the 10 classes.
  def forward(self, x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.softmax(self.fc3(x))
    return x

# creates an instance of NeuralNetwork class and .to(device) moves the model to the specified device ('device'). If
# GPU is available, the model will moves to the GPU for faster computation. If not, it remains on the CPU.
model = NeuralNetwork().to(device)

# commonly used for classification tasks. combines LogSoftmax and NLLLoss (Negative Log Likelihood Loss) in one single class.
criterion = nn.CrossEntropyLoss()

# defining the optimizer to use for updating the model's weights during training.
# Adam is an optimization algorithm that adjusts the learning rate based on the training process.
# model.parameters(): retrieves all the parameters of the model that need to be updated.
# lr=0.001 : setting learning rate (how much the model's weights are adjusted w.r.t. loss gradients)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Printing model summary
print(model)

NeuralNetwork(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
  (softmax): Softmax(dim=1)
)


In [7]:
# training the model

# each epoch is one complete pass over the training data set
num_epochs = 20

for epoch in range(num_epochs):
  # set the model to training mode. Important for certain layers that behave differently during
  # training and evaluation (e.g., dropout, batch normalization)
  model.train()

  # variable to calculate the accumulated loss over each batch in the current epoch. helps to calculate average loss for the epoch.
  running_loss = 0.0

  for X_batch, Y_batch in train_loader:
    # Move the batch of data to the specified device (CPU or GPU) for computation to
    # ensure that both the input data and labels are on the same device as the model.
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

    # Clearing the gradients of all optimized parameters (important because gradients accumulate by default in PyTorch)
    optimizer.zero_grad()

    # conducting a forward pass through the model to obtain predictions for the current batch
    # outputs : raw output scores (logits) from the model for the current batch
    outputs = model(X_batch)

    # calculating the loss between the model's predictions and the true labels
    # criterion: The loss function defined earlier (nn.CrossEntropyLoss())
    # loss: The computed loss value for the current batch
    loss = criterion(outputs, Y_batch)

    # performing a backward pass to compute the gradients of the loss with respect to the model's parameters
    # pytorch.backward() is called on the loss tensor
    # computes the gradients of the loss with respect to each parameter (weight and bias) in the network using the chain rule
    # of calculus. These gradients are stored in the .grad attribute of each parameter.
    loss.backward()

    # updating the model's parameters using the computed gradients. optimizer performs a step of gradient descent optimization.
    optimizer.step()

    # accumulating the loss for the current batch
    # loss.item(): converts the loss tensor to a Python scalar and adds it to running_loss
    running_loss += loss.item()


  print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete")

# Summary of Training Process
# Set Number of Epochs: Define how many times the model will pass through the entire training dataset.
# Epoch Loop: Iterate over the specified number of epochs.
# Set Training Mode: Ensure the model is in training mode.
# Batch Loop: Iterate over batches of data.
# Move Data to Device: Transfer the data to the appropriate device (CPU/GPU).
# Zero Gradients: Clear the accumulated gradients.
# Forward Pass: Compute the model's predictions for the current batch.
# Compute Loss: Calculate the loss between predictions and true labels.
# Backward Pass: Perform backpropagation to compute gradients.
# Update Parameters: Adjust the model's parameters based on the gradients.
# Accumulate Loss: Keep track of the total loss for the current epoch.
# Print Epoch Loss: Output the average loss for the current epoch.
# End of Training: Indicate that the training process is finished.



Epoch [1/20], Loss: 1.6007
Epoch [2/20], Loss: 1.5182
Epoch [3/20], Loss: 1.5052
Epoch [4/20], Loss: 1.4970
Epoch [5/20], Loss: 1.4922
Epoch [6/20], Loss: 1.4887
Epoch [7/20], Loss: 1.4854
Epoch [8/20], Loss: 1.4841
Epoch [9/20], Loss: 1.4821
Epoch [10/20], Loss: 1.4801
Epoch [11/20], Loss: 1.4802
Epoch [12/20], Loss: 1.4783
Epoch [13/20], Loss: 1.4788
Epoch [14/20], Loss: 1.4785
Epoch [15/20], Loss: 1.4768
Epoch [16/20], Loss: 1.4764
Epoch [17/20], Loss: 1.4759
Epoch [18/20], Loss: 1.4756
Epoch [19/20], Loss: 1.4748
Epoch [20/20], Loss: 1.4746
Training complete


In [8]:
# evaluating the model

# Set the model to evaluation mode. This is important because certain layers, such as dropout and batch normalization,
# behave differently during training and evaluation.
# model.eval(): Disables dropout and uses running statistics for batch normalization instead of batch statistics.
model.eval()

# Disabling gradient computation for efficiency. During evaluation, gradients are not needed,
# so disabling them saves memory and computation time.
# torch.no_grad(): Context manager that disables gradient calculation.
with torch.no_grad():
  # initializing counters for training data
  correct_train = 0
  total_train = 0

  # initializing counters for test data
  correct_test = 0
  total_test = 0

  for X_batch, Y_batch in train_loader:
    # Move the input features and labels to the appropriate device (CPU/GPU)
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

    outputs = model(X_batch)

    # torch.max(outputs.data, 1) returns the index of the maximum value along the specified dimension (1 in this case,
    # which corresponds to the class probabilities). The result is the predicted class for each sample in the batch
    _, predicted = torch.max(outputs.data, 1)

    # updating counters
    total_train +=Y_batch.size(0)

    correct_train += (predicted == Y_batch).sum().item()

  train_accuracy = 100 * correct_train / total_train

  # repeating process for test data

  correct_test = 0
  total_test = 0
  for X_batch, Y_batch in test_loader:
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    outputs = model(X_batch)
    _, predicted = torch.max(outputs.data, 1)
    total_test += Y_batch.size(0)
    correct_test += (predicted == Y_batch).sum().item()

  test_accuracy = 100 * correct_test / total_test

  print(f"Training Accuracy: {train_accuracy:.2f}%")
  print(f"Test Accuracy: {test_accuracy:.2f}%")


Training Accuracy: 98.59%
Test Accuracy: 97.18%


In [9]:
# Cell 6: Generate Predictions
model.eval()
with torch.no_grad():
  train_predictions = []
  train_true_labels = []
  for X_batch, Y_batch in train_loader:
    X_batch = X_batch.to(device)
    outputs = model(X_batch)
    _, predicted = torch.max(outputs.data, 1)

    # Convert the predicted class labels to a NumPy array and append them to train_predictions._.cpu()
    # ensures the data is moved back to the CPU before converting to a NumPy array.
    train_predictions.extend(predicted.cpu().numpy())

    # Convert the true labels to a NumPy array and append them to train_true_labels.
    train_true_labels.extend(Y_batch.numpy())

  test_predictions = []
  test_true_labels = []
  for X_batch, Y_batch in test_loader:
    X_batch = X_batch.to(device)
    outputs = model(X_batch)
    _, predicted = torch.max(outputs.data, 1)
    test_predictions.extend(predicted.cpu().numpy())
    test_true_labels.extend(Y_batch.numpy())

# Convert the lists of predictions and true labels to NumPy arrays for further analysis and evaluation.
# np.array(): Converts a list to a NumPy array.
train_predictions = np.array(train_predictions)
train_true_labels = np.array(train_true_labels)
test_predictions = np.array(test_predictions)
test_true_labels = np.array(test_true_labels)

# Print the first 5 predictions and their corresponding true labels for training and test data
print("First 5 training predictions:", train_predictions[:5])
print("First 5 training true labels:", train_true_labels[:5])
print("First 5 test predictions:", test_predictions[:5])
print("First 5 test true labels:", test_true_labels[:5])

First 5 training predictions: [1 2 6 6 3]
First 5 training true labels: [1 2 6 6 3]
First 5 test predictions: [7 2 1 0 4]
First 5 test true labels: [7 2 1 0 4]


In [10]:
# evaluation metrics for both datasets

# Classification Report and Confusion Matrix for Training Data
print("Training Classification Report:")
print(classification_report(train_true_labels, train_predictions))

print("Training Confusion Matrix:")
print(confusion_matrix(train_true_labels, train_predictions))

# Classification Report and Confusion Matrix for Test Data
print("Test Classification Report:")
print(classification_report(test_true_labels, test_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_true_labels, test_predictions))

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5923
           1       0.99      0.99      0.99      6742
           2       0.99      0.98      0.98      5958
           3       0.99      0.97      0.98      6131
           4       0.99      0.99      0.99      5842
           5       0.96      0.99      0.98      5421
           6       1.00      0.99      0.99      5918
           7       0.98      0.99      0.99      6265
           8       0.98      0.98      0.98      5851
           9       0.98      0.98      0.98      5949

    accuracy                           0.99     60000
   macro avg       0.99      0.99      0.99     60000
weighted avg       0.99      0.99      0.99     60000

Training Confusion Matrix:
[[5850    1    9    2    3    9    9   22   14    4]
 [   1 6690   11    4    4    2    2    5    7   16]
 [   5   27 5834   22    8    1    3   24   24   10]
 [   0    1   26 5975  