## IMPORT LIBRARIES

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

## LOAD TRAINING IMAGES AND LABELS

In [None]:
with open("data/MNIST/raw/train-images-idx3-ubyte","rb") as f:
    f.read(16) #skipping the first 16 bytes which are header
    data = f.read() 
    images = np.frombuffer(data, dtype = np.uint8) # converting images to numpy array  of type uint8
    images = images.reshape(-1,1,28,28) #-1 letting numpy figure out the number of images, 1 because we are only taking grayscale: 1 color channel, 28 * 28 is the image dimension
    images = images.astype(np.float32)/255.0  # normalizing pixel values [0,1] by dividing by 255 as the pixel value ranges from 0-255, so now its [0,1]
    images_tensor = torch.from_numpy(images) # coverts numpy array to pytorch tensor

In [None]:
with open("data/MNIST/raw/train-labels-idx1-ubyte","rb") as f:
    f.read(8) # here the first 8 bytes are header
    labels_data = f.read()
    labels = np.frombuffer(labels_data, dtype=np.uint8).copy() # converting labels to numpy array of type uint8
    labels_tensor = torch.from_numpy(labels) # converting numpy array to pytorch tensor

## CREATING TENSOR DATASET AND LOADER

In [None]:

train_dataset = TensorDataset(images_tensor, labels_tensor) # pairing images with labels
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # creating loader, it feeds data to the training loop, shuffle=True, shuffles the dataset at the start of each epoch,and returns dataset with a bs of 64, minibatch training

## FLATTEN 28*28 PIXEL IMAGES TO 784 ELEMENT VECTOR

In [None]:
## building the network
## 10 output classes 0-9
## 28 * 28 = 784 inputs

model = nn.Sequential( # stacking layers in order
    nn.Flatten(), # reshapes (batch, 1, 28, 28) → (batch, 784)
    nn.Linear(28*28,128), # linear layer, Input: 784 features (pixels), Output: 128 features (the “hidden units”).
    nn.ReLU(), # Activation function: Rectified Linear Unit.
    nn.Linear(128,64), # Takes 128 inputs → outputs 64 hidden features.
    nn.ReLU(), # Activation function: Rectified Linear Unit.
    nn.Linear(64,10) # Takes 64 inputs → outputs 10 hidden features.
)

### Input layer: 784 neurons (flattened pixels).

### Hidden layer 1: 128 neurons + ReLU.

### Hidden layer 2: 64 neurons + ReLU.

### Output layer: 10 neurons (one per digit).

## LOSS FUNCTION AND OPTIMIZER

In [None]:
criterion = nn.CrossEntropyLoss() #loss function, Applies softmax internally to turn them into probabilities.
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam (Adaptive Moment Estimation), combines the benefits of Momentum (smoother updates) and RMSProp (adaptive learning rates per parameter).

### criterion: Measures how wrong the model is (loss).

### optimizer: Adjusts the model’s parameters to reduce that loss.

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu" ## checking if cuda is available, if not use cpu
model.to(device) # Moves all the parameters (weights & biases) of your model onto the chosen device, cpu in my case

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=128, bias=True)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): ReLU()
  (5): Linear(in_features=64, out_features=10, bias=True)
)

## TRAINING LOOP

In [25]:
for epoch in range(25):  # loop for 15 epoch
    running_loss = 0.0 # accumulate the loss over the batches in the epoch.
    for images, labels in train_loader: # Loops over the training data in batches (64 images per batch in my loader).
        images, labels = images.to(device), labels.to(device) # moving them to the device, cpu in my case

        optimizer.zero_grad()      # Clears out the previous gradients, pytorch accumulates them
        outputs = model(images)    # forward pass
        loss = criterion(outputs, labels)  # Computes the loss (how wrong the predictions are) using CrossEntropyLoss.
        loss.backward()            # Backpropagation step, Computes gradients of the loss w.r.t. all trainable parameters (weights & biases).
        optimizer.step()           # Adam adjusts weights and biases to reduce the loss.

        running_loss += loss.item() #Adds the scalar value of the loss for this batch to running_loss, .item() converts the tensor to a Python number.
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}") # monitor training progress.

Epoch 1, Loss: 0.0107
Epoch 2, Loss: 0.0108
Epoch 3, Loss: 0.0084
Epoch 4, Loss: 0.0098
Epoch 5, Loss: 0.0102
Epoch 6, Loss: 0.0058
Epoch 7, Loss: 0.0067
Epoch 8, Loss: 0.0109
Epoch 9, Loss: 0.0062
Epoch 10, Loss: 0.0045
Epoch 11, Loss: 0.0086
Epoch 12, Loss: 0.0087
Epoch 13, Loss: 0.0051
Epoch 14, Loss: 0.0026
Epoch 15, Loss: 0.0087
Epoch 16, Loss: 0.0048
Epoch 17, Loss: 0.0074
Epoch 18, Loss: 0.0052
Epoch 19, Loss: 0.0015
Epoch 20, Loss: 0.0095
Epoch 21, Loss: 0.0052
Epoch 22, Loss: 0.0059
Epoch 23, Loss: 0.0056
Epoch 24, Loss: 0.0059
Epoch 25, Loss: 0.0033


## LOADING TEST IMAGES AND LABELS

In [None]:
# Test images
with open("data/MNIST/raw/t10k-images-idx3-ubyte","rb") as f:
    f.read(16)  #skipping the first 16 bytes which are header as it is image
    test_data = f.read()
    test_images = np.frombuffer(test_data, dtype=np.uint8).reshape(-1,1,28,28).astype(np.float32)/255.0 # converting images to numpy array  of type uint8 and reshaping and normalising in a single line
    test_images_tensor = torch.from_numpy(test_images) # converting to torch tensor

# Test labels
with open("data/MNIST/raw/t10k-labels-idx1-ubyte","rb") as f:
    f.read(8) #skipping the first 16 bytes which are header as it is label
    test_labels_data = f.read()
    test_labels = np.frombuffer(test_labels_data, dtype=np.uint8).copy() # converting label to numpy array  of type uint8
    test_labels_tensor = torch.from_numpy(test_labels) # converting to torch tensor

In [20]:
model.eval() # tells PyTorch that the model is now in evaluation/inference mode, not training mode.

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=128, bias=True)
  (2): ReLU()
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): ReLU()
  (5): Linear(in_features=64, out_features=10, bias=True)
)

## METRICS

In [21]:
from sklearn.metrics import classification_report, f1_score


# empty lists for storing predictions and actual labels
all_preds = []
all_labels = []

with torch.no_grad(): # We don’t need gradients during evaluation -> saves memory and speeds things up.
    for images, labels in DataLoader(TensorDataset(test_images_tensor, test_labels_tensor), batch_size=64): # Creates a DataLoader for the test set (batch size 64), Iterates over mini-batches of test images & labels.
        images = images.view(images.size(0), -1).to(device)  # flatten and move images to device: cpu
        labels = labels.to(device) # move labels to cpu
        outputs = model(images) # pass batch through network
        preds = torch.argmax(outputs, dim=1) # get predictions
        all_preds.extend(preds.cpu().numpy()) # store predictions
        all_labels.extend(labels.cpu().numpy()) # store actual labels

# Classification report
print(classification_report(all_labels, all_preds))

# F1 score (macro)
print("F1 Score (macro):", f1_score(all_labels, all_preds, average='macro'))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       980
           1       0.99      0.99      0.99      1135
           2       0.96      0.98      0.97      1032
           3       0.97      0.98      0.97      1010
           4       0.98      0.98      0.98       982
           5       0.99      0.97      0.98       892
           6       0.99      0.97      0.98       958
           7       0.98      0.98      0.98      1028
           8       0.96      0.97      0.97       974
           9       0.98      0.97      0.98      1009

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000

F1 Score (macro): 0.9786076560237223


## INFERENCE

In [26]:
def prepare_image(image_path, device="cpu"):
    # Load image and convert to grayscale
    img = Image.open(image_path).convert('L')  

    # Resize to 28x28
    img = img.resize((28, 28))  

    # Convert to numpy array and normalize
    img_array = np.array(img, dtype=np.float32) / 255.0  

    # Flatten to 1D vector
    img_flat = img_array.flatten()  

    # Convert to PyTorch tensor and add batch dimension
    img_tensor = torch.tensor(img_flat, dtype=torch.float32).unsqueeze(0).to(device)  

    return img_tensor


In [None]:
from PIL import Image
# Example image path
image_path = "data/images/9.jpg" # Change the image name from 0-9 to test all numbers, eg: "data/images/1.jpg", "data/images/3.jpg"

# Prepare image
img_tensor = prepare_image(image_path, device=device) # processing image, resizing, converting to numpy, grayscaling, flattening, then tensor

# Forward pass (no gradients needed)
with torch.no_grad():
    output = model(img_tensor)          # logits
    predicted_class = torch.argmax(output, dim=1).item() # actual prediction

print(f"Predicted digit: {predicted_class}")


Predicted digit: 9


## REFERENCE

Each MNIST dataset file (`train-images-idx3-ubyte`, `train-labels-idx1-ubyte`, etc.) follows a specific binary structure.

---

### 1. **Image file (`*-images-idx3-ubyte`)**

The first **16 bytes** are the header:

| Offset (bytes) | Meaning             | Size |
| -------------- | ------------------- | ---- |
| 0–3            | Magic number (2051) | 4    |
| 4–7            | Number of images    | 4    |
| 8–11           | Rows per image (28) | 4    |
| 12–15          | Cols per image (28) | 4    |

* **16 bytes total header** → skipped with `f.read(16)` before reading actual pixel data.
* Each pixel is 1 byte (unsigned 8-bit integer).
* Total pixel data size: `num_images × 28 × 28` bytes.

---

### 2. **Label file (`*-labels-idx1-ubyte`)**

The first **8 bytes** are the header:

| Offset (bytes) | Meaning             | Size |
| -------------- | ------------------- | ---- |
| 0–3            | Magic number (2049) | 4    |
| 4–7            | Number of labels    | 4    |

* **8 bytes total header** → skipped with `f.read(8)`.
* Each label is 1 byte (values 0–9).

---

### 3. **Data normalization**

* MNIST pixel values range **0–255**.
* Dividing by `255.0` scales them to **[0, 1]** for stable gradient learning.

  ```python
  images = images.astype(np.float32)/255.0
  ```

---

### 4. **TensorDataset and DataLoader**

* `TensorDataset(images_tensor, labels_tensor)` pairs **images with labels**.
* `DataLoader(train_dataset, batch_size=64, shuffle=True)`:

  * Provides **mini-batches** for training.
  * Shuffling prevents the model from learning the order of samples.

---

### 5. **Model layers**

* **Feedforward network (MLP)**:

  ```python
  nn.Flatten(),
  nn.Linear(28*28,128),
  nn.ReLU(),
  nn.Linear(128,64),
  nn.ReLU(),
  nn.Linear(64,10)
  ```
* Fully connected layers + ReLU activations.
* Input: 784 pixels → Hidden: 128 → 64 → Output: 10 logits (digits 0–9).

---

### 6. **Loss and optimizer**

* `CrossEntropyLoss`: measures classification error for multi-class tasks.
* `Adam optimizer`: adaptive gradient descent algorithm, updates model parameters.

---

### 7. **Device management**

* `device = "cuda" if torch.cuda.is_available() else "cpu"` → selects GPU if available, else CPU.
* `model.to(device)` moves the model to the selected device.
* Data must also be moved to the same device for training/inference.

---

### 8. **Evaluation**

* `model.eval()` → sets the model to **evaluation mode** (important if using Dropout/BatchNorm).
* `torch.no_grad()` → disables gradient computation during testing → saves memory and speeds up inference.
* Predictions and metrics:

  ```python
  classification_report(all_labels, all_preds)
  f1_score(all_labels, all_preds, average='macro')
  ```

---

### 9. **Inference**

* `prepare_image` → processing image, resizing, converting to numpy, grayscaling, flattening, then tensor
* `torch.no_grad()` → disables gradient computation during inference → saves memory and speeds up inference.
* Predictions:

  ```python
    predicted_class = torch.argmax(output, dim=1).item() # actual prediction

  ```

---

### 10. **Reference for MNIST Format**

* Official MNIST description by Yann LeCun: [https://yann.lecun.org/exdb/mnist/index.html](https://yann.lecun.org/exdb/mnist/index.html)