## Simple MNIST convnet

## Setup

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

## Prepare the data

In [1]:
# Model / data parameters
num_classes = 10
input_shape = (1, 28, 28)  # Channels first in PyTorch

# Define transformations to scale images to [0, 1] and ensure correct shape
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to tensor and scale pixel values to [0, 1]
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1] (optional)
])

# Load the data and split it between train and test sets
train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)

# Create DataLoaders for batching and shuffling
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check dataset shapes
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 55.7MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 1.94MB/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 12.1MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 13.5MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Number of training samples: 60000
Number of testing samples: 10000





## Build the model

In [6]:
import torch.nn.functional as F

class MNISTConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(MNISTConvNet, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)  # No 'activation' argument here
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        # Fully connected layers
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(64 * 5 * 5, num_classes)  # 5x5 feature map after convolutions

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.conv1(x))  # Apply ReLU after conv1
        x = self.pool1(x)         # Apply max-pooling
        x = F.relu(self.conv2(x))  # Apply ReLU after conv2
        x = self.pool2(x)         # Apply max-pooling

        x = torch.flatten(x, 1)  # Flatten feature maps into a 1D vector
        x = self.dropout(x)      # Apply dropout
        x = self.fc1(x)          # Fully connected layer
        return F.log_softmax(x, dim=1)  # Log-softmax for classification

# Instantiate the model
model = MNISTConvNet(num_classes=10)

# Print model summary
print(model)

# Test the model with a dummy input to verify output shape
dummy_input = torch.randn(1, *input_shape)  # Batch size of 1
output = model(dummy_input)
print("Output shape:", output.shape)  # Expected shape: [1, 10]

MNISTConvNet(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=1600, out_features=10, bias=True)
)
Output shape: torch.Size([1, 10])


## Train the model

In [7]:
from torch.utils.data import random_split
from tqdm import tqdm

# Define hyperparameters
batch_size = 128
epochs = 15
learning_rate = 0.001

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # PyTorch combines softmax and cross-entropy in one loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Split train dataset into training and validation sets
train_size = int(0.9 * len(train_dataset))  # 90% training, 10% validation
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    train_loss = 0.0
    train_correct = 0
    total = 0

    # Training step
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()  # Clear gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        train_loss += loss.item()
        _, preds = outputs.max(1)  # Get predictions
        train_correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_accuracy = train_correct / total
    train_loss /= len(train_loader)

    # Validation step
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    val_correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation for validation
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, preds = outputs.max(1)
            val_correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_accuracy = val_correct / total
    val_loss /= len(val_loader)

    # Print epoch results
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

Epoch 1/15: 100%|██████████| 422/422 [00:16<00:00, 24.88it/s]


Epoch 1/15
Train Loss: 0.2912, Train Accuracy: 0.9127
Val Loss: 0.0882, Val Accuracy: 0.9762


Epoch 2/15: 100%|██████████| 422/422 [00:14<00:00, 29.44it/s]


Epoch 2/15
Train Loss: 0.0945, Train Accuracy: 0.9724
Val Loss: 0.0602, Val Accuracy: 0.9825


Epoch 3/15: 100%|██████████| 422/422 [00:13<00:00, 31.06it/s]


Epoch 3/15
Train Loss: 0.0729, Train Accuracy: 0.9775
Val Loss: 0.0486, Val Accuracy: 0.9855


Epoch 4/15: 100%|██████████| 422/422 [00:13<00:00, 30.66it/s]


Epoch 4/15
Train Loss: 0.0619, Train Accuracy: 0.9805
Val Loss: 0.0441, Val Accuracy: 0.9865


Epoch 5/15: 100%|██████████| 422/422 [00:15<00:00, 28.12it/s]


Epoch 5/15
Train Loss: 0.0545, Train Accuracy: 0.9829
Val Loss: 0.0411, Val Accuracy: 0.9875


Epoch 6/15: 100%|██████████| 422/422 [00:13<00:00, 30.79it/s]


Epoch 6/15
Train Loss: 0.0500, Train Accuracy: 0.9851
Val Loss: 0.0392, Val Accuracy: 0.9880


Epoch 7/15: 100%|██████████| 422/422 [00:13<00:00, 30.96it/s]


Epoch 7/15
Train Loss: 0.0469, Train Accuracy: 0.9851
Val Loss: 0.0373, Val Accuracy: 0.9887


Epoch 8/15: 100%|██████████| 422/422 [00:14<00:00, 29.49it/s]


Epoch 8/15
Train Loss: 0.0419, Train Accuracy: 0.9871
Val Loss: 0.0365, Val Accuracy: 0.9882


Epoch 9/15: 100%|██████████| 422/422 [00:13<00:00, 30.32it/s]


Epoch 9/15
Train Loss: 0.0406, Train Accuracy: 0.9873
Val Loss: 0.0386, Val Accuracy: 0.9895


Epoch 10/15: 100%|██████████| 422/422 [00:13<00:00, 30.97it/s]


Epoch 10/15
Train Loss: 0.0372, Train Accuracy: 0.9883
Val Loss: 0.0353, Val Accuracy: 0.9897


Epoch 11/15: 100%|██████████| 422/422 [00:13<00:00, 31.30it/s]


Epoch 11/15
Train Loss: 0.0377, Train Accuracy: 0.9876
Val Loss: 0.0331, Val Accuracy: 0.9905


Epoch 12/15: 100%|██████████| 422/422 [00:13<00:00, 31.10it/s]


Epoch 12/15
Train Loss: 0.0336, Train Accuracy: 0.9894
Val Loss: 0.0330, Val Accuracy: 0.9902


Epoch 13/15: 100%|██████████| 422/422 [00:13<00:00, 31.10it/s]


Epoch 13/15
Train Loss: 0.0324, Train Accuracy: 0.9894
Val Loss: 0.0336, Val Accuracy: 0.9913


Epoch 14/15: 100%|██████████| 422/422 [00:14<00:00, 28.93it/s]


Epoch 14/15
Train Loss: 0.0323, Train Accuracy: 0.9901
Val Loss: 0.0330, Val Accuracy: 0.9915


Epoch 15/15: 100%|██████████| 422/422 [00:13<00:00, 30.94it/s]


Epoch 15/15
Train Loss: 0.0288, Train Accuracy: 0.9901
Val Loss: 0.0316, Val Accuracy: 0.9920


## Evaluate the trained model

In [8]:
# Set the model to evaluation mode
model.eval()

# Initialize metrics
test_loss = 0.0
correct = 0
total = 0

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Disable gradient computation for evaluation
with torch.no_grad():
    for images, labels in test_loader:
        # Move data to the appropriate device (CPU or GPU)
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)

        # Compute loss
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Compute predictions
        _, preds = outputs.max(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

# Calculate average loss and accuracy
test_loss /= len(test_loader)
test_accuracy = correct / total

# Print results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.0235
Test Accuracy: 0.9923
