## Training a feed forward neural network with pytorch

### Imports

In [47]:
import os
import torch
from torchvision import datasets, transforms
import kagglehub
import pandas as pd

### dataset

In [8]:
path = kagglehub.dataset_download("hojjatk/mnist-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/hojjatk/mnist-dataset?dataset_version_number=1...


100%|██████████| 22.0M/22.0M [00:03<00:00, 7.21MB/s]

Extracting files...





In [9]:
print(path)

/Users/simon/.cache/kagglehub/datasets/hojjatk/mnist-dataset/versions/1


In [11]:
# Define the file paths for the training and testing data
train_image = os.path.join(path, 'train-images.idx3-ubyte')  # Path to the training images file
train_label = os.path.join(path, 'train-labels.idx1-ubyte')  # Path to the training labels file

test_image = os.path.join(path, 't10k-images.idx3-ubyte')  # Path to the test images file
test_label = os.path.join(path, 't10k-labels.idx1-ubyte')  # Path to the test labels file

In [12]:
import struct
from array import array
import numpy as np

class MnistDataLoader:
    
    def __init__(self, train_image, train_label, test_image, test_label):
        self.train_image = train_image
        self.train_label = train_label
        self.test_image = test_image
        self.test_label = test_label

    def read_images_labels(self, img_path, label_path):

        # read images
        with open(label_path, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError(f'Magic number mismatch, expected 2049, got {magic}')
            labels = np.array(array("B", file.read()))
        
        # read labels
        with open(img_path, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError(f'Magic number mismatch, expected 2051, got {magic}')
            image_data = np.array(array("B", file.read())).reshape(size, rows, cols)

        return image_data, labels
    
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.train_image, self.train_label)
        x_test, y_test = self.read_images_labels(self.test_image, self.test_label)

        return (x_train, y_train), (x_test, y_test)

In [90]:
dataloader = MnistDataLoader(
    train_image,
    train_label,
    test_image,
    test_label,
)
(x_train, y_train), (x_test, y_test) = dataloader.load_data()

In [35]:
def create_batches(dataset, batch_size=32):
    x_train, y_train = dataset

    dataset_batches = []
    cnt = 0
    while True:
        start = cnt * batch_size
        end = (cnt + 1) * batch_size
        single_input_batch = x_train[start:end]
        single_output_batch = y_train[start:end]
        dataset_batches.append((single_input_batch, single_output_batch))
        if start >= len(x_train) or end >= len(x_train):
            break
        cnt += 1
    return dataset_batches

train_batches = create_batches((x_train, y_train))
test_batches = create_batches((x_test, y_test))

### model architecture numpy 

In [167]:
class NeuralNetwork:

    def __init__(self, input_size: int = 784, hidden_size: int = 512, output_size: int = 10):
        self.W1 = np.random.randn(input_size, hidden_size) * 1e-2
        self.b1 = np.zeros((1, hidden_size))

        self.W2 = np.random.randn(hidden_size, output_size) * 1e-2
        self.b2 = np.zeros((1, output_size))

    def relu_activation(self, x):
        return np.maximum(0, x)
    
    def forward(self, x):
        self.z1 = np.dot(x, self.W1) + self.b1
        self.a1 = self.relu_activation(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        return self.z2
    
# test
nn = NeuralNetwork()
sample_input = np.random.randn(1, 784)
logits = nn.forward(sample_input)
print(logits.shape)

(1, 10)


In [168]:
def categorical_xentropy(prediction, ground_truth):
    reg_prediction = np.exp(prediction - np.max(prediction))
    softmax = reg_prediction / np.sum(reg_prediction, axis=1, keepdims=True)
    loss = - np.sum(ground_truth * np.log(softmax + 1e-9))
    return softmax, loss

In [169]:
def backprop(x, ground_truth, W2, b2, a2, W1, b1, a1, alpha):

    dZ2 = a2 - ground_truth # 1 x output_size
    dW2 = np.dot(a1.T, dZ2) # hidden_size x 1 \cdot 1 x output_size = hidden_size x output_size
    db2 = np.sum(dZ2, axis=0, keepdims=True) # 1 x output_size

    dA1 = np.dot(dZ2, W2.T) # 1 x output_size \cdot output_size x hidden_size = 1 x hidden_size
    dZ1 = dA1 * (a1 > 0) # 1 x hidden_size
    dW1 = np.dot(x.T, dZ1) # input_size x 1 \cdot 1 x hidden_size = input_size x hidden_size
    db1 = np.sum(dZ1, axis=0, keepdims=True) # 1 x hidden_size
    
    return dW1, db1, dW2, db2

In [170]:
def optimizer_step(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W2 -= alpha * dW2
    b2 -= alpha * db2
    W1 -= alpha * dW1
    b1 -= alpha * db1
    return W2, b2, W1, b1

In [175]:
alpha = 1e-3
nb_epochs = 10
num_classes = 10
nn = NeuralNetwork()

for epoch in range(nb_epochs):

    avg_train_loss = 0
    for xtrain, ytrain in train_batches:

        xtrain = xtrain.reshape(-1, 28*28)
        xtrain = xtrain / 255
        ytrain = np.eye(num_classes)[ytrain]
        
        logits = nn.forward(xtrain)
        softm, loss = categorical_xentropy(logits, ytrain)
        avg_train_loss += loss
        dW1, db1, dW2, db2 = backprop(xtrain, ytrain, nn.W2, nn.b2, softm, nn.W1, nn.b1, nn.a1, alpha)
        nn.W2, nn.b2, nn.W1, nn.b1 = optimizer_step(nn.W1, nn.b1, nn.W2, nn.b2, dW1, db1, dW2, db2, alpha)
    
    avg_train_loss = avg_train_loss / len(train_batches)
    print(f"avg training loss: {avg_train_loss}")

    avg_test_loss = 0
    for xtest, ytest in test_batches:

        xtest = xtest.reshape(-1, 28*28)
        xtest = xtest / 255
        ytest = np.eye(num_classes)[ytest]

        test_logits = nn.forward(xtest)
        _, test_loss = categorical_xentropy(test_logits, ytest)
        avg_test_loss += test_loss
    avg_test_loss = avg_test_loss / len(test_batches)
    print(f"\taverage test loss: {avg_test_loss}")

avg training loss: 18.660388666293496
	average test loss: 9.817301256231262
avg training loss: 8.780908444943584
	average test loss: 7.6572189720979855
avg training loss: 6.968116733521247
	average test loss: 6.234337221322157
avg training loss: 5.737704256329014
	average test loss: 5.257933173340119
avg training loss: 4.850015461446986
	average test loss: 4.5640817617804865
avg training loss: 4.187423748747253
	average test loss: 4.0616120757158525
avg training loss: 3.6754391777321
	average test loss: 3.6880161506656166
avg training loss: 3.268530342195639
	average test loss: 3.4063118621479385
avg training loss: 2.936017501851357
	average test loss: 3.183390209780591
avg training loss: 2.658184733879685
	average test loss: 2.9966770547911143


### model architecture torch

In [255]:
class TorchNeuralNetwork:

    def __init__(self, input_size=784, hidden_size=512, output_size=10):
        
        torch.manual_seed(42)

        self.W1 = torch.randn(input_size, hidden_size, dtype=torch.float32) * 0.01
        self.b1 = torch.zeros(1, hidden_size, dtype=torch.float32)
        
        self.W2 = torch.randn(hidden_size, output_size, dtype=torch.float32) * 0.01
        self.b2 = torch.zeros(1, output_size, dtype=torch.float32)

    def activation(self, x):
        return torch.relu(x)
    
    def forward(self, x):
        self.z1 = torch.matmul(x, self.W1) + self.b1
        self.a1 = self.activation(self.z1)
        self.z2 = torch.matmul(self.a1, self.W2) + self.b2
        
        return self.z2


In [256]:
def torch_xentropy(logits, ground_truth):
    reg_logits = torch.exp(logits - torch.max(logits))
    softmax = reg_logits / torch.sum(reg_logits, dim=1, keepdim=True)
    loss = - torch.sum(ground_truth * torch.log(softmax + 1e-9))
    return softmax, loss

In [257]:
def torch_backprop(x, y, W1, b1, a1, W2, b2, a2):

    dZ2 = a2 - y
    dW2 = torch.matmul(a1.T, dZ2) # hidden_size x output_sze
    db2 = torch.sum(dZ2, dim=0, keepdims=True)

    da1 = torch.matmul(dZ2, W2.T) # 1 x  hidden_size
    dZ1 = da1 * (a1 > 0) # 1 x hidden_size
    dW1 = torch.matmul(x.T, dZ1) # input_size x hidden_size
    db1 = torch.sum(dZ1, dim=0, keepdims=True) # 1 x hidden_size

    return dW1, db1, dW2, db2

In [258]:
torch_nn = TorchNeuralNetwork()

for epoch in range(nb_epochs):

    avg_train_loss = 0
    for xtrain, ytrain in train_batches:

        # train dataset pre-processing
        xtrain = xtrain.reshape(-1, 28*28)
        xtrain = xtrain / 255
        ytrain = np.eye(num_classes)[ytrain]
        xtrain = torch.tensor(xtrain, dtype=torch.float32)
        ytrain = torch.tensor(ytrain, dtype=torch.float32)

        logits = torch_nn.forward(xtrain)
        softmax, loss = torch_xentropy(logits, ytrain)
        dW1, db1, dW2, db2 = torch_backprop(xtrain, ytrain, torch_nn.W1, torch_nn.b1, torch_nn.a1, torch_nn.W2, torch_nn.b2, softmax)
        torch_nn.W2, torch_nn.b2, torch_nn.W1, torch_nn.b1 = optimizer_step(torch_nn.W1, torch_nn.b1, torch_nn.W2, torch_nn.b2, dW1, db1, dW2, db2, alpha)

        avg_train_loss += loss
    avg_train_loss = avg_train_loss / len(train_batches)
    print(f"avg training loss: {avg_train_loss}")

    avg_test_loss = 0
    for xtest, ytest in test_batches:

        xtest = xtest.reshape(-1, 28*28)
        xtest = xtest / 255
        ytest = np.eye(num_classes)[ytest]
        xtest = torch.tensor(xtest, dtype=torch.float32)
        ytest = torch.tensor(ytest, dtype=torch.float32)

        logits = torch_nn.forward(xtest)
        softmax, test_loss = torch_xentropy(logits, ytest)
        
        avg_test_loss += test_loss

    avg_test_loss = avg_test_loss / len(test_batches)
    print(f"\tavg test loss: {avg_test_loss}")

avg training loss: 18.75467872619629
	avg test loss: 9.77198600769043
avg training loss: 8.728156089782715
	avg test loss: 7.543931484222412
avg training loss: 6.894233703613281
	avg test loss: 6.1518659591674805
avg training loss: 5.678622245788574
	avg test loss: 5.215864181518555
avg training loss: 4.814214706420898
	avg test loss: 4.5432000160217285
avg training loss: 4.1673994064331055
	avg test loss: 4.057517051696777
avg training loss: 3.6642508506774902
	avg test loss: 3.686331272125244
avg training loss: 3.2620186805725098
	avg test loss: 3.401247024536133
avg training loss: 2.931161642074585
	avg test loss: 3.177666425704956
avg training loss: 2.654737949371338
	avg test loss: 3.0041232109069824


### for a generic number of layers

In [303]:
class TorchNeuralNetworkGeneric:

    def __init__(self, sizes):
        
        torch.manual_seed(42)
        self.sizes = sizes
        self.parameters = {}
        self.activations = {}
        for l in range(1, len(sizes)):
            self.parameters[f"W{l}"] = torch.randn(sizes[l-1], sizes[l], dtype=torch.float32) * 0.01
            self.parameters[f"b{l}"] = torch.randn(1, sizes[l], dtype=torch.float32)

    def activation(self, x):
        return torch.relu(x)
    
    def forward(self, x):
        
        self.activations["A0"] = x
        for l in range(1, len(self.sizes)):
            Z = torch.matmul(self.activations[f"A{l-1}"], self.parameters[f"W{l}"]) + self.parameters[f"b{l}"]
            A = self.activation(Z)
            self.activations[f"Z{l}"], self.activations[f"A{l}"] = Z, A

        self.activations.pop(f"A{len(self.sizes) - 1}")
        return Z, self.activations

# test
nn = TorchNeuralNetworkGeneric([784, 512, 128, 10])
sample_input = torch.randn(1, 784, dtype=torch.float32)
logits, _ = nn.forward(sample_input)
print(logits.size())

torch.Size([1, 10])


In [313]:
def torch_backprop_generic(y, softmax, parameters, activations):
    
    grads = {}

    dZ = softmax - y

    for l in reversed(range(1, len(parameters) // 2 + 1)):
        # print(activations[f"A{l-1}"].T.dtype, dZ.dtype)
        grads[f"dW{l}"] = torch.matmul(activations[f"A{l-1}"].T, dZ)
        grads[f"db{l}"] = torch.sum(dZ, dim=0, keepdim=True)

        if l > 1:
            dA = torch.matmul(dZ, parameters[f"W{l}"].T)
            dZ = dA * (activations[f"A{l-1}"] > 0)
    return grads

In [314]:
def torch_optimizer_generic(parameters, grads, alpha):
    for l in range(1, len(parameters) // 2 + 1):
        parameters[f"W{l}"] -= alpha * grads[f"dW{l}"]
        parameters[f"b{l}"] -= alpha * grads[f"db{l}"]
    return parameters

In [315]:
torch_nn_generic = TorchNeuralNetworkGeneric([784, 256, 10])

for epoch in range(nb_epochs):

    avg_train_loss = 0
    for xtrain, ytrain in train_batches:

        #train set preprocessing
        xtrain = xtrain.reshape(-1, 784)
        xtrain = xtrain / 255
        xtrain = torch.tensor(xtrain, dtype=torch.float32)
        ytrain = np.eye(num_classes)[ytrain]
        ytrain = torch.tensor(ytrain, dtype=torch.float32)

        logits, _ = torch_nn_generic.forward(xtrain)
        softmax, loss = torch_xentropy(logits, ytrain)
        avg_train_loss += loss
        grads = torch_backprop_generic(ytrain, softmax, torch_nn_generic.parameters, torch_nn_generic.activations)
        torch_nn_generic.parameters = torch_optimizer_generic(torch_nn_generic.parameters, grads, alpha)
    
    avg_train_loss = avg_train_loss / len(train_batches)
    print(f"avg training loss:  {avg_train_loss}")

    avg_test_loss = 0
    for xtest, ytest in test_batches:

        #test set preprocessing
        xtest = xtest.reshape(-1, 784)
        xtest = xtest / 255
        xtest = torch.tensor(xtest, dtype=torch.float32)
        ytest = np.eye(num_classes)[ytest]
        ytest = torch.tensor(ytest, dtype=torch.float32)

        logits, _ = torch_nn_generic.forward(xtest)
        softmax, loss = torch_xentropy(logits, ytest)
        avg_test_loss += loss

    avg_test_loss = avg_test_loss / len(test_batches)
    print(f"\tavg test loss:  {avg_test_loss}")


avg training loss:  19.74854850769043
	avg test loss:  10.386972427368164
avg training loss:  9.35973834991455
	avg test loss:  8.373201370239258
avg training loss:  7.700314998626709
	avg test loss:  7.005049705505371
avg training loss:  6.455615997314453
	avg test loss:  5.962541103363037
avg training loss:  5.526224613189697
	avg test loss:  5.2019877433776855
avg training loss:  4.821743011474609
	avg test loss:  4.631757736206055
avg training loss:  4.2646660804748535
	avg test loss:  4.198936462402344
avg training loss:  3.8180737495422363
	avg test loss:  3.8690662384033203
avg training loss:  3.4525017738342285
	avg test loss:  3.6049489974975586
avg training loss:  3.149456739425659
	avg test loss:  3.395559787750244
