### Vanilla **Backpropogation Algorithm** on the MNIST dataset using a Feed-Forward Neural Network

#### Importing *MNIST* dataset

In [None]:
import torch
from torchvision import datasets
from torchvision.transforms import ToTensor

mnist_train = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

mnist_test = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

In [None]:
mnist_train.data[0]

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,
          18,  18, 126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   

#### Defining the Neural Network and Backpropogation Algorithm


In [None]:
from torch.nn.functional import one_hot

class FCNeuralNetwork():
    def __init__(self, layer_sizes, lr):
        self.lr=lr
        self.layer_sizes = layer_sizes
        self.w = []
        self.b = []
        for fclayer in range(1, len(self.layer_sizes)):
            # self.w.append(torch.rand(self.layer_sizes[fclayer], self.layer_sizes[fclayer-1]))
            # self.b.append(torch.rand(self.layer_sizes[fclayer]))
            self.w.append(0.01 * torch.randn(self.layer_sizes[fclayer], self.layer_sizes[fclayer-1]))
            self.b.append(torch.zeros(self.layer_sizes[fclayer]))
    def cost_func(self, pred, y):
        err = pred-y
        mse = 1/2*torch.dot(err, err)
        return mse
    def cost_func_deriv(self, pred, y):
        return pred-y
    def σ(self, z):
        return torch.sigmoid(z)
    def z(self, a):
        return torch.log(a / (1-a))
    def σ_prime(self, z):
        return torch.sigmoid(z)*(1-torch.sigmoid(z))
    def img_preprocessing(self,tensor):
        squeezed_tensor = torch.squeeze(tensor)
        reshaped_squeezed_tensor = torch.reshape(squeezed_tensor, (-1,))
        reshaped_squeezed_tensor = (reshaped_squeezed_tensor - reshaped_squeezed_tensor.min()) / (reshaped_squeezed_tensor.max() - reshaped_squeezed_tensor.min())
        return reshaped_squeezed_tensor
    def one_hot_encode(self, label):
        new_label = torch.zeros(10)
        new_label[label.item()] = 1
        return new_label
    def forward_pass(self, feature_batch, label_batch):
        self.batch_size = len(feature_batch)
        batch_a = []
        batch_z= []
        predictions = []
        c=0
        for idx in range(self.batch_size):
            x_input = self.img_preprocessing(feature_batch[idx])
            layer_pass = x_input
            activations = [x_input]
            zs = []
            for w_i, b_i in zip(self.w, self.b):
                w_i_a_iminus1 = w_i @ layer_pass # 1st iter: 10x784 @ 784x1 => 10x1, 2nd iter: 10 @ 10,... last iter: 10 @ 10 => 10
                z_i = w_i_a_iminus1 + b_i # 1st iter: 10x1 + 10x1
                a_i = self.σ(z_i) # 1st iter: 10x1
                layer_pass = a_i
                activations.append(a_i) # 1st iter: [10x1]
                zs.append(z_i)
            a_final = a_i
            batch_a.append(activations) #each row of 1st dim is for one image of batch
            batch_z.append(zs)
            y_target = self.one_hot_encode(label_batch[idx])
            predictions.append(a_final)#[10x1,...] (one per image in patch)
            c += self.cost_func(a_final, y_target)
        return c/self.batch_size, predictions, batch_a, batch_z
    def backprop(self, batch_a, predictions, label_batch, batch_z):#batch_activations = [[layer_1, layer_2,... layer_]], #predictions [img_1-10, img_2-10,...]
        batch_δ = []
        for i in range(self.batch_size):
            δ = []
            for j in range(0, len(self.layer_sizes)-1):
                if j == 0:
                    y_target = self.one_hot_encode(label_batch[i])
                    a_final = batch_a[i][-1]
                    dcda_final = self.cost_func_deriv(predictions[i], y_target)
                    # z_final = self.z(a_final)
                    z_final = batch_z[i][-1]
                    σ_prime_final = self.σ_prime(z_final)
                    #δ_final = dcda_final*σ_prime_final
                    δ_final = dcda_final #cross-entropy
                    δ.insert(0, δ_final)
                else:
                    a_i = batch_a[i][-j-1]
                    # z_i = self.z(a_i)
                    z_i = batch_z[i][-j-1]
                    σ_prime_z_i = self.σ_prime(z_i)
                    δ_iplus1 = δ[-j]
                    w_iplus1 = self.w[-j] #j+1th layer x jth layer
                    δ_i = ((w_iplus1.T) @ δ_iplus1)*σ_prime_z_i
                    δ.insert(0, δ_i)
            batch_δ.append(δ)
        return batch_δ
    def gradient_descent(self, batch_δ, batch_a):#batch_errors = [[layer n, layer n-1,..layer 2]]
        batch_average_δ = [sum(img_δ[k] for img_δ in batch_δ) for k in range(len(self.layer_sizes)-1)]   
        for l in range(len(self.b)):
            self.b[l] -= (self.lr/self.batch_size)*(batch_average_δ[l])
        for l in range(0, len(self.w)):
            sum_Δ_w_l = torch.zeros(self.w[l].shape)
            for j in range(self.batch_size):
                # sum_Δ_w_l += batch_δ[j][l]@(batch_a[j][l]).T
                sum_Δ_w_l += torch.outer(batch_δ[j][l], (batch_a[j][l]))
            self.w[l] -= (self.lr/self.batch_size)*(sum_Δ_w_l)



#### Initializing the Model and Data Loader

In [None]:
from torch.utils.data import DataLoader

epochs = 5
batch_size = 64
learning_rate = 0.5
layers = [28*28, 128, 10]
train_dataloader = DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
iter_tensor_train = iter(train_dataloader)

nn = FCNeuralNetwork(layers, learning_rate)

#### Training the Model

In [None]:
for epoch in range(epochs):
    for i, (feature_batch, label_batch) in enumerate(train_dataloader):
        cost, predictions, batch_activations, batch_z= nn.forward_pass(feature_batch, label_batch)
        print(f"epoch #{epoch+1} batch #{i+1}, cost = {cost}")
        batch_δ = nn.backprop(batch_activations, predictions, label_batch, batch_z)
        nn.gradient_descent(batch_δ, batch_activations)

epoch #1 batch #1, cost = 1.2600024938583374
epoch #1 batch #2, cost = 0.49835315346717834
epoch #1 batch #3, cost = 0.4813790023326874
epoch #1 batch #4, cost = 0.46436452865600586
epoch #1 batch #5, cost = 0.45931899547576904
epoch #1 batch #6, cost = 0.45957010984420776
epoch #1 batch #7, cost = 0.45819219946861267
epoch #1 batch #8, cost = 0.45516660809516907
epoch #1 batch #9, cost = 0.45450952649116516
epoch #1 batch #10, cost = 0.4538078308105469
epoch #1 batch #11, cost = 0.45343509316444397
epoch #1 batch #12, cost = 0.45935407280921936
epoch #1 batch #13, cost = 0.45401492714881897
epoch #1 batch #14, cost = 0.45772814750671387
epoch #1 batch #15, cost = 0.45056354999542236
epoch #1 batch #16, cost = 0.4613364636898041
epoch #1 batch #17, cost = 0.4544985294342041
epoch #1 batch #18, cost = 0.44973641633987427
epoch #1 batch #19, cost = 0.4589128792285919
epoch #1 batch #20, cost = 0.4507863521575928
epoch #1 batch #21, cost = 0.4582027792930603
epoch #1 batch #22, cost = 0.4

In [None]:
print(nn.w)

[tensor([[-0.0099, -0.0046,  0.0039,  ..., -0.0081,  0.0051, -0.0121],
        [-0.0001,  0.0218,  0.0209,  ...,  0.0085,  0.0059,  0.0050],
        [ 0.0004, -0.0093,  0.0047,  ..., -0.0025,  0.0161,  0.0131],
        ...,
        [-0.0004, -0.0013, -0.0139,  ...,  0.0151, -0.0133, -0.0147],
        [ 0.0065, -0.0167, -0.0023,  ..., -0.0119,  0.0166,  0.0146],
        [ 0.0009, -0.0063, -0.0116,  ...,  0.0017, -0.0079,  0.0015]]), tensor([[-0.3985, -0.3308, -0.2818,  ...,  0.3704, -0.7039,  0.2157],
        [ 0.1627,  0.3448,  1.6756,  ..., -0.2835,  0.0379, -0.2941],
        [-0.8201, -0.0795, -0.3197,  ...,  0.6361,  0.5645, -1.2706],
        ...,
        [ 0.0299,  0.1911, -0.1341,  ..., -0.8684, -0.1341, -0.1778],
        [ 0.4470, -0.3012, -0.6294,  ..., -0.5516,  0.5521,  0.0426],
        [-1.3052,  0.2207, -0.0053,  ...,  0.2560, -1.2561,  0.6519]])]


#### Evaluating Performance on the Test Dataset

In [None]:
def batch_accuracy(label_batch, predictions):
    total_correct = 0
    labels = label_batch.tolist()
    predictions_none_torch = [torch.argmax(prediction).item() for prediction in predictions]
    for i, j in zip(labels,predictions_none_torch):
        if i == j:
            total_correct += 1
    return total_correct/len(labels)
            


In [None]:
test_dataloader = DataLoader(mnist_test, batch_size=batch_size, shuffle=True)
accuracy = 0
for image_batch, label_batch in test_dataloader:
    cost, predictions, _, _ = nn.forward_pass(image_batch, label_batch)
    accuracy += batch_accuracy(label_batch, predictions)/len(test_dataloader)
print(f"Accuracy on test set: {accuracy}")

0.9636743630573252
