In [1]:
# Reference taken from:- https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/

# Importing the libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Defining the sigmoid and the softmax functions

def sigmoid(z):
    return 1 / (1 + torch.exp(-z))

def softmax(z):
    exp_z = torch.exp(z)
    return exp_z / exp_z.sum(dim=1, keepdim=True)

In [3]:
# Loading the MNIST dataset and preprocessing it.

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST('data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('data', train=False, download=True, transform=transform)

batch_size = 64

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initializing the Hyperparameters

d = 784
d1 = 300
d2 = 200
k = 10
learning_rate = 0.01
num_epochs = 10

# Initializing the parameters of model and weights

W1 = torch.randn(d, d1, requires_grad=True)
b1 = torch.zeros(1, d1, requires_grad=True)
W2 = torch.randn(d1, d2, requires_grad=True)
b2 = torch.zeros(1, d2, requires_grad=True)
W3 = torch.randn(d2, k, requires_grad=True)
b3 = torch.zeros(1, k, requires_grad=True)

# The loop for training

for epoch in range(num_epochs):
    total_loss = 0.0

    for data, target in train_loader:
        data = data.view(data.size(0), -1)

        z1 = data.mm(W1) + b1
        a1 = sigmoid(z1)
        z2 = a1.mm(W2) + b2
        a2 = sigmoid(z2)
        z3 = a2.mm(W3) + b3
        output = softmax(z3)

        target_one_hot = torch.zeros(data.size(0), k)
        target_one_hot.scatter_(1, target.view(-1, 1), 1)
        loss = -torch.sum(target_one_hot * torch.log(output)) / data.size(0)

        # The Backpropagation as suggested in the question without using the functions

        grad_loss = 1.0
        grad_z3 = output - target_one_hot
        grad_W3 = a2.t().mm(grad_z3)
        grad_b3 = grad_z3.sum(0)
        grad_a2 = grad_z3.mm(W3.t())
        grad_z2 = grad_a2 * a2 * (1 - a2)
        grad_W2 = a1.t().mm(grad_z2)
        grad_b2 = grad_z2.sum(0)
        grad_a1 = grad_z2.mm(W2.t())
        grad_z1 = grad_a1 * a1 * (1 - a1)
        grad_W1 = data.t().mm(grad_z1)
        grad_b1 = grad_z1.sum(0)

        W1.data -= learning_rate * grad_W1
        b1.data -= learning_rate * grad_b1
        W2.data -= learning_rate * grad_W2
        b2.data -= learning_rate * grad_b2
        W3.data -= learning_rate * grad_W3
        b3.data -= learning_rate * grad_b3

KeyboardInterrupt: ignored