# The objective of this exercise is to implement a multi-layer perceptron with one hidden layer from scratch and test it on MNIST dataset.

In [1]:
# imports
import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
import os
from dlc_practical_prologue import load_data


# Load Data

# Activation Functions

In [2]:
def sigma(x):
    return torch.tanh(x)


def dsigma(x):
    return 1-(sigma(x)**2)

# Loss Function

In [3]:
def loss(x,t): 
    return torch.pow(torch.norm(x-t),2)
      
def dloss(x,t):
    return 2*(x-t)

# Load Data

In [None]:
train_input, train_target, test_input, test_target=load_data(one_hot_labels=True, normalize=True)

# Forward and Backward Pass

In [None]:
# x=train_input[0].unsqueeze(1)
# s1 = torch.mm(x.T,w1) + b1
# print(w1)
# x1 = sigma(s1)
# s2 = torch.mm(x1, w2) + b2
# x2 = sigma(s2)
# x2

In [None]:
def forward_pass(w1,b1,w2,b2,x):
    s1 = torch.mv(w1,x) + b1.flatten()
    x1 = sigma(s1)
    s2 = torch.mv(w2,x1.T) + b2.flatten()
    x2 = sigma(s2)
    return x,s1,x1,s2,x2
    

In [None]:
def backward_pass(w1,b1,w2,b2,t,x,s1,x1,s2,x2,dl_dw1,dl_db1,dl_dw2,dl_db2):
    
    dl_dx2 =dloss(x2,t) 
    dl_ds2 =dl_dx2*dsigma(s2)
    dl_dw2 =torch.mm(dl_ds2.view(-1,1),x1.view(1,-1))
    dl_db2 = dl_ds2
    dl_dx1 = w2.t().mv(dl_ds2)
    dl_ds1 = dl_dx1*dsigma(s1)
    dl_db1 =dl_ds1
    dl_dw1 =torch.mm(dl_ds1.view(-1,1),x.view(1,-1))
    
    return dl_db1,dl_db2,dl_dw1,dl_dw2
    
    
    
    

# Training the Network

In [None]:
train_target = train_target * 0.9
test_target = test_target * 0.9

In [None]:
n_hidden=50
epsilon=1e-6
n_classes=train_target.size(1)
n_train_samples=train_input.size(0)
eta=0.1/n_train_samples


w1=torch.empty(n_hidden, train_input.size(1)).normal_(0,epsilon)
w2=torch.empty(n_classes, n_hidden).normal_(0,epsilon)
b1=torch.empty(n_hidden).normal_(0,epsilon)
b2=torch.empty(n_classes).normal_(0,epsilon)
# w1 = torch.normal(0.0,1e-6, size = (train_input.size(1),50))
# b1 = torch.normal(0.0,1e-6,size = (1,50))
# w2 = torch.normal(0.0,1e-6,size = (50,10))
# b2 = torch.normal(0.0,1e-6,size = (1,10))

dl_dw1 = torch.empty(w1.size())
dl_db1 = torch.empty(b1.size())
dl_dw2 = torch.empty(w2.size())
dl_db2 = torch.empty(b2.size())

In [None]:
for k in range(1000):

    # Back-prop

    acc_loss = 0
    nb_train_errors = 0

    dl_dw1.zero_()
    dl_db1.zero_() 
    dl_dw2.zero_()
    dl_db2.zero_()

    for n in range(n_train_samples):
        x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, train_input[n])

        pred = x2.argmax()
        if train_target[n, pred] < 0.5: nb_train_errors = nb_train_errors + 1
        acc_loss = acc_loss + loss(x2, train_target[n])

        m1,m2,m3,m4=backward_pass(w1, b1, w2, b2,
                      train_target[n],
                      x0, s1, x1, s2, x2,
                      dl_dw1, dl_db1, dl_dw2, dl_db2)
    
    dl_db1 +=m1
    dl_db2 +=m2
    dl_dw1 +=m3
    dl_dw2 +=m4
    
        

    # Gradient step

    w1 = w1 - eta * dl_dw1
    b1 = b1 - eta * dl_db1
    w2 = w2 - eta * dl_dw2
    b2 = b2 - eta * dl_db2

    # Test error

    nb_test_errors = 0

    for n in range(test_input.size(0)):
        _, _, _, _, x2 = forward_pass(w1, b1, w2, b2, test_input[n])

        pred = x2.argmax()
        if test_target[n, pred] < 0.5: nb_test_errors = nb_test_errors + 1
            
    if k%20==0:  
        print('{:d} acc_train_loss {:.02f} acc_train_error {:.02f}% test_error {:.02f}%'
          .format(k,
                  acc_loss,
                  (100 * nb_train_errors) / train_input.size(0),
                  (100 * nb_test_errors) / test_input.size(0)))

In [None]:
forward_pass(w1,b1,w2,b2,train_input[0]);

In [None]:
x,s1,x1,s2,x2 = forward_pass(w1,b1,w2,b2,train_input[0]);

In [None]:
t=train_target[0]

In [None]:
#backward_pass(w1,b1,w2,b2,t,x,s1,x1,s2,x2,dl_dw1,dl_db1,dl_dw2,dl_db2)

In [None]:
dl_dx2 =dloss(x2,train_target)