<a href="https://colab.research.google.com/github/sheldor07/understainding-transformers/blob/main/multilayer-perceptron-from-scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np

# Loading the dataset
fashion_mnist = tf.keras.datasets.fashion_mnist
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
# Network class is used to represent a neural network
class Network(object):

  # initialising Network class

  def __init__(self, sizes):
    self.num_layers = len(sizes)
    self.sizes = sizes
    # randomly intiallising weights and biases, except for the first layer which is the input layer
    self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
    # weights is a matrix representing weights of connections between two layers
    self.weights = [np.random.randn(y, x)
                    for x, y in zip(sizes[:-1], sizes[1:])]

  # given an input, feeds the output of each layer to the next and to produce the corresponding output layer
  def feedforward(self, a):
    for b, w in zip(self.biases, self.weights):
      # as mentioned, a is a vector/matrix containing activations of the previous layer and w is a vector/matrix containing the weights of all those connections
      a = sigmoid(np.dot(w, a) + b)
    return a

  def stochastic_gradient_descent(self, training_data, epochs, mini_batch_size,eta,test_data = None):
    # training data = tuples of (x, y) representing the training inputs and the desired output
    # we're not testing rn so test_data = none, but if it's provided we evaluate the trianing output against it
    if test_data:
      n_test = len(test_data)

    n = len.training_data

    for j in np.xrange(epochs):
        # epoch is a training cycle for the entire dataset
        # in each epoch we randomly shuffle the dataset and create mini batches
        np.random.shuffle(training_data)
        mini_batches = [
            training_data[k:k + mini_batch_size]
            for k in np.xrange(0, n, mini_batch_size)
        ]
        # for each mini batch we apply a single step of gradient descent
        for mini_batch in mini_batches:
          # update_mini_batch updates the weights and biases according to the iteration
          self.update_mini_batch(mini_batch, eta)
        if test_data:
            print (f"Epoch {j}: {self.evaluate(test_data)} / {n_test}")
        else:
            print (f"Epoch {j} complete")

    # updating the weights and biases of network, based on gradient calculated during backprop
    def update_mini_batch(self, mini_batch, eta):

      # initialised list of arrays with same shape as biases and weights
      # stores the gradients of the cost function w.r.t biases and weights
      nabla_b = [np.zeros(b.shape) for b in self.biases]
      nabla_w = [np.zeros(w.shape) for w in self.weights]

      for x,y in mini_batch:
        # x is input data, y is corresponding output data
        # backprop retursn gradients of cost fucntion w respect to bias and weight
        delta_nabla_b, delta_nabla_w = self.backprop(x, y)
        nable_b = [nb + dnb for nb, dnb in zip(nabla_b ,delta_nabla_b)]
        nable_w = [nw + dnw for nw, dnw in zip(nabla_w ,delta_nabla_w)]

      # For each weight and bias, the corresponding gradient is multiplied by the learning rate (eta) divided by the size of the mini-batch.
      self.weights = [w-(eta/len(mini_batch)) * nw
                      for w, nw in zip(self.weights, nabla_w)]
      self.biases = [b-(eta/len(mini_batch)) * nb
                      for b, nb in zip(self.biases, nabla_b)]


    def backprop(self, x, y):
      """ Returns a tuple (nabla_b, nabla_w) representing the gradient for the cost function
      nabla_b and nabla_w are layer by layer lists of numpy arrays similar to self.biases and self.weights"""

      nabla_b = [np.zeros(b.shape) for b in self.biases]
      nabla_w = [np.zeros(w.shape) for w in self.weights]

      # feed forward
      activation = x
      # activations is a list htat will store the activations of all layers during the forward pass
      activations = [x]
      # zs is an empty list that will store the weightsed inputs of all layers during the forward pass
      zs = []

      # forward pass of the nerual network
      # for each layer the weighted input z is calcualted by taking the dot product of the weight w and activation of previous layer and add corr bias b
      for b, w in zip(self.biases, self.weights):
        z = np.dot(w, activation) + b
        zs.append(z)
        acivation = sigmoid(z)
        activations.append(activation)

      # backward pass
      delta = self.cost_derivative(activations[-1], y) * \
          sigmoid_prime(zs[-1])
      nabla_b[-1] = delta
      nabla_w[-1] = np.dot(delta, activations[-2].transpose())

      for l in xrange(2, self.num_layers):
          z = zs[-l]
          sp = sigmoid_prime(z)
          delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
          nabla_b[-l] = delta
          nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())

      return (nabla_b, nabla_w)


    def evaluate(self, test_data):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives \partial C_x /
        \partial a for the output activations."""
        return (output_activations-y)
# sigmoid function for squishification
def sigmoid(x):
  return 1.0/(1.0 + np.exp(-x))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

In [None]:
net = Network([2,3,1])