In [1]:
import numpy as np
import math

"""

This is the engine for defining the micro architecture of the backprop algo [As written by ../karpathy], along with some of my own
custom additions.

"""

class Value:
    def __init__(self, data, child = () , op= '' ):
        self.data = data
        self.prev = set(child)
        self.op = op
        self._backward = lambda: None
        self.gradient = 0
    def __repr__(self):
        return (f"Value:(Data: {self.data})")


    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        new_val = Value(self.data + other.data, (other, self), '+')
        def _backward():
            self.gradient += new_val.gradient
            other.gradient += new_val.gradient
        new_val._backward = _backward
        return new_val



    def __neg__(self):
      return self * -1


    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return self + (-other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        new_val = Value(self.data * other.data, (other, self), '*')
        def _backward():
            self.gradient += other.data*new_val.gradient
            other.gradient += self.data*new_val.gradient
        new_val._backward = _backward
        return new_val

    def tanh(self):
        val = (math.exp(2*self.data)-1)/(math.exp(2*self.data)+1)
        new_val = Value(val, (self, ), 'tanh')
        def _backward():
            self.gradient += (1-val**2)*new_val.gradient
        new_val._backward = _backward
        return new_val

    def __lt__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return self.data < other.data

    def __le__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        return self.data <= other.data

    def exp(self):
        val = np.exp(self.data)
        new_val = Value(val, (self,), 'exp')
        def _backward():
            self.gradient +=new_val.data* new_val.gradient
        new_val._backward = _backward
        return new_val

    def __truediv__(self, other):
        return self * other**-1

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float"
        out = Value(self.data**other, (self,), f'**{other}')
        def _backward():
            self.gradient += other*(self.data**(other-1))*out.gradient
        out._backward = _backward
        return out


    def backward(self, alpha):
        visited = set()
        nodes = []
        def build_top_graph(v):
            if(v not in visited):
                visited.add(v)
                for i in v.prev:
                    build_top_graph(i)
                nodes.append(v)
        build_top_graph(self)
        self.gradient = 1
        for i in reversed(nodes):
            i._backward()


    def __rmul__(self, other):
        return self*other

    def log(self):
        val = math.log(self.data)
        new_val = Value(val, (self,), 'log')
        def _backward():
            self.gradient += (1 / self.data) * new_val.gradient
        new_val._backward = _backward
        return new_val

    def relu(self):
      new_val = Value(self.data if self.data> 0 else 0, (self, ), 'relu')
      def _backward():
         if(new_val==0):
            self.gradient = 0
         else:
            self.gradient = 1*new_val.gradient

      new_val._backward = _backward
      return new_val




In [21]:
class denseLayer:
    #num_inputs is basically the length of each input batch.
    def __init__(self, num_inputs, neurons, activation):
        self.weights =  np.vectorize(Value)(np.random.uniform(-1, 1, (num_inputs,neurons)))
        self.biases = np.vectorize(Value)(np.zeros((1,neurons)))
        self.activation = activation
    def forward(self, inputs):
        if(len(inputs.shape)==1):
            self.inputs = np.array([Value(i) for i in inputs])
        else:
            self.inputs = np.reshape([Value(j) for i in np.array(inputs) for j in i], (inputs.shape[0],inputs.shape[1]))
        output = (np.dot(inputs, self.weights)+self.biases)
        match self.activation:
            case 'relu':
                self.output = self.relu_forward(output)
            case 'softmax':
                self.output = self.softmax_forward(output)
            case _:
                raise "Incorrect Activation Function"
        return self.output


    def relu_forward(self, inputx):
        if(len(inputx)==1):
          return np.array([i.relu() for i in inputx[0]])
        else:
          return np.array([[i.relu() for i in j] for j in inputx], dtype = 'object')

    def parameters_ret(self):
        params = np.insert(self.weights, len(self.weights), self.biases, axis = 0)
        return np.reshape(params, (1,(self.weights.shape[0]*self.weights.shape[1]+self.biases.shape[1])))


    def softmax_forward(self, inputs):
        #Subbing the max input so as to avoid an overflow of values for exponentiation.
        exp_vals = np.exp(inputs - np.max(inputs, axis = 1, keepdims= True))
        summ = np.sum(exp_vals, axis=1, keepdims = True)
        self.output = exp_vals / summ
        return self.output


class final_loss:
    def calculate(self, loss_array):
        sumy = Value(0)
        for i in loss_array:
          sumy+=i
        return sumy/Value(len(loss_array))


class categorical_cross_entropy(final_loss):
    def loss_function(self, ground_truth, predicted):
        predicted = np.clip(predicted, Value(1e-7), Value(1-1e-7))
        hello_loss = np.array([])
        confidences = np.array([])
        if (len(ground_truth.shape)==1):
            hello_loss  = predicted[range(len(predicted)), ground_truth]
            confidences = (-1)*np.log(hello_loss)
        else:
            hello_loss = [sum(-np.log(predicted[j])*ground_truth[j]) for j in range(len(predicted))]
            confidences = hello_loss
        self.loss = confidences
        return self.loss

In [23]:
#This is the multi layer perceptron class, basically piecing together all the layers:

class Multi_Layer_Perceptron:
  def __init__(self, num_inputs, num_layers, num_neurons, neurons_in_last_layer, activation_, activation_final, input_data):

       self.num_inputs = num_inputs
       self.activation_ = activation_
       self.activation_final = activation_final
       self.input_data = input_data
       self.layers = []
       self.num_neurons = num_neurons
       self.last_neurons = neurons_in_last_layer


       for i in range(num_layers):
            if(i==0):
              self.layers.append(denseLayer(num_inputs, self.num_neurons, activation_))
            elif(i==num_layers-1):
              self.layers.append(denseLayer(self.num_neurons, self.last_neurons, activation_final))
            else:
              self.layers.append(denseLayer(self.num_neurons, self.num_neurons, activation_))


  def forward(self):
    prev_output = self.layers[0].forward(self.input_data)

    for i in range(len(self.layers)):
      if(i!=0):
        prev_output = self.layers[i].forward(prev_output)
    return self.layers[-1].output

  def parameters_ret(self):
    return [p for layer in self.layers for p in layer.parameters_ret()]




The first partial derivative with respect to $ w^{L}$ which is the weight of the last layer:
$$\frac{\mathrm{d}C_0}{\mathrm{d}w_{jk}^{L}} = 2 \sum_{j=0}^{n_L-1} (a_j^{L}-y_j). \sigma^{'}(z_j^{L}). a_k^{L-1}.$$

The same thing is repeated for each param in all the layers that are present. The mathematical complexity of this doesnt appear in the code because we perform derivatives on the fundamental bit level mathematical operations [micrograd] which ends up saving a lot of time.

In [41]:
import pandas as pd


#This is some arbitrary data that we can essentially train our neural net on.


X = pd.read_csv(r"/content/backprop_data.csv")
ground_truth = X['is_canceled'][:500]


input_state = X.to_numpy()
input_data = input_state[:,1:]
input_data = input_data[:500]



In [99]:

input_data = np.array([[1,2,3,4,5], [2,3,4,5,6], [7,8,9,10,11]])
ground_truth = np.array([1,0,2])


In [100]:
#number of features per input example
num_inputs = len(input_data[0])

#each layer will have the same number of neurons [except the last one.]
num_layers = 2
num_neurons = 3

#Because of the classification element
neurons_in_last_layer = 3

mlp = Multi_Layer_Perceptron(num_inputs, num_layers, num_neurons, neurons_in_last_layer,'relu', 'softmax', input_data)

mlp_output = mlp.forward()

In [101]:
loss = categorical_cross_entropy()
final_cce_loss = loss.loss_function(ground_truth, mlp_output)

In [102]:
fl = final_loss()
mean_loss = fl.calculate(final_cce_loss)
mean_loss

Value:(Data: 2.5316969343822198)

In [103]:
class gradient_desc:
    def __init__(self, learning_rate, ground_truth, mlp, orig_inputs, loss_func):
        self.learning_rate = learning_rate
        self.ground_truth = ground_truth
        self.orig_inputs = orig_inputs
        self.loss_obj = loss_func
        self.mlp_obj = mlp


    def forward_pass(self):
        output1 = self.mlp_obj.forward()
        return output1


    def backward_pass(self, prev_loss, iter):
       for i in self.mlp_obj.parameters_ret():
            for j in i:
              j.gradient = 0

       prev_loss.backward(self.learning_rate)
       for i in self.mlp_obj.parameters_ret():
            for j in i:
              j.data -= self.learning_rate*j.gradient

       updated_results = self.forward_pass()
       changed_loss = self.loss_obj.loss_function(self.ground_truth, updated_results)
       fl = final_loss()
       mean_loss = fl.calculate(changed_loss)
       if(iter%10==0):
          print("Loss in epoch ", iter, " is: ", mean_loss)
          print()
       return mean_loss



    def forward(self, epochs, loss_val):
      for i in range(1, epochs):
          loss_val = self.backward_pass(loss_val, i)




In [104]:
learning_rate  = 0.01

grad_obj = gradient_desc(learning_rate, ground_truth, mlp, input_data, loss)
grad_obj.forward(1000, mean_loss)

#After adding the zero grad, the loss minimizes fairly well.


Loss in epoch  10  is:  Value:(Data: 1.152699385580366)

Loss in epoch  20  is:  Value:(Data: 1.004972784149535)

Loss in epoch  30  is:  Value:(Data: 0.9803529088708213)

Loss in epoch  40  is:  Value:(Data: 0.961662565935745)

Loss in epoch  50  is:  Value:(Data: 0.9437794046686911)

Loss in epoch  60  is:  Value:(Data: 0.926307025318503)

Loss in epoch  70  is:  Value:(Data: 0.9090494742323731)

Loss in epoch  80  is:  Value:(Data: 0.8918640564439482)

Loss in epoch  90  is:  Value:(Data: 0.8746505309183488)

Loss in epoch  100  is:  Value:(Data: 0.8573465494343965)

Loss in epoch  110  is:  Value:(Data: 0.8497390034981476)

Loss in epoch  120  is:  Value:(Data: 0.842949017843425)

Loss in epoch  130  is:  Value:(Data: 0.8445532204539032)

Loss in epoch  140  is:  Value:(Data: 0.8505174398044097)

Loss in epoch  150  is:  Value:(Data: 0.8573643652775611)

Loss in epoch  160  is:  Value:(Data: 0.8650376002544928)

Loss in epoch  170  is:  Value:(Data: 0.8735000754066473)

Loss in epo