In [8]:
import math 
import random
from sklearn import datasets
from typing import Union

In [9]:
class Value:
    def __init__(self, data: Union[int, float, 'Value'], _children=())->None:
        self.data = data
        self.grad = 0
        self._prev = set(_children)
        self._backward = lambda : None
    
    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

    def __add__(self, other: Union[int, float, 'Value'])->'Value': 
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other))
        def _backward():
            self.grad += 1 * out.grad #y=a+b, dy/da = 1
            other.grad += 1 * out.grad  #dy/db = 1
        out._backward = _backward
        return out

    def __mul__(self, other: Union[int, float, 'Value'])->'Value': 
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other))
        def _backward():
            self.grad += other.data * out.grad #y=a*b, dy/da = b
            other.grad += self.data * out.grad #dy/db = a
        out._backward = _backward
        return out

    def __neg__(self):
        return self * -1 #-a = a*-1
        
    def __pow__(self, other: Union[int, float])->'Value':
        out = Value(self.data ** other, (self,))
        def _backward():
            self.grad += other * self.data**(other-1) * out.grad #y = x^n, dy/dx=n*x**(n-1)
        out._backward = _backward
        return out

    def __truediv__(self, other: Union[int, float, 'Value'])->'Value':
        return self * other**-1 #a/b = a *(b**-1)

    def __radd__(self, other):
        return self + other #b+a = a+b

    def __rsub__(self, other): 
        return -self + other #b-a = -a+b

    def __rmul__(self, other):
        return self * other #b*a = a*b
    
    def __rtruediv__(self, other):
        return self**-1 * other #b/a = (a**-1) * b

    def __sub__(self, other: Union[int, float, 'Value'])->'Value':
        return self + (-other) #a-b = a+(-b)

    def relu(self)->'Value':
        out = self.data if self.data>0 else 0
        out = Value(out, (self,))
        def _backward():
            self.grad += (1 if self.data>0 else 0) * out.grad #y=x, dy=1 or #y=0, dy=n
        out._backward = _backward
        return out
    
    def exp(self)->'Value':
        out = math.exp(self.data)
        out = Value(out, (self,))
        def _backward():
            self.grad += out.data * out.grad #y=e**x, #dy/dx=e**x
        out._backward = _backward
        return out
    
    def log(self)->'Value':
        out = math.log(self.data)
        out = Value(out, (self,))
        def _backward():
            self.grad += (1/self.data)* out.grad #y=ln(x), dy/dx=1/x
        out._backward = _backward
        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __gt__(self, other)->bool:
        return self.data > other.data #checks if a>b

In [10]:
class Neuron:
    def __init__(self, n_in: int, nonlin=True) -> None:
        self.w = [
            Value(random.uniform(-1, 1)) for _ in range(n_in)
        ]  # initializes weights of the neuron
        self.b = Value(0.0)  # initializes bias of the neuron
        self.nonlin = nonlin

    def __call__(self, x:list)->'Value':
        act = sum(
            (wi * xi for wi, xi in zip(self.w, x)), self.b
        )  # w1*x1+ w2*x2+ .... +wnxn + b
        return act.relu() if self.nonlin else act

    def parameters(self) -> list['Value']:
        return self.w + [self.b]  # list of weights and biases of a neuron

    def __repr__(self) -> str:
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"


class Layer:
    def __init__(self, n_in:int, n_out:int, **kwargs) -> None:
        self.neurons = [
            Neuron(n_in, **kwargs) for _ in range(n_out)
        ]  # initializes neurons of a layer

    def __call__(self, x:list) -> list['Value']:
        out = [n(x) for n in self.neurons]  # outputs of all neurons in a layer
        return out

    def parameters(self) -> list['Value']:
        params = [
            p for n in self.neurons for p in n.parameters()
        ]  # weights and biases of all neurons in a layer
        return params

    def __repr__(self) -> str:
        return f"Layer({self.neurons})"


class MLP:
    def __init__(self, n_in: int, n_outs: list) -> None:  # MLP(4,[4, 3, 3])
        sz = [n_in] + n_outs
        self.layers = [
            Layer(sz[i], sz[i + 1], nonlin=i != len(n_outs) - 1)
            for i in range(len(n_outs))  # initializes layers of an MLP
        ]

    def __call__(self, x: list)->list['Value']:
        for layer in self.layers:
            x = layer(x)  # fowards the output of a layer to the next layer
        return x  # return the output of the last layer

    def parameters(self)->list['Value']:
        params = [p for layer in self.layers for p in layer.parameters()]
        #parameters of all layers in an MLP
        return params 

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0 #resets the gradients

    def __repr__(self) -> str:
        return f"MLP({self.layers})"

In [11]:
#load data
iris = datasets.load_iris()

X = iris.data #features 
y = iris.target #label

X = X.tolist()
ys = y.tolist()

In [12]:
def Softmax(logits:list)->list['Value']: #e^yi/e^y1+e^y2+...+e^yn for i in range(1, n+1)
    denominator = sum((logit.exp() for logit in logits))
    out = [logit.exp()/denominator for logit in logits]
    return out

def NLLLoss(sm_out: list, y:int)->'Value':
    return -(sm_out[y]).log() #ln(sm_out)

def loss_calc(y:int, logits:list)->'Value':
    sm_out = Softmax(logits)
    loss = NLLLoss(sm_out, y)
    return loss

In [13]:
model = MLP(4, [4,3,3]) #initialize the model 
epochs = 500 #no. of iteration
lr=0.02 #learning rate

In [16]:
for epoch in range(epochs):
    logits = list(map(model, X)) #forward pass
    loss = sum(list(map(loss_calc, ys, logits)))/len(ys) #NLLLoss calculation
    loss.backward() #backward pass, gradient calculation
    for p in model.parameters():
        p.data -= p.grad * lr #update weights and biases
    model.zero_grad() #reseting gradients
    print(epoch, loss) 

0 Value(data=0.2716759461596869, grad=1)
1 Value(data=0.27075585526365864, grad=1)
2 Value(data=0.2698424353437808, grad=1)
3 Value(data=0.26893563880332333, grad=1)
4 Value(data=0.26803541871150527, grad=1)
5 Value(data=0.2671417286701526, grad=1)
6 Value(data=0.2662545227114834, grad=1)
7 Value(data=0.265373755219985, grad=1)
8 Value(data=0.2644993808727323, grad=1)
9 Value(data=0.2636313545938764, grad=1)
10 Value(data=0.26276963152004956, grad=1)
11 Value(data=0.26191416697418274, grad=1)
12 Value(data=0.2610649164458435, grad=1)
13 Value(data=0.2602218355766194, grad=1)
14 Value(data=0.2593850604802415, grad=1)
15 Value(data=0.25855489735799614, grad=1)
16 Value(data=0.25773082267145764, grad=1)
17 Value(data=0.2569127685823161, grad=1)
18 Value(data=0.2561006859176867, grad=1)
19 Value(data=0.25529452682413895, grad=1)
20 Value(data=0.2544942443499383, grad=1)
21 Value(data=0.25369979228169354, grad=1)
22 Value(data=0.25291112501866336, grad=1)
23 Value(data=0.25212819747553794, 

In [17]:
#calculate accuracy
out = [logit.index(max(logit)) for logit in logits]
correct =0
for a, b in zip(ys, out):
    if a == b:
        correct +=1
correct/len(ys)

0.9466666666666667