In [589]:
class Value:
    # _var is a variable intended for internal use within a class
    # self._var = var makes it publicly accessible as obj.var instead of obj._var
    def __init__(self, data, _children=(), _op='', _exp='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None # Function to calculate local grads of the input nodes to this output node
        self._prev = set(_children)
        self._op = _op
        self.exp = _exp


    # The __repr__ method provides a string representation of the instance, which is useful for debugging and displaying the object
    def __repr__(self):
        return f"Value(data={self.data})"


    # The __add__ method is a special method used to define the behavior of the addition operator (+) for instances of a class
    # Internally, the expression 'a + b' calls a.__add__(b)
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # So that we can add a numeric value directly to a Value object like Value(3.0) + 4
        out = Value(self.data + other.data, (self, other), '+', f"{self} + {other}")

        def _backward():
            self.grad += out.grad # += because if b = a + a, the db/da should be 2 but self first becomes 1 and then other (which is also a) becomes 1 so we want to accumulate instead of overwrite. Also if z = x + y and w = x * y, backprop must add dz/dx and dw/dx for x and so for y
            other.grad += out.grad

        out._backward = _backward # Not out._backward = _backward() as lambda functions return None and also the object's _backward attribute has a function value so passing _backward() will pass its returned value, not the function itself. We simply set the function to out._backward and not it's executed value
        return out


    def __radd__(self, other):
      other = other if isinstance(other, Value) else Value(other)
      return other + self


    # The __sub__ method is a special method used to define the behavior of the subtraction operator (-) for instances of a class
    def __sub__(self, other):
        return self + (-other)


    # The __mul__ method is a special method used to define the behavior of the multiplication operator (*) for instances of a class
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*', f"{self} * {other}")

        # Multiply by out.grad because we are applying the chain rule from the output back to that local layer or expression
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward
        return out


    # Something like 2 * Value(3.0) will throw an error as we have defined self * other where self is the Value object. __rmul__ will swap the expression so that 2 * Value(3.0) will become Value(3.0) * 2 and now this goes to __mul__
    def __rmul__(self, other):
        return self * other


    # The __truediv__ method is a special method used to define the behavior of the division operator (/) for instances of a class
    def __truediv__(self, other):
        return Value(self * other**-1, (self, other), '/', f"{self} / {other}") # We expressed / as a * equation so that the definition of * handles backprop without needing to redefine it


    # The __neg__ method is a special method used to define the behavior of the negative operator (-) for instances of a class
    def __neg__(self):
        out = Value(self.data * -1, (self,), '-', f"-{self}")

        def _backward():
            self.grad += -1 * out.grad

        out._backward = _backward
        return out


    # The __pow__ method is a special method used to define the behavior of the power operator (**) for instances of a class
    def __pow__(self, other):
        # assert isinstance(other, (int, float))
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data ** other.data, (self, other), '**', f"{self} ^ {other}")

        def _backward():
            self.grad += other.data * self.data**(other.data - 1) * out.grad

        out._backward = _backward
        return out


    def tanh(self): # Can be called as x = Value(3, label='x'); x.tanh()
        x = self.data
        tanh = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(tanh, (self, ), label='tanh')

        def _backward():
            self.grad += (1 - tanh**2) * out.grad

        out._backward = _backward
        return out


    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')

        def _backward():
            self.grad += out.data * out.grad

        out._backward = _backward
        return out


    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # This function is expected to be called on the output node to initiate backprop from there so o.backward() will initiate backprop. Remember that we are finding grads for each weight as the derivative of the output w.r.t to that weight and for the output itself, do/do = 1
        self.grad = 1.0

        for node in reversed(topo): # Reversed because the list is ordered from input layer to output layer and we wanna go in the backwards direction starting from the output for backprop
            node._backward()

print("--------DONE--------")

--------DONE--------


In [590]:
a = Value(3, label='a')
b = Value(4, label='b')
d = a + b; d.label='d'
e = 4*d - 5
e.backward()

In [591]:
a.grad, a

(4.0, Value(data=3))

In [592]:
e

Value(data=23)

In [593]:
e.data

23

In [594]:
d._prev, d.exp

({Value(data=3), Value(data=4)}, 'Value(data=3) + Value(data=4)')

In [595]:
# !pip install graphviz

In [596]:
# from graphviz import Digraph

# def trace(root):
#     nodes, edges = set(), set()
#     def build(v):
#         if v not in nodes:
#             nodes.add(v)
#             for child in v._prev:
#                 edges.add((child, v))
#                 build(child)
#     build(root)
#     return nodes, edges

# def draw_dot(root, format='svg', rankdir='LR'):
#     """
#     format: png | svg | ...
#     rankdir: TB (top to bottom graph) | LR (left to right)
#     """
#     assert rankdir in ['LR', 'TB']
#     nodes, edges = trace(root)
#     dot = Digraph(format=format, graph_attr={'rankdir': rankdir}) #, node_attr={'rankdir': 'TB'})

#     for n in nodes:
#         dot.node(name=str(id(n)), label = "{ data %.4f | grad %.4f }" % (n.data, n.grad), shape='record')
#         if n._op:
#             dot.node(name=str(id(n)) + n._op, label=n._op)
#             dot.edge(str(id(n)) + n._op, str(id(n)))

#     for n1, n2 in edges:
#         dot.edge(str(id(n1)), str(id(n2)) + n2._op)

#     return dot

In [597]:
# draw_dot(d)

In [598]:
import random, math

class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def parameters(self):
      return self.w + [self.b]

    # This is what is returned when an object of Neuron is called with some x like n(x) below
    def __call__(self, x):
        # sum takes an optional second argument to begin adding to instead of 0 which is the same as saying activation = sum(wi * xi for wi, xi in zip(self.w, x)) + self.b
        activation = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        o = activation.tanh()
        return o

In [599]:
x = [2.0, 3.0]
n = Neuron(2)
n(x)

Value(data=-0.02178200113679778)

In [600]:
class Layer:
    def __init__(self, nin, nout):
        # Each i/p feature feeds into each neuron (Neuron(nin)) and we want nout such neurons in the layer
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def parameters(self):
      return [params for neuron in self.neurons for params in neuron.parameters()]

    def __call__(self, x):
        # 1 output from each neuron in the layer
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

In [601]:
x = [2.0, 3.0]
n = Layer(2, 3)
n(x)

[Value(data=0.998231175563085),
 Value(data=0.2243891976026087),
 Value(data=0.9975405390438826)]

In [602]:
class MLP:
    # nout is a list containing the number of neurons in each layer
    def __init__(self, nin, nout):
        # For (4, [2, 3, 5]), [4] + [2, 3, 5] = [4, 2, 3, 5]
        tot = [nin] + nout
        # For (4, [3, 4, 1]), first layer will be 4 inputs each feeding into 3 neurons (layer(4, 3)), then those 3 neurons as inputs each feeding into 4 neurons, then those 4 into the output layer (layer(4, 1))
        self.layers = [Layer(tot[i], tot[i + 1]) for i in range(len(nout))]

    def parameters(self):
      return [params for layer in self.layers for params in layer.parameters()]

    def __call__(self, x):
        for layer in self.layers:
          x = layer(x)
        # The nn output is the output of the last layer
        return x

In [603]:
x = [2.0, 3.0, -1]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=-0.9706925899357141)

In [604]:
n.parameters()

[Value(data=0.36971441689094875),
 Value(data=-0.615486158689885),
 Value(data=0.24207821520476536),
 Value(data=0.5503890246959218),
 Value(data=-0.7861422889104617),
 Value(data=0.027664895530653544),
 Value(data=0.6535919955148117),
 Value(data=-0.6780615587909062),
 Value(data=-0.06894622633896463),
 Value(data=-0.18413369869193197),
 Value(data=0.5262484127965836),
 Value(data=0.5543300831611966),
 Value(data=-0.12042395557921792),
 Value(data=0.6219461715576238),
 Value(data=-0.7533381374191139),
 Value(data=0.8825871823123843),
 Value(data=0.7985727001021687),
 Value(data=0.03529669330228602),
 Value(data=0.22857009721582844),
 Value(data=0.27753920418634936),
 Value(data=0.48295079841770017),
 Value(data=-0.7517448661300883),
 Value(data=0.5374470286633526),
 Value(data=0.7445669796182788),
 Value(data=0.9051400098903977),
 Value(data=0.796151529573855),
 Value(data=0.546136644383018),
 Value(data=-0.9476082684196419),
 Value(data=0.2746563897144576),
 Value(data=-0.18479802354

In [605]:
# Trying out on a tiny dataset
xs = [
    [2, 3, -1],
    [3, -1, 0.5],
    [0.5, 1, 1],
    [1, 1, -1],
]

ys = [1, -1, -1, 1]

ypred = [n(x) for x in xs]
ypred

[Value(data=-0.9706925899357141),
 Value(data=0.1497306026815109),
 Value(data=-0.6495141371587563),
 Value(data=-0.9325617860026163)]

In [606]:
loss = sum([(ypredi - ysi)**2 for ysi, ypredi in zip(ys, ypred)])
loss
# loss = sum([loss_val.data for loss_val in t_loss])
# loss

Value(data=9.063145139539117)

In [607]:
# Before backprop
n.layers[0].neurons[0].w[0]

Value(data=0.36971441689094875)

In [608]:
loss.backward()

In [609]:
# After backprop
n.layers[0].neurons[0].w[0].data, n.layers[0].neurons[0].w[0].grad

(0.36971441689094875, -0.08566754412032315)

In [610]:
ypred = [n(x) for x in xs]
ypred

[Value(data=-0.9706925899357141),
 Value(data=0.1497306026815109),
 Value(data=-0.6495141371587563),
 Value(data=-0.9325617860026163)]

In [611]:
# # Gradient descent
# for param in n.parameters():
#   param.data += -0.001 * param.grad

In [638]:
ypred = [n(x) for x in xs]
loss = sum([(ypredi - ysi)**2 for ysi, ypredi in zip(ys, ypred)])
print("Loss: ", loss)
print("Y pred: ", ypred)

Loss:  Value(data=0.10902120517123255)
Y pred:  [Value(data=0.8728778366877895), Value(data=-0.8206863164300601), Value(data=-0.8355052821396477), Value(data=0.8165626770619491)]


In [639]:
# Building a training loop
for i in range(500):
  # Forward pass
  ypred = [n(x) for x in xs]
  loss = sum([(ypredi - ysi)**2 for ysi, ypredi in zip(ys, ypred)])

  #Backprop
  loss.backward()

  # Gradient descent
  for param in n.parameters():
    param.data += -0.0005 * param.grad
    param.grad = 0 # Set the grads back to zero since they keep accumulating (due to +=) for each param (adding on to the lass pass' value). This is basically zero_grad() in PyTorch

print("Loss: ", loss)
print("Y pred: ", ypred)

Loss:  Value(data=0.07552144150879558)
Y pred:  [Value(data=0.8952797402899925), Value(data=-0.8514181578084532), Value(data=-0.8609862590152525), Value(data=0.8478365198102515)]


In [618]:
# The model params for which the loss is the lowest
n.parameters()

[Value(data=0.5564672602973388),
 Value(data=-0.3625126043459481),
 Value(data=0.045056236760723574),
 Value(data=0.6647960416937546),
 Value(data=-0.7950141044123683),
 Value(data=0.00713535008996498),
 Value(data=0.6299120885773108),
 Value(data=-0.6990185060477334),
 Value(data=-0.3138665557146776),
 Value(data=-0.20475432017606496),
 Value(data=0.4799295104251299),
 Value(data=0.4365504641598593),
 Value(data=0.11054244942002281),
 Value(data=0.5435055136478706),
 Value(data=-0.7121042963327793),
 Value(data=0.95910279184566),
 Value(data=0.7708910088420023),
 Value(data=-0.0614893249131442),
 Value(data=0.13141464550914098),
 Value(data=0.4005424866315549),
 Value(data=0.571442087692058),
 Value(data=-0.6911442793460032),
 Value(data=0.5816365486870273),
 Value(data=0.8658927607152935),
 Value(data=0.7642151228827444),
 Value(data=0.7558344096794996),
 Value(data=0.5511902204552628),
 Value(data=-0.8473850689431112),
 Value(data=0.35885425874564997),
 Value(data=-0.288871200948980