# Recap
* Differentiating `y` w.r.t `x` tells you how `x + dx` changes the value of `y`. This gives you control over changing `y`'s value.
* When working with multiple equations and dependencies i.e. `y = f(x)`; `x = g(a, b)`; `a = h(c, d)` and you care about `y`, you chain the gradients of `x, a, b, c, d` and calculate them w.r.t `y`. We use the chain rule of differentiation for this.
* If the objective is to increase the value of `y` then you simply change the values of `x, a, b, c, d` along the direction of their gradient by a pre-determined step.
* `f, g, h` could be any mathematical operation like `+`, `exp`, `*` etc.
* To increase `y`
   * Calculate `y`, with values of `x, a, b, c, d`.
   * Compute the gradients for each value w.r.t `y`
   * Update the values in the direction of the gradient.

# Neural Netowrk
## Neuron
* A Neural Network typically takes a set of observations and tries to model a response.
* It does this by:
    * Randomly assigning a weight `w` for each input feature
    * Adds a bias `b`
    * Sums all the inputs(activation)
    * Does a non-linear transformation like a `tanh` or `ReLU` - this is the output
## Layer
* You can have a series of neurons with an input and an output.

## Perceptron
* Input -> layer of neurons -> output

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
from graphviz import Digraph
%matplotlib inline

In [None]:
def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

In [None]:
# Rewriting the Value class from scratch

class Value:
    def __init__(self, data, _children=(), _op="", label=""):
        self.data = data
        # Children is here for plotting the DAG.
        self._prev = set(_children)
        self._op = _op
        self.label = label
        #gradients and backward
        self.grad = 0
        # _backward is a method that we can chain.
        self._backward = lambda: None

    def __repr__(self):
        return f"Value=({self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")

        def _backward():
            # Meant to update self and other's gradients
            # += to avoid 
            self.grad += 1 * out.grad
            other.grad += 1 * out.grad
        # Definition of a backward pass
        out._backward = _backward
        
        return out

    def __radd__(self, other):
        return self + other

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), "*")

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        
        return out

    def __rmul__(self, other):
        return self * other

    def __sub__(self, other):
        return self + (-other)

    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Value(self.data ** other, (self, ), f"**{other}")

        def _backward():
            self.grad += (other * (self.data ** (other - 1))) * out.grad
        out._backward = _backward

        return out
        
    def __truediv__(self, other):
        return self * (other ** -1)

    def exp(self):
        out = Value(math.exp(self.data), (self,), "exp")

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward

        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2.0 * x) - 1) / (math.exp(2.0 * x) + 1)
        out = Value(t, (self,), "tanh")

        def _backward():
            self.grad += (1 - (t ** 2)) * out.grad
        out._backward = _backward

        return out

    def backward(self):
        # Automating the manual _backward() calls using topological sort.
        # Sort the DAG such that o is last.
        # Call _backward on the queue.
        topo = []
        visited = set()
        def build_topo(v):
          if v not in visited:
            visited.add(v)
            for child in v._prev:
              build_topo(child)
            topo.append(v)
        build_topo(self)
        self.grad = 1.0        
        for node in reversed(topo):
            node._backward()
    #def relu(self):

In [None]:
# Defining a 2 input neuron

# Forward pass
x1 = Value(2.0, label="x1")
w1 = Value(-3.0, label="w1")
x2 = Value(0.0, label="x2")
w2 = Value(1.0, label="w2")
b = Value(6.8813735870195432, label="b")
x1w1 = x1 * w1; x1w1.label="x1w1"
x2w2 = x2 * w2; x2w2.label="x2w2"
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label="x1w1x2w2"
n = x1w1x2w2 + b; n.label="n"
o = n.tanh(); o.label="o"

# Backward pass
o.backward()

# Compute graph plot
draw_dot(o)

In [None]:
# Multi Layer perceptron

class Module:
    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0.0
    def parameters(self):
        return []

class Neuron(Module):
    def __init__(self, nin):
        # Assiging a weight to every input, randomly drawn from a uniform distribution.
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        assert len(x) == len(self.w)
        acts = sum([wi * xi for wi, xi in zip(self.w, x)], self.b)
        out = acts.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

class Layer(Module):
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

class MLP(Module):
    def __init__(self, nin, nouts):
        stacks = [nin] + nouts
        self.layers = [Layer(stacks[i], stacks[i+1]) for i in range(len(stacks) - 1)]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

In [None]:
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets
# We have initialized the MLP here.
model = MLP(3, [4, 4, 1])
model(xs[0])

In [None]:
epochs = 100
step = 0.1
for k in range(epochs):
    # Forward pass
    ypred = [model(x) for x in xs]
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    model.zero_grad()
    assert sum([p.grad for p in model.parameters()]) == 0
    
    # Backward pass
    loss.backward()

    # Update
    for p in model.parameters():
        p.data += (-step) * p.grad

    print(k, loss.data)

In [None]:
ypred

In [None]:
draw_dot(loss)