In [4]:
import math
import numpy as np
import trace_graph
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def f(x):
    return 3*x**2 - 4*x + 5

In [None]:
xs = np.arange(-5, 5, 0.25)
ys = f(xs)
plt.plot(xs, ys)

In [None]:
h = 0.00001
slope = (f(xs+h) - f(xs))/h
print(np.vstack((xs, slope)).T)

In [None]:
h = 0.0001

def simple_func(a,b,c):
    return a*b + c

# inputs
a = 2.0
b = -3.0
c = 10.0

# slope wrt a
d1 = simple_func(a,b,c)
d2 = simple_func(a+h,b,c)
print('slope wrt a: ', d1, d2, (d2-d1)/h)

# slope wrt b
d1 = simple_func(a,b,c)
d2 = simple_func(a,b+h,c)
print('slope wrt b: ', d1, d2, (d2-d1)/h)

# slope wrt c
d1 = simple_func(a,b,c)
d2 = simple_func(a,b,c+h)
print('slope wrt c: ', d1, d2, (d2-d1)/h)

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float) -> None:
        self.data = data
    
    def __repr__(self) -> str:
        return f"Value(data={self.data})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data)
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data)

a = Value(2.0)    
b = Value(-3.0)
c = Value(10.0)
print(a+b)
print(a*b+c)

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=()) -> None:
        self.data = data
        self._prev = set(_children)
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data, (self, other))
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data, (self, other))

a = Value(2.0)    
b = Value(-3.0)
c = Value(10.0)
print(a+b)
print(a*b+c)

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='') -> None:
        self.data = data
        self._prev = set(_children)
        self._op = _op
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data, (self, other), '+')
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data, (self, other), '*')

a = Value(2.0)    
b = Value(-3.0)
c = Value(10.0)
print(a+b)
print(a*b+c)

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data, (self, other), '+')
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data, (self, other), '*')

a = Value(2.0, label='a')    
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
d = a*b; d.label = 'd'
e = d+c; e.label = 'e'
f = Value(-2.0, label='f')
L = e*f; L.label = 'L'

In [None]:
from importlib import reload
reload(trace_graph)
trace_graph.draw_dot(L)

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data, (self, other), '+')
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data, (self, other), '*')

a = Value(2.0, label='a')    
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
d = a*b; d.label = 'd'
e = d+c; e.label = 'e'
f = Value(-2.0, label='f')
L = e*f; L.label = 'L'

In [None]:
from importlib import reload
reload(trace_graph)
trace_graph.draw_dot(L)

### computing the gradient manually

In [None]:
L.grad = 1.0
e.grad = L.grad * f.data
f.grad = L.grad * e.data
d.grad = e.grad * 1.0
c.grad = e.grad * 1.0
b.grad = d.grad * a.data
a.grad = d.grad * b.data

In [None]:
from importlib import reload
reload(trace_graph)
trace_graph.draw_dot(L)

### moving in the direction of the gradient will increase the function value

In [None]:
a.data += 0.01 * a.grad
b.data += 0.01 * b.grad
c.data += 0.01 * c.grad
f.data += 0.01 * f.grad
d = a*b
e = d+c
L = e*f
print(L.data)

### small MLP

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        return Value(self.data + other.data, (self, other), '+')
    
    def __mul__(self, other: Self) -> Self:
        return Value(self.data * other.data, (self, other), '*')
    
    def tanh(self) -> Self:
        x = self.data
        t = (np.exp(2*x) - 1) / (np.exp(2*x) + 1)
        return Value(t, (self,), 'tanh')

In [None]:
# inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.881373587, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

In [None]:
print(o)

In [None]:
trace_graph.draw_dot(o)

In [None]:
o.grad = 1.0
n.grad = 1 - o.data**2
b.grad = n.grad
x1w1x2w2.grad = n.grad
x2w2.grad = x1w1x2w2.grad
x1w1.grad = x1w1x2w2.grad
w2.grad = x2.data * x2w2.grad
x2.grad = w2.data * x2w2.grad
w1.grad = x1.data * x1w1.grad
x1.grad = w1.data * x1w1.grad

In [None]:
trace_graph.draw_dot(o)

### backward implementation

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out
    
    def __mul__(self, other: Self) -> Self:
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out
    
    def tanh(self) -> Self:
        x = self.data
        t = (np.exp(2*x) - 1) / (np.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad            
        out._backward = _backward
        return out

In [None]:
# inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.881373587, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

In [None]:
o.grad = 1.0

In [None]:
o._backward()
n._backward()
x1w1x2w2._backward()
x1w1._backward()
x2w2._backward()

In [None]:
trace_graph.draw_dot(o)

### implementing backward inside the Value class

In [None]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self:
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out
    
    def __mul__(self, other: Self) -> Self:
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out
    
    def tanh(self) -> Self:
        x = self.data
        t = (np.exp(2*x) - 1) / (np.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad            
        out._backward = _backward
        return out
    
    def backward(self):

        topo = []
        visited = set()
        def build_topo(v):
            if v in visited: return
            visited.add(v)
            for child in v._prev:
                build_topo(child)
            topo.append(v)
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

In [None]:
# inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.881373587, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'

In [None]:
o.backward()

In [None]:
trace_graph.draw_dot(o)

### generalizing the Value class to include exponentiation and division functions

In [7]:
from typing import Self

class Value:

    def __init__(self, data: float, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f"Value(data={self.data}, prev={self._prev}, op={self._op})"
    
    def __add__(self, other: Self) -> Self: # self + other
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out

    def __radd__(self, other: Self) -> Self: # other + self
        return self + other
        
    def __neg__(self) -> Self: # -self
        return self * -1
    
    def __sub__(self, other) -> Self: # self - other
        return self + (-other)
    
    def __rsub__(self, other: Self) -> Self: # other - self
        return self + (-other)
        
    def __mul__(self, other: Self) -> Self: # self * other
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out
    
    def __pow__(self, other) -> Self: # self ^ other
        assert isinstance(other, (int, float)), "only supports int or float powers"
        out = Value(self.data ** other, (self,), f'**{other}')

        def _backward():
            self.grad = (other * (self.data ** (other-1))) * out.grad
        out._backward = _backward

        return out    

    def __rmul__(self, other: Self) -> Self: # other * self
        return self * other
    
    def __truediv__(self, other) -> Self: # self / other
        return self * (other ** -1)
    
    def tanh(self) -> Self:
        x = self.data
        t = (np.exp(2*x) - 1) / (np.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad            
        out._backward = _backward
        
        return out
    
    def exp(self) -> Self:
        x = self.data
        out = Value(math.exp(x), (self,), 'exp')

        def _backward():
            self.grad = out.data * out.grad
        out._backward = _backward
        
        return out
    
    def backward(self):

        topo = []
        visited = set()
        def build_topo(v):
            if v in visited: return
            visited.add(v)
            for child in v._prev:
                build_topo(child)
            topo.append(v)
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

In [None]:
# inputs
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias
b = Value(6.881373587, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
#----
#o = n.tanh(); o.label = 'o'
e = (2*n).exp(); e.label = 'e'
o = (e-1) / (e+1)
#----
o.label = 'o'
o.backward()
trace_graph.draw_dot(o)

### implementing the function using PyTorch

In [None]:
import torch

x1 = torch.Tensor([2.]).double(); x1.requires_grad = True
x2 = torch.Tensor([0.]).double(); x2.requires_grad = True
w1 = torch.Tensor([-3.]).double(); w1.requires_grad = True
w2 = torch.Tensor([1.]).double(); w2.requires_grad = True
b = torch.Tensor([6.8813735]).double(); b.requires_grad = True
n = x1*w1 + x2*w2 + b; n.retain_grad()
o = torch.tanh(n)

print(o.data.item())
o.backward()

print('x1', x1.grad.item())
print('x2', x2.grad.item())
print('w1', w1.grad.item())
print('w2', w2.grad.item())
print('n', n.grad.item())

### Higher level functions using the Value class
### Neuron

In [None]:
from typing import Self
import random

class Neuron:
    def __init__(self, n: int) -> None:
        self.w = [Value(random.uniform(-1,1)) for _ in range(n)]
        self.b = Value(random.uniform(-1,1))
    
    def __call__(self, x) -> Value:
        # w*x + b
        assert len(x) == len(self.w)
        act = sum((xi*wi for xi, wi in zip(x, self.w)), self.b)
        out = act.tanh()
        return out
    
class Layer:
    def __init__(self, nin: int, nout: int) -> None:
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        out = [neuron(x) for neuron in self.neurons]
        return out[0] if len(out) == 1 else out

class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

In [None]:
trace_graph.draw_dot(n(x))

### adding feature to read the parameters easily

In [8]:
from typing import Self
import random

class Neuron:
    def __init__(self, n: int) -> None:
        self.w = [Value(random.uniform(-1,1)) for _ in range(n)]
        self.b = Value(random.uniform(-1,1))
    
    def __call__(self, x) -> Value:
        # w*x + b
        assert len(x) == len(self.w)
        act = sum((xi*wi for xi, wi in zip(x, self.w)), self.b)
        out = act.tanh()
        return out
    
    def parameters(self):
        return self.w + [self.b]
    
class Layer:
    def __init__(self, nin: int, nout: int) -> None:
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        out = [neuron(x) for neuron in self.neurons]
        return out[0] if len(out) == 1 else out
    
    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]


class MLP:
    def __init__(self, nin, nouts) -> None:
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [19]:
x = [[random.uniform(-5,5) for _ in range(3)] for _ in range(4)]
y = [1.0, 1.0, -1.0, -1.0]
n = MLP(3, [4, 4, 1])
out = [n(xi) for xi in x]
print(x)
print(out)

[[-0.3652918488652892, 3.371476555476246, -3.2963086027033874], [3.2197945977742197, -1.8384885488503953, 0.13411374159008282], [0.2979477695477346, 0.6803100794205754, -2.0976534242219236], [2.895512725907137, -0.596285143691019, -1.1997135876884544]]
[Value(data=-0.9638652225889809, prev={Value(data=-1.997707048917433, prev={Value(data=-0.20150689312182357, prev={Value(data=0.4367130481488162, prev=set(), op=), Value(data=-0.4614171570462379, prev={Value(data=-0.4991102872282771, prev={Value(data=-0.5043134361025113, prev={Value(data=0.006172146547591771, prev={Value(data=-0.36942876542749303, prev=set(), op=), Value(data=-0.01670727113101095, prev={Value(data=-0.01670882590780831, prev={Value(data=2.177470101079239, prev={Value(data=0.6402249838111372, prev={Value(data=-0.062426235451489655, prev={Value(data=-0.3652918488652892, prev=set(), op=), Value(data=0.1708941375106099, prev=set(), op=)}, op=*), Value(data=0.7026512192626269, prev=set(), op=)}, op=+), Value(data=1.53724511726

In [20]:
for idx in range(100):
    # forward pass
    out = [n(xi) for xi in x]

    # loss - mean squared error
    loss = sum((oi - yi)**2 for oi, yi in zip(out, y))

    # backward - compute gradients
    for p in n.parameters():
        p.grad = 0.0
    loss.backward()

    # update weights
    lr = 0.01
    for p in n.parameters():
        p.data -= lr * p.grad
    
    print(idx, loss.data, [outi.data for outi in out])


0 4.516381130432431 [-0.9638652225889809, 0.46938458546165834, -0.9723714199011784, -0.38575374534801476]
1 4.466011857325028 [-0.9640984791863482, 0.4501126274426394, -0.9728515733764308, -0.44753655160116806]
2 4.435293140857434 [-0.9643293643662323, 0.4449917217974637, -0.9731794103512339, -0.48236136519427425]
3 4.411008284670365 [-0.9645496016238891, 0.4477475039171703, -0.9734243225865792, -0.504153187597132]
4 4.389418224484879 [-0.9647565722279112, 0.45464029422213237, -0.9736229380080507, -0.5193370429431265]
5 4.3694069675334894 [-0.9649489741629761, 0.4635727197231043, -0.9737941541261943, -0.5310207647601785]
6 4.3506004846869075 [-0.9651259880368985, 0.47340296278208305, -0.9739476573942013, -0.540764579254853]
7 4.332841161495872 [-0.9652870462152462, 0.4835207168549209, -0.974088376894735, -0.5493718546912326]
8 4.316037771160978 [-0.9654317403872787, 0.49360482330550265, -0.9742187881346572, -0.5572640751254604]
9 4.30012166712618 [-0.9655597744514904, 0.503489758417891