In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

## Rate of Change (Derivatives)

In [3]:
import math
import numpy as np

In [32]:
def f(x):
    return 3*x**2 - 4*x+5

In [33]:
f(3.0)

20.0

In [35]:
# (f(x + h)-f(x))/h -> definition of a derivative
h = 0.001
x = 3.0
f(x + h) # bump function, respons positively

20.014003000000002

In [36]:
f(x + h) - f(x) # Remove the function to get how much the function responded

0.01400300000000243

In [37]:
(f(x + h) - f(x))/h # slope at point

14.00300000000243

## Forward and Backwards

In [31]:
# Class to add linked list 'memory' to a mathematical expression
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0 # zeroing gradients, seen at the start of any training loop
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f'Value(data={self.data})'

    def __add__(self, other):
        return Value(self.data + other.data, (self, other), '+')

    def __mul__(self, other):
        return Value(self.data * other.data, (self, other), '*')

    def __tanh__(self):
        n = self.data
        t = (math.exp(2*n)-1)/(math.exp(2*n) + 1)
        out = Value(t, (self,), 'tanh')
        return out

In [38]:
# Forward pass (2 * -3 + 10) * -2
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a*b; e.label='e'
d = e+c; d.label='d'
f = Value(-2.0, label='f')
L = d * f; L.label='L'
L, L._prev, L._op, L.label

(Value(data=-8.0), {Value(data=-2.0), Value(data=4.0)}, '*', 'L')

dL/da = -2.0 * -3.0 = 6.0
dL/db = -2.0 * 2.0 = -4.0

de/da = -3.0 | b
de/db = 2.0 | a

e = a * b
(((a + h) * b) -(a * b))/h
(ab + hb - ab) / h
hb/h
b

Application of chain rule

dz/dx = dz/dy * dy/dx

dL/dc = (dL/dd) * (dd/dc)
dL/dc = -2.0 * 1
dL/dc = -2.0

takeaway - addition passes through value

dL / dc -2.0
dL / de -2.0

dd / dc 1.0
dd / de 1.0

d = c + e

(f(x + h)-f(x))/h
((c + h + e)-(e + c))/h
(c + h + e - e - c)/h
h/h
1

L = d * f

dL/dd = f 
dL/df = d

(f(x+h)-f(x))/h
x = d

dL = (((d + h) * f)-d * f)/h
(d*f + h*f - d * f)/h
h*f/h
f

Multiplacation takes other side and addition is a passthrough

In [57]:
a.grad = 6.0
b.grad = -4.0
c.grad = -2.0
e.grad = -2.0
d.grad = -2.0
f.grad = 4.0
L.grad = 1.0

In [58]:
# optimize leaf nodes
h = 0.01
a.data += h * a.grad
b.data += h * b.grad
c.data += h * c.grad
f.data += h * f.grad

e = a*b; e.label='e'
d = e+c; d.label='d'
f = Value(-2.0, label='f')
L = d * f; L.label='L'
L.data

# If gradients are correct we expect to nudge to 0, so -8 should go up

-7.4352

In [56]:
def check():
    h = 0.001
    
    a = Value(2.0, label='a')
    b = Value(-3.0, label='b')
    c = Value(10.0, label='c')
    e = a*b; e.label='e'
    d = e+c; d.label='d'
    f = Value(-2.0, label='f')
    L = d * f; L.label='L'
    L1 = L.data

    a = Value(2.0, label='a')
    b = Value(-3.0 + h, label='b')
    c = Value(10.0, label='c')
    e = a*b; e.label='e'
    d = e+c; d.label='d'
    f = Value(-2.0, label='f')
    L = d * f; L.label='L'
    L2 = L.data

    print((L2 - L1)/h) # rise over run

check() # Derivative of L with respect to h

-3.9999999999995595


1. Theory: All problems can be represented by many linear functions fed through a non-linear function
2. Correct answer v guess can be graphed, by using calculus we can iteratively push our predicted answers to the correct answer -> "gradient descent"
3. Definition of a derivative: df(x) = (f(x + h) - f(x))/h
4. Create a class with a linked list structure, this linked list strucutre holds the memory of previous calulations
5. Manually by hand backpropagate back to the beginning
6. Manually optimize by updating each functions "gradient"
7. Define atomic functions for this class, each one should include a function to calculate the derivative and apply the chain rule (chain rule is used to pull the loss from the front of the expression (answer) back to the start)
8. Define a MLP with neurons, layers
9. Create functions to extract parameters up to the top of the model for easy updating
10. Calculate the loss (mean squared error)
11. Back propagate the loss back through
12. Update the parameters
13. Create a training loop (zero gradients before calling loss)