# micrograd exercises

1. watch the [micrograd video](https://www.youtube.com/watch?v=VMj-3S1tku0) on YouTube
2. come back and complete these exercises to level up :)

## section 1: derivatives

In [None]:
# here is a mathematical expression that takes 3 inputs and produces one output
from math import sin, cos

def f(a, b, c):
  return -a**3 + sin(3*b) - 1.0/c + b**2.5 - a**0.5

print(f(2, 3, 4))

In [9]:
# write the function df that returns the analytical gradient of f
# i.e. use your skills from calculus to take the derivative, then implement the formula
# if you do not calculus then feel free to ask wolframalpha, e.g.:
# https://www.wolframalpha.com/input?i=d%2Fda%28sin%283*a%29%29%29
from math import cos

def gradf(a, b, c):
  aGrad = -3*a**2 - 0.5*a**-0.5
  bGrad = 3*cos(3*b) + 2.5*b**1.5
  cGrad = c**-2
  return [aGrad, bGrad, cGrad] # todo, return [df/da, df/db, df/dc]

# expected answer is the list of
ans = [-12.353553390593273, 10.25699027111255, 0.0625]
yours = gradf(2, 3, 4)
for dim in range(3):
  ok = 'OK' if abs(yours[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {yours[dim]}")


OK for dim 0: expected -12.353553390593273, yours returns -12.353553390593273
OK for dim 1: expected 10.25699027111255, yours returns 10.25699027111255
OK for dim 2: expected 0.0625, yours returns 0.0625


In [30]:
# now estimate the gradient numerically without any calculus, using
# the approximation we used in the video.
# you should not call the function df from the last cell

# -----------
from math import cos, sin

# a = 2
# b = 3
# c = 4

# agrad = 0
# bgrad = 0
# cgrad = 0

# x = -a**3
# y = sin(3*b)
# exp1 = x + y

# x_ = -1/c
# y_ = b**2.5
# exp2 = x_ + y_

# z = -a**0.5
# exp3 = exp2 + z

# f = exp1 + exp3

# fgrad = 1;
# exp1grad = 1;
# exp3grad = 1;

# zgrad = exp3grad;
# agrad += -0.5*a**-0.5 * zgrad

# # done with exp3grad
# exp2grad = 1;

# x_grad = exp2grad
# y_grad = exp2grad

# bgrad += 2.5*b**1.5 * y_grad
# cgrad += c**-2 * x_grad

# # done with y_grad, x_grad and c_grad

# xgrad = exp1grad
# ygrad = exp1grad

# agrad += -3*a**2 * xgrad
# bgrad += 3*cos(3*b) * ygrad


a = 2
b = 3
c = 4

x = -a**3
y = sin(3*b)
exp1 = x + y

x_ = -1/c
y_ = b**2.5
exp2 = x_ + y_

exp3 = -a**0.5

L = exp1 + exp2 + exp3

L1 = L

agrad = 0
bgrad = 0
cgrad = 0

fgrad = 1;
exp1grad = 1;
exp2grad = 1
exp3grad = 1;

agrad += -0.5*a**-0.5 * exp3grad


x_grad = exp2grad
y_grad = exp2grad

bgrad += 2.5*b**1.5 * y_grad
cgrad += c**-2 * x_grad

xgrad = exp1grad
ygrad = exp1grad

agrad += -3*a**2 * xgrad
bgrad += 3*cos(3*b) * ygrad


for iter in range(29):
  if (iter < 20):
    h = 0.001
  else:
    h = 0.0001
  a += -h*agrad
  b += -h*bgrad
  c += -h*cgrad


  x = -a**3
  y = sin(3*b)
  exp1 = x + y

  x_ = -1/c
  y_ = b**2.5
  exp2 = x_ + y_

  exp3 = -a**0.5

  L = exp1 + exp2 + exp3
  print(f"L is {L}")

  L1 = L

  agrad = 0
  bgrad = 0
  cgrad = 0

  fgrad = 1;
  exp1grad = 1;
  exp2grad = 1
  exp3grad = 1;

  agrad += -0.5*a**-0.5 * exp3grad


  x_grad = exp2grad
  y_grad = exp2grad

  bgrad += 2.5*b**1.5 * y_grad
  cgrad += c**-2 * x_grad

  xgrad = exp1grad
  ygrad = exp1grad

  agrad += -3*a**2 * xgrad
  bgrad += 3*cos(3*b) * ygrad


numerical_grad = [agrad, bgrad, cgrad] # TODO



# -----------

# for dim in range(3):
#   ok = 'OK' if abs(numerical_grad[dim] - ans[dim]) < 1e-5 else 'WRONG!'
#   print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad[dim]}")


L is 3.675575029404377
L is 0.6054070319986138
L is -3.0672582195094593
L is -7.591282642833134
L is -13.3084753984031
L is -20.714156198142213
L is -30.566347646921276
L is -44.09295164100362
L is -63.39452919984738
L is -92.24938809116014
L is -137.80977170103625
L is -214.46764546269728
L is -353.55155260433963
L is -630.5635127622439
L is -1253.2451894608462
L is -2903.92842718701
L is -8452.695201155042
L is -35338.74283217072
L is -276143.8548064081
L is -7114957.889562729
L is -8418994.004575998
L is -10056404.364163691
L is -12139222.609149272
L is -14827224.134696364
L is -18353127.518555775
L is -23064019.65377393
L is -29491611.028331302
L is -38475676.04465765
L is -51389871.32173745


In [29]:
# there is an alternative formula that provides a much better numerical
# approximation to the derivative of a function.
# learn about it here: https://en.wikipedia.org/wiki/Symmetric_derivative
# implement it. confirm that for the same step size h this version gives a
# better approximation.

# -----------


a = 2
b = 3
c = 4

agrad = 0
bgrad = 0
cgrad = 0

for iter in range(29):
  if (iter == 0):
    h = 0
  if (iter < 20):
    h = 0.001
  else:
    h = 0.0001

  a += -h*agrad
  b += -h*bgrad
  c += -h*cgrad


  x = -a**3
  y = sin(3*b)
  exp1 = x + y

  x_ = -1/c
  y_ = b**2.5
  exp2 = x_ + y_

  exp3 = -a**0.5

  L = exp1 + exp2 + exp3
  print(f"L is {L}")

  L1 = L

  agrad = 0
  bgrad = 0
  cgrad = 0

  exp1grad = 1;
  exp2grad = 1
  exp3grad = 1;

  agrad += ((-a**0.5 - -(a-h)**0.5)/(2*h)) * exp3grad


  x_grad = exp2grad
  y_grad = exp2grad

  bgrad += ((b**2.5 - (b-h)**2.5)/(2*h)) * y_grad
  cgrad += ((c**-1 - (c-h)**-1)/(2*h)) * x_grad

  xgrad = exp1grad
  ygrad = exp1grad

  agrad +=((-a**3 - -(a-h)**3)/(2*h)) * xgrad
  bgrad += ((sin(3*b) - sin(3*(b-h)))/(2*h))* ygrad


numerical_grad2 = [0, 0, 0] # TODO
# -----------

for dim in range(3):
  ok = 'OK' if abs(numerical_grad2[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad2[dim]}")


L is 6.336362190988558
L is 5.032016039707964
L is 3.638981212540368
L is 2.135238152773397
L is 0.4964317770808362
L is -1.3049962973395808
L is -3.3008161172840618
L is -5.5282382000172525
L is -8.031471587339682
L is -10.863764536977678
L is -14.090147199292693
L is -17.791190624874424
L is -22.068231380357847
L is -27.05070320525991
L is -32.90649692683813
L is -39.85669329351763
L is -48.196683277057744
L is -58.32679076060235
L is -70.79736756404473
L is -86.3765153401368
L is -86.56216798710071
L is -86.74872076457822
L is -86.93575726071532
L is -87.12327911227932
L is -87.31128796267379
L is -87.49978546197268
L is -87.68877326694965
L is -87.8782530411104
L is -88.0682264547232
WRONG! for dim 0: expected -12.353553390593273, yours returns 0
WRONG! for dim 1: expected 10.25699027111255, yours returns 0
WRONG! for dim 2: expected 0.0625, yours returns 0


## section 2: support for softmax

In [27]:
# Value class starter code, with many functions taken out
from math import exp, log

class Value:

  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0.0
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op
    self.label = label

  def __repr__(self):
    return f"Value(data={self.data})"

  def __radd__(self, other):
    return self + other

  def __add__(self, other): # exactly as in the video
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')

    def _backward():
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward

    return out


  # ------
  # re-implement all the other functions needed for the exercises below
  # your code here
  # TODO
  # ------

  def exp(self):
    d = self.data
    out = Value(exp(d), (self, ), 'exp')

    def _backward():
      self.grad += out.data * out.grad
    out._backward = _backward
    return out

  def __rmul__(self, other):
    return self*other

  def __mul__(self, other):
    other = other if isinstance(other, Value) else Value(other)

    out = Value(self.data*other.data, (self, other), '*')

    def backward():
      self.grad += other.data  * out.grad
      other.grad += self.data  * out.grad

    out._backward = backward
    return out

  def __truediv__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    return self * other**-1

  def __pow__(self, other):
    out = Value(self.data**other, (self, ), 'pow')

    def backward():
      self.grad += (other*(self.data**(other-1))) * out.grad

    out._backward = backward
    return out

  def log(self):
    out = Value(log(self.data), (self, ), 'log')

    def backward():
      self.grad += (1/self.data) * out.grad
    out._backward = backward
    return out

  def __neg__(self):
    return self * -1

  def backward(self): # exactly as in video
    topo = []
    visited = set()
    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self)

    self.grad = 1.0
    for node in reversed(topo):
      node._backward()

In [26]:
# without referencing our code/video __too__ much, make this cell work
# you'll have to implement (in some cases re-implemented) a number of functions
# of the Value object, similar to what we've seen in the video.
# instead of the squared error loss this implements the negative log likelihood
# loss, which is very often used in classification.

# this is the softmax function
# https://en.wikipedia.org/wiki/Softmax_function
def softmax(logits):
  counts = [logit.exp() for logit in logits]
  denominator = sum(counts)
  out = [c / denominator for c in counts]
  return out

# this is the negative log likelihood loss function, pervasive in classification
logits = [Value(0.0), Value(3.0), Value(-2.0), Value(1.0)]
probs = softmax(logits)
loss = -probs[3].log() # dim 3 acts as the label for this input example
loss.grad = 1.0
loss.backward()
print(loss.data)

ans = [0.041772570515350445, 0.8390245074625319, 0.005653302662216329, -0.8864503806400986]
for dim in range(4):
  ok = 'OK' if abs(logits[dim].grad - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {logits[dim].grad}")


2.1755153626167147
OK for dim 0: expected 0.041772570515350445, yours returns 0.041772570515350445
OK for dim 1: expected 0.8390245074625319, yours returns 0.8390245074625319
OK for dim 2: expected 0.005653302662216329, yours returns 0.005653302662216329
OK for dim 3: expected -0.8864503806400986, yours returns -0.8864503806400986


In [34]:
# verify the gradient using the torch library
# torch should give you the exact same gradient
import torch


logit_tensor = torch.Tensor([0.0, 3.0, -2.0, 1.0]).double()
logit_tensor.requires_grad = True

probs_torch = torch.softmax(logit_tensor, dim = 0)
loss_torch = -probs_torch[3].log()
loss_torch.backward()

print(loss_torch.data)
print(logit_tensor.grad)


tensor(2.1755, dtype=torch.float64)
tensor([ 0.0418,  0.8390,  0.0057, -0.8865], dtype=torch.float64)
