# Adam Optimization

In [None]:
import numpy as np

### Loss and Gradient Functions
Define function for loss $L(W_0, W_1) = 2W_0^2 + W_1^2$ and gradient $\nabla_WL=[4W_0,2W_1]$.

In [None]:
def loss(W):
    return 2*W[0]*W[0] + W[1]*W[1]

def gradient(W):
    return np.array([4*W[0],2*W[1]])

### Initialization

In [None]:
print("ADAM")
W0 = np.array([2,2])
alpha = 0.1
beta1 = 0.9
beta2 = 0.999
eps=0
print("W0: {}".format(W0))
print("Loss W0: {}".format(loss(W0)))
v0 = 0
m0 = 0

### Update Rule
$Update_{epoch=i}=-\frac{\alpha}{\sqrt{v_{epoch=i}} + \epsilon}m_{epoch=i}$

where, 

+ $m_{epoch=i} = \beta_1m_{epoch=i-1} + (1-\beta_1) \nabla_WL_{epoch=i-1}, \quad m_{epoch=0} = 0$

+ $v_{epoch=i} = \beta_2v_{epoch=i-1} + (1-\beta_2) \nabla_WL^2_{epoch=i-1}, \quad v_{epoch=0} = 0$

### Epoch 1

In [None]:
# Epoch 1
gradW0 = gradient(W0)
grad2W0 = np.square(gradW0)
print("Gradient W0: {}".format(gradW0))
print("Gradient2 W0: {}".format(grad2W0))
m1 = beta1*m0 + (1-beta1)*gradW0
v1 = beta2*v0 + (1-beta2)*grad2W0
W1 = W0 - alpha*m1/(np.sqrt(v1)+eps)
print("m1: {}".format(m1))
print("v1: {}".format(v1))
print("W1: {}".format(W1))
print("Loss W1: {}".format(loss(W1)))

### Epoch 2

In [None]:
# Epoch 2
gradW1 = gradient(W1)
grad2W1 = np.square(gradW1)
print("Gradient W1: {}".format(gradW1))
print("Gradient2 W1: {}".format(grad2W1))
m2 = beta1*m1 + (1-beta1)*gradW1
v2 = beta2*v1 + (1-beta2)*grad2W1
W2 = W1 - alpha*m2/(np.sqrt(v2)+eps)
print("m2: {}".format(m2))
print("v2: {}".format(v2))
print("W2: {}".format(W2))
print("Loss W2: {}".format(loss(W2)))