In [1]:
import numpy as np

# Generate data

In [2]:
x = np.array([14,86,28,51,28,29,72,62,84,15,
              42,62,47,35,9,38,44,99,13,21,
              28,20,8,64,99,70,27,17,8])

B = 30
W = 2

y = B + W*x

# Stochastic Gradient Descent

Loss function is square error:

$$loss=(b+wx-y)^2$$

Gradients by chain rule:

$$F'(g(x)) = f’(g(x))g’(x)$$

$$\frac{dl}{db} = 2(b+wx-y)$$

$$\frac{dl}{dw} = 2(b+wx-y)*x$$

In [3]:
lr = 0.0001
epochs = 50

# Initial values
b = 1
w = 1

In [4]:
for e in range(epochs):
    for i in range(len(x)):
        
        # Gradient wrt to each parameter
        y_pred = b + w*x[i]
        dl_db = 2*(y_pred-y[i])
        dl_dw = 2*(y_pred-y[i])*x[i]
        
        # Update parameters
        b -= lr * dl_db
        w -= lr * dl_dw
        
print("After %d epochs:" % epochs)
print("b = %.2f (actual %.2f)" % (b,B))
print("w = %.2f (actual %.2f)" % (w,W))

After 50 epochs:
b = 3.73 (actual 30.00)
w = 2.56 (actual 2.00)


# Momentum

- Parameter update is 90% the same direction as the previous step and 10% current gradient
- Note that previous step includes effect of all prior steps where the most recent ones are exponentially weighted (exponential average)
- Dampens oscillations by maintaining momentum from previous direction so steps don't zig zag as much, which should help get to minima faster

In [5]:
beta = 0.9
lr = 0.0001
epochs = 50

# Initial values
b = 1
w = 1
b_step_prev = 1
w_step_prev = 1

In [6]:
for e in range(epochs):
    for i in range(len(x)):
        
        # Gradient wrt to each parameter
        y_pred = b + w*x[i]
        dl_db = 2*(y_pred-y[i])
        dl_dw = 2*(y_pred-y[i])*x[i]

        # Step size for each parameter
        b_step = beta*b_step_prev + (1-beta)*dl_db
        w_step = beta*w_step_prev + (1-beta)*dl_dw
        b_step_prev = b_step
        w_step_prev = w_step
        
        # Update parameters
        b -= lr * b_step
        w -= lr * w_step

        
print("After %d epochs:" % epochs)
print("b = %.2f (actual %.2f)" % (b,B))
print("w = %.2f (actual %.2f)" % (w,W))

After 50 epochs:
b = 3.46 (actual 30.00)
w = 2.31 (actual 2.00)


# RMSprop

- Root Mean Square Propogation
- Dampens oscillations by taking smaller steps when previous steps have been large, and larger steps when previous steps have been small
    - Reduce step component perpendicular to direction of minima and increase step component directly towards it

In [7]:
beta = 0.9
lr = 0.02
epochs = 50

# Initial values
b = 1
w = 1
b_step_prev = 1
w_step_prev = 1

In [8]:
for e in range(epochs):
    for i in range(len(x)):
        
        # Gradient wrt to each parameter
        y_pred = b + w*x[i]
        dl_db = 2*(y_pred-y[i])
        dl_dw = 2*(y_pred-y[i])*x[i]

        # Step size for each parameter
        b_step = beta*b_step_prev + (1-beta)*dl_db**2
        w_step = beta*w_step_prev + (1-beta)*dl_dw**2
        b_step_prev = b_step
        w_step_prev = w_step
                
        # Update parameters
        b -= lr/(b_step**0.5) * dl_db
        w -= lr/(w_step**0.5) * dl_dw
        
        
print("After %d epochs:" % epochs)
print("b = %.2f (actual %.2f)" % (b,B))
print("w = %.2f (actual %.2f)" % (w,W))

After 50 epochs:
b = 15.78 (actual 30.00)
w = 2.24 (actual 2.00)


# Adam

- Adaptive Moment
- Combination of both Momentum and RMSProp

In [13]:
beta_m = 0.9
beta_r = 0.9
lr = 1.0
epochs = 50

# Initial values
b = 1
w = 1
b_m_step_prev = 1
w_m_step_prev = 1
b_r_step_prev = 1
w_r_step_prev = 1

In [14]:
for e in range(epochs):
    for i in range(len(x)):

        # Gradient wrt to each parameter
        y_pred = b + w*x[i]
        dl_db = 2*(y_pred-y[i])
        dl_dw = 2*(y_pred-y[i])*x[i]

        # Momentum step size for each parameter
        b_m_step = beta_m*b_m_step_prev + (1-beta_m)*dl_db
        w_m_step = beta_m*w_m_step_prev + (1-beta_m)*dl_dw
        b_m_step_prev = b_m_step
        w_m_step_prev = w_m_step

        # RMSProp step size for each parameter
        b_r_step = beta_r*b_r_step_prev + (1-beta_r)*dl_db**2
        w_r_step = beta_r*w_r_step_prev + (1-beta_r)*dl_dw**2
        b_r_step_prev = b_r_step
        w_r_step_prev = w_r_step

        # Update parameters
        b -= lr/(b_r_step**0.5) * b_m_step
        w -= lr/(w_r_step**0.5) * w_m_step
        
print("After %d epochs:" % epochs)
print("b = %.2f (actual %.2f)" % (b,B))
print("w = %.2f (actual %.2f)" % (w,W))

After 50 epochs:
b = 29.06 (actual 30.00)
w = 1.39 (actual 2.00)


Approaches the true values a lot faster than either Momentum or RMSprop alone

# Adam with Annealing

Although Adam does scale the step size, we can also apply learning rate annealing to help fine tune the result as it gets closer to the minima.

In [18]:
beta_m = 0.9
beta_r = 0.95
epochs = 50

# Initial values
b = 1
w = 1
b_m_step_prev = 1
w_m_step_prev = 1
b_r_step_prev = 1
w_r_step_prev = 1

lr = 1.0
avg_grad_prev = 5500  # Init as value of avg_grad after 1 epoch

In [19]:
for e in range(epochs):
    for i in range(len(x)):

        # Gradient wrt to each parameter
        y_pred = b + w*x[i]
        de_db = 2*(y_pred-y[i])
        de_dw = 2*(y_pred-y[i])*x[i]

        # Momentum step size for each parameter
        b_m_step = beta_m*b_m_step_prev + (1-beta_m)*de_db
        w_m_step = beta_m*w_m_step_prev + (1-beta_m)*de_dw
        b_m_step_prev = b_m_step
        w_m_step_prev = w_m_step

        # RMSProp step size for each parameter
        b_r_step = beta_r*b_r_step_prev + (1-beta_r)*de_db**2
        w_r_step = beta_r*w_r_step_prev + (1-beta_r)*de_dw**2
        b_r_step_prev = b_r_step
        w_r_step_prev = w_r_step

        # Update parameters
        b -= lr/(b_r_step**0.5) * b_m_step
        w -= lr/(w_r_step**0.5) * w_m_step
        
    avg_grad = b_r_step**0.5 + w_r_step**0.5
    
    if avg_grad/avg_grad_prev > 2:
        lr = lr/4
    
    if avg_grad < avg_grad_prev:
        avg_grad_prev = avg_grad
        
print("After %d epochs:" % epochs)
print("b = %.2f (actual %.2f)" % (b,B))
print("w = %.2f (actual %.2f)" % (w,W))
print("lr = %.8f" % lr)

After 50 epochs:
b = 30.00 (actual 30.00)
w = 2.00 (actual 2.00)
lr = 0.00024414


Smaller learning rates closer to the minima allows convergence to the true values

Reference:
https://course.fast.ai/videos/?lesson=5