In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression

In [2]:
### Creating the data
X, y = make_regression(n_samples = 1000, n_features = 10)

In [3]:
def config():

  global w, b, n, learning_rate
  w = np.random.rand(10)
  b = np.random.rand(1)

  n = y.shape[0]
  learning_rate = 0.01


In [4]:
### Gradient Descent;

def gradient_descent(w, b, X, y):


  y_hat = X @ w + b

  loss = 1 / n * np.sum((y - y_hat) ** 2)

  dw = 2 / n *  np.sum(X.T @ ( y_hat - y ))
  db = 2 / n *  np.sum( y_hat - y )


  w = w - learning_rate * dw
  b = b - learning_rate * db


  return w, b, loss

In [6]:
epochs = 10
config()

for i in range(epochs):
  w, b, loss = gradient_descent(w, b, X, y)
  print(f'Epoch {i} - Loss = ', loss)


Epoch 0 - Loss =  34404.70324872005
Epoch 1 - Loss =  24887.1957086164
Epoch 2 - Loss =  18702.30586156976
Epoch 3 - Loss =  14683.06442813652
Epoch 4 - Loss =  12071.136279537686
Epoch 5 - Loss =  10373.73018804087
Epoch 6 - Loss =  9270.614081931437
Epoch 7 - Loss =  8553.690503366337
Epoch 8 - Loss =  8087.730755689358
Epoch 9 - Loss =  7784.858662240764


In [7]:
### RMS Prop;

def RmsProp(X, y, w, b, learning_rate=0.01, beta=.9, iterations = 10, epsilon=1e-7):

    v_tw = np.zeros_like(w)
    v_tb = np.zeros_like(b)

    # RMSprop optimization loop
    for i in range(iterations):
        # Compute the gradient of the loss function
        y_hat =  X @ w + b
        loss = 1 / n * np.sum((y - y_hat) ** 2)

        print(f'Epoch {i} - Loss = ', loss)

        dw = 2 / n *  np.sum(X.T @ ( y_hat - y ))
        db = 2 / n *  np.sum( y_hat - y )

        # Update the exponentially decaying average of squared gradients  ## learning rate correction
        v_tw = beta * v_tw + (1 - beta) * dw**2
        v_tb = beta * v_tb + (1 - beta) * db**2

        # Update the parameters using RMSprop
        w -= (learning_rate / (np.sqrt(v_tw) + epsilon)) * dw
        b -= (learning_rate / (np.sqrt(v_tb) + epsilon)) * db

    return w, b


In [8]:
config()

w, b = RmsProp(X, y, w, b, learning_rate, beta=.9, iterations = 10, epsilon=1e-7)

Epoch 0 - Loss =  34280.04033629526
Epoch 1 - Loss =  34246.49248776484
Epoch 2 - Loss =  34222.17607837403
Epoch 3 - Loss =  34201.828366860886
Epoch 4 - Loss =  34183.775088020215
Epoch 5 - Loss =  34167.23873325005
Epoch 6 - Loss =  34151.78588980398
Epoch 7 - Loss =  34137.14690818973
Epoch 8 - Loss =  34123.14118429671
Epoch 9 - Loss =  34109.64116988388


In [9]:
### Adam;

def Adam(X, y, w, b, learning_rate=0.01, beta=.9, iterations = 10, epsilon=1e-7):

    v_tw = np.zeros_like(w)
    v_tb = np.zeros_like(b)

    m_tw = np.zeros_like(w)
    m_tb = np.zeros_like(b)

    # RMSprop optimization loop
    for i in range(iterations):
        # Compute the gradient of the loss function
        y_hat =  X @ w + b
        loss = 1 / n * np.sum((y - y_hat) ** 2)

        print(f'Epoch {i} - Loss = ', loss)

        dw = 2 / n *  np.sum(X.T @ ( y_hat - y ))
        db = 2 / n *  np.sum( y_hat - y )

        # Update the exponentially decaying average of gradients  ## gradient correction
        m_tw = beta * m_tw + (1 - beta) * dw
        m_tb = beta * m_tb + (1 - beta) * db

        # Update the exponentially decaying average of squared gradients  ## learning rate correction
        v_tw = beta * v_tw + (1 - beta) * dw**2
        v_tb = beta * v_tb + (1 - beta) * db**2

        # Update the parameters using RMSprop
        w -= (learning_rate / (np.sqrt(v_tw) + epsilon)) * m_tw
        b -= (learning_rate / (np.sqrt(v_tb) + epsilon)) * m_tb

    return w, b


In [12]:
config()

w, b = Adam(X, y, w, b, learning_rate, beta=.9, iterations = 10, epsilon=1e-7)

Epoch 0 - Loss =  33933.79141709444
Epoch 1 - Loss =  33930.45492207886
Epoch 2 - Loss =  33925.85624904302
Epoch 3 - Loss =  33920.3646833473
Epoch 4 - Loss =  33914.17915972361
Epoch 5 - Loss =  33907.43021364352
Epoch 6 - Loss =  33900.2120849146
Epoch 7 - Loss =  33892.59674750191
Epoch 8 - Loss =  33884.641228567794
Epoch 9 - Loss =  33876.39188709095
