<a href="https://colab.research.google.com/github/rajiv1817/NNDL/blob/main/NNDL_Optimization_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Optimization Techniques

In [1]:
import numpy as np


In [2]:
def f(w,b,x): #sigmoid function
    return 1.0/(1.0 +np.exp(-(w*x+b)))

def error(w,b):
    err = 0.0
    for x,y in zip(X,Y):
        fx = f(w,b,x)
        err += 0.5 * (fx - y)**2
    return err

def grad_b(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) *fx * (1 - fx)

def grad_w(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) *fx * (1 - fx) * x

In [3]:
def do_gradient_descent():
    w, b, eta = init_w, init_b, 1.0
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        for x,y in zip(X,Y):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)
        params.append((w, b, error(w,b)))
        w = w - eta * dw
        b = b - eta * db
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_stochastic_gradient_descent():
    w, b, eta = init_w, init_b, 1.0
    params = []
    batch_size = 1
    for i in range(1, max_iter+1):
        indexes = np.random.randint(0, len(X), batch_size) # random sample
        Xs = np.take(X, indexes)
        Ys = np.take(Y, indexes)
        dw, db = 0, 0
        for x,y in zip(Xs,Ys):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)
            params.append((w, b, error(w,b)))
            w = w - eta * dw
            b = b - eta * db
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_mini_batch_gradient_descent():
    w, b, eta = init_w, init_b, 1.0
    params = []
    batch_size = 2
    for i in range(1, max_iter+1):
        indexes = np.random.randint(0, len(X), batch_size) # random sample
        Xs = np.take(X, indexes)
        Ys = np.take(Y, indexes)
        dw, db = 0, 0
        for x,y in zip(Xs,Ys):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)
            params.append((w, b, error(w,b)))
            w = w - eta * dw
            b = b - eta * db
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_momentum_gradient_descent():
    w, b, eta = init_w, init_b, 1.0
    v_w, v_b, prev_v_w, prev_v_b, gamma = 0, 0, 0, 0, 0.8
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        for x,y in zip(X,Y):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)
        v_w = gamma * prev_v_w + eta*dw
        v_b = gamma * prev_v_b + eta*db
        w = w - gamma * prev_v_w + eta*dw
        b = b - gamma * prev_v_b + eta*db
        params.append((w, b, error(w,b)))
        prev_v_w = v_w
        prev_v_b = v_b
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_nesterov_accelerated_gradient_descent():
    w, b, eta = init_w, init_b, 1.0
    prev_v_w, prev_v_b, gamma = 0, 0, 0.8
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        # do partial update
        v_w = gamma * prev_v_w
        v_b = gamma * prev_v_b
        for x,y in zip(X,Y):
          # calculate gradients after partial update
            dw += grad_w(w - v_w, b - v_b, x, y)
            db += grad_b(w - v_w, b - v_b, x, y)
        # now do the full update
        v_w = gamma * prev_v_w + eta*dw
        v_b = gamma * prev_v_b + eta*db
        w = w - v_w
        b = b - v_b
        prev_v_w = v_w
        prev_v_b = v_b
        er = error(w, b)
        params.append((w, b, er))
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_adagrad():
    w, b, eta = init_w, init_b, 0.1
    v_w, v_b, eps = 0, 0, 1e-8
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        for x,y in zip(X,Y):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)

        v_w = v_w + dw**2
        v_b = v_b + db**2
        w = w - (eta/np.sqrt(v_w + eps)) * dw
        b = b - (eta/np.sqrt(v_b + eps)) * db
        er = error(w, b)
        params.append((w, b, er))
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_rmsprop():
    w, b, eta = init_w, init_b, 0.1
    v_w, v_b, beta, eps = 0, 0, 0.9, 1e-8
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        for x,y in zip(X,Y):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)

        v_w = beta * v_w + (1 - beta) * dw**2
        v_b = beta * v_b + (1 - beta) * db**2

        w = w - (eta/np.sqrt(v_w + eps)) * dw
        b = b - (eta/np.sqrt(v_b + eps)) * db
        er = error(w, b)
        params.append((w, b, er))
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')


def do_adam():
    w, b, eta, max_epochs = 1, 1, 0.01, 100,
    m_w, m_b, v_w, v_b, eps, beta1, beta2 = 0, 0, 0, 0, 1e-8, 0.9, 0.99
    params = []
    for i in range(1, max_iter+1):
        dw, db = 0, 0
        for x,y in zip(X,Y):
            dw += grad_w(w, b, x, y)
            db += grad_b(w, b, x, y)
            m_w = beta1 * m_w + (1-beta1) * dw
            m_b = beta1 * m_b + (1-beta1) * db
            v_w = beta2 * v_w + (1-beta2) * dw**2
            v_b = beta2 * v_b + (1-beta2) * db**2
            m_w = m_w/(1-beta1**(i+1))
            m_b = m_b/(1-beta1**(i+1))
            v_w = v_w/(1-beta2**(i+1))
            v_b = v_b/(1-beta2**(i+1))
            w = w - eta * m_w/np.sqrt(v_w + eps)
            b = b - eta * m_b/np.sqrt(v_b + eps)
            er = error(w, b)
            params.append((w, b, er))
    p = params[-1]
    print(f'w = {p[0]:f}, b = {p[1]:f}, e = {p[2]:f}')

In [4]:
X = np.array([2.5, 1.8])
Y = np.array([0.9, 0.5])

eta = 1
init_w = -2
init_b = 2
max_iter = 100

In [5]:
do_stochastic_gradient_descent()


w = -0.472226, b = 2.031714, e = 0.055019


In [6]:
do_mini_batch_gradient_descent()


w = 0.083726, b = 1.170387, e = 0.046972


In [7]:
do_momentum_gradient_descent()


w = 3.252622, b = 4.284385, e = 0.129980


In [8]:
do_nesterov_accelerated_gradient_descent()


w = 1.169024, b = -1.649317, e = 0.013296


In [9]:
do_adagrad()


w = -0.830849, b = 2.945572, e = 0.067177


In [10]:
do_rmsprop()


w = 1.387022, b = -2.247345, e = 0.010101


In [11]:
do_adam()


w = 0.971993, b = 0.972098, e = 0.098347
