<div style="height:200px; width:100%;">
    <img style="height:200px; width:100%;" src="http://csc.lsu.edu/~saikat/deepsat/images/sat_img.png"/>
</div>

In [None]:
import numpy as np
from tqdm import tqdm

import pandas as pd
import sys

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib

<h1 id="dataset" style="color:blue; border: 1px dotted green;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
features = pd.read_csv('../input/deepsat-sat4/X_test_sat4.csv', skiprows=lambda i: i % 50 != 0).values
features = features / 255.
labels = pd.read_csv('../input/deepsat-sat4/y_test_sat4.csv', skiprows=lambda i: i % 50 != 0).values
labels = np.max(labels, axis=1)

features, labels = shuffle(features, labels)

X_train, X_test, y_train, y_test = train_test_split(
                                        features, labels, test_size=0.33, random_state=42)

<h1 id="activation" style="color:blue; border: 1px dotted green;"> 
    <center>Activation Functions
        <a class="anchor-link" href="#activation" target="_self">¶</a>
    </center>
</h1>

In [None]:
def log(x):
    return 1 / (1 + np.exp(-1 * x))

def d_log(x):
    return log(x) * ( 1 - log(x))

def tanh(x):
    return np.tanh(x)

def d_tanh(x):
    return 1 - np.tanh(x) ** 2 

def ReLu(x):
    mask = (x > 0.0) * 1.0
    return x * mask

def d_ReLu(x):
    mask = (x > 0.0) * 1.0
    return mask    

def elu(matrix):
    mask = (matrix<=0) * 1.0
    less_zero = matrix * mask
    safe =  (matrix>0) * 1.0
    greater_zero = matrix * safe
    final = 3.0 * (np.exp(less_zero) - 1) * less_zero
    return greater_zero + final

def d_elu(matrix):
    safe = (matrix>0) * 1.0
    mask2 = (matrix<=0) * 1.0
    temp = matrix * mask2
    final = (3.0 * np.exp(temp))*mask2
    return (matrix * safe) + final

<h1 id="weights" style="color:blue; border: 1px dotted green;"> 
    <center>Weights
        <a class="anchor-link" href="#weights" target="_self">¶</a>
    </center>
</h1>

In [None]:
# 1. Declare Weights
np.random.seed(1234)

w1 = np.random.randn(3136,256) * 0.2
w2 =np.random.randn(256,128) * 0.2
w3 =np.random.randn(128,1) * 0.2

w1_sgd,w2_sgd ,w3_sgd = w1,w2,w3
w1_m,w2_m ,w3_m = w1,w2,w3
w1_ng,w2_ng,w3_ng =  w1,w2,w3
w1_adagrad,w2_adagrad,w3_adagrad =  w1,w2,w3
w1_adadelta,w2_adadelta,w3_adadelta =  w1,w2,w3
w1_RSMprop,w2_RSMprop,w3_RSMprop =  w1,w2,w3
w1_adam,w2_adam,w3_adam =  w1,w2,w3
w1_nadam,w2_nadam,w3_nadam =  w1,w2,w3

w1_sgd_noise,w2_sgd_noise ,w3_sgd_noise = w1,w2,w3
w1_noise,w2_noise,w3_noise  = w1,w2,w3
w1_noise_noise,w2_noise_noise,w3_noise_noise  = w1,w2,w3
w1_noise_adam,w2_noise_adam,w3_noise_adam  = w1,w2,w3

<h1 id="sgd" style="color:blue; border: 1px dotted green;"> 
    <center>SGD
        <a class="anchor-link" href="#sgd" target="_self">¶</a>
    </center>
</h1>

In [None]:
num_epoch = 10
total_cost = 0
learn_rate = 0.0003
cost_array =[]
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_sgd)
        l1A = elu(l1)

        l2 = l1A.dot(w2_sgd)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_sgd)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =    grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_sgd.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_sgd.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =    grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        w3_sgd = w3_sgd - learn_rate * grad_3
        w2_sgd = w2_sgd - learn_rate * grad_2
        w1_sgd = w1_sgd - learn_rate * grad_1
        
    if e % 1 == 0 :
        print("e:{:2d}. SGD - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="momentum" style="color:blue; border: 1px dotted green;"> 
    <center>Momentum
        <a class="anchor-link" href="#momentum" target="_self">¶</a>
    </center>
</h1>

In [None]:
v1,v2,v3 = 0,0,0
alpha = 0.001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_m)
        l1A = elu(l1)

        l2 = l1A.dot(w2_m)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_m)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =    grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_m.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_m.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        v3 = v3 * alpha + learn_rate * grad_3
        v2 = v2 * alpha + learn_rate * grad_2
        v1 = v1 * alpha + learn_rate * grad_1

        w3_m = w3_m - v3
        w2_m = w2_m - v2
        w1_m = w1_m - v1
        
    if e % 1 == 0 :
        print("e:{:2d}. Momentum - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="nesterov" style="color:blue; border: 1px dotted green;"> 
    <center>Nesterov
        <a class="anchor-link" href="#nesterov" target="_self">¶</a>
    </center>
</h1>

In [None]:
v1,v2,v3 = 0,0,0
alpha = 0.001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_ng)
        l1A = elu(l1)

        l2 = l1A.dot(w2_ng)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_ng)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =    grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_ng.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_ng.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        # ------- FAKE GRADIENT --------
        fake_w3_ng = w3_ng - alpha * v3
        fake_w2_ng = w2_ng - alpha * v2
        fake_w1_ng = w1_ng - alpha * v1
        
        l1 = X.dot(fake_w1_ng)
        l1A = elu(l1)

        l2 = l1A.dot(fake_w2_ng)
        l2A = tanh(l2)       

        l3 = l2A.dot(fake_w3_ng)
        l3A = log(l3)   

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3_fake =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(fake_w3_ng.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2_fake =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(fake_w2_ng.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1_fake =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)
        # ------- FAKE GRADIENT --------

        v3 = v3 * alpha + learn_rate * grad_3_fake
        v2 = v2 * alpha + learn_rate * grad_2_fake
        v1 = v1 * alpha + learn_rate * grad_1_fake

        w3_ng = w3_ng - v3
        w2_ng = w2_ng - v2
        w1_ng = w1_ng - v1
        
    if e % 1 == 0 :
        print("e:{:2d}. Nesterov - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="adagrad" style="color:blue; border: 1px dotted green;"> 
    <center>Adagrad
        <a class="anchor-link" href="#adagrad" target="_self">¶</a>
    </center>
</h1>

In [None]:
Adagrad_lr_1,Adagrad_lr_2,Adagrad_lr_3 = 0,0,0
Adagrad_e = 0.00000001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_adagrad)
        l1A = elu(l1)

        l2 = l1A.dot(w2_adagrad)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_adagrad)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_adagrad.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_adagrad.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        Adagrad_lr_3 = Adagrad_lr_3 + grad_3 ** 2
        Adagrad_lr_2 = Adagrad_lr_2 + grad_2 ** 2
        Adagrad_lr_1 = Adagrad_lr_1 + grad_1 ** 2

        w3_adagrad = w3_adagrad - (learn_rate/np.sqrt(Adagrad_lr_3 + Adagrad_e)) *grad_3
        w2_adagrad = w2_adagrad - (learn_rate/np.sqrt(Adagrad_lr_2 + Adagrad_e)) *grad_2
        w1_adagrad = w1_adagrad - (learn_rate/np.sqrt(Adagrad_lr_1 + Adagrad_e)) *grad_1
    
    if e % 1 == 0 :
        print("e:{:2d}. Adagrad - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="adadelta" style="color:blue; border: 1px dotted green;"> 
    <center>Adadelta
        <a class="anchor-link" href="#adadelta" target="_self">¶</a>
    </center>
</h1>

In [None]:
AdaDelta_e,AdaDelta_v = 0.000001,0.001
AdaDelta_1,AdaDelta_2,AdaDelta_3 = 0,0,0
AdaDelta_1_v,AdaDelta_2_v,AdaDelta_3_v = 0,0,0
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)
        
        l1 = X.dot(w1_adadelta)
        l1A = elu(l1)

        l2 = l1A.dot(w2_adadelta)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_adadelta)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_adadelta.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_adadelta.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        AdaDelta_3 = AdaDelta_v * AdaDelta_3 + (1-AdaDelta_v) * grad_3 ** 2
        AdaDelta_2 = AdaDelta_v * AdaDelta_2 + (1-AdaDelta_v) * grad_2 ** 2
        AdaDelta_1 = AdaDelta_v * AdaDelta_1 + (1-AdaDelta_v) * grad_1 ** 2

        mid_grad_3 = - ( np.sqrt(AdaDelta_3_v + AdaDelta_e) / np.sqrt(AdaDelta_3 + AdaDelta_e) ) * grad_3
        mid_grad_2 = - ( np.sqrt(AdaDelta_2_v + AdaDelta_e) / np.sqrt(AdaDelta_2 + AdaDelta_e) ) * grad_2
        mid_grad_1 = - ( np.sqrt(AdaDelta_1_v + AdaDelta_e) / np.sqrt(AdaDelta_1 + AdaDelta_e) ) * grad_1

        AdaDelta_3_v = AdaDelta_v * AdaDelta_3_v + (1-AdaDelta_v) * mid_grad_3 ** 2
        AdaDelta_2_v = AdaDelta_v * AdaDelta_2_v + (1-AdaDelta_v) * mid_grad_2 ** 2
        AdaDelta_1_v = AdaDelta_v * AdaDelta_1_v + (1-AdaDelta_v) * mid_grad_1 ** 2

        w3_adadelta = w3_adadelta - mid_grad_3
        w2_adadelta = w2_adadelta - mid_grad_2
        w1_adadelta = w1_adadelta - mid_grad_1
        
    if e % 1 == 0 :
        print("e:{:2d}. Adadelta - Cost:{:1.3}".format(e + 1, total_cost))

    cost_temp_array.append(total_cost)
    total_cost = 0
    
# exclude from cost_array due to high cost
# cost_array.append(cost_temp_array)

<h1 id="rmsprop" style="color:blue; border: 1px dotted green;"> 
    <center>RMSprop
        <a class="anchor-link" href="#rmsprop" target="_self">¶</a>
    </center>
</h1>

In [None]:
RMSprop_1,RMSprop_2,RMSprop_3 = 0,0,0
RMSprop_v,RMSprop_e= 0.9,0.00000001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_RSMprop)
        l1A = elu(l1)

        l2 = l1A.dot(w2_RSMprop)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_RSMprop)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_RSMprop.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_RSMprop.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        RMSprop_3 = RMSprop_v*RMSprop_3 + (1- RMSprop_v)*grad_3**2
        RMSprop_2 = RMSprop_v*RMSprop_2 + (1- RMSprop_v)*grad_2**2
        RMSprop_1 = RMSprop_v*RMSprop_1 + (1- RMSprop_v)*grad_1**2

        w3_RSMprop = w3_RSMprop - (learn_rate/np.sqrt(RMSprop_3 + RMSprop_e)) * grad_3
        w2_RSMprop = w2_RSMprop - (learn_rate/np.sqrt(RMSprop_2 + RMSprop_e)) * grad_2
        w1_RSMprop = w1_RSMprop - (learn_rate/np.sqrt(RMSprop_1 + RMSprop_e)) * grad_1
        
    if e % 1 == 0 :
        print("e:{:2d}. RMSprop - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="adam" style="color:blue; border: 1px dotted green;"> 
    <center>Adam
        <a class="anchor-link" href="#adam" target="_self">¶</a>
    </center>
</h1>

In [None]:
Adam_m_1,Adam_m_2,Adam_m_3 = 0,0,0
Adam_v_1,Adam_v_2,Adam_v_3 = 0,0,0
Adam_Beta_1,Adam_Beta_2 = 0.9,0.999
Adam_e = 0.00000001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_adam)
        l1A = elu(l1)

        l2 = l1A.dot(w2_adam)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_adam)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_adam.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_adam.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        Adam_m_3 = Adam_Beta_1 * Adam_m_3 + ( 1-Adam_Beta_1 ) *grad_3
        Adam_m_2 = Adam_Beta_1 * Adam_m_2 + ( 1-Adam_Beta_1 ) *grad_2
        Adam_m_1 = Adam_Beta_1 * Adam_m_1 + ( 1-Adam_Beta_1 ) *grad_1

        Adam_v_3 = Adam_Beta_2 * Adam_v_3 + ( 1-Adam_Beta_2 ) *grad_3 **2 
        Adam_v_2 = Adam_Beta_2 * Adam_v_2 + ( 1-Adam_Beta_2 ) *grad_2 **2 
        Adam_v_1 = Adam_Beta_2 * Adam_v_1 + ( 1-Adam_Beta_2 ) *grad_1 **2 
        
        Adam_m_3_hat = Adam_m_3/(1-Adam_Beta_1)
        Adam_m_2_hat = Adam_m_2/(1-Adam_Beta_1)
        Adam_m_1_hat = Adam_m_1/(1-Adam_Beta_1)
        
        Adam_v_3_hat = Adam_v_3/(1-Adam_Beta_2)
        Adam_v_2_hat = Adam_v_2/(1-Adam_Beta_2)
        Adam_v_1_hat = Adam_v_1/(1-Adam_Beta_2)
        
        w3_adam = w3_adam - (learn_rate/(np.sqrt(Adam_v_3_hat) + Adam_e)) * Adam_m_3_hat
        w2_adam = w2_adam - (learn_rate/(np.sqrt(Adam_v_2_hat) + Adam_e)) * Adam_m_2_hat
        w1_adam = w1_adam - (learn_rate/(np.sqrt(Adam_v_1_hat) + Adam_e)) * Adam_m_1_hat
        
    if e % 1 == 0 :
        print("e:{:2d}. Adam - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="nadam" style="color:blue; border: 1px dotted green;"> 
    <center>Nadam
        <a class="anchor-link" href="#nadam" target="_self">¶</a>
    </center>
</h1>

In [None]:
Nadam_m_1,Nadam_m_2,Nadam_m_3 = 0,0,0
Nadam_v_1,Nadam_v_2,Nadam_v_3 = 0,0,0
Nadam_Beta_1,Nadam_Beta_2 = 0.9,0.999
Nadam_e = 0.00000001
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_nadam)
        l1A = elu(l1)

        l2 = l1A.dot(w2_nadam)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_nadam)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_nadam.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_nadam.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        Nadam_m_3 = Nadam_Beta_1 * Nadam_m_3 + (1 - Nadam_Beta_1) * grad_3
        Nadam_m_2 = Nadam_Beta_1 * Nadam_m_2 + (1 - Nadam_Beta_1) * grad_2
        Nadam_m_1 = Nadam_Beta_1 * Nadam_m_1 + (1 - Nadam_Beta_1) * grad_1
        
        Nadam_v_3 = Nadam_Beta_2 * Nadam_v_3 + (1- Nadam_Beta_2) * grad_3 ** 2
        Nadam_v_2 = Nadam_Beta_2 * Nadam_v_2 + (1- Nadam_Beta_2) * grad_2 ** 2
        Nadam_v_1 = Nadam_Beta_2 * Nadam_v_1 + (1- Nadam_Beta_2) * grad_1 ** 2

        Nadam_m_3_hat = Nadam_m_3/ (1 - Nadam_Beta_1)
        Nadam_m_2_hat = Nadam_m_2/ (1 - Nadam_Beta_1)
        Nadam_m_1_hat = Nadam_m_1/ (1 - Nadam_Beta_1)

        Nadam_v_3_hat = Nadam_v_3/ (1 - Nadam_Beta_2)
        Nadam_v_2_hat = Nadam_v_2/ (1 - Nadam_Beta_2)
        Nadam_v_1_hat = Nadam_v_1/ (1 - Nadam_Beta_2)
         
        w3_nadam = w3_nadam - (learn_rate/( np.sqrt(Nadam_v_3_hat) + Nadam_e )) * ( Nadam_Beta_1  * Nadam_m_3_hat + ( ( (1-Nadam_Beta_1) * grad_3 ) / (1 - Nadam_Beta_1)  ) )
        w2_nadam = w2_nadam - (learn_rate/( np.sqrt(Nadam_v_2_hat) + Nadam_e )) * ( Nadam_Beta_1  * Nadam_m_2_hat + ( ( (1-Nadam_Beta_1) * grad_2 ) / (1 - Nadam_Beta_1)  ) )
        w1_nadam = w1_nadam - (learn_rate/( np.sqrt(Nadam_v_1_hat) + Nadam_e )) * ( Nadam_Beta_1  * Nadam_m_1_hat + ( ( (1-Nadam_Beta_1) * grad_1 ) / (1 - Nadam_Beta_1)  ) )
    
    if e % 1 == 0 :
        print("e:{:2d}. Nadam - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="sgdnoise" style="color:blue; border: 1px dotted green;"> 
    <center>SGD with Gaussian Noise
        <a class="anchor-link" href="#sgdnoise" target="_self">¶</a>
    </center>
</h1>

In [None]:
total_cost = 0
n_value = 0.001
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_sgd_noise)
        l1A = elu(l1)

        l2 = l1A.dot(w2_sgd_noise)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_sgd_noise)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        grad_3_part_1 = l3A - y
        grad_3_part_2 = d_log(l3)
        grad_3_part_3 = l2A
        grad_3 =     grad_3_part_3.T.dot(grad_3_part_1 * grad_3_part_2)    

        grad_2_part_1 = (grad_3_part_1 * grad_3_part_2).dot(w3_sgd_noise.T)
        grad_2_part_2 = d_tanh(l2)
        grad_2_part_3 = l1A
        grad_2 =    grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2)

        grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2_sgd_noise.T)
        grad_1_part_2 = d_elu(l1)
        grad_1_part_3 = X
        grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 *grad_1_part_2)

        # ------ Calculate The Additive Noise -------
        ADDITIVE_NOISE_STD = n_value / (np.power((1 + e), 0.55))
        ADDITIVE_GAUSSIAN_NOISE = np.random.normal(loc=0,scale=ADDITIVE_NOISE_STD)
        # ------ Calculate The Additive Noise -------

        w3_sgd_noise = w3_sgd_noise - learn_rate* (grad_3 + ADDITIVE_GAUSSIAN_NOISE)
        w2_sgd_noise = w2_sgd_noise - learn_rate* (grad_2 + ADDITIVE_GAUSSIAN_NOISE)
        w1_sgd_noise = w1_sgd_noise - learn_rate* (grad_1 + ADDITIVE_GAUSSIAN_NOISE)
        
    if e % 1 == 0 :
        print("e:{:2d}. SGD with Gaussian Noise - Cost:{:1.3}".format(e + 1, total_cost))    
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="noise" style="color:blue; border: 1px dotted green;"> 
    <center>Noise
        <a class="anchor-link" href="#noise" target="_self">¶</a>
    </center>
</h1>

In [None]:
total_cost = 0
n, p = 1, .5 
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_noise)
        l1A = elu(l1)

        l2 = l1A.dot(w2_noise)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_noise)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        gradient_weight_3 = np.random.gumbel(size=w3.shape)
        gradient_weight_2 = np.random.gumbel(size=w2.shape)
        gradient_weight_1 = np.random.gumbel(size=w1.shape)

        w3_noise = w3_noise - learn_rate* gradient_weight_3
        w2_noise = w2_noise - learn_rate* gradient_weight_2
        w1_noise = w1_noise - learn_rate* gradient_weight_1
    
    if e % 1 == 0 :
        print("e:{:2d}. Noise - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
        
cost_array.append(cost_temp_array)

<h1 id="noisenoise" style="color:blue; border: 1px dotted green;"> 
    <center>Noise + Gaussian Additive Noise
        <a class="anchor-link" href="#noisenoise" target="_self">¶</a>
    </center>
</h1>

In [None]:
total_cost = 0
cost_temp_array = []

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_noise_noise)
        l1A = elu(l1)

        l2 = l1A.dot(w2_noise_noise)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_noise_noise)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        gradient_weight_3 = np.random.gumbel(size=w3.shape)
        gradient_weight_2 = np.random.gumbel(size=w2.shape)
        gradient_weight_1 = np.random.gumbel(size=w1.shape)

        # ------ Calculate The Additive Noise -------
        ADDITIVE_NOISE_STD = n_value / (np.power((1 + e), 0.55))
        ADDITIVE_GAUSSIAN_NOISE = np.random.normal(loc=0,scale=ADDITIVE_NOISE_STD)
        # ------ Calculate The Additive Noise -------

        w3_noise_noise = w3_noise_noise - learn_rate* (gradient_weight_3 + ADDITIVE_GAUSSIAN_NOISE)
        w2_noise_noise = w2_noise_noise - learn_rate* (gradient_weight_2 + ADDITIVE_GAUSSIAN_NOISE)
        w1_noise_noise = w1_noise_noise - learn_rate* (gradient_weight_1 + ADDITIVE_GAUSSIAN_NOISE)
    
    if e % 1 == 0 :
        print("e:{:2d}. Noise + Gaussian Noise - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0

cost_array.append(cost_temp_array)

<h1 id="noiseadam" style="color:blue; border: 1px dotted green;"> 
    <center>Noise Adam
        <a class="anchor-link" href="#noiseadam" target="_self">¶</a>
    </center>
</h1>

In [None]:
total_cost = 0
cost_temp_array = []
noise_adam_m1,noise_adam_m2,noise_adam_m3 = 0,0,0
noise_adam_v1,noise_adam_v2,noise_adam_v3 = 0,0,0
noise_Adam_Beta_1,noise_Adam_Beta_2 = 0.9,0.999
noise_Adam_e = 0.00000001

for e in range(num_epoch):
    for i in range(len(X_train)):
        
        X = np.expand_dims(X_train[i],axis=0)
        y = np.expand_dims(np.array([y_train[i]]), axis=1)

        l1 = X.dot(w1_noise_adam)
        l1A = elu(l1)

        l2 = l1A.dot(w2_noise_adam)
        l2A = tanh(l2)       

        l3 = l2A.dot(w3_noise_adam)
        l3A = log(l3)   

        cost = np.square(l3A - y).sum() * 0.5
        total_cost = total_cost + cost

        gradient_weight_3 = np.random.gumbel(size=w3.shape)
        gradient_weight_2 = np.random.gumbel(size=w2.shape)
        gradient_weight_1 = np.random.gumbel(size=w1.shape)

        noise_adam_m3 = noise_Adam_Beta_1 * noise_adam_m3 + (1 - noise_Adam_Beta_1) * gradient_weight_3
        noise_adam_m2 = noise_Adam_Beta_1 * noise_adam_m2 + (1 - noise_Adam_Beta_1) * gradient_weight_2
        noise_adam_m1 = noise_Adam_Beta_1 * noise_adam_m1 + (1 - noise_Adam_Beta_1) * gradient_weight_1
        
        noise_adam_v3 = noise_Adam_Beta_2 * noise_adam_v3 + (1 - noise_Adam_Beta_2) * gradient_weight_3 ** 2
        noise_adam_v2 = noise_Adam_Beta_2 * noise_adam_v2 + (1 - noise_Adam_Beta_2) * gradient_weight_2 ** 2
        noise_adam_v1 = noise_Adam_Beta_2 * noise_adam_v1 + (1 - noise_Adam_Beta_2) * gradient_weight_1 ** 2

        noise_adam_m3_hat = noise_adam_m3/(1 -noise_Adam_Beta_1 )
        noise_adam_m2_hat = noise_adam_m2/(1 -noise_Adam_Beta_1 )
        noise_adam_m1_hat = noise_adam_m1/(1 -noise_Adam_Beta_1 )

        noise_adam_v3_hat = noise_adam_v3/(1 -noise_Adam_Beta_2 )
        noise_adam_v2_hat = noise_adam_v2/(1 -noise_Adam_Beta_2 )
        noise_adam_v1_hat = noise_adam_v1/(1 -noise_Adam_Beta_2 )

        w3_noise_adam = w3_noise_adam - (learn_rate / ( np.sqrt(noise_adam_v3_hat)  +noise_Adam_e )) * noise_adam_m3_hat
        w2_noise_adam = w2_noise_adam - (learn_rate / ( np.sqrt(noise_adam_v2_hat)  +noise_Adam_e )) * noise_adam_m2_hat
        w1_noise_adam = w1_noise_adam - (learn_rate / ( np.sqrt(noise_adam_v1_hat)  +noise_Adam_e )) * noise_adam_m1_hat

    if e % 1 == 0 :
        print("e:{:2d}. Noise Adam - Cost:{:1.3}".format(e + 1, total_cost))
        
    cost_temp_array.append(total_cost)
    total_cost = 0
    
cost_array.append(cost_temp_array)

<h1 id="analyze" style="color:blue; border: 1px dotted green;"> 
    <center>Analyze
        <a class="anchor-link" href="#analyze" target="_self">¶</a>
    </center>
</h1>

In [None]:
bar_color = ['b', 'g', 'saddlebrown', 'steelblue', 
            'orangered', 'y', 'paleturquoise', 'royalblue',
            'salmon','silver','skyblue','slateblue','peru','plum']
labels_z = ['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop',
            'Adam', 'Nadam', 'SGD with Gaussian Noise', 'Noise',
            'Noise + Gaussian Additive Noise', 'Noise Adam']

plt.figure(figsize=(12, 8))
for i in range(len(cost_array)):
    plt.plot(np.arange(num_epoch), cost_array[i],color=bar_color[i],linewidth=3,label=str(labels_z[i]) )
plt.title("Total Cost per Training")
plt.legend()
plt.show()

<h1 id="reference" style="color:blue; border: 1px dotted green;"> 
    <center>Reference
        <a class="anchor-link" href="#reference" target="_self">¶</a>
    </center>
</h1>

Full credits goes to Jae Duk Seo and his [Medium](https://towardsdatascience.com/only-numpy-implementing-and-comparing-gradient-descent-optimization-algorithms-google-brains-8870b133102b)