# Update parameter over iterations

- Gradient descent is used to find the optimal parameter which atleast could make local minimum of cost function

In [10]:
import import_ipynb  # Import modules from Jupyter notebook

In [11]:
from mlp_component import * #linear_forward ,.... linear_backward_model

# %% External modules
import numpy as np

In [12]:
# I need param , grad, lr
# Optimizer method (so generate v & s)
# Regularization (add weight decay then)

In [13]:
__all__ =   ['update_params', 
             'initialize_v','initialize_s',
             'bias_correction',
             'get_momentum_v','get_rmsprop_s','get_adagrad_s','get_adam_v_s']

In [26]:
# TEST OPTIMIZER

def update_params(param,grads,lr,optimizer = None ,regularization=None, **kwargs):
    """param in param_ out, updated by gradient descent technique
    
    Arguments:
    1. param
    2. grads -- d_theta 
    3. lr
    4. optimizer
    5. regularization

    Keyword Argument
    ----------   
    1. lambd 
    2. beta1
    3. beta2
    4. epsilon?
    
    Returns :
    param_ : updated parameter
    """
    _momentum_base_method = {'momentum','nesterov_momentum','adam','nadam'} #create v ( if not have yet ) #require b1 
    _adaptive_learning_method = {'adagrad','rmsprop','adam','nadam'}        #create s #require b2, error 
    
    """dict for dev
    grads_ : v
    lr_ : scaled lr
    param_ : updated param
    """
    L = len(param) // 2
    grads_ = grads
    lr_ = lr
    #grads_ = v
    #lr_ = lr / (np.sqrt(s) + epsilon)
    for l in range(1,L+1):
        param["W" + str(l)] -= lr_ * grads_["dW" + str(l)]
        param["b" + str(l)] -= lr_ * grads_["db" + str(l)]
        
    # Add weight decay if regularization

    if regularization == "L2":
        assert {'lambd','m'} <= set(kwargs)
        lambd = kwargs.get("lambd")
        m = kwargs.get("m")
        for l in range(1,L+1):
            param["W" + str(l)] -= (lambd/m)*param["W" + str(l)]   #missing m

    return param

In [26]:
def update_params(param,grads,lr,optimizer = None ,regularization=None, **kwargs):
    """param in param_ out, updated by gradient descent technique
    
    Arguments:
    1. param
    2. grads -- d_theta 
    3. lr
    4. optimizer
    5. regularization

    Keyword Argument
    ----------   
    1. lambd 
    2. beta1
    3. beta2
    4. epsilon?
    
    Returns :
    param_ : updated parameter
    """
    _momentum_base_method = {'momentum','nesterov_momentum','adam','nadam'} #create v ( if not have yet ) #require b1 
    _adaptive_learning_method = {'adagrad','rmsprop','adam','nadam'}        #create s #require b2, error 
    
    """dict for dev
    grads_ : v
    lr_ : scaled lr
    param_ : updated param
    """
    L = len(param) // 2
    grads_ = grads
    lr_ = lr
    #grads_ = v
    #lr_ = lr / (np.sqrt(s) + epsilon)
    for l in range(1,L+1):
        param["W" + str(l)] -= lr_ * grads_["dW" + str(l)]
        param["b" + str(l)] -= lr_ * grads_["db" + str(l)]
        
    # Add weight decay if regularization

    if regularization == "L2":
        assert {'lambd','m'} <= set(kwargs)
        lambd = kwargs.get("lambd")
        m = kwargs.get("m")
        for l in range(1,L+1):
            param["W" + str(l)] -= (lambd/m)*param["W" + str(l)]   #missing m

    return param

In [15]:
def initialize_v(grads):
    """first iteration of gd with momentum? generate it 
    
    Arguments:
    1. grads : grads dict
    
    Returns :
    1. v : initiatized exponentially weighted average of gradient dict
    """    
    
    L = len(grads) // 2 # number of layers in the neural networks
    v = {}
    
    for l in range(1,L+1):
        v["dW" + str(l)] = np.zeros_like(grads["dW" + str(l)])
        v["db" + str(l)] = np.zeros_like(grads["db" + str(l)])
    
    return v

In [16]:
def initialize_s(grads):
    """first iteration of gd with adaptive learning rate? generate it 
    
    Arguments:
    1. grads : grads dict
    
    Returns :
    1. v : initiatized exponentially weighted average of squared gradient dict
    """    
    
    return initialize_v(grads) # same as initialize_v

In [17]:
def bias_correction(exp_avg,iteration,beta):
    """bias correction of v or s due to the exp. weighted average side effect
    
    Arguments:    
    exp_avg --- exponentially weighted average of anything [v/s]
    iteration --- current iteration
    
    Returns:
    exp_avg_ --- bias corrected exponentially weighted average of anything 
    """
    L = len(exp_avg) // 2 # number of layers in the neural networks
    exp_avg_ = {}
    
    for l in range(1,L+1):
        exp_avg_["dW" + str(l)] = (exp_avg["dW" + str(l)] / (1-beta**iteration))
        exp_avg_["db" + str(l)] = (exp_avg["db" + str(l)] / (1-beta**iteration))
    
    return exp_avg_

## Optimizer with Momentum

<fieldset>
    
- Momentum ✔️
- Nesterov accerelated Gradient (aka. Nesterov Momentum) ❌
    
</fieldset>

In [18]:
def get_momentum_v(v,grads,beta1,iteration):
    """grads in exp.grads out 
    
    
    Arguments:
    1. v
    2. grads
    3. beta1
    4. nesterov
    
    Returns :
    1. s_ : new exponentially weighted average of gradient 
    """
    
    L = len(grads) // 2 # number of layers in the neural networks
    v_ = {}
    
    for l in range(1,L+1):
        v_["dW" + str(l)] = beta1 * v["dW" + str(l)] + (1-beta1) * grads["dW" + str(l)]
        v_["db" + str(l)] = beta1 * v["db" + str(l)] + (1-beta1) * grads["db" + str(l)]
    
    if iteration <= 10:
        ### Bias correction ###
        v_ = bias_correction(v_,iteration,beta1)
        
    return v_  

In [19]:
def get_nesterov_momentum_v(v,param):
    """grads lookahead in exp of grads lookahead oiut
    
    """
    pass
    
    

## Optimizer with Adaptive learning method

<fieldset>
    
- Adagrad ✔️
- RMSProp ✔️ <br>
- AdaDelta ❌
    
</fieldset>

In [20]:
def get_adagrad_s(s,grads):
    """grads in exp.grads^2 out 
    
    
    Arguments:
    1. s
    2. grads
    
    Returns :
    1. s_ : new exponentially weighted average of (gradient)^2
    """
    
    L = len(grads) // 2 # number of layers in the neural networks
    s_ = {}
    
    for l in range(1,L+1):
        s_["dW" + str(l)] += grads["dW" + str(l)]
        s_["db" + str(l)] += grads["db" + str(l)]
    
    return s_  

In [21]:
def get_rmsprop_s(s,grads,beta2,iteration):
    """grads in exp. grads^2 out 
    
    
    Arguments:
    1. s
    2. grads
    3. beta2
    4. iteration --- current iteration for bias correction
    
    Returns :
    1. s_ : new exponentially weighted average of (gradient)^2
    """
    
    L = len(grads) // 2 # number of layers in the neural networks
    s_ = {}
    
    for l in range(1,L+1):
        s_["dW" + str(l)] = beta2 * s["dW" + str(l)] + (1-beta2) * (grads["dW" + str(l)])**2
        s_["db" + str(l)] = beta2 * s["db" + str(l)] + (1-beta2) * (grads["db" + str(l)])**2
    
    if iteration <= 10:
        ### Bias correction ###
        s_ = bias_correction(s_,iteration,beta2)
    
    return s_  

## Optimizer with Momentum & Adaptive learning method

<fieldset>
    
- Adam ✔️
- Nadam ❌<br>
    
</fieldset>

In [22]:
def get_adam_v_s(v,beta1,s,beta2,iteration):
    """ Obtain v and s of Adam
    
    Arguments:
    1. v
    2. beta1
    3. s
    4. beta2
    5. iteration --- current iteration for bias correction
    
    Returns :
    1. v_ : new exponentially weighted average of gradient
    2. s_ : new exponentially weighted average of (gradient)^2
    """
    v_ = get_momentum_v(v,grads,beta1,iteration)
    s_ = get_rmsprop_s(s,grads,beta2,iteration)
    return v_ , s_

In [23]:
def get_nadam_v_s(v,beta1,s,beta2,iteration):
    v_ = get_nesterov_momentum_v(v,grads,beta1,iteration)
    s_ = get_rmsprop_s(s,grads,beta2,iteration)
    return v_ , s_

In [25]:

dir()  

['In',
 'L_model_backward',
 'L_model_forward',
 'Out',
 '_',
 '_24',
 '_3',
 '_4',
 '__',
 '___',
 '__all__',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i19',
 '_i2',
 '_i20',
 '_i21',
 '_i22',
 '_i23',
 '_i24',
 '_i25',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'bias_correction',
 'exit',
 'get_adagrad_s',
 'get_adam_v_s',
 'get_ipython',
 'get_momentum_v',
 'get_nadam_v_s',
 'get_nesterov_momentum_v',
 'get_rmsprop_s',
 'import_ipynb',
 'initialize_s',
 'initialize_v',
 'linear_activation_backward',
 'linear_activation_forward',
 'linear_backward',
 'linear_forward',
 'np',
 'quit',
 'statistics',
 'update_params']