In [31]:
import numpy as np
import keras
from keras.datasets import fashion_mnist
from matplotlib import pyplot as plt
import math

In [32]:
def sigmoid(a):
    s = 1/(1+np.exp(-a))
    return s

def derivative_sigmoid(a):
    ds = sigmoid(a) *(1-sigmoid (a))
    return ds

def tanh(a):
    t=(np.exp(a)-np.exp(-a))/(np.exp(a)+np.exp(-a))
    return t

def derivative_tanh(a):
    dt=1-tanh(a)**2
    return dt


def softmax(a):
    return np.exp(a) / np.sum(np.exp(a), axis=0) #expA (axis=0, keepdims=True)

In [33]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [34]:
fashion_mnist = keras.datasets.fashion_mnist
(x_train_orig, y_train_orig), (x_test_orig, y_test_orig) = fashion_mnist.load_data()

In [35]:
def initialize_parameters(layer_dimensions):

    #np.random.seed(0)
    parameters = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        parameters['w' + str(k)] = np.random.randn(layer_dimensions[k], layer_dimensions[k-1]) 
        parameters['b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return parameters

In [36]:
def initialize_update(layer_dimensions):

    #np.random.seed(0)
    update = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        update['update_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        update['update_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return update

In [37]:
def initialize_velocity(layer_dimensions):

    #np.random.seed(0)
    velocity = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        velocity['v_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        velocity['v_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return velocity

In [38]:
def initialize_moment(layer_dimensions):

    #np.random.seed(0)
    moment = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        moment['m_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        moment['m_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return moment

In [39]:
def agrregation_forward(h, w, b):
    
    a = np.dot(w, h) + b
    temp = (h,w,b)
    
    return a ,temp

In [40]:
def activation_forward(h_prev, w, b, activation):
        
    
    if activation == "sigmoid":

        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = sigmoid(a)
    
    elif activation == "tanh":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = tanh(a)
        
    elif activation == "softmax":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = softmax(a)
    
    
    temp = (linear_temp, a)

    return h, temp

In [41]:
def forward_pass(x, parameters):

    temps = []
    h = x
    L = len(parameters) // 2                  # number of layers in the neural network
    
    for k in range(L-1):
        l = k+1
        h_prev = h 
        h,temp = activation_forward(h_prev, parameters['w'+str(l)], parameters['b'+str(l)], activation="sigmoid")
        temps.append(temp)
    
    
    hL,temp1 = activation_forward(h, parameters['w'+str(L)], parameters['b'+str(L)], activation="softmax")
    temps.append(temp1)
    
            
    return hL, temps

In [42]:
def cost_function(yhat, y):   
    m = y.shape[1] # no. of examples
  
    product_sum = np.sum((y *np.log(yhat)), axis = 0)
    cost = -1/m*np.sum(product_sum)
    
    return cost

In [43]:
def agrregation_backward(dL_da, temp):
    
    h_prev, w, b = temp 
    m = h_prev.shape[1]
    dL_dh_prev = np.dot(w.T, dL_da)
    
    dL_dw = 1/m*np.dot(dL_da, h_prev.T)
    dL_db = 1/m*np.sum(dL_da, axis=1, keepdims=True)
     

    return dL_dh_prev, dL_dw, dL_db

In [44]:
def activation_backward(dL_dh, temp, activation):

    linear_temp, a = temp
    
    if activation == "sigmoid":
        ds = derivative_sigmoid(a)
        dL_da = dL_dh * ds
       
        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
        
    elif activation == "tanh":
        dt = derivative_tanh(a)
        dL_da = dL_dh * dt

        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
    
    return dL_dh_prev, dL_dw, dL_db

In [45]:
def backward_pass(yhat, y, temps):
    
    grads = {}
    L = len(temps) # the number of layers
    m = y.shape[1]

# el = one hot vector
    el = y
    dL_dyhat = -(1/yhat)*el
    dL_daL  = -(el - yhat)
    current_temp = temps[L-1]
    linear_tempL,aL = current_temp
    
    hL_prev, wL, bL = linear_tempL
    m = hL_prev.shape[1]

    dL_dhL_prev = np.dot(wL.T, dL_daL)
    
    dL_dwL = 1/m*np.dot(dL_daL, hL_prev.T)
    dL_dbL = 1/m*np.sum(dL_daL, axis=1, keepdims=True)

    grads["dL_dh" + str(L-1)] = dL_dhL_prev
    grads["dL_dw" + str(L)]      = dL_dwL
    grads["dL_db" + str(L)] = dL_dbL
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        #print(l)
        current_temp = temps[l]
        dL_dh_prev, dL_dw, dL_db = activation_backward(grads["dL_dh" + str(l+1)], current_temp, "sigmoid")
        grads["dL_dh" + str(l)] = dL_dh_prev
        grads["dL_dw" + str(l + 1)] = dL_dw
        grads["dL_db" + str(l + 1)] = dL_db

    return grads

In [46]:
def parameter_update_vanilla(parameters, grads,learning_rate):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- learning_rate*grads["dL_dw" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- learning_rate*grads["dL_db" + str(l + 1)]

    return parameters

In [47]:
def parameter_update_momentum(parameters, grads, update, learning_rate ,gamma):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        update["update_w" + str(l+1)] = gamma*update["update_w" + str(l+1)] + learning_rate*grads["dL_dw" + str(l + 1)]
        update["update_b" + str(l+1)] = gamma*update["update_b" + str(l+1)] + learning_rate*grads["dL_db" + str(l + 1)]
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-update["update_w" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- update["update_b" + str(l+1)]

    return parameters, update

In [48]:
def parameter_update_RMSProp(parameters, grads, velocity, learning_rate ,beta,eps):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        velocity["v_w" + str(l+1)] = beta*velocity["v_w" + str(l+1)] + (1-beta)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta*velocity["v_b" + str(l+1)] + (1-beta)*grads["dL_db" + str(l + 1)]**2
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*grads["dL_dw" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*grads["dL_db" + str(l + 1)]

    return parameters, velocity

In [105]:
def parameter_update_adam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,epoch):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        
        moment["m_w" + str(l+1)] = beta1*moment["m_w" + str(l+1)] + (1-beta1)*grads["dL_dw" + str(l + 1)]
        moment["m_b" + str(l+1)] = beta1*moment["m_b" + str(l+1)] + (1-beta1)*grads["dL_db" + str(l + 1)]
           
        velocity["v_w" + str(l+1)] = beta2*velocity["v_w" + str(l+1)] + (1-beta2)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta2*velocity["v_b" + str(l+1)] + (1-beta2)*grads["dL_db" + str(l + 1)]**2
        
        m_w_hat = moment["m_w" + str(l+1)]/(1-beta1**(epoch+1))
        m_b_hat = moment["m_b" + str(l+1)]/(1-beta1**(epoch+1))
        
        v_w_hat = velocity["v_w" + str(l+1)]/(1-beta2**(epoch+1))
        v_b_hat = velocity["v_b" + str(l+1)]/(1-beta2**(epoch+1))

        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*m_w_hat
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*m_b_hat

    return parameters, velocity, moment

In [106]:
def parameter_update_nadam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,epoch):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        
        moment["m_w" + str(l+1)] = beta1*moment["m_w" + str(l+1)] + (1-beta1)*grads["dL_dw" + str(l + 1)]
        moment["m_b" + str(l+1)] = beta1*moment["m_b" + str(l+1)] + (1-beta1)*grads["dL_db" + str(l + 1)]
           
        velocity["v_w" + str(l+1)] = beta2*velocity["v_w" + str(l+1)] + (1-beta2)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta2*velocity["v_b" + str(l+1)] + (1-beta2)*grads["dL_db" + str(l + 1)]**2
        
        m_w_hat = moment["m_w" + str(l+1)]/(1-beta1**(epoch+1))
        m_b_hat = moment["m_b" + str(l+1)]/(1-beta1**(epoch+1))
        
        v_w_hat = velocity["v_w" + str(l+1)]/(1-beta2**(epoch+1))
        v_b_hat = velocity["v_b" + str(l+1)]/(1-beta2**(epoch+1))
        
        
        nadam_update_w =  (beta1*m_w_hat) + (((1-beta1)*grads["dL_dw" + str(l + 1)])/ (1-beta1**(epoch+1)))
        nadam_update_b =  (beta1*m_b_hat) + (((1-beta1)*grads["dL_db" + str(l + 1)])/ (1-beta1**(epoch+1)))

        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*nadam_update_w                                       
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*nadam_update_b

    return parameters, velocity, moment

In [83]:
math.pow(3,2)

9.0

In [84]:
def find_lookahead_parameters(parameters,update,gamma):
    L = len(parameters) // 2 # number of layers in the neural network
    lookahead_parameters = {}

    for l in range(L):
        lookahead_parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-gamma*update["update_w" + str(l+1)] 
        lookahead_parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-gamma*update["update_b" + str(l+1)]
    return lookahead_parameters 

In [85]:
def predict(x, y, parameters):
    x =x.T
    y =y.T
    
    m = x.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    prob, temps = forward_pass(x, parameters)
    

    predicted_label = np.argmax(prob, axis=0)
    true_label = np.argmax(y, axis=0)
    
    Accuracy = np.sum(predicted_label == true_label)/m
        
#         if probas[0,i] > 0.5:
#             p[0,i] = 1
#         else:
#             p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(Accuracy))
        
    return

In [86]:
# index = 15
# plt.imshow(x_train_orig[index])
# print ("y = " + str(y_train_orig[index])+ ". It's a " + class_names[y_train_orig[index]] +  " picture.")

In [87]:
x_train = np.array([[1,2,3,4],[3,4,5,6],[5,6,7,8]])
y_train = np.array([[0,0,1],[1,0,0],[0,1,0]])
y_train.shape

(3, 3)

In [88]:
no_hidden_layers = 1 # no of hidden layers
no_neuron_hidden = 4 # no. of neurons in each hidden layers
no_neuron_output = 3 # # no. of neurons in each hidden layers

In [89]:
x_train.shape

(3, 4)

In [145]:
no_of_training_examples = np.shape(x_train)[0]
#no_of_testing_examples = np.shape(x_test)[0]
size_input_layer = [x_train.shape[1]]
no_hidden_layers = 2
size_hidden_layers = [4,3]
size_output_layer = [10]


In [146]:
def one_hot_vector_form(labels,size_output_layer):
    no_of_examples = labels.shape[0]
    one_hot_vector = np.zeros((no_of_examples , size_output_layer))
    for i in range(no_of_examples):
        one_hot_vector[i, labels[i]] = 1    
        y = one_hot_vector#.T
    return y  

In [147]:
layer_dimensions = size_input_layer + size_hidden_layers + size_output_layer
layer_dimensions

[4, 4, 3, 10]

In [148]:
velo = initialize_velocity(layer_dimensions)
velo

{'v_w1': array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]),
 'v_b1': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'v_w2': array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]),
 'v_b2': array([[0.],
        [0.],
        [0.]]),
 'v_w3': array([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]),
 'v_b3': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])}

In [94]:
parameters = initialize_parameters(layer_dimensions)
parameters

{'w1': array([[ 0.605113  , -1.22244522,  0.57228786, -0.24796819],
        [-0.68379726,  0.04230595,  0.05292313,  0.32519867],
        [-2.40600739,  1.01802245, -0.75336393, -0.91916087],
        [-1.70283982, -0.14775631, -0.9135634 , -1.61842618]]),
 'b1': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'w2': array([[-1.27372657, -0.09783771, -1.00431167,  2.03344773],
        [ 0.43222517, -0.88064236, -0.12510805,  1.43812515],
        [-0.71213434, -0.69159649, -1.72816224,  0.43877603]]),
 'b2': array([[0.],
        [0.],
        [0.]])}

In [95]:
2*(parameters["w1"]*2)

array([[ 2.42045201, -4.88978086,  2.28915144, -0.99187276],
       [-2.73518904,  0.16922379,  0.21169254,  1.30079467],
       [-9.62402954,  4.0720898 , -3.0134557 , -3.67664349],
       [-6.81135927, -0.59102525, -3.6542536 , -6.47370471]])

In [96]:
x_train

array([[1, 2, 3, 4],
       [3, 4, 5, 6],
       [5, 6, 7, 8]])

In [97]:
np.sqrt(x_train+1)

array([[1.41421356, 1.73205081, 2.        , 2.23606798],
       [2.        , 2.23606798, 2.44948974, 2.64575131],
       [2.44948974, 2.64575131, 2.82842712, 3.        ]])

In [98]:
sd =2/np.sqrt(x_train+1)
sd

array([[1.41421356, 1.15470054, 1.        , 0.89442719],
       [1.        , 0.89442719, 0.81649658, 0.75592895],
       [0.81649658, 0.75592895, 0.70710678, 0.66666667]])

In [99]:
sd*x_train

array([[1.41421356, 2.30940108, 3.        , 3.57770876],
       [3.        , 3.57770876, 4.0824829 , 4.53557368],
       [4.0824829 , 4.53557368, 4.94974747, 5.33333333]])

In [100]:
forward_pass(x_train.T, parameters)

(array([[0.37929966, 0.38516438, 0.38066278],
        [0.33400188, 0.31763605, 0.31278171],
        [0.28669846, 0.29719957, 0.30655551]]),
 [((array([[1, 3, 5],
           [2, 4, 6],
           [3, 5, 7],
           [4, 6, 8]]),
    array([[ 0.605113  , -1.22244522,  0.57228786, -0.24796819],
           [-0.68379726,  0.04230595,  0.05292313,  0.32519867],
           [-2.40600739,  1.01802245, -0.75336393, -0.91916087],
           [-1.70283982, -0.14775631, -0.9135634 , -1.61842618]]),
    array([[0.],
           [0.],
           [0.],
           [0.]])),
   array([[ -1.11478661,  -1.7008117 ,  -2.28683678],
          [  0.86037871,   0.33363968,  -0.19309934],
          [ -6.30669775, -12.42771723, -18.5487367 ],
          [-11.21274735, -19.97791877, -28.74309019]])),
  ((array([[2.46979594e-01, 1.54359283e-01, 9.22190162e-02],
           [7.02739771e-01, 5.82644703e-01, 4.51874610e-01],
           [1.82072567e-03, 4.00598433e-06, 8.79804143e-09],
           [1.35008103e-05, 2.10717

In [101]:
predict(x_train, y_train, parameters)

Accuracy: 0.3333333333333333


In [123]:
def L_layer_network(x_train, y_train,layer_dimensions,learning_rate,num_epochs,gamma = 0.9,  
                    batch_type ="Full_batch",batchsize = 2,grad_descent_type = "Vanilla", beta_rms = 0.9, 
                    beta1 = 0.9,beta2 = 0.9,eps = 1e-8, print_cost=False):
    
    print(learning_rate)

    #np.random.seed(1)
    costs = []          
    
    parameters = initialize_parameters(layer_dimensions)
    update = initialize_update(layer_dimensions)
    velocity = initialize_velocity(layer_dimensions)
    moment = initialize_moment(layer_dimensions)
    if batch_type == "SGD":
        batchsize =1
    elif batch_type ==  "Mini_batch":
        batchsize = batchsize
    elif batch_type == "Full_batch":
        batchsize = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batchsize
    #print(num_steps)
 
    for i in range(0, num_epochs):
        print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batchsize
            end = start+batchsize
            x = x_train[start:end].T
            y = y_train[start:end].T 
            
            #print("param",parameters)
            #print("in update",update)
               
            if grad_descent_type == "Vanilla":
            
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters = parameter_update_vanilla(parameters, grads,learning_rate)
                
            elif grad_descent_type == "Momentum":
                
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                #print("in update",update)
                parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)
                #print("out update",update)
                
            elif grad_descent_type == "NAG":
                lookahead_parameters = find_lookahead_parameters(parameters,update,gamma)
                #print("lookahead_parameters",lookahead_parameters)
                yhat, temps = forward_pass(x, lookahead_parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)
                
            elif grad_descent_type == "RMSProp":
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters,velocity= parameter_update_RMSProp(parameters, grads, velocity,learning_rate ,beta_rms,eps)
                #print("velocity",velocity)
                
            elif grad_descent_type == "Adam":
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters,velocity, moment = parameter_update_adam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,i)
                #print("velocity",velocity)

            elif grad_descent_type == "Nadam":
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters,velocity, moment = parameter_update_nadam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,i)
                #print("velocity",velocity)   

            #print("out update",update)
            par_update += 1
        
        print("par_updated ",par_update,"times")
        print("***********************************************************")
        
    #     #print("cost in iteration ",i," is =",cost)
                
    #         # Print the cost every 100 training example
    #     if print_cost and i % 100 == 0:
    #         print ("Cost after iteration %i: %f" %(i, cost))
    #     if print_cost and i % 100 == 0:
    #         costs.append(cost)
    # # plot the cost
    # plt.plot(np.squeeze(costs))
    # plt.ylabel('cost')
    # plt.xlabel('iterations (per hundreds)')
    # plt.title("Learning rate =" + str(learning_rate))
    # plt.show()
    
    return parameters

In [130]:
parameters = L_layer_network(x_train, y_train, layer_dimensions,0.1,5,grad_descent_type = "RMSProp",print_cost = True)

0.1
***********epoch =  0
par_updated  1 times
***********************************************************
***********epoch =  1
par_updated  1 times
***********************************************************
***********epoch =  2
par_updated  1 times
***********************************************************
***********epoch =  3
par_updated  1 times
***********************************************************
***********epoch =  4
par_updated  1 times
***********************************************************


In [112]:
parameters

{'w1': array([[  2.26736466,   5.73005981,   6.18858314,   7.82924814],
        [-73.18099915, -73.79981161, -74.05578235, -73.04065069],
        [ -0.75683644,  -0.40210793,  -1.06098473,  -2.44007136],
        [ -0.11135225,  -1.77045326,  -1.58715983,  -4.16049516]]),
 'b1': array([[ 1.98860739e+00],
        [-7.15296182e+01],
        [-3.19285017e-02],
        [-1.07653434e+00]]),
 'w2': array([[ 35.62980241,  -0.65697496,  -2.56583185,  74.61087684],
        [-36.52542684, -57.2859071 ,   1.66990971, -71.65807942],
        [-34.43690199,  59.56155287,  -3.80831888, -71.97653527]]),
 'b2': array([[ 37.7213183 ],
        [-38.62825971],
        [-36.79429088]])}

In [None]:
predict(x_train, y_train, parameters)

In [None]:
forward_pass(x_train.T, parameters)

In [None]:
predict(x_test,y_test,parameters)