In [40]:
import numpy as np
import keras
from keras.datasets import fashion_mnist
from matplotlib import pyplot as plt

In [41]:
def sigmoid(a):
    s = 1/(1+np.exp(-a))
    return s

def derivative_sigmoid(a):
    ds = sigmoid(a) *(1-sigmoid (a))
    return ds

def tanh(a):
    t=(np.exp(a)-np.exp(-a))/(np.exp(a)+np.exp(-a))
    return t

def derivative_tanh(a):
    dt=1-tanh(a)**2
    return dt


def softmax(a):
    return np.exp(a) / np.sum(np.exp(a), axis=0) #expA (axis=0, keepdims=True)

In [42]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [43]:
fashion_mnist = keras.datasets.fashion_mnist
(x_train_orig, y_train_orig), (x_test_orig, y_test_orig) = fashion_mnist.load_data()

In [44]:
def initialize_parameters(layer_dimensions):

    #np.random.seed(0)
    parameters = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        parameters['w' + str(k)] = np.random.randn(layer_dimensions[k], layer_dimensions[k-1]) 
        parameters['b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return parameters

In [45]:
def initialize_update(layer_dimensions):

    #np.random.seed(0)
    update = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        update['update_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        update['update_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return update

In [46]:
def agrregation_forward(h, w, b):
    
    a = np.dot(w, h) + b
    temp = (h,w,b)
    
    return a ,temp

In [47]:
def activation_forward(h_prev, w, b, activation):
        
    
    if activation == "sigmoid":

        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = sigmoid(a)
    
    elif activation == "tanh":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = tanh(a)
        
    elif activation == "softmax":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = softmax(a)
    
    
    temp = (linear_temp, a)

    return h, temp

In [48]:
def forward_pass(x, parameters):

    temps = []
    h = x
    L = len(parameters) // 2                  # number of layers in the neural network
    
    for k in range(L-1):
        l = k+1
        h_prev = h 
        h,temp = activation_forward(h_prev, parameters['w'+str(l)], parameters['b'+str(l)], activation="sigmoid")
        temps.append(temp)
    
    
    hL,temp1 = activation_forward(h, parameters['w'+str(L)], parameters['b'+str(L)], activation="softmax")
    temps.append(temp1)
    
            
    return hL, temps

In [49]:
def cost_function(yhat, y):   
    m = y.shape[1] # no. of examples
  
    product_sum = np.sum((y *np.log(yhat)), axis = 0)
    cost = -1/m*np.sum(product_sum)
    
    return cost

In [50]:
def agrregation_backward(dL_da, temp):
    
    h_prev, w, b = temp 
    m = h_prev.shape[1]
    dL_dh_prev = np.dot(w.T, dL_da)
    
    dL_dw = 1/m*np.dot(dL_da, h_prev.T)
    dL_db = 1/m*np.sum(dL_da, axis=1, keepdims=True)
     

    return dL_dh_prev, dL_dw, dL_db

In [51]:
def activation_backward(dL_dh, temp, activation):

    linear_temp, a = temp
    
    if activation == "sigmoid":
        ds = derivative_sigmoid(a)
        dL_da = dL_dh * ds
       
        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
        
    elif activation == "tanh":
        dt = derivative_tanh(a)
        dL_da = dL_dh * dt

        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
    
    return dL_dh_prev, dL_dw, dL_db

In [52]:
def backward_pass(yhat, y, temps):
    
    grads = {}
    L = len(temps) # the number of layers
    m = y.shape[1]

# el = one hot vector
    el = y
    dL_dyhat = -(1/yhat)*el
    dL_daL  = -(el - yhat)
    current_temp = temps[L-1]
    linear_tempL,aL = current_temp
    
    hL_prev, wL, bL = linear_tempL
    m = hL_prev.shape[1]

    dL_dhL_prev = np.dot(wL.T, dL_daL)
    
    dL_dwL = 1/m*np.dot(dL_daL, hL_prev.T)
    dL_dbL = 1/m*np.sum(dL_daL, axis=1, keepdims=True)

    grads["dL_dh" + str(L-1)] = dL_dhL_prev
    grads["dL_dw" + str(L)]      = dL_dwL
    grads["dL_db" + str(L)] = dL_dbL
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        #print(l)
        current_temp = temps[l]
        dL_dh_prev, dL_dw, dL_db = activation_backward(grads["dL_dh" + str(l+1)], current_temp, "sigmoid")
        grads["dL_dh" + str(l)] = dL_dh_prev
        grads["dL_dw" + str(l + 1)] = dL_dw
        grads["dL_db" + str(l + 1)] = dL_db

    return grads

In [53]:
def parameter_update_vanilla(parameters, grads,learning_rate):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- learning_rate*grads["dL_dw" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- learning_rate*grads["dL_db" + str(l + 1)]

    return parameters

In [54]:
def parameter_update_momentum(parameters, grads, update, learning_rate ,gamma):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        update["update_w" + str(l+1)] = gamma*update["update_w" + str(l+1)] + learning_rate*grads["dL_dw" + str(l + 1)]
        update["update_b" + str(l+1)] = gamma*update["update_b" + str(l+1)] + learning_rate*grads["dL_db" + str(l + 1)]
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-update["update_w" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- update["update_b" + str(l+1)]

    return parameters, update

In [55]:
def find_lookahead_parameters(parameters,update,gamma):
    L = len(parameters) // 2 # number of layers in the neural network
    lookahead_parameters = {}

    for l in range(L):
        lookahead_parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-gamma*update["update_w" + str(l+1)] 
        lookahead_parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-gamma*update["update_b" + str(l+1)]
    return lookahead_parameters 

In [56]:
def predict(x, y, parameters):
    x =x.T
    y =y.T
    
    m = x.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    prob, temps = forward_pass(x, parameters)
    

    predicted_label = np.argmax(prob, axis=0)
    true_label = np.argmax(y, axis=0)
    
    Accuracy = np.sum(predicted_label == true_label)/m
        
#         if probas[0,i] > 0.5:
#             p[0,i] = 1
#         else:
#             p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(Accuracy))
        
    return

In [57]:
# index = 15
# plt.imshow(x_train_orig[index])
# print ("y = " + str(y_train_orig[index])+ ". It's a " + class_names[y_train_orig[index]] +  " picture.")

In [58]:
x_train = np.array([[1,2,3,4],[3,4,5,6],[5,6,7,8]])
y_train = np.array([[0,0,1],[1,0,0],[0,1,0]])
y_train.shape

(3, 3)

In [59]:
no_hidden_layers = 1 # no of hidden layers
no_neuron_hidden = 4 # no. of neurons in each hidden layers
no_neuron_output = 3 # # no. of neurons in each hidden layers

In [60]:
x_train.shape

(3, 4)

In [61]:
no_of_training_examples = np.shape(x_train)[0]
#no_of_testing_examples = np.shape(x_test)[0]
size_input_layer = np.shape(x_train)[1]
size_hidden_layer = no_neuron_hidden
size_output_layer = no_neuron_output

In [62]:
def one_hot_vector_form(labels,size_output_layer):
    no_of_examples = labels.shape[0]
    one_hot_vector = np.zeros((no_of_examples , size_output_layer))
    for i in range(no_of_examples):
        one_hot_vector[i, labels[i]] = 1    
        y = one_hot_vector#.T
    return y  

In [63]:
layer_dimensions = [size_input_layer]+ [size_hidden_layer]*no_hidden_layers+ [size_output_layer]
layer_dimensions

[4, 4, 3]

In [72]:
parameters = initialize_parameters(layer_dimensions)
parameters

{'w1': array([[-0.71586379, -0.60550807,  0.65897813, -1.42415746],
        [-0.57116409, -0.69220739, -1.10277752,  0.86515635],
        [-1.30960855, -0.998372  , -0.02451508, -1.85132059],
        [-0.62809248,  0.10892502, -0.62449993, -0.05636695]]),
 'b1': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'w2': array([[-1.14414468, -1.57157571, -0.59874191,  0.32573063],
        [-0.71490638,  0.65037816, -1.09429188, -2.7186347 ],
        [ 0.84925068,  0.02444377,  0.24658797, -0.01504266]]),
 'b2': array([[0.],
        [0.],
        [0.]])}

In [73]:
forward_pass(x_train.T, parameters)

(array([[0.30112577, 0.33262299, 0.33341211],
        [0.32843672, 0.33121876, 0.33303481],
        [0.37043751, 0.33615825, 0.33355308]]),
 [((array([[1, 3, 5],
           [2, 4, 6],
           [3, 5, 7],
           [4, 6, 8]]),
    array([[-0.71586379, -0.60550807,  0.65897813, -1.42415746],
           [-0.57116409, -0.69220739, -1.10277752,  0.86515635],
           [-1.30960855, -0.998372  , -0.02451508, -1.85132059],
           [-0.62809248,  0.10892502, -0.62449993, -0.05636695]]),
    array([[0.],
           [0.],
           [0.],
           [0.]])),
   array([[ -5.64657539,  -9.81967777, -13.99278015],
          [ -1.80328605,  -4.80527136,  -7.80725667],
          [-10.78518015, -19.15281258, -27.52044501],
          [ -2.50921005,  -4.90927873,  -7.30934741]])),
  ((array([[3.51716941e-03, 5.43681452e-05, 8.37553252e-07],
           [1.41451527e-01, 8.12000500e-03, 4.06607034e-04],
           [2.07036436e-05, 4.80882732e-09, 1.11692135e-12],
           [7.52150384e-02, 7.32377

In [74]:
predict(x_train, y_train, parameters)

Accuracy: 0.3333333333333333


In [87]:
def L_layer_network(x_train, y_train,layer_dimensions,learning_rate,gamma, num_epochs,batch_type,batchsize,
                   grad_deescent_type,print_cost=False):
    

    #np.random.seed(1)
    costs = []          
    
    parameters = initialize_parameters(layer_dimensions)
    update = initialize_update(layer_dimensions)
    if batch_type == "SGD":
        batchsize =1
    elif batch_type ==  "Mini_batch":
        batchsize = batchsize
    elif batch_type == "Full_batch":
        batchsize = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batchsize
    #print(num_steps)
 
    for i in range(0, num_epochs):
        print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batchsize
            end = start+batchsize
            x = x_train[start:end].T
            y = y_train[start:end].T 
            
            #print("param",parameters)
            #print("in update",update)
               
            if grad_deescent_type == "Vanilla":
            
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters = parameter_update_vanilla(parameters, grads,learning_rate)
                
            elif grad_deescent_type == "Momentum":
                
                yhat, temps = forward_pass(x, parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                #print("in update",update)
                parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)
                #print("out update",update)
                
            elif grad_deescent_type == "NAG":
                lookahead_parameters = find_lookahead_parameters(parameters,update,gamma)
                #print("lookahead_parameters",lookahead_parameters)
                yhat, temps = forward_pass(x, lookahead_parameters)
                cost = cost_function(yhat, y)
                grads = backward_pass(yhat,y,temps)
                parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)

            #print("out update",update)
            par_update += 1
        
        print("par_updated ",par_update,"times")
        print("***********************************************************")
        
    #     #print("cost in iteration ",i," is =",cost)
                
    #         # Print the cost every 100 training example
    #     if print_cost and i % 100 == 0:
    #         print ("Cost after iteration %i: %f" %(i, cost))
    #     if print_cost and i % 100 == 0:
    #         costs.append(cost)
    # # plot the cost
    # plt.plot(np.squeeze(costs))
    # plt.ylabel('cost')
    # plt.xlabel('iterations (per hundreds)')
    # plt.title("Learning rate =" + str(learning_rate))
    # plt.show()
    
    return parameters

In [89]:
parameters = L_layer_network(x_train, y_train, layer_dimensions,1,0.9,2,"Full_batch",1,"Vanilla",print_cost = True)

***********epoch =  0
par_updated  1 times
***********************************************************
***********epoch =  1
par_updated  1 times
***********************************************************


In [80]:
parameters

{'w1': array([[-0.35019555,  1.30889103,  2.03061962, -1.86649547],
        [-0.67394888,  1.87991259,  0.54175514,  0.33039636],
        [-1.98065672,  0.22936762, -0.47406352,  0.12342723],
        [-0.3406421 ,  0.7907601 , -0.13481437,  1.38529023]]),
 'b1': array([[-0.04934173],
        [ 0.00039241],
        [-0.04825434],
        [ 0.00064664]]),
 'w2': array([[ 1.76852794, -0.22563902, -0.71350865, -0.6106242 ],
        [-0.11719976, -0.19370735,  0.60045324,  1.26370635],
        [-0.96149393,  0.3784562 ,  1.14029767,  1.74207237]]),
 'b2': array([[ 0.09514906],
        [ 0.07195659],
        [-0.16710565]])}

In [81]:
predict(x_train, y_train, parameters)

Accuracy: 0.6666666666666666


In [82]:
forward_pass(x_train.T, parameters)

(array([[0.18356911, 0.31510608, 0.33508608],
        [0.35547496, 0.34166959, 0.337239  ],
        [0.46095593, 0.34322432, 0.32767491]]),
 [((array([[1, 3, 5],
           [2, 4, 6],
           [3, 5, 7],
           [4, 6, 8]]),
    array([[-0.35019555,  1.30889103,  2.03061962, -1.86649547],
           [-0.67394888,  1.87991259,  0.54175514,  0.33039636],
           [-1.98065672,  0.22936762, -0.47406352,  0.12342723],
           [-0.3406421 ,  0.7907601 , -0.13481437,  1.38529023]]),
    array([[-0.04934173],
           [ 0.00039241],
           [-0.04825434],
           [ 0.00064664]])),
   array([[  0.84412176,   3.08976102,   5.33540028],
          [  6.03311956,  10.18934998,  14.34558039],
          [ -2.49865747,  -6.70250826, -10.90635905],
          [  6.37824256,   9.77943027,  13.18061799]])),
  ((array([[6.99332595e-01, 9.56468416e-01, 9.95205120e-01],
           [9.97607735e-01, 9.99962433e-01, 9.99999411e-01],
           [7.59523497e-02, 1.22632261e-03, 1.83408932e-05],

In [None]:
predict(x_test,y_test,parameters)