In [1]:
import numpy as np
import keras
from keras.datasets import fashion_mnist
from matplotlib import pyplot as plt
import math

Using TensorFlow backend.


In [2]:
import wandb

In [3]:
#wandb.login()

In [4]:
def sigmoid(a):
    s = 1/(1+np.exp(-a))
    return s

def derivative_sigmoid(a):
    ds = sigmoid(a) *(1-sigmoid (a))
    return ds

def tanh(a):
    t=(np.exp(a)-np.exp(-a))/(np.exp(a)+np.exp(-a))
    return t

def derivative_tanh(a):
    dt=1-tanh(a)**2
    return dt


def softmax(a):
    return np.exp(a) / np.sum(np.exp(a), axis=0) #expA (axis=0, keepdims=True)

In [5]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [6]:
fashion_mnist = keras.datasets.fashion_mnist
(x_train_orig, y_train_orig), (x_test_orig, y_test_orig) = fashion_mnist.load_data()

In [7]:
def one_hot_vector_form(labels,no_neurons_output_layer):
    no_of_examples = labels.shape[0]
    one_hot_vector = np.zeros((no_of_examples , no_neurons_output_layer))
    for i in range(no_of_examples):
        one_hot_vector[i, labels[i]] = 1    
        y = one_hot_vector#.T
    return y  

In [8]:
def initialize_parameters(layer_dimensions):

    #np.random.seed(0)
    parameters = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        parameters['w' + str(k)] = np.random.randn(layer_dimensions[k], layer_dimensions[k-1]) 
        parameters['b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return parameters

In [9]:
def initialize_update(layer_dimensions):

    #np.random.seed(0)
    update = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        update['update_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        update['update_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return update

In [10]:
def initialize_velocity(layer_dimensions):

    #np.random.seed(0)
    velocity = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        velocity['v_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        velocity['v_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return velocity

In [11]:
def initialize_moment(layer_dimensions):

    #np.random.seed(0)
    moment = {}
    L = len(layer_dimensions)            # number of layers in the network

    for k in range(1, L):
        
        moment['m_w' + str(k)] = np.zeros((layer_dimensions[k], layer_dimensions[k-1])) 
        moment['m_b' + str(k)] = np.zeros((layer_dimensions[k], 1))
        
    return moment

In [12]:
def agrregation_forward(h, w, b):
    
    a = np.dot(w, h) + b
    temp = (h,w,b)
    
    return a ,temp

In [13]:
def activation_forward(h_prev, w, b, activation):
        
    
    if activation == "sigmoid":

        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = sigmoid(a)
    
    elif activation == "tanh":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = tanh(a)
        
    elif activation == "softmax":
        a, linear_temp = agrregation_forward(h_prev, w, b)
        h = softmax(a)
    
    
    temp = (linear_temp, a)

    return h, temp

In [14]:

def forward_pass(x, parameters):

    temps = []
    h = x
    L = len(parameters) // 2                  # number of layers in the neural network
    
    for k in range(L-1):
        l = k+1
        h_prev = h 
        h,temp = activation_forward(h_prev, parameters['w'+str(l)], parameters['b'+str(l)], activation="sigmoid")
        temps.append(temp)
    
    
    hL,temp1 = activation_forward(h, parameters['w'+str(L)], parameters['b'+str(L)], activation="softmax")
    temps.append(temp1)
    
            
    return hL, temps

In [15]:
def cost_function(yhat, y):   
    m = y.shape[1] # no. of examples
  
    product_sum = np.sum((y *np.log(yhat)), axis = 0)
    cost = -1/m*np.sum(product_sum)
    
    return cost

In [16]:
def agrregation_backward(dL_da, temp):
    
    h_prev, w, b = temp 
    m = h_prev.shape[1]
    dL_dh_prev = np.dot(w.T, dL_da)
    
    dL_dw = 1/m*np.dot(dL_da, h_prev.T)
    dL_db = 1/m*np.sum(dL_da, axis=1, keepdims=True)
     

    return dL_dh_prev, dL_dw, dL_db

In [17]:
def activation_backward(dL_dh, temp, activation):

    linear_temp, a = temp
    
    if activation == "sigmoid":
        ds = derivative_sigmoid(a)
        dL_da = dL_dh * ds
       
        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
        
    elif activation == "tanh":
        dt = derivative_tanh(a)
        dL_da = dL_dh * dt

        dL_dh_prev, dL_dw, dL_db = agrregation_backward(dL_da, linear_temp)    
    
    return dL_dh_prev, dL_dw, dL_db

In [18]:
def backward_pass(yhat, y, temps):
    
    grads = {}
    L = len(temps) # the number of layers
    m = y.shape[1]

# el = one hot vector
    el = y
    dL_dyhat = -(1/yhat)*el
    dL_daL  = -(el - yhat)
    current_temp = temps[L-1]
    linear_tempL,aL = current_temp
    
    hL_prev, wL, bL = linear_tempL
    m = hL_prev.shape[1]

    dL_dhL_prev = np.dot(wL.T, dL_daL)
    
    dL_dwL = 1/m*np.dot(dL_daL, hL_prev.T)
    dL_dbL = 1/m*np.sum(dL_daL, axis=1, keepdims=True)

    grads["dL_dh" + str(L-1)] = dL_dhL_prev
    grads["dL_dw" + str(L)]      = dL_dwL
    grads["dL_db" + str(L)] = dL_dbL
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        #print(l)
        current_temp = temps[l]
        dL_dh_prev, dL_dw, dL_db = activation_backward(grads["dL_dh" + str(l+1)], current_temp, "sigmoid")
        grads["dL_dh" + str(l)] = dL_dh_prev
        grads["dL_dw" + str(l + 1)] = dL_dw
        grads["dL_db" + str(l + 1)] = dL_db

    return grads

In [19]:
def parameter_update_vanilla(parameters, grads,learning_rate):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- learning_rate*grads["dL_dw" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- learning_rate*grads["dL_db" + str(l + 1)]

    return parameters

In [20]:
def parameter_update_momentum(parameters, grads, update, learning_rate ,gamma):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        update["update_w" + str(l+1)] = gamma*update["update_w" + str(l+1)] + learning_rate*grads["dL_dw" + str(l + 1)]
        update["update_b" + str(l+1)] = gamma*update["update_b" + str(l+1)] + learning_rate*grads["dL_db" + str(l + 1)]
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-update["update_w" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- update["update_b" + str(l+1)]

    return parameters, update

In [21]:

def parameter_update_RMSProp(parameters, grads, velocity, learning_rate ,beta,eps):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        velocity["v_w" + str(l+1)] = beta*velocity["v_w" + str(l+1)] + (1-beta)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta*velocity["v_b" + str(l+1)] + (1-beta)*grads["dL_db" + str(l + 1)]**2
        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*grads["dL_dw" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- (learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*grads["dL_db" + str(l + 1)]

    return parameters, velocity

In [22]:
def parameter_update_adam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,epoch):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        
        moment["m_w" + str(l+1)] = beta1*moment["m_w" + str(l+1)] + (1-beta1)*grads["dL_dw" + str(l + 1)]
        moment["m_b" + str(l+1)] = beta1*moment["m_b" + str(l+1)] + (1-beta1)*grads["dL_db" + str(l + 1)]
           
        velocity["v_w" + str(l+1)] = beta2*velocity["v_w" + str(l+1)] + (1-beta2)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta2*velocity["v_b" + str(l+1)] + (1-beta2)*grads["dL_db" + str(l + 1)]**2
        
        m_w_hat = moment["m_w" + str(l+1)]/(1-beta1**(epoch+1))
        m_b_hat = moment["m_b" + str(l+1)]/(1-beta1**(epoch+1))
        
        v_w_hat = velocity["v_w" + str(l+1)]/(1-beta2**(epoch+1))
        v_b_hat = velocity["v_b" + str(l+1)]/(1-beta2**(epoch+1))

        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- ((learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*m_w_hat)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- ((learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*m_b_hat)

    return parameters, velocity, moment

In [23]:
def parameter_update_nadam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,epoch):
    
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        
        
        moment["m_w" + str(l+1)] = beta1*moment["m_w" + str(l+1)] + (1-beta1)*grads["dL_dw" + str(l + 1)]
        moment["m_b" + str(l+1)] = beta1*moment["m_b" + str(l+1)] + (1-beta1)*grads["dL_db" + str(l + 1)]
           
        velocity["v_w" + str(l+1)] = beta2*velocity["v_w" + str(l+1)] + (1-beta2)*grads["dL_dw" + str(l + 1)]**2
        velocity["v_b" + str(l+1)] = beta2*velocity["v_b" + str(l+1)] + (1-beta2)*grads["dL_db" + str(l + 1)]**2
        
        m_w_hat = moment["m_w" + str(l+1)]/(1-beta1**(epoch+1))
        m_b_hat = moment["m_b" + str(l+1)]/(1-beta1**(epoch+1))
        
        v_w_hat = velocity["v_w" + str(l+1)]/(1-beta2**(epoch+1))
        v_b_hat = velocity["v_b" + str(l+1)]/(1-beta2**(epoch+1))
        
        
        nadam_update_w =  (beta1*m_w_hat) + (((1-beta1)*grads["dL_dw" + str(l + 1)])/ (1-beta1**(epoch+1)))
        nadam_update_b =  (beta1*m_b_hat) + (((1-beta1)*grads["dL_db" + str(l + 1)])/ (1-beta1**(epoch+1)))

        
        parameters["w" + str(l+1)] = parameters["w" + str(l+1)]- ((learning_rate / np.sqrt(velocity["v_w" + str(l+1)]+eps))*nadam_update_w)                                       
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]- ((learning_rate / np.sqrt(velocity["v_b" + str(l+1)]+eps))*nadam_update_b)

    return parameters, velocity, moment

In [24]:
def find_lookahead_parameters(parameters,update,gamma):
    L = len(parameters) // 2 # number of layers in the neural network
    lookahead_parameters = {}

    for l in range(L):
        lookahead_parameters["w" + str(l+1)] = parameters["w" + str(l+1)]-gamma*update["update_w" + str(l+1)] 
        lookahead_parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-gamma*update["update_b" + str(l+1)]
    return lookahead_parameters 

In [25]:
def predict(x, y, parameters):
    x =x.T
    y =y.T
    
    m = x.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    prob, temps = forward_pass(x, parameters)
    

    predicted_label = np.argmax(prob, axis=0)
    true_label = np.argmax(y, axis=0)
    
    Accuracy = np.sum(predicted_label == true_label)/m

    #print("Accuracy: "  + str(Accuracy))
        
    return Accuracy

In [88]:
x_train_orig = x_train_orig[:5000]
x_test_orig = x_test_orig[:1000]
y_train_orig = y_train_orig[:5000]
y_test_orig = y_train_orig[:1000]
y_train_orig.shape

(5000,)

10

In [27]:
x_train_flatten = x_train_orig.reshape(x_train_orig.shape[0], -1)
x_test_flatten = x_test_orig.reshape(x_test_orig.shape[0], -1)
x_train = x_train_flatten/255
x_test = x_test_flatten/255

print ("x_train's shape: " + str(x_train.shape))
print ("x_test's shape: " + str(x_test.shape))


x_train's shape: (5000, 784)
x_test's shape: (1000, 784)


In [28]:
# index = 15
# plt.imshow(x_train_orig[index])
# print ("y = " + str(y_train_orig[index])+ ". It's a " + class_names[y_train_orig[index]] +  " picture.")

x_train = np.array([[1,2,3,4],[3,4,5,6],[5,6,7,8]])
y_train = np.array([[0,0,1],[1,0,0],[0,1,0]])
y_train.shape

In [29]:
x_train.shape

(5000, 784)

In [113]:
no_of_training_examples = np.shape(x_train)[0]; no_of_testing_examples = np.shape(x_test)[0]
size_input_layer = x_train.shape[1]
no_hidden_layers = 3
size_hidden_layer = 32 
size_output_layer = len(np.unique(y_train_orig)) # no of class labels

In [117]:
y_train = one_hot_vector_form(y_train_orig,size_output_layer)
y_test = one_hot_vector_form(y_test_orig,size_output_layer)
y_train.shape

(5000, 10)

In [118]:
def get_layer_dims(size_input_layer,no_hidden_layers,size_hidden_layer,size_output_layer):
    layer_dimensions = [size_input_layer] + [size_hidden_layer]*no_hidden_layers+ [size_output_layer]
    return layer_dimensions

In [120]:
layer_dimensions = get_layer_dims(size_input_layer,no_hidden_layers,size_hidden_layer,size_output_layer)
layer_dimensions

[784, 32, 32, 32, 10]

In [38]:
x_test.T.shape

(784, 1000)

In [39]:
def get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters):
    xt = x_train.T
    yt = y_train.T
    
    xv = x_valid.T
    yv = y_valid.T

    yhatt, tempst = forward_pass(xt, parameters)
    train_loss = cost_function(yhatt, yt)

    yhatv, tempsv = forward_pass(xv, parameters)
    valid_loss = cost_function(yhatv, yv)

    return train_loss,valid_loss
    


In [83]:
def sgd(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs):
    
    #np.random.seed(1)
              
    parameters = initialize_parameters(layer_dimensions)
    batch_size = 1
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        #par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T 
            
            yhat, temps = forward_pass(x, parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)
            parameters = parameter_update_vanilla(parameters, grads,learning_rate)
                
            
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
         
        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})
        print("train_acc",train_acc)
        print("valid_acc",valid_acc)
    
        print("train_loss",train_loss)
        print("valid_loss",valid_loss) 

    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [84]:
parameters,train_loss,valid_loss,train_acc,valid_acc = sgd(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4)

train_acc 0.2102
valid_acc 0.095
train_loss 3.8028444269271757
valid_loss 4.53001423331665
train_acc 0.3334
valid_acc 0.092
train_loss 2.1194293617728714
valid_loss 3.188737463358315
train_acc 0.4476
valid_acc 0.085
train_loss 1.707313146759275
valid_loss 3.1182912747231675
train_acc 0.5108
valid_acc 0.091
train_loss 1.4944132804423285
valid_loss 3.2071194102896503


In [79]:
def momentum(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs,gamma = 0.9,  
                    batch_type ="Min_batch",batch_size = 16):

    #np.random.seed(1)  
    
    parameters = initialize_parameters(layer_dimensions)
    update = initialize_update(layer_dimensions)

    if batch_type == "SGD":
        batch_size =1
    elif batch_type ==  "Mini_batch":
        batch_size = batch_size
    elif batch_type == "Full_batch":
        batch_size = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T
                
            yhat, temps = forward_pass(x, parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)   
            parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)
                
            
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
         
        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})

#         print("train_acc",train_acc)
#         print("valid_acc",valid_acc)
    
#         print("train_loss",train_loss)
#         print("valid_loss",valid_loss) 

     
    
    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [80]:
parameters,train_loss,valid_loss,train_acc,valid_acc = momentum(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4,batch_size = 16)

train_acc 0.1356
valid_acc 0.057
train_loss 6.5024940389836985
valid_loss 6.745959532936545
train_acc 0.2438
valid_acc 0.084
train_loss 4.547525776683534
valid_loss 5.124292319637778
train_acc 0.3574
valid_acc 0.092
train_loss 3.2805578131815913
valid_loss 4.1286997867509045
train_acc 0.396
valid_acc 0.088
train_loss 2.4467876601497345
valid_loss 3.5230442325893057


In [77]:
def nesterov(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs,gamma = 0.9,  
                    batch_type ="Min_batch",batch_size = 16):
    

    #np.random.seed(1)         
    
    parameters = initialize_parameters(layer_dimensions)
    update = initialize_update(layer_dimensions)
    
    if batch_type == "SGD":
        batch_size =1
    elif batch_type ==  "Mini_batch":
        batch_size = batch_size
    elif batch_type == "Full_batch":
        batch_size = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T 
            
            lookahead_parameters = find_lookahead_parameters(parameters,update,gamma)
            #print("lookahead_parameters",lookahead_parameters)
            yhat, temps = forward_pass(x, lookahead_parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)
            parameters,update = parameter_update_momentum(parameters, grads, update, learning_rate ,gamma)
                
            
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
         

        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})

#         print("train_acc",train_acc)
#         print("valid_acc",valid_acc)
    
#         print("train_loss",train_loss)
#         print("valid_loss",valid_loss) 

    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [78]:
parameters,train_loss,valid_loss,train_acc,valid_acc = nesterov(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4,batch_size = 32)

train_acc 0.0312
valid_acc 0.03
train_loss 7.908866500589115
valid_loss 8.039728219620974
train_acc 0.127
valid_acc 0.083
train_loss 5.710033953332255
valid_loss 5.959140004484729
train_acc 0.193
valid_acc 0.088
train_loss 4.310694586395852
valid_loss 4.658753508757357
train_acc 0.2508
valid_acc 0.099
train_loss 3.455993144650091
valid_loss 3.8888236813293564


In [73]:
def rmsprop(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs,batch_type ="Mini_batch",
            batch_size = 16,beta_rms = 0.9,eps = 1e-8):
    

    #np.random.seed(1)             
    
    parameters = initialize_parameters(layer_dimensions)
    velocity = initialize_velocity(layer_dimensions)

    if batch_type == "SGD":
        batch_size =1
    elif batch_type ==  "Mini_batch":
        batch_size = batch_size
    elif batch_type == "Full_batch":
        batch_size = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        #par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T    
           
            yhat, temps = forward_pass(x, parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)
            parameters,velocity= parameter_update_RMSProp(parameters, grads, velocity,learning_rate ,beta_rms,eps)
            #print("velocity",velocity)
                
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
         

        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})

#         print("train_acc",train_acc)
#         print("valid_acc",valid_acc)
    
#         print("train_loss",train_loss)
#         print("valid_loss",valid_loss) 


    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [74]:
parameters,train_loss,valid_loss,train_acc,valid_acc = rmsprop(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4,batch_size = 16)

***********epoch =  0
train_acc 0.4552
valid_acc 0.092
train_loss 2.1806356855477973
valid_loss 4.551088210564466
***********epoch =  1
train_acc 0.6626
valid_acc 0.103
train_loss 0.9542825603737598
valid_loss 4.18164909509122
***********epoch =  2
train_acc 0.7282
valid_acc 0.107
train_loss 0.7868521448606678
valid_loss 4.5805496712712905
***********epoch =  3
train_acc 0.7532
valid_acc 0.104
train_loss 0.7069154371636389
valid_loss 4.893737027171547


In [69]:
def adam(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs,batch_type ="Mini_batch",
         batch_size = 16,beta1 = 0.9,beta2 = 0.999,eps = 1e-8):

    #np.random.seed(1)
            
    parameters = initialize_parameters(layer_dimensions)
    velocity = initialize_velocity(layer_dimensions)
    moment = initialize_moment(layer_dimensions)

    if batch_type == "SGD":
        batch_size =1
    elif batch_type ==  "Mini_batch":
        batch_size = batch_size
    elif batch_type == "Full_batch":
        batch_size = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T 
        
            yhat, temps = forward_pass(x, parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)
            parameters,velocity, moment = parameter_update_adam(parameters, grads, velocity, moment,
                                                                    learning_rate ,beta1,beta2,eps,i)
                
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
        
        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})

#         print("train_acc",train_acc)
#         print("valid_acc",valid_acc)
    
#         print("train_loss",train_loss)
#         print("valid_loss",valid_loss) 

    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [70]:
parameters,train_loss,valid_loss,train_acc,valid_acc = adam(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4,batch_size = 16)

train_acc 0.6394
valid_acc 0.112
train_loss 0.9117365078935252
valid_loss 4.880382279524359
train_acc 0.742
valid_acc 0.106
train_loss 0.6868949246897417
valid_loss 5.415662902110358
train_acc 0.7602
valid_acc 0.108
train_loss 0.628837826059322
valid_loss 5.634467366096267
train_acc 0.7738
valid_acc 0.101
train_loss 0.5884356956556227
valid_loss 5.813887564313782


In [59]:
def nadam(x_train, y_train,x_valid,y_valid,layer_dimensions,learning_rate,num_epochs,batch_type ="Mini_batch",
          batch_size = 16,beta1 = 0.9,beta2 = 0.999,eps = 1e-8):

    #np.random.seed(1)
   
    parameters = initialize_parameters(layer_dimensions)
    velocity = initialize_velocity(layer_dimensions)
    moment = initialize_moment(layer_dimensions)

    if batch_type == "SGD":
        batch_size =1
    elif batch_type ==  "Mini_batch":
        batch_size = batch_size
    elif batch_type == "Full_batch":
        batch_size = x_train.shape[0]
        
    total_examples = x_train.shape[0]
    num_steps = total_examples//batch_size
    #print(num_steps)
    costs = []
    for i in range(0, num_epochs):
        #print("***********epoch = ",i)
        par_update = 0
        for j in range(num_steps):
            
            start = j*batch_size
            end = start+batch_size
            x = x_train[start:end].T
            y = y_train[start:end].T
            

            yhat, temps = forward_pass(x, parameters)
            #cost = cost_function(yhat, y)
            grads = backward_pass(yhat,y,temps)
            parameters,velocity, moment = parameter_update_nadam(parameters, grads, velocity, moment,learning_rate ,beta1,beta2,eps,i)
              
            
        train_loss,valid_loss = get_train_and_validation_loss(x_train,y_train,x_valid,y_valid, parameters)
         
        train_acc= predict(x_train, y_train, parameters)

        valid_acc= predict(x_valid, y_valid, parameters)

        #wandb.log({"train_loss":train_loss,"val_loss":valid_loss,"train_accuracy":train_acc,"val_accuracy":valid_acc,"epochs":num_epochs})

#         print("train_acc",train_acc)
#         print("valid_acc",valid_acc)
    
#         print("train_loss",train_loss)
#         print("valid_loss",valid_loss) 

    return parameters,train_loss,valid_loss,train_acc,valid_acc

In [60]:
parameters,train_loss,valid_loss,train_acc,valid_acc = nadam(x_train, y_train,x_test,y_test,layer_dimensions,0.001,4,batch_size = 16)

***********epoch =  0
train_acc 0.7182
valid_acc 0.103
train_loss 0.7588116583598381
valid_loss 5.532759249684952
***********epoch =  1
train_acc 0.7532
valid_acc 0.109
train_loss 0.6719356324351495
valid_loss 5.705713936570341
***********epoch =  2
train_acc 0.7708
valid_acc 0.109
train_loss 0.622007553933051
valid_loss 5.8413296907360035
***********epoch =  3
train_acc 0.7756
valid_acc 0.109
train_loss 0.6013220210516357
valid_loss 5.885931881999552


In [236]:
#parameters = L_layer_network(x_train, y_train, layer_dimensions,0.001,5,grad_descent_type = "Nadam",print_cost = True)

In [237]:
def train():
    import numpy as np
    import wandb
    config_defaults = {
        'epochs': 5
    }
    wandb.init(config=config_defaults, magic=True)


    parameters = L_layer_network(x_train, y_train,x_test,y_test, layer_dimensions,0.001,num_epochs = wandb.config.epochs,grad_descent_type = "Nadam",print_cost = True)

    # accuracy = predict(x_test, y_test, parameters)
    # train_accu = accuracy = predict(x_train, y_train, parameters)
    # print("train_accu",train_accu)
    # print("test_accu",accuracy)
    # wandb.log({"metric": accuracy})



In [238]:
import wandb
wandb.agent(sweep_id, function=train)

[34m[1mwandb[0m: Agent Starting Run: emcuevl1 with config:
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.01


lr 0.001
train_acc 0.0658
valid_acc 0.07
train_loss 8.014049489115243
valid_loss 8.065007537372505
train_acc 0.0084
valid_acc 0.017
train_loss 6.790457981108266
valid_loss 6.960796105340185
train_acc 0.0082
valid_acc 0.014
train_loss 6.087886177677672
valid_loss 6.326531817923793
train_acc 0.0448
valid_acc 0.034
train_loss 5.602337086030147
valid_loss 5.899195748521462
train_acc 0.1094
valid_acc 0.088
train_loss 5.222731535389936
valid_loss 5.56774708841198


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,5.22273
val_loss,5.56775
train_accuracy,0.1094
val_accuracy,0.088
epochs,5.0
_runtime,9.0
_timestamp,1615446093.0
_step,4.0


0,1
train_loss,█▅▃▂▁
val_loss,█▅▃▂▁
train_accuracy,▅▁▁▄█
val_accuracy,▆▁▁▃█
epochs,▁▁▁▁▁
_runtime,▁▂▄▅█
_timestamp,▁▂▄▅█
_step,▁▃▅▆█


[34m[1mwandb[0m: Agent Starting Run: 2sojj7gf with config:
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001


lr 0.001
train_acc 0.0
valid_acc 0.0
train_loss 10.543932199319897
valid_loss 10.718200727035844
train_acc 0.0
valid_acc 0.0
train_loss 8.979882154622578
valid_loss 9.206671679886757
train_acc 0.0
valid_acc 0.0
train_loss 8.029986859851807
valid_loss 8.302931711282351
train_acc 0.0762
valid_acc 0.052
train_loss 7.372025831835371
valid_loss 7.693614822722848
train_acc 0.0954
valid_acc 0.063
train_loss 6.844478835602165
valid_loss 7.215963219127262


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,6.84448
val_loss,7.21596
train_accuracy,0.0954
val_accuracy,0.063
epochs,5.0
_runtime,9.0
_timestamp,1615446110.0
_step,4.0


0,1
train_loss,█▅▃▂▁
val_loss,█▅▃▂▁
train_accuracy,▁▁▁▇█
val_accuracy,▁▁▁▇█
epochs,▁▁▁▁▁
_runtime,▁▂▅▇█
_timestamp,▁▂▅▇█
_step,▁▃▅▆█


[34m[1mwandb[0m: Agent Starting Run: nxf2cl6d with config:
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.01


lr 0.001
train_acc 0.0894
valid_acc 0.095
train_loss 8.875163595839194
valid_loss 9.193899925557663
train_acc 0.0898
valid_acc 0.09
train_loss 7.40693132992241
valid_loss 7.896193394366308
train_acc 0.0872
valid_acc 0.079
train_loss 6.446696294606317
valid_loss 7.100006849038933
train_acc 0.08
valid_acc 0.071
train_loss 5.734272432886347
valid_loss 6.518898022836032
train_acc 0.1024
valid_acc 0.077
train_loss 5.16391608310028
valid_loss 6.04829603858233
train_acc 0.1202
valid_acc 0.086
train_loss 4.693306614709044
valid_loss 5.662984776714891
train_acc 0.1818
valid_acc 0.092
train_loss 4.307344214925584
valid_loss 5.362395729114272
train_acc 0.2762
valid_acc 0.092
train_loss 3.999501000547501
valid_loss 5.133093261510167
train_acc 0.3246
valid_acc 0.092
train_loss 3.7514335294381054
valid_loss 4.966589593467037
train_acc 0.344
valid_acc 0.094
train_loss 3.5472035930260093
valid_loss 4.838023014165415


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,3.5472
val_loss,4.83802
train_accuracy,0.344
val_accuracy,0.094
epochs,10.0
_runtime,15.0
_timestamp,1615446132.0
_step,9.0


0,1
train_loss,█▆▅▄▃▃▂▂▁▁
val_loss,█▆▅▄▃▂▂▁▁▁
train_accuracy,▁▁▁▁▂▂▄▆▇█
val_accuracy,█▇▃▁▃▅▇▇▇█
epochs,▁▁▁▁▁▁▁▁▁▁
_runtime,▁▂▃▄▄▅▆▇▇█
_timestamp,▁▂▃▄▄▅▆▇▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: 0nnx7i16 with config:
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.001


lr 0.001
train_acc 0.047
valid_acc 0.045
train_loss 6.637914699777414
valid_loss 6.765216451687727
train_acc 0.092
valid_acc 0.096
train_loss 5.669588925765528
valid_loss 5.888498836137309
train_acc 0.1512
valid_acc 0.099
train_loss 5.028438125224394
valid_loss 5.3737688583333885
train_acc 0.1758
valid_acc 0.1
train_loss 4.578445187604828
valid_loss 5.008838112444187
train_acc 0.1916
valid_acc 0.093
train_loss 4.226851066539303
valid_loss 4.71733004182667
train_acc 0.2264
valid_acc 0.096
train_loss 3.932345330673675
valid_loss 4.470378560647238
train_acc 0.2378
valid_acc 0.095
train_loss 3.677123660417081
valid_loss 4.254626635529774
train_acc 0.2416
valid_acc 0.093
train_loss 3.4513850621181303
valid_loss 4.062834099710531
train_acc 0.2466
valid_acc 0.091
train_loss 3.249565999863409
valid_loss 3.8931527235169088
train_acc 0.2732
valid_acc 0.098
train_loss 3.0687593217052274
valid_loss 3.7445019711729515


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,3.06876
val_loss,3.7445
train_accuracy,0.2732
val_accuracy,0.098
epochs,10.0
_runtime,16.0
_timestamp,1615446155.0
_step,9.0


0,1
train_loss,█▆▅▄▃▃▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁
train_accuracy,▁▂▄▅▅▇▇▇▇█
val_accuracy,▁▇██▇▇▇▇▇█
epochs,▁▁▁▁▁▁▁▁▁▁
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: ncx9mhy4 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 0.01


lr 0.001
train_acc 0.0
valid_acc 0.001
train_loss 7.549228449670281
valid_loss 7.983953743680567
train_acc 0.0118
valid_acc 0.009
train_loss 6.2263216179785354
valid_loss 6.875233607655555
train_acc 0.0306
valid_acc 0.012
train_loss 5.618536010498709
valid_loss 6.243175216838216
train_acc 0.058
valid_acc 0.033
train_loss 5.2615863438342
valid_loss 5.916012374049484
train_acc 0.1204
valid_acc 0.06
train_loss 4.996964253285584
valid_loss 5.686989017639533
train_acc 0.165
valid_acc 0.068
train_loss 4.775864485659197
valid_loss 5.504493220085337
train_acc 0.1916
valid_acc 0.075
train_loss 4.579756310184947
valid_loss 5.352636532161066
train_acc 0.21
valid_acc 0.079
train_loss 4.401100101121431
valid_loss 5.222650398916747
train_acc 0.224
valid_acc 0.086
train_loss 4.236637114362814
valid_loss 5.108068952073165
train_acc 0.2354
valid_acc 0.089
train_loss 4.084437049938498
valid_loss 5.005160891565611
train_acc 0.2488
valid_acc 0.095
train_loss 3.942345505095787
valid_loss 4.91373411705423
t

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,3.44262
val_loss,4.665
train_accuracy,0.2668
val_accuracy,0.098
epochs,15.0
_runtime,22.0
_timestamp,1615446183.0
_step,14.0


0,1
train_loss,█▆▅▄▄▃▃▃▂▂▂▂▁▁▁
val_loss,█▆▄▄▃▃▂▂▂▂▂▁▁▁▁
train_accuracy,▁▁▂▃▄▅▆▇▇▇█████
val_accuracy,▁▂▂▃▅▆▆▆▇▇█████
epochs,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▂▂▃▃▄▄▅▆▆▆▇██
_timestamp,▁▁▂▂▃▃▄▄▅▆▆▆▇██
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Agent Starting Run: tw7v2k7l with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 0.001


lr 0.001
train_acc 0.0
valid_acc 0.0
train_loss 6.658841307944736
valid_loss 6.976958060995641
train_acc 0.0414
valid_acc 0.036
train_loss 5.355819783020832
valid_loss 5.815707364287723
train_acc 0.1008
valid_acc 0.067
train_loss 4.669548556739527
valid_loss 5.24575272363556
train_acc 0.1498
valid_acc 0.07
train_loss 4.191380406402284
valid_loss 4.879539365557536
train_acc 0.209
valid_acc 0.082
train_loss 3.8146215779136665
valid_loss 4.614709420983794
train_acc 0.2316
valid_acc 0.093
train_loss 3.506356402196527
valid_loss 4.407972880328785
train_acc 0.2742
valid_acc 0.097
train_loss 3.2483889781674833
valid_loss 4.238258967369941
train_acc 0.3172
valid_acc 0.098
train_loss 3.028057048505622
valid_loss 4.095419244477963
train_acc 0.3458
valid_acc 0.1
train_loss 2.837478151391276
valid_loss 3.9741860727300966
train_acc 0.366
valid_acc 0.106
train_loss 2.6717142221354795
valid_loss 3.8710649455318538
train_acc 0.392
valid_acc 0.102
train_loss 2.526875456044062
valid_loss 3.7831571791877

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,2.09322
val_loss,3.54296
train_accuracy,0.459
val_accuracy,0.099
epochs,15.0
_runtime,21.0
_timestamp,1615446211.0
_step,14.0


0,1
train_loss,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁
val_loss,█▆▄▄▃▃▂▂▂▂▁▁▁▁▁
train_accuracy,▁▂▃▃▄▅▅▆▆▇▇▇███
val_accuracy,▁▃▅▆▆▇▇▇████▇██
epochs,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▂▃▃▃▄▅▅▆▆▇▇██
_timestamp,▁▁▂▃▃▃▄▅▅▆▆▇▇██
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [239]:
predict(x_train, y_train, parameters)

TypeError: ignored

In [None]:
predict(x_test,y_test,parameters)

In [121]:
p = np.array([[1,2],[3,4]])
p

array([[1, 2],
       [3, 4]])

In [124]:
np.linalg.norm(p,ord = 1)

6.0