# Question 3:

Implement the backpropagation algorithm with support for the following optimisation functions

*   sgd
*   momentum based gradient descent
*   nesterov accelerated gradient descent
*   rmsprop
*   adam
*   nadam

(12 marks for the backpropagation framework and 2 marks for each of the optimisation algorithms above)

We will check the code for implementation and ease of use (e.g., how easy it is to add a new optimisation algorithm such as Eve). Note that the code should be flexible enough to work with different batch sizes.

##**Importing the FASHION-MNIST dataset**

**Furthermore, the data is segreggated into training and test data**

In [1]:
from keras.datasets import fashion_mnist
import matplotlib.pyplot as plt
import numpy as np
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
X,Y = train_images.reshape(train_images.shape[0], -1).T/225,train_labels
X_test,Y_test = test_images.reshape(test_images.shape[0], -1).T/225,test_labels

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


**Setting a part of it for validation:**

In [2]:
validation_ratio = 0.1 #percentage of data for validation
num_validation_samples = int(validation_ratio * X.shape[1])
indices = np.random.permutation(X.shape[1]) #shuffling the indices
validation_indices = indices[:num_validation_samples]
training_indices = indices[num_validation_samples:]
X_train = X[:, training_indices]
Y_train = Y[training_indices]
X_val = X[:, validation_indices]
Y_val = Y[validation_indices]

**Importing a few functions that we have defined, for example, activation functions, their derrivatives, intializers etc.**

In [4]:
from support_functions import *
from activation_functions import *
from initializers import *

##**Forward proagartion function:**

In [5]:
from forward_propagation import forward_prop #Question 2

##**Loss function:**

In [6]:
def compute_loss(Y_pred, Y, epsilon, params, lambd):
    m = Y.shape[0]
    one_hot_Y = one_hot(Y)
    cross_entropy_loss = -np.mean(one_hot_Y * np.log(Y_pred + epsilon))

    # Computing L2 regularization term
    l2_regularization = 0
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        l2_regularization += np.sum(np.square(params[f'W{i}']))

    l2_regularization *= (lambd / (2 * m))

    total_loss = cross_entropy_loss + l2_regularization
    return total_loss

##**Question 3 (part-A): Backward Propagation**

In [7]:
def backward_prop(params, activations, X, Y, act, lambd):
    m = Y.size
    num_layers = len(params) // 2
    one_hot_Y = one_hot(Y)

    if act == 'sigmoid':
        grad_activation = grad_sigmoid
    elif act == 'relu':
        grad_activation = grad_ReLU
    elif act == 'tanh':
        grad_activation = grad_tanh

    gradients = {}

    # Backpropagate through output layer
    da_last = - (one_hot_Y - activations[f'h{num_layers}'])
    dw_last = 1 / m * np.dot(da_last, activations[f'h{num_layers-1}'].T) + (lambd / m) * params[f'W{num_layers}']
    db_last = 1 / m * np.sum(da_last, axis=1, keepdims=True)
    gradients[f'dW{num_layers}'] = dw_last
    gradients[f'db{num_layers}'] = db_last

    # Backpropagate through hidden layers
    da = da_last
    for i in reversed(range(1, num_layers)):
        da = np.dot(params[f'W{i+1}'].T, da) * grad_activation(activations[f'a{i}'])

        dw = 1 / m * np.dot(da, activations[f'h{i-1}'].T) + (lambd / m) * params[f'W{i}']
        db = 1 / m * np.sum(da, axis=1, keepdims=True)
        gradients[f'dW{i}'] = dw
        gradients[f'db{i}'] = db

    return gradients

## **Question 3 (part-B):Gradient-descent algorithms**

In [14]:
def stochastic_gradient_descent(weight, activation, weight_decay, X, Y, X_val, Y_val, iterations, alpha, layer_sizes, batch_size):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)

        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        params = update_params(params, gradients, alpha)

      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))

    return params

#momentum-gradient-descent
def momentum_gradient_descent(weight, activation, weight_decay, X, Y, X_val, Y_val, iterations, alpha, beta, layer_sizes, batch_size):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    history_prev = initialize_history(params)
    num_batches = len(X.T) // batch_size

    for j in range(iterations):
      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)
        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)

        history = add_dicts(gradients, history_prev, scalar2=beta)
        params = update_momentum_params(params, history, alpha)
        history_prev=history

      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))

    return params


#nesterov-gradient-descent
def nesterov_gradient_descent(weight, activation, weight_decay, X, Y, X_val, Y_val,iterations, alpha, beta, layer_sizes, batch_size):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    history_prev = initialize_history(params)
    num_batches = len(X.T) // batch_size

    for j in range(iterations):
      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)

        nesterov_params = nesterov_params_look(params, history_prev, beta*alpha)
        nesterov_activations = forward_prop(nesterov_params, X_batch, activation)
        nesterov_gradients = backward_prop(nesterov_params, nesterov_activations, X_batch, Y_batch, activation, weight_decay)
        history = add_dicts(nesterov_gradients, history_prev, scalar2=beta)
        params = update_momentum_params(params, history, alpha)

        history_prev=history

      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))


    return params


#sto-gradient-descent-with-adagrad
def stochastic_gradient_descent_with_adagrad(weight, activation, weight_decay, X, Y, X_val, Y_val,iterations, alpha, layer_sizes, batch_size, epsilon_v):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size
    v_old = initialize_history(params)

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)

        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        gradients_squared = {key: np.square(value) for key, value in gradients.items()}
        v = add_dicts(gradients_squared,v_old)
        params = update_params_adagrad(params, gradients, alpha, v, epsilon_v)
        v_old = v
      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))


    return params


#sto-gradient-descent-with-rmsprop
def stochastic_gradient_descent_with_RMSProp(weight, activation, weight_decay, X, Y, X_val, Y_val, iterations, alpha, layer_sizes, batch_size, epsilon_v, beta):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size
    v_old = initialize_history(params)

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)

        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        gradients_squared = {key: np.square(value) for key, value in gradients.items()}

        v = add_dicts(gradients_squared,v_old,scalar1=(1-beta),scalar2=beta)
        params = update_params_adagrad(params, gradients, alpha, v, epsilon_v)
        v_old = v
      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))

    return params


#sto-gradient-descent-with-adadelta
def stochastic_gradient_descent_with_AdaDelta(weight, activation, weight_decay, X, Y, X_val, Y_val, iterations, alpha, layer_sizes, batch_size, epsilon_v, beta):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size
    v_old = initialize_history(params)
    u_old = initialize_history(params)

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon, params, weight_decay)

        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)

        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        gradients_squared = {key: np.square(value) for key, value in gradients.items()}
        v = add_dicts(gradients_squared,v_old,scalar1=(1-beta),scalar2=beta)
        delw = del_w(u_old,v,gradients,epsilon_v)
        params = update_params_adadelta(params, delw)
        u = add_dicts(gradients_squared,u_old,scalar1=(1-beta),scalar2=beta)

        v_old = v
        u_old = u
      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))


    return params



#sto-gradient-descent-with-adam
def stochastic_gradient_descent_with_adam(weight, activation, weight_decay, X, Y, X_val, Y_val,iterations, alpha, layer_sizes, batch_size, epsilon_v, beta1, beta2):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size
    m_old = initialize_history(params)
    v_old = initialize_history(params)

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon,params, weight_decay)
        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)
        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        gradients_squared = {key: np.square(value) for key, value in gradients.items()}
        m = add_dicts(gradients,m_old,scalar1=(1-beta1),scalar2=beta1)
        m_bar = dict_div(m,(1-beta1))
        v = add_dicts(gradients_squared,v_old,scalar1=(1-beta2),scalar2=beta2)
        v_bar = dict_div(v,(1-beta2))
        params = update_params_adam(params, m_bar, v_bar, epsilon_v, alpha)
        v_old = v
        m_old = m
      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))


    return params



#sto-gradient-descent-with-nadam
def stochastic_gradient_descent_with_nadam(weight, activation, weight_decay, X, Y, X_val, Y_val, iterations, alpha, layer_sizes, batch_size, epsilon_v, beta1, beta2):
    epsilon = 0.
    if weight=='random':
      params = init_params(layer_sizes)
    elif weight=='glorot':
      params = xavier_init_params(layer_sizes)
    else:
      print("Please choose glorot or random initialization")

    num_batches = len(X.T) // batch_size
    m_old = initialize_history(params)
    v_old = initialize_history(params)

    for j in range(iterations):

      for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        X_batch = X[:,start_idx:end_idx]
        Y_batch = Y[start_idx:end_idx]
        activations = forward_prop(params, X_batch, activation)
        train_loss = compute_loss(activations[f'h{len(layer_sizes)-1}'], Y_batch, epsilon,params,weight_decay)
        activations_val = forward_prop(params, X_val, activation)
        val_loss = compute_loss(activations_val[f'h{len(layer_sizes)-1}'], Y_val, epsilon, params, weight_decay)
        gradients = backward_prop(params, activations, X_batch, Y_batch, activation, weight_decay)
        gradients_squared = {key: np.square(value) for key, value in gradients.items()}
        m = add_dicts(gradients,m_old,scalar1=(1-beta1),scalar2=beta1)
        m_bar = dict_div(m,(1-beta1))
        v = add_dicts(gradients_squared,v_old,scalar1=(1-beta2),scalar2=beta2)
        v_bar = dict_div(v,(1-beta2))
        bracterm = add_dicts(m,gradients,scalar1=beta1,scalar2=1.)
        params = update_params_adam(params, bracterm, v_bar, epsilon_v, alpha)
        v_old = v
        m_old = m
      print("Epoch: " + str(j+1) + "/" + str(iterations) + "; Batch:  " + str(i+1) + "/" + str(num_batches) + "; Train Loss: " + str(train_loss) + "; Val Loss: " + str(val_loss))

    return params

## **Question 3 (part-C):Updating parameters, part of gradient-descent**

In [9]:
#vanill_update_parameters
def update_params(params, gradients, alpha):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha * gradients[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha * gradients[f'db{i}']
    return updated_params

#adagrad_update_parameters
def update_params_adagrad(params, gradients, alpha, v, epsilon):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha/np.sqrt(v[f'dW{i}']+epsilon) * gradients[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha/np.sqrt(v[f'db{i}']+epsilon) * gradients[f'db{i}']
    return updated_params

#nesterov_update_parameters
def update_nesterov_params(params, history, alpha):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha * history[f'W{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha * history[f'b{i}']
    return updated_params

#adadelta_update_parameters
def update_params_adadelta(params, delw):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] + delw[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] + delw[f'db{i}']
    return updated_params

#momentum_update_parameters
def update_momentum_params(params, history, alpha):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha * history[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha * history[f'db{i}']
    return updated_params

#adam_update_parameters
def update_params_adam(params, m_bar, v_bar, epsilon, alpha):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha/(np.sqrt(v_bar[f'dW{i}'])+epsilon) *   m_bar[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha/(np.sqrt(v_bar[f'db{i}'])+epsilon) *   m_bar[f'db{i}']
    return updated_params

#nadam_update_parameters
def update_params_nadam(params, bracterm, v_bar, epsilon, alpha):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - alpha/(np.sqrt(v_bar[f'dW{i}'])+epsilon) *   bracterm[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - alpha/(np.sqrt(v_bar[f'db{i}'])+epsilon) *   bracterm[f'db{i}']
    return updated_params

#nesterov_look_ahead
def nesterov_params_look(params, u, beta):
    updated_params = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
        updated_params[f'W{i}'] = params[f'W{i}'] - beta * u[f'dW{i}']
        updated_params[f'b{i}'] = params[f'b{i}'] - beta * u[f'db{i}']
    return updated_params

#nesterov_look_ahead
def del_w(u,v,gradients,epsilon):
    updated_delw = {}
    num_layers = len(params) // 2
    for i in range(1, num_layers + 1):
            updated_delw[f'dW{i}'] = - (np.sqrt(u[f'dW{i}']+epsilon)/np.sqrt(v[f'dW{i}']+epsilon)) * gradients[f'dW{i}']
            updated_delw[f'db{i}'] = - (np.sqrt(u[f'db{i}']+epsilon)/np.sqrt(v[f'db{i}']+epsilon)) * gradients[f'db{i}']
    return updated_delw

##**Traning the model**

In [22]:
max_epochs=5
no_hidden_layers=3
size_of_hidden_layer=32
weight_decay = 0.
alpha=1e-3
batch_size=32
weight='glorot' #glorot,random
activation='relu' #sigmoid,tanh,relu

input_size,output_size = 784,10
layer_sizes = [input_size] + [size_of_hidden_layer] * no_hidden_layers + [output_size]
beta=0.9
epsilon=1e-6
beta1=0.9
beta2=0.999

def train_model(opt):
  if opt == 'sgd':
    params = stochastic_gradient_descent(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val, max_epochs, alpha, layer_sizes, batch_size)
  elif opt == 'momentum':
    params = momentum_gradient_descent(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val, max_epochs, alpha, beta, layer_sizes, batch_size)
  elif opt == 'nesterov':
    params = nesterov_gradient_descent(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val, max_epochs, alpha, beta, layer_sizes, batch_size)
  elif opt == 'rmsprop':
    params = stochastic_gradient_descent_with_RMSProp(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val,max_epochs, alpha, layer_sizes, batch_size, epsilon, beta)
  elif opt == 'adam':
    params = stochastic_gradient_descent_with_adam(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val, max_epochs, alpha, layer_sizes, batch_size, epsilon, beta1, beta2)
  elif opt == 'nadam':
    params = stochastic_gradient_descent_with_nadam(weight, activation, weight_decay, X_train, Y_train, X_val, Y_val, max_epochs, alpha, layer_sizes, batch_size, epsilon, beta1, beta2)
  else:
    raise ValueError(f"Invalid optimizer option: {opt}")

  # Accuracy checker:
  activations_test = forward_prop(params, X_test, activation)
  test = activations_test[f'h{len(layer_sizes)-1}']
  reverse_onehat = np.argmax(test, axis=0)
  test_accuracy = (np.count_nonzero((reverse_onehat-Y_test) == 0)/len(Y_test))*100
  print("Accuracy on test data= " + str(test_accuracy) + " %")

In [18]:
train_model('momentum')

Epoch: 1/5; Batch:  1687/1687; Train Loss: 0.06705850927382326; Val Loss: 0.053947216686166144
Epoch: 2/5; Batch:  1687/1687; Train Loss: 0.06280388096658601; Val Loss: 0.04691587561810313
Epoch: 3/5; Batch:  1687/1687; Train Loss: 0.06165828695339315; Val Loss: 0.043509580832396075
Epoch: 4/5; Batch:  1687/1687; Train Loss: 0.06132715088420857; Val Loss: 0.04146964674878003
Epoch: 5/5; Batch:  1687/1687; Train Loss: 0.05873975823696202; Val Loss: 0.039896586536219365
Accuracy on test data= 84.46000000000001 %


In [23]:
train_model('adam')

Epoch: 1/5; Batch:  1687/1687; Train Loss: 0.05998754623068191; Val Loss: 0.04308040804055756
Epoch: 2/5; Batch:  1687/1687; Train Loss: 0.054587721229956275; Val Loss: 0.03935447301678587
Epoch: 3/5; Batch:  1687/1687; Train Loss: 0.05119604347717123; Val Loss: 0.037454337666288556
Epoch: 4/5; Batch:  1687/1687; Train Loss: 0.04914348318689472; Val Loss: 0.03626974243782838
Epoch: 5/5; Batch:  1687/1687; Train Loss: 0.04627860513528143; Val Loss: 0.035558012213474814
Accuracy on test data= 85.88 %
