In [None]:
!pip install wandb
!pip install tensorflow
!pip install keras
import keras
import numpy as np
from keras.datasets import mnist
from tqdm.auto import tqdm
import tensorflow as tf
import wandb
import pprint

# Question 1

In [None]:
def load_fashion_mnist(return_images=False, test=False):

  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

  train_shuffler = np.random.shuffle(np.arange(60000))
  x_train, y_train = x_train[train_shuffler][0], y_train[train_shuffler][0]

  test_shuffler = np.random.shuffle(np.arange(10000))
  x_test, y_test = x_test[test_shuffler][0], y_test[test_shuffler][0]

  x_train = np.array(x_train/255).astype('float32')
  x_test = np.array(x_test/255).astype('float32')

  if test==False:
    x_train, x_val = x_train[:54000], x_train[54000:]
    y_train, y_val = y_train[:54000], y_train[54000:]

    if (return_images==False):
      return {
          'train': {
              'X': x_train.reshape([-1, 784]),
              'Y': y_train.reshape([54000])
          },
          'val': {
              'X': x_val.reshape([-1, 784]),
              'Y': y_val.reshape([6000])
          },
          'test': {
              'X': x_test.reshape([-1, 784]),
              'Y': y_test.reshape([10000])
          }
    }

    else :
      return {
        'train': {
              'X': x_train,
              'Y': y_train
        },
        'val': {
              'X': x_val,
              'Y': y_val
        },
        'test': {
              'X': x_test,
              'Y': y_test
        }
      }

  else:
    print('train = Old Train + Old Val', 'val = Old Test', sep='\n')
    return {
        'train': {
            'X': x_train.reshape([-1, 784]),
            'Y': y_train.reshape([60000])
        },
        'val': {
            'X': x_test.reshape([-1, 784]),
            'Y': y_test.reshape([10000])
        }
    }

data = load_fashion_mnist()

# Question 2

In [None]:
class neural_network:

  # constructor function - initializes weights
  def __init__(self, dict_layers, initializer):

    self.weights_list = []
    self.biases_list = []
    self.dict_layers= dict_layers

    self.weights_list, self.biases_list = wandb_initializer(dict_layers, self.weights_list, self.biases_list, initializer)

  # function to compute forward propogation
  def forward_prop(self, W, b, X, Y, activation_func):
    '''
    This fuction implements forward propagation

      forward propagation :
      A_i = W_i*H_(i-1) + b_i
      H_i = activation_func(A_i)

      Output = softmax(A_L)

    Returns :
    Output (Y_hat) , A , H
    '''
    A = []
    H = []
    
    H_pre = X
    
    L = self.dict_layers['num_hidden_layers']

    for i in range(L) :
      A.append(W[i] @ H_pre + b[i])
      H_pre = getattr(activation, activation_func)(A[i])
      H.append(H_pre)
    
    A.append(W[L] @ H_pre + b[L])
    
    Y_hat = activation.softmax(A[L])
    
    return {
        'A' : A,
        'H' : H,
        'Y_hat' : Y_hat
    }

  # helper function to perform forward propogation 
  def self_forward_prop(self, X, Y, activation_func) :

    temp = self.forward_prop(self.weights_list,self.biases_list, X, Y, activation_func)
    return temp

  # function to perform backward propogration
  def back_prop(self, W, b, A, H, Y_hat, X, Y,activation_func):
    '''
    This function implements backpropogation :
      
      Backprop :
      L = cross-entropy loss

      δL/δW_i = (δL/δA_i)(H_(i-1))
      δL/δb = δL/δA_i
      δL/δH_(i-1) = (W_i)^T(δL/δA)
      δL/δA_(i-1) = δL/δH_(i-1) ⊙ g'(H_(i-1))

    Inputs : Y_hat - Y predicted, Y - true Y , X - train data , A,H - from forward propogation

    Returns : δL/δW , δL/δb
    '''

    batch_size = len(Y)
    
    del_w = []
    del_b = []
    L = self.dict_layers['num_hidden_layers']
    
    E = np.zeros(Y_hat.shape)
    E[Y,np.arange(batch_size)] = 1
    
    grad_A = -(E - Y_hat)

    for i in range(L,-1,-1) :

      temp1 = grad_A.reshape(-1,batch_size)
      
      if i==0 :
        temp2 = X.T
      else :
        temp2 = H[i-1].reshape((batch_size ,-1))

      del_w.append(temp1 @ temp2/batch_size)
      del_b.append(grad_A/batch_size)

      if(i!=0) :
        grad_H = W[i].T @ grad_A      
        grad_A = grad_H * getattr(activation,activation_func+'_der')(H[i-1])

    for j in range(len(del_b)) :
       del_b[j] = np.sum(del_b[j],axis=1)

    return {
        'dw' : del_w,
        'db' : del_b
    }

  # helper function to perform backward propogation
  def self_back_prop(self, A, H, Y_hat, X, Y,activation_func) :
    temp = self.back_prop(self.weights_list,self.biases_list, A, H, Y_hat, X, Y, activation_func)
    return temp

  #  function to compute gradient
  def grad_wandb(self, W, b, X, Y,activation_func):
    ''' 
    Function to find gradient of Cross entropy loss with respect to given W and b

    Arguments : W,B and data (X,Y)
    Returns : Returns : δL/δW , δL/δb
    '''

    X = X.T.reshape((784,-1))
    
    temp = self.forward_prop(W, b, X, Y, activation_func)
    temp2 = self.back_prop(W, b, temp['A'], temp['H'], temp['Y_hat'], X, Y, activation_func)

    return {
        'dw' : temp2['dw'],
        'db' : temp2['db']
    }

  # helper function to compute gradient
  def self_grad_wandb(self, X, Y, activation_func) :
    temp = self.grad_wandb(self.weights_list, self.biases_list, X, Y,activation_func)
    return temp

  # function to compute predictions
  def predict(self, X, activation_func):
    '''
    Function to take X and give the prediction of the network on this X
    '''
    X = X.T.reshape((784,-1))
    temp = self.forward_prop(self.weights_list,self.biases_list, X, 0, activation_func)
    return {
      'Y' : np.argmax(temp['Y_hat'],axis=0),
      'Y_hat' : temp['Y_hat']
    }

  # function to update weights and biases
  def update_vals(self, dw, db, wd) :
    '''
    Functions to update parameters of the network
    Arguments : change in weights(dw) , change in biases (db) , Regularization parameter : wd
    '''
    L = len(self.weights_list)
    for i in range(L) :
      self.weights_list[i] =self.weights_list[i] - dw[L-i-1].reshape(self.weights_list[i].shape) - wd * self.weights_list[i]

    #for i in range(len(self.biases_list)) :
      self.biases_list[i] =self.biases_list[i] - db[L-i-1].reshape(self.biases_list[i].shape)  
##################################################################################
class activation:
  
  @staticmethod
  def sigmoid(z):
    return 1 / (1 + np.exp(-z))
  
  @staticmethod
  def relu(z):
    return (z>0) * z

  @staticmethod
  def tanh(z):
    return np.tanh(z)

  @staticmethod
  def sigmoid_der(z) :
    return z * (1-z)
  
  @staticmethod
  def relu_der(z) :
    return (z>0)

  @staticmethod
  def tanh_der(z):
    return 1 - z*z

  @staticmethod
  def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x,axis=0)

##################################################################################
def set_nn_shape(verbose=True, num_hidden_layers=-1, hidden_layer_size=-1):

  input_layer_size = 784
  hidden_layer_size = hidden_layer_size
  num_hidden_layers = num_hidden_layers
  output_layer_size = 10
  

  if (verbose):
    print("\nNumber Of Hidden Layers:")
    num_hidden_layers = int(input())

    print("\nSize Of Each Hidden Layer:")
    hidden_layer_size = int(input())

    print(f"\nThe Neural Network Has {num_hidden_layers+2} Layers In Total!")
  
  return {"input_layer_size": input_layer_size, "hidden_layer_size": hidden_layer_size, "output_layer_size": output_layer_size, "num_hidden_layers": num_hidden_layers}


In [None]:
def wandb_initializer(nn_shape, weights_list, biases_list, type='random', mu = 0, sigma = 1):
  
  # random initialization
  if (type=='random'):
    initializer = tf.keras.initializers.TruncatedNormal(mean=mu, stddev=sigma)
  
  # xavier initialization
  elif (type=='xavier'):
    initializer = tf.keras.initializers.GlorotNormal()

  weights_list.append(initializer(shape=(nn_shape['hidden_layer_size'], nn_shape['input_layer_size'])).numpy())
  biases_list.append(initializer(shape=(nn_shape['hidden_layer_size'], 1)).numpy())
  for i in range(nn_shape['num_hidden_layers'] - 1):
    weights_list.append(initializer(shape=(nn_shape['hidden_layer_size'], nn_shape['hidden_layer_size'])).numpy())
    biases_list.append(initializer(shape=(nn_shape['hidden_layer_size'], 1)).numpy())

  weights_list.append(initializer(shape=(nn_shape['output_layer_size'], nn_shape['hidden_layer_size'])).numpy())
  biases_list.append(initializer(shape=(nn_shape['output_layer_size'], 1)).numpy())

  return weights_list, biases_list

In [None]:
class optimizer:

  @staticmethod
  def sgd(network, data, config):
    '''
    This function implements gradient descent
      Gradient descent :
      param = param - eta*δL/δparam

    '''

    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        temp = network.self_grad_wandb(X, Y, activation_func)         
        dw = temp['dw']
        db = temp['db']
        for dd in dw :
          dd*= eta
        for dd in db :
          dd*=eta

        network.update_vals(dw, db, lambda_)
    
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def momentum(network, data, config,gamma = 0.9) :
    '''
    This function implements momentum based Gradient descent
     Momentum :
     update_t = gamma*update_(t-1) + eta*δL/δparam
     param = param - update_t

    '''
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    dw, db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    dw.reverse()
    db.reverse()

    for j in range(len(db)) :
      db[j] = db[j].flatten()

    for i in range(num_epochs) :
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        temp = network.self_grad_wandb(X,Y,activation_func)       
        for j in range(len(dw)) :
          dw[j] += eta*temp['dw'][j]
          db[j] += eta*temp['db'][j]

        network.update_vals(dw,db, lambda_)
        for dd in db :
          dd*=gamma
        for dd in dw :
          dd*=gamma

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 
        
  @staticmethod
  def NAG(network, data, config,gamma = 0.9) :
    '''
    This function implements Nesterov Accelerated Gradient descent
     NAG :
     param_lookahead = param - gamma*upadte_(t-1)
     update_t = gamma*update_(t-1) + eta*δL/δparam_lookahead
     param = param - update_t

    '''
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    
    X_train, Y_train = data['train']['X'], data['train']['Y']
    
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    v_dw, v_db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    v_dw.reverse()
    v_db.reverse()

    for j in range(len(v_db)) :
      v_db[j] = v_db[j].flatten()

    for i in range(num_epochs) :
      for k in tqdm(range(0, len(X_train), batch_size)) :
        for j in range(len(v_dw)) :
          v_dw[j] = gamma*v_dw[j]
          v_db[j] = gamma*v_db[j]

        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        
        W = network.weights_list.copy()
        B = network.biases_list.copy()

        L = len(W)
        for j in range(L) :
          W[j] -= v_dw[L-j-1]
          B[j] -= v_db[L-j-1].reshape(B[j].shape)

        temp = network.grad_wandb(W,B,X,Y,activation_func)  

        for j in range(len(v_dw)) :
          v_dw[j] += eta*temp['dw'][j]
          v_db[j] += eta*temp['db'][j]


        network.update_vals(v_dw,v_db,lambda_)

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def RMSprop(network, data, config,beta = 0.9,epsilon = 1e-8) :
    '''
    This function implements RMS prop
     RMSprop :
     v_t = beat*v_(t-1) + (1-beta)*(δL/δparam)**2
     param = param - (eta/sqrt(epsilon + v_t))*δL/δparam

    '''
    
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    v_dw, v_db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    v_dw.reverse()
    v_db.reverse()

    for j in range(len(v_db)) :
      v_db[j] = v_db[j].flatten()

    for i in range(num_epochs) :
      dw = []
      db = []
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        temp = network.self_grad_wandb(X,Y,activation_func)  

        dw = temp['dw']
        db = temp['db']

        for j in range(len(dw)) :
          v_dw[j] *= beta
          v_dw[j] += (1-beta)*(dw[j]**2) 
          dw[j] *= eta/np.sqrt(v_dw[j]+epsilon)
          v_db[j] *= beta
          v_db[j] += (1-beta)*(db[j]**2) 
          db[j] *= eta/np.sqrt(v_db[j]+epsilon)

        network.update_vals(dw,db, lambda_)

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def adam(network, data, config, beta1=0.9, beta2=0.999, epsilon=1e-8):
    '''
    This function implements Adam
    Adam is similar to RMS prop,but with momentum

    '''

    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
   
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])

    m_w, m_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)
    v_w, v_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    m_w.reverse()
    m_b.reverse()
    v_w.reverse()
    v_b.reverse()
    for j in range(len(m_b)):
      m_b[j], v_b[j] = m_b[j].flatten(), v_b[j].flatten() 
    
    t = 0
    eta = eta/(1-beta1)
    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        
        t += 1
        
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        
        temp = network.self_grad_wandb(X, Y, activation_func)
        
        dw = temp['dw']
        db = temp['db']
        
        for j in range(len(dw)):
          
          m_w[j] = beta1 * m_w[j] + (1 - beta1) * dw[j]
          m_b[j] = beta1 * m_b[j] + (1 - beta1) * db[j]
          
          v_w[j] = beta2 * v_w[j] + (1 - beta2) * dw[j] * dw[j]
          v_b[j] = beta2 * v_b[j] + (1 - beta2) * db[j] * db[j]
                 
          m_w_hat = m_w[j] *((1-beta1)/ (1-beta1**int(t+1)))
          m_b_hat = m_b[j] *((1-beta1)/ (1-beta1**int(t+1)))
          
          v_w_hat = v_w[j]*((1-beta2)/ (1-beta2**int(t+1)))
          v_b_hat = v_b[j]*((1-beta2)/ (1-beta2**int(t+1)))
          
          dw[j] = eta * m_w_hat / (epsilon + np.sqrt( v_w_hat))
          db[j] = eta * m_b_hat / ( epsilon + np.sqrt(v_b_hat))

         
        network.update_vals(dw, db, lambda_)
        
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      })

    
  @staticmethod
  def nadam(network, data, config, beta1=0.9, beta2=0.999, epsilon=1e-8):
    '''
    This function implements Nadam.
    Just like Adam is RMS prop with momentum,
    Nadam is RMS prop with Nesterov

    '''

    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)
    
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])

    m_w, m_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)
    v_w, v_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    m_w.reverse()
    m_b.reverse()
    v_w.reverse()
    v_b.reverse()
    for j in range(len(m_b)):
      m_b[j], v_b[j] = m_b[j].flatten(), v_b[j].flatten() 
    
    t = 0
    eta = eta/(1-beta1)
    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        
        t += 1
        
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        
        temp = network.self_grad_wandb(X, Y, activation_func)
        
        dw = temp['dw']
        db = temp['db']
        
        for j in range(len(dw)):
          
          m_w[j] = beta1 * m_w[j] + (1 - beta1) * dw[j]
          m_b[j] = beta1 * m_b[j] + (1 - beta1) * db[j]
          
          v_w[j] = beta2 * v_w[j] + (1 - beta2) * dw[j] * dw[j]
          v_b[j] = beta2 * v_b[j] + (1 - beta2) * db[j] * db[j]
                 
          m_w_hat = m_w[j] *((1-beta1)/ (1-beta1**int(t+1)))
          m_b_hat = m_b[j] *((1-beta1)/ (1-beta1**int(t+1)))
          
          v_w_hat = v_w[j]*((1-beta2)/ (1-beta2**int(t+1)))
          v_b_hat = v_b[j]*((1-beta2)/ (1-beta2**int(t+1)))
          
          
          dw[j] = eta * (beta1*m_w_hat + (1 - beta1)*dw[j]) / (epsilon + np.sqrt( v_w_hat))
          db[j] = eta * (beta1*m_b_hat + (1 - beta1)*db[j]) / ( epsilon + np.sqrt(v_b_hat))
        
        network.update_vals(dw, db, lambda_)
        
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      })

In [None]:
def run_callback(network,data,config) :
    '''
    This function is used to calculate accuracy and loss
    '''
    activation_func = config['activation']
    
    X_train = data['train']['X']
    Y_train = data['train']['Y']

    X_val = data['val']['X']
    Y_val = data['val']['Y']

    train_loss = 0
    train_count = 0
    train_sq_error = 0
    
    temp = network.predict(X_train,activation_func)
    train_count = np.sum(temp['Y'].reshape(Y_train.shape)==Y_train)
    
    Y_pred = np.array(temp['Y_hat'].T)
    train_loss = np.sum(-np.log(Y_pred[np.arange(len(X_train)),Y_train]))
    E = np.zeros(Y_pred.shape)
    E[np.arange(len(X_train)),Y_train] = 1
    train_sq_error = np.sum((E-Y_pred)**2)

    val_loss = 0
    val_count = 0
    val_sq_error = 0
    
    temp = network.predict(X_val, activation_func)
    val_count = np.sum(temp['Y'].reshape(Y_val.shape)==Y_val)
    
    Y_pred = np.array(temp['Y_hat'].T)
    val_loss = np.sum(-np.log(Y_pred[np.arange(len(X_val)),Y_val]))
    E = np.zeros(Y_pred.shape)
    E[np.arange(len(X_val)),Y_val] = 1
    val_sq_error = np.sum((E-Y_pred)**2)
    
    return  {
        'loss': {
            'train' : train_loss / len(X_train),
            'val' : val_loss / len(X_val)
        },
        'accuracy': {
            'train': train_count / len(X_train),
            'val': val_count / len(X_val)
        }
    }

    


In [None]:
sweep_config = {
    'method' : 'grid',

    'parameters': {
        'num_epochs': {
            'values': [10]
        },
        'num_hidden_layers': {
            'values': [5]
        },
        'hidden_layer_size': {
            'values': [256]
        },
        'weight_decay': {
            'values': [0]
        },
        'lr': {
            'values': [1e-4, 5e-4, 1e-3]
        },
        'optimizer': {
            'values': ['momentum', 'RMSprop', 'adam', 'nadam']
        },
        'batch_size': {
            'values': [32, 16]
        },
        'weights_initializer': {
            'values': ['xavier']
        },
        'activation': {
            'values': ['relu']
        }        
    }
}


In [None]:
sweep_id = wandb.sweep(sweep_config, project='mnist')

In [None]:
pprint.pprint(sweep_config)

In [None]:
class sweep_module:
  @staticmethod
  def train(config=None):

    with wandb.init(config):
      
      config = wandb.config
      wandb.run.name = 'ac:'+config['activation'][:3]+'_opt:'+config['optimizer'][:4]+'_hl:'+str(config['num_hidden_layers'])+':'+str(config['hidden_layer_size'])
      
      nn_shape = set_nn_shape(False, config['num_hidden_layers'] , config['hidden_layer_size'])
      
      network = neural_network(nn_shape, config['weights_initializer'])
      
      getattr(optimizer, config['optimizer'])(network, data, config)

In [None]:
# performing the sweep
wandb.agent(sweep_id, sweep_module.train)