In [1]:
!pip install wandb
!pip install tensorflow
!pip install keras
import keras
import numpy as np
from keras.datasets import fashion_mnist
from tqdm.auto import tqdm
import tensorflow as tf
import wandb
import pprint



# Question 1

In [2]:
def load_fashion_mnist(return_images=False):

  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

  train_shuffler = np.random.shuffle(np.arange(50000))
  x_train, y_train = x_train[train_shuffler][0], y_train[train_shuffler][0]

  test_shuffler = np.random.shuffle(np.arange(10000))
  x_test, y_test = x_test[test_shuffler][0], y_test[test_shuffler][0]

  x_train = np.array(x_train/255).astype('float32')
  x_test = np.array(x_test/255).astype('float32')

  x_train, x_val = x_train[:50000], x_train[50000:]
  y_train, y_val = y_train[:50000], y_train[50000:]


  if (return_images==False):
    return {
        'train': {
            'X': x_train.reshape([50000, 784]),
            'Y': y_train.reshape([50000])
        },
        'val': {
            'X': x_val.reshape([10000, 784]),
            'Y': y_val.reshape([10000])
        },
        'test': {
            'X': x_test.reshape([10000, 784]),
            'Y': y_test.reshape([10000])
        }
  }

  else :
    return {
      'train': {
          	'X': x_train,
          	'Y': y_train
      },
      'val': {
            'X': x_val,
            'Y': y_val
      },
      'test': {
            'X': x_test,
            'Y': y_test
      }
    }


data = load_fashion_mnist()

# Question 2

In [3]:
class neural_network:

  # constructor function - initializes weights
  def __init__(self, dict_layers, initializer):

    self.weights_list = []
    self.biases_list = []
    self.dict_layers= dict_layers

    self.weights_list, self.biases_list = wandb_initializer(dict_layers, self.weights_list, self.biases_list, initializer)

  # function to compute forward propogation
  def forward_prop(self, W, b, X, Y, activation_func):

    A = []
    H = []
    
    H_pre = X
    
    L = self.dict_layers['num_hidden_layers']

    for i in range(L) :
      A.append(W[i] @ H_pre + b[i])
      H_pre = getattr(activation, activation_func)(A[i])
      H.append(H_pre)
    
    A.append(W[L] @ H_pre + b[L])
    
    Y_hat = activation.softmax(A[L])
    
    return {
        'A' : A,
        'H' : H,
        'Y_hat' : Y_hat
    }

  # helper function to perform forward propogation 
  def self_forward_prop(self, X, Y, activation_func) :

    temp = self.forward_prop(self.weights_list,self.biases_list, X, Y, activation_func)
    return temp

  # function to perform backward propogration
  def back_prop(self, W, b, A, H, Y_hat, X, Y,activation_func):

    batch_size = len(Y)
    
    del_w = []
    del_b = []
    L = self.dict_layers['num_hidden_layers']
    
    E = np.zeros(Y_hat.shape)
    E[Y,np.arange(batch_size)] = 1
    
    grad_A = -(E - Y_hat)

    for i in range(L,-1,-1) :

      temp1 = grad_A.reshape(-1,batch_size)
      
      if i==0 :
        temp2 = X.T
      else :
        temp2 = H[i-1].reshape((batch_size ,-1))

      del_w.append(temp1 @ temp2)
      del_b.append(grad_A)

      if(i!=0) :
        grad_H = W[i].T @ grad_A      
        grad_A = grad_H * getattr(activation,activation_func+'_der')(H[i-1])

    for j in range(len(del_b)) :
       del_b[j] = np.sum(del_b[j],axis=1)

    return {
        'dw' : del_w,
        'db' : del_b
    }

  # helper function to perform backward propogation
  def self_back_prop(self, A, H, Y_hat, X, Y,activation_func) :
    temp = self.back_prop(self.weights_list,self.biases_list, A, H, Y_hat, X, Y, activation_func)
    return temp

  #  function to compute gradient
  def grad_wandb(self, W, b, X, Y,activation_func):

    X = X.T.reshape((784,-1))
    
    temp = self.forward_prop(W, b, X, Y, activation_func)
    temp2 = self.back_prop(W, b, temp['A'], temp['H'], temp['Y_hat'], X, Y, activation_func)

    return {
        'dw' : temp2['dw'],
        'db' : temp2['db']
    }

  # helper function to compute gradient
  def self_grad_wandb(self, X, Y, activation_func) :
    temp = self.grad_wandb(self.weights_list, self.biases_list, X, Y,activation_func)
    return temp

  # function to compute predictions
  def predict(self, X, activation_func):
    X = X.T.reshape((784,-1))
    temp = self.forward_prop(self.weights_list,self.biases_list, X, 0, activation_func)
    return {
      'Y' : np.argmax(temp['Y_hat'],axis=0),
      'Y_hat' : temp['Y_hat']
    }

  # function to update weights and biases
  def update_vals(self, dw, db, wd) :
    L = len(self.weights_list)
    for i in range(L) :
      # print('dw['+str(L-i-1)+']',dw[L-i-1])
      self.weights_list[i] =self.weights_list[i] - dw[L-i-1].reshape(self.weights_list[i].shape) - wd * self.weights_list[i]

    for i in range(len(self.biases_list)) :
      # print('db['+str(L-i-1)+']',db[L-i-1])
      self.biases_list[i] =self.biases_list[i] - db[L-i-1].reshape(self.biases_list[i].shape)  
##################################################################################
class activation:
  
  @staticmethod
  def sigmoid(z):
    return 1 / (1 + np.exp(-z))
  
  @staticmethod
  def relu(z):
    return (z>0) * z

  @staticmethod
  def tanh(z):
    return np.tanh(z)

  @staticmethod
  def sigmoid_der(z) :
    return z * (1-z)
  
  @staticmethod
  def relu_der(z) :
    return (z>0)

  @staticmethod
  def tanh_der(z):
    return 1 - z*z

  @staticmethod
  def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x,axis=0)
    #return np.array(tf.nn.softmax(np.array(x)))
##################################################################################
def set_nn_shape(verbose=True, num_hidden_layers=-1, hidden_layer_size=-1):

  input_layer_size = 784
  hidden_layer_size = hidden_layer_size
  num_hidden_layers = num_hidden_layers
  output_layer_size = 10
  
  # input_layer_size = 3
  # hidden_layer_size = hidden_layer_size
  # num_hidden_layers = num_hidden_layers
  # output_layer_size = 2
  if (verbose):
    print("\nNumber Of Hidden Layers:")
    num_hidden_layers = int(input())

    print("\nSize Of Each Hidden Layer:")
    hidden_layer_size = int(input())

    print(f"\nThe Neural Network Has {num_hidden_layers+2} Layers In Total!")
  
  return {"input_layer_size": input_layer_size, "hidden_layer_size": hidden_layer_size, "output_layer_size": output_layer_size, "num_hidden_layers": num_hidden_layers}


In [4]:
def wandb_initializer(nn_shape, weights_list, biases_list, type='random', mu = 0, sigma = 1):
  
  # random initialization
  if (type=='random'):
    print('Random')
    initializer = tf.keras.initializers.TruncatedNormal(mean=mu, stddev=sigma)
  # xavier initialization
  elif (type=='xavier'):
    print('Xavier')
    initializer = tf.keras.initializers.GlorotNormal()

  weights_list.append(initializer(shape=(nn_shape['hidden_layer_size'], nn_shape['input_layer_size'])).numpy())
  biases_list.append(initializer(shape=(nn_shape['hidden_layer_size'], 1)).numpy())
  for i in range(nn_shape['num_hidden_layers'] - 1):
    weights_list.append(initializer(shape=(nn_shape['hidden_layer_size'], nn_shape['hidden_layer_size'])).numpy())
    biases_list.append(initializer(shape=(nn_shape['hidden_layer_size'], 1)).numpy())

  weights_list.append(initializer(shape=(nn_shape['output_layer_size'], nn_shape['hidden_layer_size'])).numpy())
  biases_list.append(initializer(shape=(nn_shape['output_layer_size'], 1)).numpy())

  return weights_list, biases_list

In [5]:
class optimizer:

  @staticmethod
  def sgd(network, data, config):

    # num_hidden_layers, hidden_layers_size = config['num_hidden_layers'], config['hidden_layer_size']
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        temp = network.self_grad_wandb(X, Y, activation_func)         
        dw = temp['dw']
        db = temp['db']
        for dd in dw :
          dd*= eta
        for dd in db :
          dd*=eta

        network.update_vals(dw, db, lambda_)
    
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def momentum(network, data, config,gamma = 0.9) :
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    dw, db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    dw.reverse()
    db.reverse()

    for j in range(len(db)) :
      db[j] = db[j].flatten()

    for i in range(num_epochs) :
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        temp = network.self_grad_wandb(X,Y,activation_func)       
        for j in range(len(dw)) :
          dw[j] += eta*temp['dw'][j]
          db[j] += eta*temp['db'][j]

        network.update_vals(dw,db, lambda_)
        for dd in db :
          dd*=gamma
        for dd in dw :
          dd*=gamma

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 
        
  @staticmethod
  def NAG(network, data, config,gamma = 0.9) :
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    v_dw, v_db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    v_dw.reverse()
    v_db.reverse()

    for j in range(len(v_db)) :
      v_db[j] = v_db[j].flatten()

    for i in range(num_epochs) :
      for k in tqdm(range(0, len(X_train), batch_size)) :
        for j in range(len(v_dw)) :
          v_dw[j] = gamma*v_dw[j]
          v_db[j] = gamma*v_db[j]

        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        W = network.get_weights()
        B = network.get_biases()
        L = len(W)
        for j in range(L) :
          W[j] -= v_dw[L-j-1]
          B[j] -= v_db[L-j-1].reshape(B[j].shape)

        temp = network.grad_wandb(W,B,X,Y,activation_func)  

        for j in range(len(v_dw)) :
          v_dw[j] += eta*temp['dw'][j]
          v_db[j] += eta*temp['db'][j]


        network.update_vals(v_dw,v_db,lambda_)

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def RMSprop(network, data, config,beta = 0.9,epsilon = 1e-4) :
    
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
    
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)

    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])
    v_dw, v_db = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    v_dw.reverse()
    v_db.reverse()

    for j in range(len(v_db)) :
      v_db[j] = v_db[j].flatten()

    for i in range(num_epochs) :
      dw = []
      db = []
      for k in tqdm(range(0, len(X_train), batch_size)) :
        X = X_train[k:k+batch_size]
        Y = Y_train[k:k+batch_size]
        temp = network.self_grad_wandb(X,Y,activation_func)  

        dw = temp['dw']
        db = temp['db']

        for j in range(len(dw)) :
          v_dw[j] *= beta
          v_dw[j] += (1-beta)*(dw[j]**2) 
          dw[j] *= eta/np.sqrt(v_dw[j]+epsilon)
          v_db[j] *= beta
          v_db[j] += (1-beta)*(db[j]**2) 
          db[j] *= eta/np.sqrt(v_db[j]+epsilon)

        network.update_vals(dw,db, lambda_)

      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      }) 


  @staticmethod
  def adam(network, data, config, beta1=0.9, beta2=0.999, epsilon=1e-4):

    # num_hidden_layers, hidden_layers_size = config['num_hidden_layers'], config['hidden_layer_size']
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])

    m_w, m_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)
    v_w, v_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    m_w.reverse()
    m_b.reverse()
    v_w.reverse()
    v_b.reverse()
    for j in range(len(m_b)):
      m_b[j], v_b[j] = m_b[j].flatten(), v_b[j].flatten() 
    # print('Checkpoint 1')
    t = 0
    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        
        t += 1
        
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        
        temp = network.self_grad_wandb(X, Y, activation_func)
        # print('Checkpoint 2')  
        dw = temp['dw']
        db = temp['db']
        
        for j in range(len(dw)):
          # print(db[j].shape, m_b[j].shape, v_b[j].shape)
          m_w[j] = beta1 * m_w[j] + (1 - beta1) * dw[j]
          # print('Checkpoint 2.5')
          
          m_b[j] = beta1 * m_b[j] + (1 - beta1) * db[j]
          # print('Checkpoint 3')
          v_w[j] = beta2 * v_w[j] + (1 - beta2) * dw[j] * dw[j]
          v_b[j] = beta2 * v_b[j] + (1 - beta2) * db[j] * db[j]
          # print('Checkpoint 4')
                 
          m_w[j] = m_w[j] *((1-beta1**int(t))/ (1-beta1**int(t+1)))
          m_b[j] = m_b[j] *((1-beta1**int(t))/ (1-beta1**int(t+1)))
          # # print('Checkpoint 5')
          v_w[j] = v_w[j]*((1-beta2**int(t))/ (1-beta2**int(t+1)))
          v_b[j] = v_b[j]*((1-beta2**int(t))/ (1-beta2**int(t+1)))
          # print('Checkpoint 6')
          
          dw[j] = eta * m_w[j] / (epsilon + np.sqrt( v_w[j]))
          db[j] = eta * m_b[j] / ( epsilon + np.sqrt(v_b[j]))

          # print('Checkpoint 7')
        
        network.update_vals(dw, db, lambda_)
        
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      })

    
  @staticmethod
  def nadam(network, data, config, beta1=0.9, beta2=0.999, epsilon=1e-4):

    # num_hidden_layers, hidden_layers_size = config['num_hidden_layers'], config['hidden_layer_size']
    num_epochs, batch_size = config['num_epochs'], config['batch_size']
    eta, lambda_ = config['lr'], config['weight_decay']
    initializer, activation_func = config['weights_initializer'], config['activation']
 
    ### google the getattr function - eg: getattr(activation, 'relu')(junk) is same as activation.relu(junk)
    X_train, Y_train = data['train']['X'], data['train']['Y']
    num_examples = len(X_train)
    # print(num_hidden_layers, hidden_layers_size)
    nn_shape = set_nn_shape(False, config['num_hidden_layers'], config['hidden_layer_size'])

    m_w, m_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)
    v_w, v_b = wandb_initializer(nn_shape, [], [], 'random', 0, 0)

    m_w.reverse()
    m_b.reverse()
    v_w.reverse()
    v_b.reverse()
    for j in range(len(m_b)):
      m_b[j], v_b[j] = m_b[j].flatten(), v_b[j].flatten() 
    # print('Checkpoint 1')
    t = 0
    for i in range(num_epochs):
      for k in tqdm(range(0, len(X_train), batch_size)) :
        
        t += 1
        
        X = X_train[k: k+batch_size]
        Y = Y_train[k: k+batch_size]
        
        temp = network.self_grad_wandb(X, Y, activation_func)
        # print('Checkpoint 2')  
        dw = temp['dw']
        db = temp['db']
        
        for j in range(len(dw)):
          # print(db[j].shape, m_b[j].shape, v_b[j].shape)
          m_w[j] = beta1 * m_w[j] + (1 - beta1) * dw[j]
          # print('Checkpoint 2.5')
          
          m_b[j] = beta1 * m_b[j] + (1 - beta1) * db[j]
          # print('Checkpoint 3')
          v_w[j] = beta2 * v_w[j] + (1 - beta2) * dw[j] * dw[j]
          v_b[j] = beta2 * v_b[j] + (1 - beta2) * db[j] * db[j]
          # print('Checkpoint 4')
                 
          m_w[j] = m_w[j] *((1-beta1**int(t))/ (1-beta1**int(t+1)))
          m_b[j] = m_b[j] *((1-beta1**int(t))/ (1-beta1**int(t+1)))
          # # print('Checkpoint 5')
          v_w[j] = v_w[j]*((1-beta2**int(t))/ (1-beta2**int(t+1)))
          v_b[j] = v_b[j]*((1-beta2**int(t))/ (1-beta2**int(t+1)))
          # print('Checkpoint 6')
          
          dw[j] = eta * (beta1*m_w[j] + (1 - beta1)*dw[j]) / (epsilon + np.sqrt( v_w[j]))
          db[j] = eta * (beta1*m_b[j] + (1 - beta1)*db[j]) / ( epsilon + np.sqrt(v_b[j]))

          # print('Checkpoint 7')
        
        network.update_vals(dw, db, lambda_)
        
      report = run_callback(network, data, config) 
        
      wandb.log({
            'batch_size': config.batch_size, 
            'val_loss' : report['loss']['val'], 
            'train_loss': report['loss']['train'],
            'train_acc': report['accuracy']['train'],
            'val_acc': report['accuracy']['val']  
      })

In [6]:
'''
# X = np.array([[1,1,2],[-1,2,3],[10,-67,43],[-5,45,-67]])
# Y = np.array([1,0,1,0])
# temp = solver.sgd(X,Y,100,1e-3)
X=data['train']['X']
# print(len(X))
# print(X[0])
Y = data['train']['Y']
temp = solver.RMSprop(X,Y,100,0.001,0.9,32,1e-2)
print(temp)
'''

"\n# X = np.array([[1,1,2],[-1,2,3],[10,-67,43],[-5,45,-67]])\n# Y = np.array([1,0,1,0])\n# temp = solver.sgd(X,Y,100,1e-3)\nX=data['train']['X']\n# print(len(X))\n# print(X[0])\nY = data['train']['Y']\ntemp = solver.RMSprop(X,Y,100,0.001,0.9,32,1e-2)\nprint(temp)\n"

In [7]:
def run_callback(network,data,config) :
    
    activation_func = config['activation']
    
    X_train = data['train']['X']
    Y_train = data['train']['Y']

    X_val = data['val']['X']
    Y_val = data['val']['Y']

    train_loss = 0
    train_count = 0
    train_sq_error = 0
    
    temp = network.predict(X_train,activation_func)
    train_count = np.sum(temp['Y'].reshape(Y_train.shape)==Y_train)
    
    Y_pred = np.array(temp['Y_hat'].T)
    train_loss = np.sum(-np.log(Y_pred[np.arange(len(X_train)),Y_train]))
    E = np.zeros(Y_pred.shape)
    E[np.arange(len(X_train)),Y_train] = 1
    train_sq_error = np.sum((E-Y_pred)**2)

    val_loss = 0
    val_count = 0
    val_sq_error = 0
    
    temp = network.predict(X_val, activation_func)
    val_count = np.sum(temp['Y'].reshape(Y_val.shape)==Y_val)
    
    Y_pred = np.array(temp['Y_hat'].T)
    val_loss = np.sum(-np.log(Y_pred[np.arange(len(X_val)),Y_val]))
    E = np.zeros(Y_pred.shape)
    E[np.arange(len(X_val)),Y_val] = 1
    val_sq_error = np.sum((E-Y_pred)**2)

    
    '''
    test_loss = 0
    test_count = 0
    test_sq_error = 0
    for x,y in zip(X_test,Y_test) :
        temp = network.predict(x,activation_func)
        if temp['y'] == y :
            test_count += 1
        test_loss -= np.log(temp['y_hat'][y]) 
        temp['y_hat'][y] = 1 - temp['y_hat'][y]
        test_sq_error += np.sum(np.dot(temp['y_hat'],temp['y_hat'])) 
    '''
    
    return  {
        'loss': {
            'train' : train_loss / len(X_train),
            'val' : val_loss / len(X_val)
        },
        'accuracy': {
            'train': train_count / len(X_train),
            'val': val_count / len(X_val)
        }
        #'test' : np.array([test_sq_error,test_loss,test_count])/len(X_test)
    }

    


In [8]:
'''
nn = set_nn_shape()
network = neural_network(nn, 'random')
#data1 = {'train' : {'X': np.array([[1,1,2],[-1,2,3],[10,-67,43],[-5,45,-67],[5,6,7]]), 'Y' : np.array([1,0,1,0,1]) },'val' : {'X': np.array([[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]]), 'Y' : np.array([0,1,3,2,1]) }}
# need to change this bit later to accomodate other optimization functions

config1 = {'num_epochs' : 5,'lr' : 1e-2,'optimizer': 'sgd', 'batch_size' : 32 , 'weights_initializer' : 'random' , 'weight_decay' : 0.001, 'activation' : 'sigmoid' }
optimizer.sgd(network, data,config1 )

# generating reports for the run
report = run_callback(network, data, config1) 

print(report)
'''

"\nnn = set_nn_shape()\nnetwork = neural_network(nn, 'random')\n#data1 = {'train' : {'X': np.array([[1,1,2],[-1,2,3],[10,-67,43],[-5,45,-67],[5,6,7]]), 'Y' : np.array([1,0,1,0,1]) },'val' : {'X': np.array([[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1]]), 'Y' : np.array([0,1,3,2,1]) }}\n# need to change this bit later to accomodate other optimization functions\n\nconfig1 = {'num_epochs' : 5,'lr' : 1e-2,'optimizer': 'sgd', 'batch_size' : 32 , 'weights_initializer' : 'random' , 'weight_decay' : 0.001, 'activation' : 'sigmoid' }\noptimizer.sgd(network, data,config1 )\n\n# generating reports for the run\nreport = run_callback(network, data, config1) \n\nprint(report)\n"

In [9]:
'''
# generating reports for the run
report = run_callback(network, data, config1) 
print(report)
'''

'\n# generating reports for the run\nreport = run_callback(network, data, config1) \nprint(report)\n'

In [10]:
activation.softmax(np.array([[0.7,0.5],[3,7]]))

array([[0.09112296, 0.00150118],
       [0.90887704, 0.99849882]])

In [11]:
sweep_config = {
    'method': 'random',

    'parameters': {
        'num_epochs': {
            'values': [5, 10]
        },
        'num_hidden_layers': {
            'values': [3, 4, 5]
        },
        'hidden_layer_size': {
            'values': [32, 64, 128]
        },
        'weight_decay': {
            'values': [0, 0.0005, 0.05]
        },
        'lr': {
            'values': [1e-3, 1e-4]
        },
        'optimizer': {
            'values': ['sgd', 'momentum', 'NAG', 'RMSprop', 'adam', 'nadam']
        },
        'batch_size': {
            'values': [16, 32, 64]
        },
        'weights_initializer': {
            'values': ['random', 'xavier']
        },
        'activation': {
            'values': ['sigmoid', 'tanh', 'relu']
        }        
    }
}

In [12]:
sweep_id = wandb.sweep(sweep_config, project='test5')

Create sweep with ID: b15n4ri6
Sweep URL: https://wandb.ai/ramkamal/test5/sweeps/b15n4ri6


In [13]:
sweep_id = '8f6t9uc6'

In [14]:
pprint.pprint(sweep_config)

{'method': 'random',
 'parameters': {'activation': {'values': ['sigmoid', 'tanh', 'relu']},
                'batch_size': {'values': [16, 32, 64]},
                'hidden_layer_size': {'values': [32, 64, 128]},
                'lr': {'values': [0.001, 0.0001]},
                'num_epochs': {'values': [5, 10]},
                'num_hidden_layers': {'values': [3, 4, 5]},
                'optimizer': {'values': ['sgd',
                                         'momentum',
                                         'NAG',
                                         'RMSprop',
                                         'adam',
                                         'nadam']},
                'weight_decay': {'values': [0, 0.0005, 0.05]},
                'weights_initializer': {'values': ['random', 'xavier']}}}


In [15]:
class sweep_module:
  @staticmethod
  def train(config=None):

    with wandb.init(config):
      # print('Checkpoint 1')
      config = wandb.config
      # print('Checkpoint 2')
      
      nn_shape = set_nn_shape(False, config['num_hidden_layers'] , config['hidden_layer_size'])
      
      # print('Checkpoint 3')
      
      # print(nn_shape)
      network = neural_network(nn_shape, config['weights_initializer'])
      # print('Checkpoint 4')
      
      # need to change this bit later to accomodate other optimization functions
      optimizer.nadam(network, data, config)

In [16]:
sweep_id

'8f6t9uc6'

In [None]:
# for logging the best model
network_best = None
val_acc_best = -1

# performing the sweep
wandb.agent(sweep_id, sweep_module.train)

[34m[1mwandb[0m: Agent Starting Run: vdo9tn2d with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.05
[34m[1mwandb[0m: 	weights_initializer: xavier
[34m[1mwandb[0m: Currently logged in as: [33mramkamal[0m (use `wandb login --relogin` to force relogin)


Xavier
Random
Random


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_size,16.0
val_loss,2.30364
train_loss,2.30436
train_acc,0.10024
val_acc,0.0988
_runtime,73.0
_timestamp,1615708620.0
_step,4.0


0,1
batch_size,▁▁▁▁▁
val_loss,█▄▂▁▁
train_loss,█▄▂▁▁
train_acc,▁▁▁▁▁
val_acc,▁▁▁▁▁
_runtime,▁▃▅▆█
_timestamp,▁▃▅▆█
_step,▁▃▅▆█


[34m[1mwandb[0m: Agent Starting Run: bitxl9n7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: RMSprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weights_initializer: xavier


Xavier
Random
Random


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_size,16.0
val_loss,0.5018
train_loss,0.48544
train_acc,0.85454
val_acc,0.8422
_runtime,68.0
_timestamp,1615708693.0
_step,4.0


0,1
batch_size,▁▁▁▁▁
val_loss,█▅▃▂▁
train_loss,█▅▃▂▁
train_acc,▁▄▅▇█
val_acc,▁▄▆▇█
_runtime,▁▃▄▆█
_timestamp,▁▃▄▆█
_step,▁▃▅▆█


[34m[1mwandb[0m: Agent Starting Run: 81uuizch with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weights_initializer: xavier


Xavier
Random
Random


HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_size,32.0
val_loss,1.08117
train_loss,1.08511
train_acc,0.67804
val_acc,0.6813
_runtime,33.0
_timestamp,1615708735.0
_step,4.0


0,1
batch_size,▁▁▁▁▁
val_loss,▁█▅▄▃
train_loss,▁█▅▄▃
train_acc,█▇▁▅▅
val_acc,█▆▁▄▅
_runtime,▁▃▄▆█
_timestamp,▁▃▄▆█
_step,▁▃▅▆█


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
502 response executing GraphQL.

<html><head>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<title>502 Server Error</title>
</head>
<body text=#000000 bgcolor=#ffffff>
<h1>Error: Server Error</h1>
<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>
<h2></h2>
</body></html>

[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 047x0fbo with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	hidden_layer_size: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_hidden_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weights_initializer: xavier


Xavier
Random
Random


HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_size,32.0
val_loss,1.29951
train_loss,1.29544
train_acc,0.5877
val_acc,0.5855
_runtime,71.0
_timestamp,1615708852.0
_step,9.0


0,1
batch_size,▁▁▁▁▁▁▁▁▁▁
val_loss,▁█▇▅▅▄▂▂▃▄
train_loss,▁█▇▅▅▄▂▂▃▄
train_acc,█▃▂▁▃▃▅▆▅▄
val_acc,█▃▂▁▃▃▆▆▅▄
_runtime,▁▂▂▃▄▅▆▇▇█
_timestamp,▁▂▂▃▄▅▆▇▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: c4joy7fo with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hidden_layer_size: 32
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_hidden_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weights_initializer: random


Random
Random
Random


HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
batch_size,64.0
val_loss,0.69955
train_loss,0.68017
train_acc,0.84058
val_acc,0.833
_runtime,38.0
_timestamp,1615708898.0
_step,9.0


0,1
batch_size,▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▅▄▄▃▂▂▁▁
train_loss,█▆▅▄▄▃▂▂▁▁
train_acc,▁▃▅▅▆▇▇▇██
val_acc,▁▃▅▅▆▇▇▇██
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: 2o9bwwtn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	hidden_layer_size: 128
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	num_hidden_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.05
[34m[1mwandb[0m: 	weights_initializer: random


Random
Random
Random


HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




In [None]:
nn_shape = {'input_layer_size': 784, 'hidden_layer_size': 64, 'output_layer_size': 10, 'num_hidden_layers': 5}

In [None]:
init = 'random'

In [None]:
neural_network(nn_shape, init)

In [None]:
nn_shape = {'input_layer_size': 784, 'hidden_layer_size': 64, 'output_layer_size': 10, 'num_hidden_layers': 3}

In [None]:
a, b = wandb_initializer(nn_shape,
                  [],
                  [],
                  'random',
                  0,
                  0)

In [None]:
a.reverse()

In [None]:
help(wandb.agent)