In [None]:
!pip install wandb
import wandb
from keras.datasets import fashion_mnist
from keras.datasets import mnist
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

In [2]:
def relu(x):
  return np.maximum(0, x)

def sigmoid(x):
  clip_x = np.clip(x, -500, 500)  # Clipping x to avoid overflow
  return 1 / (1 + np.exp(-clip_x))

def _tanh(x):
  clip_x = np.clip(x, -500, 500)  # Clipping x for uniformity
  return np.tanh(clip_x)

In [3]:
## writing the code in the classwise fashion
class NeuralNetwork:
  def __init__(self, inputSize, hiddenLayers, outputSize, sizeOfHiddenLayers, batchSize, learningRate, initialisationType, optimiser, epochs, activationFunc, weightDecay, isWandb = False, lossFunc = "cross_entropy", dataset = "fashion_mnist"):
    # initialising model parameters
    nodes_in_layers = []
    for i in range(hiddenLayers):
      nodes_in_layers.append(sizeOfHiddenLayers)
    nodes_in_layers.append(outputSize)
    if dataset == "fashion_mnist":
      (X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data()
      X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
    elif dataset == "mnist":
      (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
      X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)



    # normalsing and resisizing all the images
    X_train = X_train/255.0
    X_test  = X_test/255.0
    X_val   = X_val/255.0

    X_train = X_train.reshape(X_train.shape[0], 784).T
    X_test = X_test.reshape(X_test.shape[0], 784).T
    X_val = X_val.reshape(X_val.shape[0], 784).T

    self.X_train = X_train
    self.Y_train = Y_train
    self.X_val   = X_val
    self.Y_val   = Y_val
    self.X_test  = X_test
    self.Y_test  = Y_test

    self.inputSize = inputSize
    self.outputSize= outputSize
    self.batchSize = batchSize
    self.layers = hiddenLayers + 1
    self.nodes  = nodes_in_layers
    self.initialisationType = initialisationType
    self.betha1 = 0.9
    self.betha2 = 0.999
    self.betha  = 0.9
    self.epsilon= 1e-8
    self.Weights= {}
    self.Baises = {}
    self.optimiser = optimiser
    self.epochs = epochs
    self.learningRate = learningRate
    self.activationFunc = activationFunc
    self.isWandb = isWandb
    self.weightDecay = weightDecay
    self.lossFunc = lossFunc # "cross_entropy" or "MSE"
    self.dataset = dataset


  def Initialise(self):
    # initialising weights and biases as a key value pair
    W = {}
    B = {}

    PreActivation = {}
    Activation = {}

    # adding input layer
    LayerWise = self.nodes
    LayerWise.insert(0, self.inputSize)

    # initialisation of weights and baises
    for i in range(self.layers):
      if self.initialisationType == "random":
        W[i+1] = 0.01*np.random.randn(LayerWise[i+1], LayerWise[i])
        B[i+1] = 0.01*np.random.randn(LayerWise[i+1], 1)
      if self.initialisationType == "Xavier":
        W[i+1] = np.random.randn(LayerWise[i+1], LayerWise[i]) * np.sqrt(2. / (LayerWise[i] + LayerWise[i+1]))
        B[i+1] = np.zeros((LayerWise[i+1], 1))

      # preactivation and activation will have same size
      PreActivation[i+1] = np.zeros((LayerWise[i+1], 1))
      Activation[i+1]    = np.zeros((LayerWise[i+1], 1))

    del LayerWise[0]

    self.Weights = W
    self.Baises  = B
    self.PreActivation = PreActivation
    self.Activation = Activation

    return W,B,PreActivation,Activation

  def InitialiseEmptyWeightsAndBiases(self):
    W = {}
    B = {}
    LayerWise = self.nodes

    LayerWise.insert(0, self.inputSize)
    for i in range(self.layers):
      W[i+1] = np.zeros((LayerWise[i+1], LayerWise[i]))
      B[i+1] = np.zeros((LayerWise[i+1], 1))
    del LayerWise[0]

    return W,B

  def FeedForward(self, x, W, B, preActivation, activation):
    # no of layers
    n = len(W)
    y = x
    for i in range(1, n+1):
      preActivation[i] = np.dot(W[i], y) + B[i]
      if self.activationFunc == "sigmoid":
        activation[i] = sigmoid(preActivation[i])
      elif self.activationFunc == "tanh":
        activation[i] = _tanh(preActivation[i])
      elif self.activationFunc == "relu":
        activation[i] = relu(preActivation[i])

      y = activation[i]

    # last layer we don't need activation
    y = preActivation[n]
    # doing softmax doing the each column wise
    exp_y = np.exp(y - np.max(y, axis=0, keepdims=True))  # Improve numerical stability
    y = exp_y / np.sum(exp_y, axis=0, keepdims=True)
    return y

  def BackWardPropogation(self, X, y_corr, W, preActivation, activation, y_hat):
    # y_hat is the prediction and y_corr is the correct class
    dw = {}
    db = {}

    # these many points are there in the batch
    batch_Size = y_corr.shape[0]
    y = np.zeros([10, batch_Size])
    # this y is encoded in 10*batchsize with each col being for one point that will be one

    for ind in range(batch_Size):
      y[y_corr[ind]][ind] = 1

    if self.lossFunc == "cross_entropy":
      da = y_hat - y
    elif self.lossFunc == "MSE":
      da = (y_hat - y)*(y_hat)*(1 - y_hat)
      da = da / self.batchSize

    activation[0] = X
    layer = len(W)
    dh = da #used for finding next layer
    while layer >= 1:
      dw[layer] = np.dot(da, activation[layer-1].T)
      db[layer] = da.sum(axis=1, keepdims=True)
      if layer > 1:
        dh = np.dot(W[layer].T, da)
        if self.activationFunc == "sigmoid":
          dg = activation[layer-1] * (1 - activation[layer-1])
        elif self.activationFunc == "tanh":
          dg = (1 + activation[layer-1]) * (1 - activation[layer-1])
        elif self.activationFunc == "relu":
          dg = np.where(preActivation[layer-1] > 0, 1, 0)

        # dg = activation[layer-1] * (1 - activation[layer-1])
        da = dh * dg
        # hedamant product
      layer -= 1

    # L2 regularisation
    for i in range(1, self.layers+1):
      dw[i] = dw[i] + self.weightDecay*W[i]

    return dw, db

  def FindAccuracyAndLoss(self, W, B, data, labels):
    n = data.shape[1]
    correct = 0
    labels_one_hot = np.eye(10)[labels]
    #running the data on the weights and baises
    y = data
    for i in range(1, self.layers+1):
      preActivation = np.dot(W[i], y) + B[i]

      if self.activationFunc == "sigmoid":
        activation = sigmoid(preActivation)
      elif self.activationFunc == "tanh":
        activation = _tanh(preActivation)
      elif self.activationFunc == "relu":
        activation = relu(preActivation)
      y = activation

    # last layer we don't need activation
    y = preActivation
    # doing softmax doing the each column wise
    exp_y = np.exp(y - np.max(y, axis=0, keepdims=True))  # Improve numerical stability
    y = exp_y / np.sum(exp_y, axis=0, keepdims=True)
    loss = 0

    for i in range(1, 1+self.layers):
      loss += self.weightDecay * np.linalg.norm(W[i])

    for i in range(n):
      y_pred = np.argmax(y[:,i])
      if labels[i] == y_pred:
        correct += 1

      if self.lossFunc == "cross_entropy":
        loss += -1*np.log(y[:,i][labels[i]] + 1e-9)
      else:
        loss += np.sum((y[:, i] - labels_one_hot[i]) ** 2)

    return (correct*100/ n), (loss/n)

  def predict(self, data):
    n = data.shape[1]
    y = data

    for i in range(1, self.layers+1):
      preActivation = np.dot(self.Weights[i], y) + self.Baises[i]

      if self.activationFunc == "sigmoid":
        activation = sigmoid(preActivation)
      elif self.activationFunc == "tanh":
        activation = _tanh(preActivation)
      elif self.activationFunc == "relu":
        activation = relu(preActivation)
      y = activation

    # last layer we don't need activation
    y = preActivation
    # doing softmax doing the each column wise
    exp_y = np.exp(y - np.max(y, axis=0, keepdims=True))  # Improve numerical stability
    y = exp_y / np.sum(exp_y, axis=0, keepdims=True)
    predictions = []
    for i in range(n):
      y_pred = np.argmax(y[:,i])
      predictions.append(y_pred)

    return predictions

  def SGD(self):
    W, B, preActivation, activation  = self.Initialise()
    iteration = 0
    layers = self.layers
    empty_W, empty_B = self.InitialiseEmptyWeightsAndBiases()

    while(iteration < self.epochs):
      i = 0
      while i < self.X_train.shape[1]:
        y = self.FeedForward(self.X_train[:, i:i+self.batchSize], W, B, preActivation, activation)
        # these are the partial derivates for one point
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+self.batchSize], self.Y_train[i:i+self.batchSize], W, preActivation, activation, y)

        # we will update the weights now
        for k in range(1, layers+1):
            W[k] = W[k] - self.learningRate*dw[k]
            B[k] = B[k] - self.learningRate*db[k]

        i += self.batchSize
      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def MomentBasedGradientDecent(self):
    W, B, preActivation, activation  = self.Initialise()
    iteration = 0
    u_W, u_B = self.InitialiseEmptyWeightsAndBiases()
    # inititialising u to be zero

    while(iteration < self.epochs):
      i = 0
      while i < self.X_train.shape[1]:
        # batch wise forward and backward passes
        y = self.FeedForward(self.X_train[:, i:i+self.batchSize], W, B, preActivation, activation)
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+self.batchSize], self.Y_train[i:i+self.batchSize], W, preActivation, activation, y)

        # update the momentum with the gradient
        for k in range(1, self.layers+1):
          u_W[k] = u_W[k]*self.betha + dw[k]
          u_B[k] = u_B[k]*self.betha + db[k]

        # we will update the weights now with the momentum
        for k in range(1, self.layers+1):
            W[k] = W[k] - self.learningRate*u_W[k]
            B[k] = B[k] - self.learningRate*u_B[k]

        # next batch
        i += self.batchSize
      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def NestrovBasedGradientDescent(self):
    iteration = 0
    W, B, preActivation, activation = self.Initialise()
    u_W, u_B = self.InitialiseEmptyWeightsAndBiases()
    # initializing u to be zero

    while(iteration < self.epochs):
      i = 0
      while i < self.X_train.shape[1]:

        y = self.FeedForward(self.X_train[:, i:i+self.batchSize], W, B, preActivation, activation)
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+self.batchSize], self.Y_train[i:i+self.batchSize], W, preActivation, activation, y)

        for k in range(1, self.layers+1):
            u_W[k] = u_W[k]*self.betha + dw[k]
            u_B[k] = u_B[k]*self.betha + db[k]

        for k in range(1, self.layers+1):
            W[k] = W[k] - self.learningRate*(self.betha* u_W[k]+ dw[k])
            B[k] = B[k] - self.learningRate*(self.betha* u_B[k]+ db[k])

        i += self.batchSize
      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def RMSPROP(self):
    iteration = 0
    epochs = self.epochs
    layers = self.layers
    batchSize = self.batchSize
    betha = self.betha
    W, B, preActivation, activation  = self.Initialise()
    v_W, v_B = self.InitialiseEmptyWeightsAndBiases()
    # inititialising u to be zero

    while(iteration < epochs):
      i = 0
      while i < self.X_train.shape[1]:
        y = self.FeedForward(self.X_train[:, i:i+batchSize], W, B, preActivation, activation)
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+batchSize], self.Y_train[i:i+batchSize], W, preActivation, activation, y)

        # update the v values with the gradient
        for k in range(1, layers+1):
          v_W[k] = v_W[k]*betha + (1 - betha) * (dw[k] ** 2)
          v_B[k] = v_B[k]*betha + (1 - betha) * (db[k] ** 2)

        # we will update the weights now with the momentum
        for k in range(1, layers+1):
          W[k] = W[k] - (self.learningRate/np.sqrt(v_W[k] + self.epsilon))*dw[k]
          B[k] = B[k] - (self.learningRate/np.sqrt(v_B[k] + self.epsilon))*db[k]

        i += batchSize
      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def ADAM(self):
    iteration = 0
    epochs = self.epochs
    layers = self.layers
    batchSize = self.batchSize

    W, B, preActivation, activation  = self.Initialise()
    v_W, v_B = self.InitialiseEmptyWeightsAndBiases()
    m_W, m_B = self.InitialiseEmptyWeightsAndBiases()
    mhat_W, mhat_B = self.InitialiseEmptyWeightsAndBiases()
    vhat_W, vhat_B = self.InitialiseEmptyWeightsAndBiases()
    # inititialising u to be zero
    t = 1

    while(iteration < epochs):
      # this is used to compute the gradients
      i = 0
      while i < self.X_train.shape[1]:
        y = self.FeedForward(self.X_train[:, i:i+batchSize], W, B, preActivation, activation)
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+batchSize], self.Y_train[i:i+batchSize], W, preActivation, activation, y)

        # updating the momentum
        for k in range(1, layers+1):
          m_W[k] = self.betha1*m_W[k] + (1 - self.betha1)*dw[k]
          m_B[k] = self.betha1*m_B[k] + (1 - self.betha1)*db[k]

          # finding m hat of W and B
          mhat_W[k] = m_W[k]/(1 - self.betha1 ** t)
          mhat_B[k] = m_B[k]/(1 - self.betha1 ** t)

        # update the v values with the gradient
        for k in range(1, layers+1):
          v_W[k] = v_W[k]*self.betha2 + (1 - self.betha2) * (dw[k] ** 2)
          v_B[k] = v_B[k]*self.betha2 + (1 - self.betha2) * (db[k] ** 2)

          # finding v hat of W and B
          vhat_W[k] = v_W[k]/(1 - self.betha2 ** t)
          vhat_B[k] = v_B[k]/(1 - self.betha2 ** t)

        # we will update the weights now with the momentum
        for k in range(1, layers+1):
          l2_norm_w = np.linalg.norm(vhat_W[k])
          l2_norm_b = np.linalg.norm(vhat_B[k])
          W[k] = W[k] - (self.learningRate/np.sqrt(l2_norm_w) + self.epsilon)*mhat_W[k]
          B[k] = B[k] - (self.learningRate/np.sqrt(l2_norm_b) + self.epsilon)*mhat_B[k]

        t += 1
        i += self.batchSize

      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def NADAM(self):
    iteration = 0
    epochs = self.epochs
    layers = self.layers
    W, B, preActivation, activation  = self.Initialise()
    v_W, v_B = self.InitialiseEmptyWeightsAndBiases()
    m_W, m_B = self.InitialiseEmptyWeightsAndBiases()
    mhat_W, mhat_B = self.InitialiseEmptyWeightsAndBiases()
    vhat_W, vhat_B = self.InitialiseEmptyWeightsAndBiases()
    # inititialising u to be zero
    t = 1

    while(iteration < self.epochs):
      # this is used to compute the gradients
      i = 0
      while i < self.X_train.shape[1]:
        y = self.FeedForward(self.X_train[:, i:i+self.batchSize], W, B, preActivation, activation)
        dw, db = self.BackWardPropogation(self.X_train[:, i:i+self.batchSize], self.Y_train[i:i+self.batchSize], W, preActivation, activation, y)

        # updating the momentum
        for k in range(1, layers+1):
          m_W[k] = self.betha1*m_W[k] + (1 - self.betha1)*dw[k]
          m_B[k] = self.betha1*m_B[k] + (1 - self.betha1)*db[k]

          # finding m hat of W and B
          mhat_W[k] = m_W[k]/(1 - self.betha1 ** t)
          mhat_B[k] = m_B[k]/(1 - self.betha1 ** t)

        # update the v values with the gradient
        for k in range(1, layers+1):
          v_W[k] = v_W[k]*self.betha2 + (1 - self.betha2) * (dw[k] ** 2)
          v_B[k] = v_B[k]*self.betha2 + (1 - self.betha2) * (db[k] ** 2)

          # finding v hat of W and B
          vhat_W[k] = v_W[k]/(1 - self.betha2 ** t)
          vhat_B[k] = v_B[k]/(1 - self.betha2 ** t)

        # we will update the weights now with the momentum
        for k in range(1, layers+1):
          l2_norm_w = np.linalg.norm(vhat_W[k])
          l2_norm_b = np.linalg.norm(vhat_B[k])
          W[k] = W[k] - (self.learningRate/np.sqrt(l2_norm_w) + self.epsilon)*(mhat_W[k]*self.betha1 + (1 - self.betha1)*dw[k]/(1 - self.betha1 ** t))
          B[k] = B[k] - (self.learningRate/np.sqrt(l2_norm_b) + self.epsilon)*(mhat_B[k]*self.betha1 + (1 - self.betha1)*db[k]/(1 - self.betha1 ** t))

        t += 1
        i += self.batchSize

      acuu, loss = self.FindAccuracyAndLoss(W, B, self.X_train, self.Y_train)
      v_acc, v_loss = self.FindAccuracyAndLoss(W, B, self.X_val, self.Y_val)
      if self.isWandb == True:
        wandb.log({'accuracy': acuu})
        wandb.log({'loss': loss})
        wandb.log({'v_accuracy': v_acc})
        wandb.log({'v_loss': v_loss})
        wandb.log({'epoch': iteration})
      print(acuu, loss, v_acc, v_loss)
      iteration += 1

    self.Weights = W
    self.Baises  = B

  def fit(self):
    if self.optimiser == "sgd":
      self.SGD()
    if self.optimiser == "momentum":
      self.MomentBasedGradientDecent()
    if self.optimiser == "nestrov":
      self.NestrovBasedGradientDescent()
    if self.optimiser == "rmsprop":
      self.RMSPROP()
    if self.optimiser == "adam":
      self.ADAM()
    if self.optimiser == "nadam":
      self.NADAM()

  def confusionMatrix(self):
    # on the test data set
    predictions = self.predict(self.X_test)
    if self.dataset == "fashion_mnist":
      class_names = ["T-shirt/Top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle Boot"]
    else:
      class_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

    if self.isWandb == True:
      conf_matrix = confusion_matrix(self.Y_test, predictions)
      plt.figure(figsize=(10, 7))
      sns_heatmap = sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                                xticklabels=class_names, yticklabels=class_names)
      plt.title('Confusion Matrix')
      plt.ylabel('True Label')
      plt.xlabel('Predicted Label')

      # Save the plot to an image file
      heatmap_image_filename = "confusion_matrix_heatmap.png"
      plt.savefig(heatmap_image_filename)
      plt.close()  # Close the plot to avoid displaying it in the notebook/output

      # Log the image to Wandb
      wandb.log({"confusion_matrix_custom": wandb.Image(heatmap_image_filename)})

    else:
      conf_matrix = confusion_matrix(self.Y_test, predictions)
      plt.figure(figsize=(10, 7))
      sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                  xticklabels= class_names,
                  yticklabels= class_names)
      plt.title('Confusion Matrix')
      plt.ylabel('True Label')
      plt.xlabel('Predicted Label')
      plt.show()


In [None]:
!wandb login

In [None]:
model = NeuralNetwork(inputSize = 784, hiddenLayers = 4, outputSize = 10, sizeOfHiddenLayers = 64, batchSize = 32, learningRate = 0.001, initialisationType = "Xavier", optimiser = "nadam", activationFunc="relu",weightDecay = 0.0005,lossFunc = "MSE", epochs = 10, dataset = "mnist")
model.fit()

In [None]:
model = NeuralNetwork(inputSize = 784, hiddenLayers = 4, outputSize = 10, sizeOfHiddenLayers = 128, batchSize = 64, learningRate = 0.001, initialisationType = "Xavier", optimiser = "rmsprop", activationFunc="relu",weightDecay = 0.0005,lossFunc = "cross_entropy", epochs = 10, dataset = "mnist")
model.fit()

In [None]:
model.confusionMatrix()

In [None]:
# for printing the confusion matrix
import wandb

# Define the sweep configuration
sweep_config = {
    'method': 'grid',
    'name' : 'cross entropy and MSE',
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'lossFunction': {
            'values': ["cross_entropy", "MSE"]
        },
        # Define other parameters here
    }
}
def main():
    # Initialize a wandb run
    wandb.init()
    # Access sweep parameters via wandb.config
    config = wandb.config
    run_name = f"{config.lossFunction}_momentum_tanh_Xavier_4_128"

    # Set the run name
    wandb.run.name = run_name
    wandb.run.save()


    # Define and train the model using parameters from config
    model = NeuralNetwork(inputSize=784, hiddenLayers=4, outputSize=10,
                          sizeOfHiddenLayers=128, batchSize=32,
                          learningRate=0.0001, initialisationType="Xavier",
                          optimiser="momentum", activationFunc="tanh", weightDecay=0.0005,
                          epochs=10, isWandb=True, lossFunc = config.lossFunction)
    model.fit()

    wandb.finish()

# Create the sweep
sweep_id = wandb.sweep(sweep=sweep_config, project='Assignment 1')

# Start the sweep agent
wandb.agent(sweep_id, main, count=2)


In [None]:
# for printing the confusion matrix
import wandb

# Define the sweep configuration
sweep_config = {
    'method': 'bayes',
    'name' : 'confusion matrix',
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'batchSize': {
            'values': [32]
        },
        # Define other parameters here
    }
}
def main():
    # Initialize a wandb run
    wandb.init()
    # Access sweep parameters via wandb.config
    config = wandb.config

    # Define and train the model using parameters from config
    model = NeuralNetwork(inputSize=784, hiddenLayers=4, outputSize=10,
                          sizeOfHiddenLayers=64, batchSize=config.batchSize,
                          learningRate=0.0001, initialisationType="Xavier",
                          optimiser="nadam", activationFunc="relu", weightDecay=0.0005,
                          epochs=2, isWandb=True)
    model.fit()
    model.confusionMatrix()

    wandb.finish()

# Create the sweep
sweep_id = wandb.sweep(sweep=sweep_config, project='Assignment 1')

# Start the sweep agent
wandb.agent(sweep_id, main, count=1)


In [None]:
def main():
    wandb.init(project="Assignment 1")
    config = wandb.config
    run_name = f"{config.optimiser}_{config.activation}_{config.hidden_layers}_{config.hidden_layer_size}_{config.batch_size}"

    # Set the run name
    wandb.run.name = run_name
    wandb.run.save()

    # Define and train the model as before
    model = NeuralNetwork(inputSize = 784, hiddenLayers = config.hidden_layers, outputSize = 10, sizeOfHiddenLayers = config.hidden_layer_size, batchSize = config.batch_size, learningRate = config.learning_rate, initialisationType = config.weights_initialisation, optimiser = config.optimiser, activationFunc=config.activation, epochs = config.epochs,weightDecay = config.weight_decay, isWandb = True)
    model.fit()
    wandb.finish()

sweep_config = {
    'method': 'bayes',
    'name' : 'sweep cross entropy',
    'metric': {
      'name': 'v_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [5,10]
        },
        'hidden_layers': {
          'values': [3, 4, 5]
        },
        'hidden_layer_size':{
            'values':[32,64,128]
        },
        'weight_decay': {
            'values':[0, 0.0005, 0.5]
        },
        'batch_size': {
            'values': [16, 32, 64]
        },
        'activation': {
            'values': ['sigmoid','relu','tanh']
        },
        'optimiser': {
            'values': ['sgd', 'momentum', 'nestrov', 'rmsprop', 'adam', 'nadam']
        },
        'weights_initialisation': {
            'values': ['random', 'Xavier']
        },
        'learning_rate': {
            'values':[1e-2,1e-3,1e-4]
        },
    }
}


sweep_id = wandb.sweep(sweep=sweep_config,project='Assignment 1')
wandb.agent("3ncb86iq" , function = main , count = 400)

wandb.finish()