<a href="https://colab.research.google.com/github/subikkshas/DA6401/blob/main/DLass1Q4-6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import wandb
import numpy as np
from keras.datasets import fashion_mnist  # Only for dataset loading

# 🔹 WandB Login
wandb.login()

# 🔹 Activation Functions & Their Derivatives
def sigmoid(x): return 1 / (1 + np.exp(-x))
def tanh(x): return np.tanh(x)
def relu(x): return np.maximum(0, x)

def sigmoid_derivative(x): return x * (1 - x)
def tanh_derivative(x): return 1 - np.tanh(x) ** 2
def relu_derivative(x): return np.where(x > 0, 1, 0)

activations = {"sigmoid": (sigmoid, sigmoid_derivative),
               "tanh": (tanh, tanh_derivative),
               "ReLU": (relu, relu_derivative)}

# 🔹 Weight Initialization
def initialize_weights(layers, method="random"):
    weights, biases = [], []
    for i in range(len(layers) - 1):
        if method == "random":
            weights.append(np.random.randn(layers[i], layers[i+1]) * 0.01)
        elif method == "Xavier":
            weights.append(np.random.randn(layers[i], layers[i+1]) * np.sqrt(1 / layers[i]))
        biases.append(np.zeros((1, layers[i+1])))
    return weights, biases

# 🔹 Forward Pass
def forward_pass(X, weights, biases, activation):
    activ, deriv = activations[activation]
    activations_list = [X]
    pre_activations = []

    for W, b in zip(weights, biases):
        Z = np.dot(activations_list[-1], W) + b
        pre_activations.append(Z)
        A = activ(Z)
        activations_list.append(A)

    return activations_list, pre_activations

# 🔹 Backpropagation
def backward_pass(activations_list, pre_activations, y, weights, biases, activation):
    m = y.shape[0]
    activ, deriv = activations[activation]

    # One-hot encode y
    y_one_hot = np.zeros((m, 10))
    y_one_hot[np.arange(m), y] = 1

    dZ = activations_list[-1] - y_one_hot  # Softmax gradient
    dW, db = [], []

    for i in range(len(weights) - 1, -1, -1):
        dW_i = np.dot(activations_list[i].T, dZ) / m
        db_i = np.sum(dZ, axis=0, keepdims=True) / m
        dW.insert(0, dW_i)
        db.insert(0, db_i)

        if i > 0:
            dZ = np.dot(dZ, weights[i].T) * deriv(pre_activations[i-1])

    return dW, db

# 🔹 Optimizer Class
class Optimizer:
    def __init__(self, method="sgd", lr=0.01, momentum=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.method = method
        self.lr = lr
        self.momentum = momentum
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.v_dW, self.s_dW = {}, {}
        self.v_db, self.s_db = {}, {}
        self.t = 0

    def update(self, weights, biases, dW, db):
        updated_weights, updated_biases = [], []
        for i, (W, b, dW_i, db_i) in enumerate(zip(weights, biases, dW, db)):
            if self.method == "sgd":
                W -= self.lr * dW_i
                b -= self.lr * db_i
            updated_weights.append(W)
            updated_biases.append(b)
        return updated_weights, updated_biases

# 🔹 Load Dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0
val_size = int(0.1 * len(X_train))
X_val, y_val = X_train[:val_size], y_train[:val_size]
X_train, y_train = X_train[val_size:], y_train[val_size:]

# 🔹 Train Function with Meaningful Sweep Names & Auto-Generated Plots
def train():
    run = wandb.init(project="DA6401-Assignment-1", group="Q4", config=wandb.config)

    # Constructing meaningful name for this sweep
    config = wandb.config
    sweep_name = f"hl_{config.hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    run.name = sweep_name  # Assign custom name to this run

    # Initialize Model Weights & Biases
    layers = [784] + [config.hidden_size] * config.hidden_layers + [10]
    weights, biases = initialize_weights(layers, method=config.weight_init)
    optimizer = Optimizer(config.optimizer, config.learning_rate)

    history = {"loss": [], "val_loss": [], "accuracy": [], "val_accuracy": []}

    for epoch in range(config.epochs):
      activations_list, pre_activations = forward_pass(X_train.reshape(-1, 784), weights, biases, config.activation)
      dW, db = backward_pass(activations_list, pre_activations, y_train, weights, biases, config.activation)
      weights, biases = optimizer.update(weights, biases, dW, db)

    # Compute Training Loss & Accuracy
      train_preds = np.argmax(activations_list[-1], axis=1)
      train_accuracy = np.mean(train_preds == y_train)
      train_loss = -np.sum(np.log(activations_list[-1][np.arange(len(y_train)), y_train])) / len(y_train)

    # Compute Validation Loss & Accuracy
      val_activations, _ = forward_pass(X_val.reshape(-1, 784), weights, biases, config.activation)
      val_preds = np.argmax(val_activations[-1], axis=1)
      val_accuracy = np.mean(val_preds == y_val)
      val_loss = -np.sum(np.log(val_activations[-1][np.arange(len(y_val)), y_val])) / len(y_val)

    # 🔹 Correct way to log: Log all values per epoch & explicitly set step=epoch
    wandb.log({
        "epoch": epoch,                # Ensures tracking over time
        "loss": train_loss,            # Scalar
        "accuracy": train_accuracy,    # Scalar
        "val_loss": val_loss,          # Scalar
        "val_accuracy": val_accuracy   # Scalar
    }, step=epoch)  # Ensures all plots use "epoch" on the x-axis


    run.finish()

# 🔹 WandB Sweep Setup with Hyperparameters
sweep_config = {
    "method": "random",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        "epochs": {"values": [5, 10]},
        "hidden_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "weight_init": {"values": ["random", "Xavier"]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "optimizer": {"values": ["sgd", "adam"]},
        "learning_rate": {"values": [1e-3, 1e-4]}
    }
}

# 🔹 Run WandB Sweep
sweep_id = wandb.sweep(sweep_config, project="DA6401-Assignment-1")
wandb.agent(sweep_id, function=train, count=10)  # Run 10 different configurations


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msubikksha[0m ([33msubikksha-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: hyi2rpim
Sweep URL: https://wandb.ai/subikksha-indian-institute-of-technology-madras/DA6401-Assignment-1/sweeps/hyi2rpim


[34m[1mwandb[0m: Agent Starting Run: 9fq403pg with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,▁
epoch,▁
loss,▁
val_accuracy,▁
val_loss,▁

0,1
accuracy,0.09985
epoch,9.0
loss,0.78125
val_accuracy,0.10133
val_loss,0.77967


[34m[1mwandb[0m: Agent Starting Run: esillo3n with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


  train_loss = -np.sum(np.log(activations_list[-1][np.arange(len(y_train)), y_train])) / len(y_train)
  val_loss = -np.sum(np.log(val_activations[-1][np.arange(len(y_val)), y_val])) / len(y_val)


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.08541
epoch,4.0
loss,inf
val_accuracy,0.08517
val_loss,inf


[34m[1mwandb[0m: Agent Starting Run: doyikqx1 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.1677
epoch,4.0
loss,inf
val_accuracy,0.16617
val_loss,inf


[34m[1mwandb[0m: Agent Starting Run: q1o4hyrf with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: Xavier


  train_loss = -np.sum(np.log(activations_list[-1][np.arange(len(y_train)), y_train])) / len(y_train)
  val_loss = -np.sum(np.log(val_activations[-1][np.arange(len(y_val)), y_val])) / len(y_val)


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.14719
epoch,9.0
loss,
val_accuracy,0.15117
val_loss,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0sz0oi0x with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁
epoch,▁
loss,▁
val_accuracy,▁
val_loss,▁

0,1
accuracy,0.08576
epoch,9.0
loss,9.31579
val_accuracy,0.08283
val_loss,9.21054


[34m[1mwandb[0m: Agent Starting Run: fqkysfr8 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.09872
epoch,9.0
loss,
val_accuracy,0.09733
val_loss,


[34m[1mwandb[0m: Agent Starting Run: tiflyxb9 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁
epoch,▁
loss,▁
val_accuracy,▁
val_loss,▁

0,1
accuracy,0.09978
epoch,4.0
loss,0.69787
val_accuracy,0.102
val_loss,0.69944


[34m[1mwandb[0m: Agent Starting Run: bg1z4fq4 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.14228
epoch,4.0
loss,
val_accuracy,0.1385
val_loss,


[34m[1mwandb[0m: Agent Starting Run: xy8p918m with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁
epoch,▁
loss,▁
val_accuracy,▁
val_loss,▁

0,1
accuracy,0.06063
epoch,4.0
loss,10.12885
val_accuracy,0.05267
val_loss,9.90528


[34m[1mwandb[0m: Agent Starting Run: d1e358z1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁
epoch,▁
val_accuracy,▁

0,1
accuracy,0.06976
epoch,4.0
loss,
val_accuracy,0.07
val_loss,


In [2]:
import wandb
import numpy as np
from keras.datasets import fashion_mnist  # Allowed for dataset loading

# 🔹 WandB Login
wandb.login()

# 🔹 Activation Functions & Their Derivatives
def sigmoid(x): return 1 / (1 + np.exp(-x))
def tanh(x): return np.tanh(x)
def relu(x): return np.maximum(0, x)

def sigmoid_derivative(x): return x * (1 - x)
def tanh_derivative(x): return 1 - np.tanh(x) ** 2
def relu_derivative(x): return np.where(x > 0, 1, 0)

activations = {"sigmoid": (sigmoid, sigmoid_derivative),
               "tanh": (tanh, tanh_derivative),
               "ReLU": (relu, relu_derivative)}

# 🔹 Weight Initialization
def initialize_weights(layers, method="Xavier"):
    weights, biases = [], []
    for i in range(len(layers) - 1):
        if method == "random":
            weights.append(np.random.randn(layers[i], layers[i+1]) * 0.01)
        elif method == "Xavier":
            weights.append(np.random.randn(layers[i], layers[i+1]) * np.sqrt(1 / layers[i]))
        elif method == "He":  # He initialization for ReLU
            weights.append(np.random.randn(layers[i], layers[i+1]) * np.sqrt(2 / layers[i]))
        biases.append(np.zeros((1, layers[i+1])))
    return weights, biases


# 🔹 Forward Pass
def forward_pass(X, weights, biases, activation):
    activ, deriv = activations[activation]
    activations_list = [X]
    pre_activations = []

    for W, b in zip(weights, biases):
        Z = np.dot(activations_list[-1], W) + b
        pre_activations.append(Z)
        A = activ(Z)
        activations_list.append(A)

    return activations_list, pre_activations

# 🔹 Backpropagation
def backward_pass(activations_list, pre_activations, y, weights, biases, activation):
    m = y.shape[0]
    activ, deriv = activations[activation]

    # One-hot encode y
    y_one_hot = np.zeros((m, 10))
    y_one_hot[np.arange(m), y] = 1

    dZ = activations_list[-1] - y_one_hot  # Softmax gradient
    dW, db = [], []

    for i in range(len(weights) - 1, -1, -1):
        dW_i = np.dot(activations_list[i].T, dZ) / m
        db_i = np.sum(dZ, axis=0, keepdims=True) / m
        dW.insert(0, dW_i)
        db.insert(0, db_i)

        if i > 0:
            dZ = np.dot(dZ, weights[i].T) * deriv(pre_activations[i-1])

    return dW, db

# 🔹 Optimizer Class
class Optimizer:
    def __init__(self, method="sgd", lr=0.01):
        self.method = method
        self.lr = lr

    def update(self, weights, biases, dW, db):
        updated_weights, updated_biases = [], []
        for i, (W, b, dW_i, db_i) in enumerate(zip(weights, biases, dW, db)):
            W -= self.lr * dW_i
            b -= self.lr * db_i
            updated_weights.append(W)
            updated_biases.append(b)
        return updated_weights, updated_biases

# 🔹 Load Dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0
val_size = int(0.1 * len(X_train))
X_val, y_val = X_train[:val_size], y_train[:val_size]
X_train, y_train = X_train[val_size:], y_train[val_size:]

# 🔹 Train Function with Correct Logging
def train():
    run = wandb.init(project="DA6401-Assignment-1", group="Q4", config=wandb.config)

    # Constructing meaningful name for this sweep
    config = wandb.config
    sweep_name = f"hl_{config.hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    run.name = sweep_name

    layers = [784] + [config.hidden_size] * config.hidden_layers + [10]
    weights, biases = initialize_weights(layers, method=config.weight_init)
    optimizer = Optimizer(config.optimizer, config.learning_rate)

    # 🔹 Ensure logging per epoch
    for epoch in range(config.epochs):
        activations_list, pre_activations = forward_pass(X_train.reshape(-1, 784), weights, biases, config.activation)
        dW, db = backward_pass(activations_list, pre_activations, y_train, weights, biases, config.activation)
        weights, biases = optimizer.update(weights, biases, dW, db)

        # Compute Training Metrics
        train_preds = np.argmax(activations_list[-1], axis=1)
        train_accuracy = np.mean(train_preds == y_train)
        train_loss = -np.mean(np.log(activations_list[-1][np.arange(len(y_train)), y_train] + 1e-9))  # Avoid log(0)

        # Compute Validation Metrics
        val_activations, _ = forward_pass(X_val.reshape(-1, 784), weights, biases, config.activation)
        val_preds = np.argmax(val_activations[-1], axis=1)
        val_accuracy = np.mean(val_preds == y_val)
        val_loss = -np.mean(np.log(val_activations[-1][np.arange(len(y_val)), y_val] + 1e-9))  # Avoid log(0)

        # 🔹 Corrected WandB Logging for Line Plots
        wandb.log({
            "epoch": epoch,                # Ensures tracking over time
            "loss": train_loss,            # Scalar
            "accuracy": train_accuracy,    # Scalar
            "val_loss": val_loss,          # Scalar
            "val_accuracy": val_accuracy   # Scalar
        }, step=epoch)  # Ensures proper epoch-based tracking

    run.finish()

# 🔹 WandB Sweep Setup
sweep_config = {
    "method": "random",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        "epochs": {"values": [5, 10]},
        "hidden_layers": {"values": [3, 4, 5]},
        "hidden_size": {"values": [32, 64, 128]},
        "batch_size": {"values": [16, 32, 64]},
        "weight_init": {"values": ["random", "Xavier"]},
        "activation": {"values": ["sigmoid", "tanh", "ReLU"]},
        "optimizer": {"values": ["sgd", "adam"]},
        "learning_rate": {"values": [1e-3, 1e-4]}
    }
}

# 🔹 Run WandB Sweep
sweep_id = wandb.sweep(sweep_config, project="DA6401-Assignment-1")
wandb.agent(sweep_id, function=train, count=10)  # Run 10 different configurations


Create sweep with ID: rjydpx8s
Sweep URL: https://wandb.ai/subikksha-indian-institute-of-technology-madras/DA6401-Assignment-1/sweeps/rjydpx8s


[34m[1mwandb[0m: Agent Starting Run: ffhiss6b with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▂▃▃▄▅▆▆▇█

0,1
accuracy,0.09996
epoch,9.0
loss,0.71532
val_accuracy,0.10033
val_loss,0.71715


[34m[1mwandb[0m: Agent Starting Run: hyslnsa9 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▂▃▃▄▅▆▆▇█

0,1
accuracy,0.09996
epoch,9.0
loss,0.73966
val_accuracy,0.10033
val_loss,0.74605


[34m[1mwandb[0m: Agent Starting Run: p7zn23l5 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,█▆▅▃▁
epoch,▁▃▅▆█
loss,█▆▄▂▁
val_accuracy,█▇▆▄▁
val_loss,█▆▄▃▁

0,1
accuracy,0.0987
epoch,4.0
loss,13.76204
val_accuracy,0.09533
val_loss,13.86203


[34m[1mwandb[0m: Agent Starting Run: 7u2ixd0r with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▂▃▃▄▅▆▆▇█

0,1
accuracy,0.1003
epoch,9.0
loss,0.57643
val_accuracy,0.09733
val_loss,0.57805


[34m[1mwandb[0m: Agent Starting Run: fk1v3y6c with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▂▃▃▄▅▆▆▇█

0,1
accuracy,0.10019
epoch,9.0
loss,0.71904
val_accuracy,0.09833
val_loss,0.72238


[34m[1mwandb[0m: Agent Starting Run: 5at46t8t with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▃▅▆█
val_accuracy,▁▁▁▁▁
val_loss,▁▃▅▆█

0,1
accuracy,0.09978
epoch,4.0
loss,0.71192
val_accuracy,0.102
val_loss,0.71833


[34m[1mwandb[0m: Agent Starting Run: 2hk6uaeq with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: Xavier


  train_loss = -np.mean(np.log(activations_list[-1][np.arange(len(y_train)), y_train] + 1e-9))  # Avoid log(0)
  val_loss = -np.mean(np.log(val_activations[-1][np.arange(len(y_val)), y_val] + 1e-9))  # Avoid log(0)


0,1
accuracy,▁▃▅▆█
epoch,▁▃▅▆█
val_accuracy,▁▄▆▇█

0,1
accuracy,0.17954
epoch,4.0
loss,
val_accuracy,0.1755
val_loss,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: d11jteye with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: Xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▃▅▆█
val_accuracy,▁▁▁▁▁
val_loss,▁▃▅▆█

0,1
accuracy,0.09969
epoch,4.0
loss,0.78482
val_accuracy,0.10283
val_loss,0.78324


[34m[1mwandb[0m: Agent Starting Run: e5bgiekb with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▃▄▆▇███▇▆
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▄▄▃▂▂▁▁
val_accuracy,▁▂▅▆▇▇██▇▄
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
accuracy,0.11513
epoch,9.0
loss,9.31536
val_accuracy,0.10617
val_loss,9.21009


[34m[1mwandb[0m: Agent Starting Run: apawsqg3 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▂▃▃▄▅▆▆▇█
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▂▃▃▄▅▆▆▇█

0,1
accuracy,0.1003
epoch,9.0
loss,0.70546
val_accuracy,0.09733
val_loss,0.70726
