<a href="https://colab.research.google.com/github/subikkshas/DA6401/blob/main/DLass1Q10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install wandb
import wandb
from keras.datasets import mnist
import numpy as np
from  matplotlib import pyplot as plt
import time
import math
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# Load the dataset
dataset = mnist.load_data()
(X_train_and_validation, y_train_and_validation), (X_test, y_test) = dataset

# Split the dataset into training, validation, and test sets
X_train, X_validation, y_train, y_validation = train_test_split(X_train_and_validation, y_train_and_validation, test_size=0.1, random_state=42)

# Normalize the data
X_train = (X_train / 255.0).astype(np.float32)
X_validation = (X_validation / 255.0).astype(np.float32)
X_test = (X_test / 255.0).astype(np.float32)

# Print dataset shapes
print("Train Dataset Shape: ", X_train.shape)
print("Train Target Vector Shape: ", y_train.shape)
print("Test Dataset Shape:", X_test.shape)
print("Test Target Vector Shape", y_test.shape)
print("Validation Dataset Shape:", X_validation.shape)
print("Validation Target Vector Shape", y_validation.shape)

# Reshape the data
X_train = np.array(X_train.reshape(X_train.shape[0], 784, 1))
X_test = np.array(X_test.reshape(X_test.shape[0], 784, 1))
X_validation = np.array(X_validation.reshape(X_validation.shape[0], 784, 1))

# Initialize parameters
def param(layers, init_type):
    W = []
    B = []
    for i in range(len(layers) - 1):
        W = layer_init(W, layers[i + 1], layers[i], init_type)
        B = layer_init(B, layers[i + 1], 1, init_type)
    return W, B

# Initialize layers
def layer_init(arr, n1, n2, init_type):
    np.random.seed(10)
    if init_type == "random":
        arr.append(np.random.randn(n1, n2) * 0.1)
    elif init_type == "xavier":
        arr.append(np.random.randn(n1, n2) * np.sqrt(2 / (n1 + n2)))
    return arr

# Initialize parameters
def param(layers, init_type):
    W = []
    B = []
    for i in range(len(layers) - 1):
        W = layer_init(W, layers[i + 1], layers[i], init_type)
        B = layer_init(B, layers[i + 1], 1, init_type)
    return W, B

# One-hot encoding
def one_hot(y, num_output_nodes):
    v = np.zeros((num_output_nodes, len(y)))
    for i, j in enumerate(y):
        v[j, i] = 1
    return v

# Loss function
def loss(y, y_hat, l_type, W=None, reg=0, n_class=10):
    if l_type == 'cross_entropy':
        err = -1 * np.sum(np.multiply(one_hot(y, n_class), np.log(y_hat))) / one_hot(y, n_class).shape[1]
    elif l_type == 'squared_error':
        err = np.sum((one_hot(y, n_class) - y_hat) ** 2) / (2 * one_hot(y, n_class).shape[1])

    if W:
        r = 0
        for i in range(len(W)):
            r += np.sum((np.array(W, dtype=object) ** 2)[i])
        err = err + reg * r
    return err

# Evaluate accuracy
def eval_acc(y_hat, y_true):
    return np.mean(np.argmax(y_hat, axis=0) == y_true) * 100

# Activation functions
def activation(activation_function):
    if activation_function == 'sigmoid':
        return sigmoid
    elif activation_function == 'tanh':
        return tanh
    elif activation_function == 'ReLU':
        return relu

def sigmoid(x, derivative=False):
    if derivative:
        return sigmoid(x) * (1 - sigmoid(x))
    return 1 / (1 + np.exp(-x))

def tanh(x, derivative=False):
    if derivative:
        return 1 - tanh(x) ** 2
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

def relu(x, derivative=False):
    if derivative:
        return (x > 0) * 1
    return x * (x > 0)

def softmax(x, derivative=False):
    if derivative:
        return softmax(x) * (1 - softmax(x))
    return np.exp(x) / np.sum(np.exp(x), axis=0)

# Forward propagation
def forward(x, W, B, activation_type):
    h = []
    a = []
    sigma = activation(activation_type)  # Activation function
    h.append(x)  # h0 = x
    a.append(np.dot(W[0], h[0]) + B[0])
    for i in range(len(W) - 1):
        h.append(sigma(a[-1]))
        a.append(np.dot(W[i + 1], h[-1]) + B[i + 1])
    y_hat = softmax(a[-1])
    return y_hat, h, a

# Backpropagation
def back_prop(x, y, y_hat, a, h, W, B, batch_size, l_type, act_type):
    grad_h, grad_a, grad_W, grad_B = [0] * len(h), [0] * len(a), [0] * len(W), [0] * len(B)
    sigma = activation(act_type)

    if l_type == "cross_entropy":
        grad_h[-1] = -1 * (y / y_hat)
        grad_a[-1] = -1 * (y - y_hat)
    elif l_type == "squared_error":
        grad_h[-1] = y_hat - y
        grad_a[-1] = (y_hat - y) * softmax(a[-1]) * (1 - softmax(a[-1]))

    for i in range(len(W) - 1, -1, -1):
        grad_W[i] = np.dot(grad_a[i], h[i].T)
        grad_B[i] = np.dot(grad_a[i], np.ones((grad_a[i].shape[1], 1)))
        if i > 0:
            grad_h[i - 1] = np.dot(W[i].T, grad_a[i])
            grad_a[i - 1] = np.multiply(grad_h[i - 1], sigma(a[i - 1], derivative=True))

    return grad_W, grad_B, grad_h, grad_a

# Evaluate the model on the test set
def evaluate_test_set(W, B, X_test, y_test, activation_type):
    y_test_hat, _, _ = forward(np.squeeze(X_test).T, W, B, activation_type)
    test_acc = eval_acc(y_test_hat, y_test)
    test_loss = loss(y_test, y_test_hat, loss_type, W=None, reg=reg_lamda, n_class=10)
    print(f"Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}")
    return test_acc, test_loss

# SGD step
def sgd_step(W, B, grad_W, grad_B, lr, reg):
    W = np.array(W, dtype=object)
    B = np.array(B, dtype=object)
    W -= lr * reg * W + lr * np.array(grad_W, dtype=object)
    B -= lr * reg * B + lr * np.array(grad_B, dtype=object)
    return W.tolist(), B.tolist()

# Momentum step
def momentum_step(w, b, gW, gB, lr=0.001, gamma=0.9, reg=0):
    params = {'w': w, 'b': b}
    Wmoments = [np.zeros_like(p) for p in params['w']]
    Bmoments = [np.zeros_like(p) for p in params['b']]

    Wmoments = gamma * np.array(Wmoments, dtype=object) + lr * np.array(gW, dtype=object)
    W = (1 - lr * reg) * np.array(params['w'], dtype=object) - Wmoments
    Wmoments = Wmoments.tolist()

    Bmoments = gamma * np.array(Bmoments, dtype=object) + lr * np.array(gB, dtype=object)
    B = (1 - lr * reg) * np.array(params['b'], dtype=object) - Bmoments
    Bmoments = Bmoments.tolist()

    return W.tolist(), B.tolist()

# RMSprop step
def RMSprop_step(w, b, gW, gB, lr=0.01, beta=0.99):
    params = {'w': w, 'b': b}
    vW = [np.zeros_like(p) for p in params['w']]
    vB = [np.zeros_like(p) for p in params['b']]

    vW = beta * np.array(vW, dtype=object) + (1 - beta) * (np.array(gW, dtype=object) ** 2)
    W = np.array(params['w'], dtype=object) - (lr / ((vW + 1e-7) ** 0.5)) * np.array(gW, dtype=object)

    vB = beta * np.array(vB, dtype=object) + (1 - beta) * (np.array(gB, dtype=object) ** 2)
    B = np.array(params['b'], dtype=object) - (lr / ((vB + 1e-7) ** 0.5)) * np.array(gB, dtype=object)

    return W.tolist(), B.tolist()

def train(X_train, y_train, x_val, y_val, num_inputs_nodes, hidden_layers, out_num, init_type, epochs,
          batch_size, l_type, act_type, op_name, lr_rate, reg):
    # Ensure WandB starts fresh
    wandb.finish()

    run = wandb.init(project="DA6401-Assignment-1", name="training-run", reinit=True)

    in_node = [num_inputs_nodes]
    layers = []
    layers.extend(in_node)
    layers.extend(hidden_layers)
    layers.extend([out_num])

    W, B = param(layers, init_type)
    N = X_train.shape[0]
    n_batches = int(np.floor(N / batch_size))

    print(f"🔹 Training with Loss Type: {l_type}")
    for epoch in range(epochs):
        print(f"Epoch {epoch}: Using Loss Type: {l_type}")
        train_loss, train_accuracy, val_loss, val_accuracy = [], [], [], []
        l, acc, ds, steps = 0, 0, 0, 1

        while ds < N:
            mini_batch_size = min((N - ds), batch_size)
            x = np.squeeze(X_train[ds:ds + mini_batch_size]).T
            y = one_hot(y_train[ds:ds + mini_batch_size], out_num)

            y_hat, h, a = forward(x, W, B, act_type)
            grad_W, grad_B, grad_h, grad_a = back_prop(x, y, y_hat, a, h, W, B, batch_size, l_type, act_type)

            if op_name == 'sgd':
                W, B = sgd_step(W, B, grad_W, grad_B, lr_rate, reg)
            elif op_name == 'momentum':
                W, B = momentum_step(W, B, grad_W, grad_B, lr_rate, reg)
            elif op_name == 'rmsprop':
                W, B = RMSprop_step(W, B, grad_W, grad_B, lr_rate, reg)

            l += loss(y_train[ds:ds + mini_batch_size], y_hat, l_type, W, reg, out_num)
            acc += eval_acc(y_hat, y_train[ds:ds + mini_batch_size])

            steps += 1
            ds += batch_size

        l /= (n_batches + mini_batch_size)
        acc /= steps

        train_loss.append(l)
        train_accuracy.append(acc)

        y_val_hat, _, _ = forward(np.squeeze(x_val).T, W, B, act_type)
        val_acc = eval_acc(y_val_hat, y_val)
        val_l = loss(y_val, y_val_hat, l_type, W=None, reg=reg, n_class=out_num)

        val_accuracy.append(val_acc)
        val_loss.append(val_l)

        wandb.log({"epoch": epoch, "Train_loss": l, "Train_acc": acc, "val_loss": val_l, "val_Accuracy": val_acc})

        print(f"Epoch {epoch}: Train Loss = {l:.4f}, Train Accuracy = {acc:.4f}, Val Loss = {val_l:.4f}, Val Accuracy = {val_acc:.4f}")

    wandb.finish()  # Explicitly finish the WandB run

    return W, B, train_loss, train_accuracy, val_loss, val_accuracy

# Configuration 1: SGD optimizer, learning rate = 0.01, batch size = 128, hidden layers = [128, 64]
wandb.init(project="DA6401-Assignment-1", name="config1-sgd")
epochs = 10
hidden_layers = [128, 64]
learning_rate = 0.01
weight_decay = 0.0005
optimizer_name = 'sgd'
batch_size = 128
init_type = 'xavier'
activation_type = 'ReLU'
loss_type = 'cross_entropy'
reg_lamda = 0.0005

# Train the model
W, B, train_loss, train_accuracy, val_loss, val_accuracy = train(
    X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate, reg_lamda
)

# Evaluate on test set
test_acc, test_loss = evaluate_test_set(W, B, X_test, y_test, activation_type)

# Log test metrics within the same WandB run
wandb.init(project="DA6401-Assignment-1", name="config1-sgd")
wandb.log({"Test Accuracy": test_acc, "Test Loss": test_loss})


# Configuration 2: Adam optimizer, learning rate = 0.001, batch size = 64, hidden layers = [256, 128]
wandb.init(project="DA6401-Assignment-1", name="config2-adam")  # Initialize a new wandb run for this config
hidden_layers = [256, 128]
optimizer_name = 'adam'

# Train the model
W, B, train_loss, train_accuracy, val_loss, val_accuracy = train(
    X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate, reg_lamda
)

# Evaluate on test set
test_acc, test_loss = evaluate_test_set(W, B, X_test, y_test, activation_type)

# Log test metrics within the same WandB run
wandb.init(project="DA6401-Assignment-1", name="config1-sgd")
wandb.log({"Test Accuracy": test_acc, "Test Loss": test_loss})


# Configuration 3: RMSprop optimizer, learning rate = 0.0005, batch size = 32, hidden layers = [128, 64, 32]
wandb.init(project="DA6401-Assignment-1", name="config3-rmsprop")  # Initialize a new wandb run for this config
hidden_layers = [128, 64, 32]
learning_rate = 0.0005
batch_size = 32
optimizer_name = 'rmsprop'

# Train the model
W, B, train_loss, train_accuracy, val_loss, val_accuracy = train(
    X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate, reg_lamda
)

# Evaluate on test set
test_acc, test_loss = evaluate_test_set(W, B, X_test, y_test, activation_type)

# Log test metrics within the same WandB run
wandb.init(project="DA6401-Assignment-1", name="config1-sgd")
wandb.log({"Test Accuracy": test_acc, "Test Loss": test_loss})
wandb.finish()  # Finish the run after logging the test metrics

Train Dataset Shape:  (54000, 28, 28)
Train Target Vector Shape:  (54000,)
Test Dataset Shape: (10000, 28, 28)
Test Target Vector Shape (10000,)
Validation Dataset Shape: (6000, 28, 28)
Validation Target Vector Shape (6000,)


🔹 Training with Loss Type: cross_entropy
Epoch 0: Using Loss Type: cross_entropy
Epoch 0: Train Loss = 2.0727, Train Accuracy = 42.5234, Val Loss = 0.8532, Val Accuracy = 69.2000
Epoch 1: Using Loss Type: cross_entropy
Epoch 1: Train Loss = 1.3448, Train Accuracy = 84.4502, Val Loss = 0.3003, Val Accuracy = 91.6667
Epoch 2: Using Loss Type: cross_entropy
Epoch 2: Train Loss = 1.2013, Train Accuracy = 91.9511, Val Loss = 0.2232, Val Accuracy = 93.8333
Epoch 3: Using Loss Type: cross_entropy
Epoch 3: Train Loss = 1.1861, Train Accuracy = 93.3756, Val Loss = 0.2076, Val Accuracy = 94.0667
Epoch 4: Using Loss Type: cross_entropy
Epoch 4: Train Loss = 1.1914, Train Accuracy = 94.0869, Val Loss = 0.1970, Val Accuracy = 94.3500
Epoch 5: Using Loss Type: cross_entropy
Epoch 5: Train Loss = 1.2016, Train Accuracy = 94.4951, Val Loss = 0.1970, Val Accuracy = 94.3833
Epoch 6: Using Loss Type: cross_entropy
Epoch 6: Train Loss = 1.2109, Train Accuracy = 94.8793, Val Loss = 0.1929, Val Accuracy = 9

0,1
Train_acc,▁▇████████
Train_loss,█▂▁▁▁▁▁▁▁▂
epoch,▁▂▃▃▄▅▆▆▇█
val_Accuracy,▁▇████████
val_loss,█▂▂▁▁▁▁▁▁▁

0,1
Train_acc,95.5592
Train_loss,1.25964
epoch,9.0
val_Accuracy,95.05
val_loss,0.17436


Test Accuracy: 94.8800, Test Loss: 0.1958


0,1
Test Accuracy,▁
Test Loss,▁

0,1
Test Accuracy,94.88
Test Loss,0.19583


🔹 Training with Loss Type: cross_entropy
Epoch 0: Using Loss Type: cross_entropy
Epoch 0: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 1: Using Loss Type: cross_entropy
Epoch 1: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 2: Using Loss Type: cross_entropy
Epoch 2: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 3: Using Loss Type: cross_entropy
Epoch 3: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 4: Using Loss Type: cross_entropy
Epoch 4: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 5: Using Loss Type: cross_entropy
Epoch 5: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000
Epoch 6: Using Loss Type: cross_entropy
Epoch 6: Train Loss = 2.3010, Train Accuracy = 9.7892, Val Loss = 2.6077, Val Accuracy = 10.4000


0,1
Train_acc,▁▁▁▁▁▁▁▁▁▁
Train_loss,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
val_Accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
Train_acc,9.78924
Train_loss,2.30097
epoch,9.0
val_Accuracy,10.4
val_loss,2.60766


Test Accuracy: 9.8000, Test Loss: 2.6186


0,1
Test Accuracy,▁
Test Loss,▁

0,1
Test Accuracy,9.8
Test Loss,2.61862


🔹 Training with Loss Type: cross_entropy
Epoch 0: Using Loss Type: cross_entropy
Epoch 0: Train Loss = 0.6640, Train Accuracy = 87.1725, Val Loss = 0.2614, Val Accuracy = 93.2500
Epoch 1: Using Loss Type: cross_entropy
Epoch 1: Train Loss = 0.5850, Train Accuracy = 93.4706, Val Loss = 0.2587, Val Accuracy = 94.3000
Epoch 2: Using Loss Type: cross_entropy
Epoch 2: Train Loss = 0.6854, Train Accuracy = 94.2958, Val Loss = 0.2774, Val Accuracy = 94.9000
Epoch 3: Using Loss Type: cross_entropy
Epoch 3: Train Loss = 0.7638, Train Accuracy = 94.9915, Val Loss = 0.2887, Val Accuracy = 95.1833
Epoch 4: Using Loss Type: cross_entropy
Epoch 4: Train Loss = 0.8337, Train Accuracy = 95.5169, Val Loss = 0.2902, Val Accuracy = 95.5667
Epoch 5: Using Loss Type: cross_entropy


  grad_h[-1] = -1 * (y / y_hat)
  err = -1 * np.sum(np.multiply(one_hot(y, n_class), np.log(y_hat))) / one_hot(y, n_class).shape[1]
  err = -1 * np.sum(np.multiply(one_hot(y, n_class), np.log(y_hat))) / one_hot(y, n_class).shape[1]
  return np.exp(x) / np.sum(np.exp(x), axis=0)


Epoch 5: Train Loss = nan, Train Accuracy = 67.7231, Val Loss = nan, Val Accuracy = 10.4000
Epoch 6: Using Loss Type: cross_entropy
Epoch 6: Train Loss = nan, Train Accuracy = 9.8079, Val Loss = nan, Val Accuracy = 10.4000
Epoch 7: Using Loss Type: cross_entropy
Epoch 7: Train Loss = nan, Train Accuracy = 9.8079, Val Loss = nan, Val Accuracy = 10.4000
Epoch 8: Using Loss Type: cross_entropy
Epoch 8: Train Loss = nan, Train Accuracy = 9.8079, Val Loss = nan, Val Accuracy = 10.4000
Epoch 9: Using Loss Type: cross_entropy
Epoch 9: Train Loss = nan, Train Accuracy = 9.8079, Val Loss = nan, Val Accuracy = 10.4000


0,1
Train_acc,▇████▆▁▁▁▁
Train_loss,▃▁▄▆█
epoch,▁▂▃▃▄▅▆▆▇█
val_Accuracy,█████▁▁▁▁▁
val_loss,▂▁▅██

0,1
Train_acc,9.80795
Train_loss,
epoch,9.0
val_Accuracy,10.4
val_loss,


Test Accuracy: 9.8000, Test Loss: nan


0,1
Test Accuracy,▁

0,1
Test Accuracy,9.8
Test Loss,
