<a href="https://colab.research.google.com/github/subikkshas/DA6401/blob/main/DLass1Q8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:


from keras.datasets import fashion_mnist
import numpy as np
from matplotlib import pyplot as plt
import time
import math
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

import wandb
!wandb login

# Load the dataset
dataset = fashion_mnist.load_data()
(X_train_and_validation, y_train_and_validation), (X_test, y_test) = dataset

# Split the dataset into training, validation, and test sets
X_train, X_validation, y_train, y_validation = train_test_split(X_train_and_validation, y_train_and_validation, test_size=0.1, random_state=42)

# Normalize the data
X_train = (X_train / 255.0).astype(np.float32)
X_validation = (X_validation / 255.0).astype(np.float32)
X_test = (X_test / 255.0).astype(np.float32)

# Print dataset shapes
print("Train Dataset Shape: ", X_train.shape)
print("Train Target Vector Shape: ", y_train.shape)
print("Test Dataset Shape:", X_test.shape)
print("Test Target Vector Shape", y_test.shape)
print("Validation Dataset Shape:", X_validation.shape)
print("Validation Target Vector Shape", y_validation.shape)

# Reshape the data
X_train = np.array(X_train.reshape(X_train.shape[0], 784, 1))
X_test = np.array(X_test.reshape(X_test.shape[0], 784, 1))
X_validation = np.array(X_validation.reshape(X_validation.shape[0], 784, 1))

# Initialize layers
def layer_init(arr, n1, n2, init_type):
    np.random.seed(10)
    if init_type == "random":
        arr.append(np.random.randn(n1, n2) * 0.1)
    elif init_type == "xavier":
        arr.append(np.random.randn(n1, n2) * np.sqrt(2 / (n1 + n2)))
    return arr

# Initialize parameters
def param(layers, init_type):
    W = []
    B = []
    for i in range(len(layers) - 1):
        W = layer_init(W, layers[i + 1], layers[i], init_type)
        B = layer_init(B, layers[i + 1], 1, init_type)
    return W, B

# Activation functions
def activation(activation_function):
    if activation_function == 'sigmoid':
        return sigmoid
    elif activation_function == 'tanh':
        return tanh
    elif activation_function == 'ReLU':
        return relu

def sigmoid(x, derivative=False):
    if derivative:
        return sigmoid(x) * (1 - sigmoid(x))
    return 1 / (1 + np.exp(-x))

def tanh(x, derivative=False):
    if derivative:
        return 1 - tanh(x) ** 2
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

def relu(x, derivative=False):
    if derivative:
        return (x > 0) * 1
    return x * (x > 0)

def softmax(x, derivative=False):
    if derivative:
        return softmax(x) * (1 - softmax(x))
    return np.exp(x) / np.sum(np.exp(x), axis=0)

# One-hot encoding
def one_hot(y, num_output_nodes):
    v = np.zeros((num_output_nodes, len(y)))
    for i, j in enumerate(y):
        v[j, i] = 1
    return v

# Forward propagation
def forward(x, W, B, activation_type):
    h = []
    a = []
    sigma = activation(activation_type)  # Activation function
    h.append(x)  # h0 = x
    a.append(np.dot(W[0], h[0]) + B[0])
    for i in range(len(W) - 1):
        h.append(sigma(a[-1]))
        a.append(np.dot(W[i + 1], h[-1]) + B[i + 1])
    y_hat = softmax(a[-1])
    return y_hat, h, a

# Loss function
def loss(y, y_hat, l_type, W=None, reg=0, n_class=10):
    if l_type == 'cross_entropy':
        err = -1 * np.sum(np.multiply(one_hot(y, n_class), np.log(y_hat))) / one_hot(y, n_class).shape[1]
    elif l_type == 'squared_error':
        err = np.sum((one_hot(y, n_class) - y_hat) ** 2) / (2 * one_hot(y, n_class).shape[1])

    if W:
        r = 0
        for i in range(len(W)):
            r += np.sum((np.array(W, dtype=object) ** 2)[i])
        err = err + reg * r
    return err

# Evaluate accuracy
def eval_acc(y_hat, y_true):
    return np.mean(np.argmax(y_hat, axis=0) == y_true) * 100

# Backpropagation
def back_prop(x, y, y_hat, a, h, W, B, batch_size, l_type, act_type):
    grad_h, grad_a, grad_W, grad_B = [0] * len(h), [0] * len(a), [0] * len(W), [0] * len(B)
    sigma = activation(act_type)

    if l_type == "cross_entropy":
        grad_h[-1] = -1 * (y / y_hat)
        grad_a[-1] = -1 * (y - y_hat)
    elif l_type == "squared_error":
        grad_h[-1] = y_hat - y
        grad_a[-1] = (y_hat - y) * softmax(a[-1]) * (1 - softmax(a[-1]))

    for i in range(len(W) - 1, -1, -1):
        grad_W[i] = np.dot(grad_a[i], h[i].T)
        grad_B[i] = np.dot(grad_a[i], np.ones((grad_a[i].shape[1], 1)))
        if i > 0:
            grad_h[i - 1] = np.dot(W[i].T, grad_a[i])
            grad_a[i - 1] = np.multiply(grad_h[i - 1], sigma(a[i - 1], derivative=True))

    return grad_W, grad_B, grad_h, grad_a

# SGD step
def sgd_step(W, B, grad_W, grad_B, lr, reg):
    W = np.array(W, dtype=object)
    B = np.array(B, dtype=object)
    W -= lr * reg * W + lr * np.array(grad_W, dtype=object)
    B -= lr * reg * B + lr * np.array(grad_B, dtype=object)
    return W.tolist(), B.tolist()

# Momentum step
def momentum_step(w, b, gW, gB, lr=0.001, gamma=0.9, reg=0):
    params = {'w': w, 'b': b}
    Wmoments = [np.zeros_like(p) for p in params['w']]
    Bmoments = [np.zeros_like(p) for p in params['b']]

    Wmoments = gamma * np.array(Wmoments, dtype=object) + lr * np.array(gW, dtype=object)
    W = (1 - lr * reg) * np.array(params['w'], dtype=object) - Wmoments
    Wmoments = Wmoments.tolist()

    Bmoments = gamma * np.array(Bmoments, dtype=object) + lr * np.array(gB, dtype=object)
    B = (1 - lr * reg) * np.array(params['b'], dtype=object) - Bmoments
    Bmoments = Bmoments.tolist()

    return W.tolist(), B.tolist()

# RMSprop step
def RMSprop_step(w, b, gW, gB, lr=0.01, beta=0.99):
    params = {'w': w, 'b': b}
    vW = [np.zeros_like(p) for p in params['w']]
    vB = [np.zeros_like(p) for p in params['b']]

    vW = beta * np.array(vW, dtype=object) + (1 - beta) * (np.array(gW, dtype=object) ** 2)
    W = np.array(params['w'], dtype=object) - (lr / ((vW + 1e-7) ** 0.5)) * np.array(gW, dtype=object)

    vB = beta * np.array(vB, dtype=object) + (1 - beta) * (np.array(gB, dtype=object) ** 2)
    B = np.array(params['b'], dtype=object) - (lr / ((vB + 1e-7) ** 0.5)) * np.array(gB, dtype=object)

    return W.tolist(), B.tolist()

# Training function
def train(X_train, y_train, x_val, y_val, num_inputs_nodes, hidden_layers, out_num, init_type, epochs,
          batch_size, l_type, act_type, op_name, lr_rate, reg):
    # Ensure WandB starts fresh
    wandb.finish()

    run = wandb.init(project="DA6401-Assignment-1", name="training-run", reinit=True)

    in_node = [num_inputs_nodes]
    layers = []
    layers.extend(in_node)
    layers.extend(hidden_layers)
    layers.extend([out_num])

    W, B = param(layers, init_type)
    N = X_train.shape[0]
    n_batches = int(np.floor(N / batch_size))

    print(f"🔹 Training with Loss Type: {l_type}")
    for epoch in range(epochs):
        print(f"Epoch {epoch}: Using Loss Type: {l_type}")
        train_loss, train_accuracy, val_loss, val_accuracy = [], [], [], []
        l, acc, ds, steps = 0, 0, 0, 1

        while ds < N:
            mini_batch_size = min((N - ds), batch_size)
            x = np.squeeze(X_train[ds:ds + mini_batch_size]).T
            y = one_hot(y_train[ds:ds + mini_batch_size], out_num)

            y_hat, h, a = forward(x, W, B, act_type)
            grad_W, grad_B, grad_h, grad_a = back_prop(x, y, y_hat, a, h, W, B, batch_size, l_type, act_type)

            if op_name == 'sgd':
                W, B = sgd_step(W, B, grad_W, grad_B, lr_rate, reg)
            elif op_name == 'momentum':
                W, B = momentum_step(W, B, grad_W, grad_B, lr_rate, reg)
            elif op_name == 'rmsprop':
                W, B = RMSprop_step(W, B, grad_W, grad_B, lr_rate, reg)

            l += loss(y_train[ds:ds + mini_batch_size], y_hat, l_type, W, reg, out_num)
            acc += eval_acc(y_hat, y_train[ds:ds + mini_batch_size])

            steps += 1
            ds += batch_size

        l /= (n_batches + mini_batch_size)
        acc /= steps

        train_loss.append(l)
        train_accuracy.append(acc)

        y_val_hat, _, _ = forward(np.squeeze(x_val).T, W, B, act_type)
        val_acc = eval_acc(y_val_hat, y_val)
        val_l = loss(y_val, y_val_hat, l_type, W=None, reg=reg, n_class=out_num)

        val_accuracy.append(val_acc)
        val_loss.append(val_l)

        wandb.log({"epoch": epoch, "Train_loss": l, "Train_acc": acc, "val_loss": val_l, "val_Accuracy": val_acc})

        print(f"Epoch {epoch}: Train Loss = {l:.4f}, Train Accuracy = {acc:.4f}, Val Loss = {val_l:.4f}, Val Accuracy = {val_acc:.4f}")

    wandb.finish()  # Explicitly finish the WandB run

    return W, B, train_loss, train_accuracy, val_loss, val_accuracy

# Sweep configuration
sweep_configuration = {
    'method': "grid",  # Use grid search to ensure both loss types are used
    'metric': {'name': 'val_Accuracy', 'goal': 'maximize'},
    'parameters': {
        'epochs': {'values': [5]},
        'hidden_layers': {'values': [[64, 32]]},
        'learning_rate': {'values': [1e-3]},
        'weight_decay': {'values': [0.0005]},
        'optimizer_name': {'values': ['sgd']},
        'batch_size': {'values': [16]},
        'init_type': {'values': ['xavier']},
        'activation_type': {'values': ['sigmoid']},
        'loss_type': {'values': ['cross_entropy', 'squared_error']}  # Both loss types
    }
}

# Sweep training function
def sweep_train():
    wandb.init(project="DA6401-Assignment-1", entity="MSE")
    config = wandb.config

    print(f"Running Sweep with Loss Function: {config.loss_type}")
    epochs = config.epochs
    hidden_layers = config.hidden_layers
    learning_rate = config.learning_rate
    weight_decay = config.weight_decay
    optimizer_name = config.optimizer_name
    batch_size = config.batch_size
    init_type = config.init_type
    activation_type = config.activation_type
    loss_type = config.loss_type
    reg_lamda = config.weight_decay

    wandb.run.name = "e_{}_hl_{}_lr_{}_wd_{}_o_{}_bs_{}_winit_{}_ac_{}_los_{}_r_{}".format(
        epochs, hidden_layers, learning_rate, weight_decay, optimizer_name, batch_size, init_type, activation_type, loss_type, reg_lamda
    )

    _, _, train_loss, train_accuracy, val_loss, val_accuracy = train(
        X_train, y_train, X_validation, y_validation, 784, hidden_layers, 10, init_type, epochs, batch_size, loss_type, activation_type, optimizer_name, learning_rate, reg_lamda
    )

# Run the sweep
sweep_id = wandb.sweep(sweep_configuration, project='DA6401-Assignment-1')
wandb.agent(sweep_id, function=sweep_train, project='DA6401-Assignment-1', count=2)

[34m[1mwandb[0m: Currently logged in as: [33msubikksha[0m ([33msubikksha-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Train Dataset Shape:  (54000, 28, 28)
Train Target Vector Shape:  (54000,)
Test Dataset Shape: (10000, 28, 28)
Test Target Vector Shape (10000,)
Validation Dataset Shape: (6000, 28, 28)
Validation Target Vector Shape (6000,)
Create sweep with ID: w6e17oba
Sweep URL: https://wandb.ai/subikksha-indian-institute-of-technology-madras/DA6401-Assignment-1/sweeps/w6e17oba


[34m[1mwandb[0m: Agent Starting Run: dhwovvxp with config:
[34m[1mwandb[0m: 	activation_type: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: [64, 32]
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_type: cross_entropy
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005


Running Sweep with Loss Function: cross_entropy


🔹 Training with Loss Type: cross_entropy
Epoch 0: Using Loss Type: cross_entropy
Epoch 0: Train Loss = 1.7502, Train Accuracy = 48.0228, Val Loss = 1.1351, Val Accuracy = 65.5500
Epoch 1: Using Loss Type: cross_entropy
Epoch 1: Train Loss = 1.0679, Train Accuracy = 68.1206, Val Loss = 0.7920, Val Accuracy = 72.3833
Epoch 2: Using Loss Type: cross_entropy
Epoch 2: Train Loss = 0.8891, Train Accuracy = 74.0484, Val Loss = 0.6559, Val Accuracy = 76.5333
Epoch 3: Using Loss Type: cross_entropy
Epoch 3: Train Loss = 0.8149, Train Accuracy = 77.6103, Val Loss = 0.5825, Val Accuracy = 79.1500
Epoch 4: Using Loss Type: cross_entropy
Epoch 4: Train Loss = 0.7761, Train Accuracy = 80.1355, Val Loss = 0.5352, Val Accuracy = 80.8500


0,1
Train_acc,▁▅▇▇█
Train_loss,█▃▂▁▁
epoch,▁▃▅▆█
val_Accuracy,▁▄▆▇█
val_loss,█▄▂▂▁

0,1
Train_acc,80.13552
Train_loss,0.77607
epoch,4.0
val_Accuracy,80.85
val_loss,0.53522


[34m[1mwandb[0m: Agent Starting Run: w6cwclue with config:
[34m[1mwandb[0m: 	activation_type: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: [64, 32]
[34m[1mwandb[0m: 	init_type: xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_type: squared_error
[34m[1mwandb[0m: 	optimizer_name: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005


Running Sweep with Loss Function: squared_error


🔹 Training with Loss Type: squared_error
Epoch 0: Using Loss Type: squared_error
Epoch 0: Train Loss = 0.5309, Train Accuracy = 22.0546, Val Loss = 0.4417, Val Accuracy = 26.7500
Epoch 1: Using Loss Type: squared_error
Epoch 1: Train Loss = 0.5209, Train Accuracy = 29.5635, Val Loss = 0.4292, Val Accuracy = 32.4833
Epoch 2: Using Loss Type: squared_error
Epoch 2: Train Loss = 0.5072, Train Accuracy = 31.7276, Val Loss = 0.4084, Val Accuracy = 28.5833
Epoch 3: Using Loss Type: squared_error
Epoch 3: Train Loss = 0.4921, Train Accuracy = 32.7921, Val Loss = 0.3857, Val Accuracy = 38.1500
Epoch 4: Using Loss Type: squared_error
Epoch 4: Train Loss = 0.4851, Train Accuracy = 42.0690, Val Loss = 0.3663, Val Accuracy = 45.8000


0,1
Train_acc,▁▄▄▅█
Train_loss,█▆▄▂▁
epoch,▁▃▅▆█
val_Accuracy,▁▃▂▅█
val_loss,█▇▅▃▁

0,1
Train_acc,42.06902
Train_loss,0.48514
epoch,4.0
val_Accuracy,45.8
val_loss,0.36625
