In [1]:
import numpy as np
import torch
from torchvision import datasets

seed = 1131
lr= 0.01
#input = 784 pixels, 156 for 2 hidden layers, output= 10 different types of clothing items
layers= [784, 156, 156, 10]
batch_size= 128
lam = 0.0018738
early_stopping = 10
max_epochs=100

#error because file showing as non existent 
import os
# Create data folder if it doesn't exist
os.makedirs('./data', exist_ok=True)

np.random.seed(seed)
torch.manual_seed(seed)

#define number of classes
num_classes=10

#load fasion MNUST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True)

print(train_dataset.data.shape, test_dataset.data.shape)

# Prepare the data as numpy arrays
X_train = train_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_train = train_dataset.targets.numpy()

X_test = test_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_test = test_dataset.targets.numpy()

# Split the training set into train and validation sets (80% / 20%)
validation_size = int(0.2 * X_train.shape[0])
X_validation, Y_validation = X_train[:validation_size], Y_train[:validation_size]
X_train, Y_train = X_train[validation_size:], Y_train[validation_size:]

# Save original labels before one-hot encoding
Y_train_orig = Y_train
Y_validation_orig = Y_validation
Y_test_orig = Y_test

# Convert labels to one-hot encoding for multi-class classification
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

Y_train = one_hot_encode(Y_train, num_classes)
Y_validation = one_hot_encode(Y_validation, num_classes)
Y_test = one_hot_encode(Y_test, num_classes)

# Standardizing the data

# Calculate the mean and standard deviation of the training features
X_train_mean = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)
X_train_std[X_train_std == 0] = 1  # To avoid division by zero

# Standardize all three subsets of data
X_train = (X_train - X_train_mean) / X_train_std
X_validation = (X_validation - X_train_mean) / X_train_std
X_test = (X_test - X_train_mean) / X_train_std

def init_uniform_1_over_sqrt_m(rng, m, n, dtype=np.float32):
    # Unif(-1/sqrt(m), 1/sqrt(m))
    bound = 1.0 / np.sqrt(m)
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype)
    b = np.zeros((n,), dtype=dtype)  # biases = 0
    return W, b

def init_uniform_sqrt6_over_m_plus_n(rng, m, n, dtype=np.float32):
    # Unif(-sqrt(6/(m+n)), sqrt(6/(m+n)))
    bound = np.sqrt(6.0 / (m + n))
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype)
    b = np.zeros((n,), dtype=dtype)  # biases = 0
    return W, b

def relu(x):  # hidden activation
    return np.maximum(x, 0.0)

def softmax(z):
    z = z - z.max(axis=1, keepdims=True)   # stability
    e = np.exp(z)
    return e / e.sum(axis=1, keepdims=True)

def cross_entropy_onehot(p, t):
    return -np.mean(np.sum(t * np.log(p + 1e-12), axis=1))

def build_params(layers, init_name, seed):
    rng = np.random.default_rng(seed)
    W_list = []
    b_list = []
    
    for i in range(len(layers) - 1):
        m = layers[i]     # fan-in
        n = layers[i+1]   # fan-out

        if init_name == "uniform_1_sqrt_m":
            W, b = init_uniform_1_over_sqrt_m(rng, m, n)
        elif init_name == "uniform_sqrt6_m_plus_n":
            W, b = init_uniform_sqrt6_over_m_plus_n(rng, m, n)
        else:
            raise ValueError("Unknown init name")

        W_list.append(W)
        b_list.append(b)

    return {"W": W_list, "b": b_list}
def forward_pass(X, W_list, b_list):
    """
    X      : (N, D)
    W_list : [W1, W2, ..., WL]
    b_list : [b1, b2, ..., bL]

    Returns:
        P      : (N, K) softmax probabilities
        caches : list of (A_prev, Z) for each layer
    """
    A = X
    caches = []

    # hidden layers
    for i in range(len(W_list) - 1):
        Z = A @ W_list[i] + b_list[i]
        A_next = relu(Z)
        caches.append((A, Z))
        A = A_next

    # output layer
    ZL = A @ W_list[-1] + b_list[-1]
    P = softmax(ZL)
    caches.append((A, ZL))

    return P, caches

def backward_pass(P, T, W_list, b_list, caches, lam=0.0):
    """
    P      : (N, K) probs from forward_pass
    T      : (N, K) one-hot labels
    W_list : [W1, W2, ..., WL]
    b_list : [b1, b2, ..., bL]
    caches : from forward_pass
    lam    : L2 weight decay

    Returns:
        loss    : scalar CE + L2
        grads_W : list of dW for each W
        grads_b : list of db for each b
    """
    L = len(W_list)
    N = T.shape[0]

    # loss = CE + lam * sum ||W||^2
    ce = cross_entropy_onehot(P, T)
    l2 = sum((W**2).sum() for W in W_list)
    loss = ce + lam * l2

    grads_W = [None] * L
    grads_b = [None] * L

    # output layer delta: (P - T)/N
    dZ = (P - T) / N
    A_prev, ZL = caches[-1]          # last hidden A, output Z
    grads_W[-1] = A_prev.T @ dZ + 2*lam*W_list[-1]
    grads_b[-1] = dZ.sum(axis=0)

    dA = dZ @ W_list[-1].T

    # hidden layers: L-2 ... 0
    for i in range(L-2, -1, -1):
        A_prev, Z = caches[i]
        dZ = dA * relu_grad(Z)
        grads_W[i] = A_prev.T @ dZ + 2*lam*W_list[i]
        grads_b[i] = dZ.sum(axis=0)
        dA = dZ @ W_list[i].T

    return loss, grads_W, grads_b

def evaluate_loss(X, T, W_list, b_list, lam):
    """
    Compute full loss (cross-entropy + L2) on a dataset.
    """
    P, _ = forward_pass(X, W_list, b_list)
    ce = cross_entropy_onehot(P, T)
    l2 = sum((W**2).sum() for W in W_list)
    return ce + lam * l2

def predict_classes(X, W_list, b_list):
    """
    Return hard class predictions (0..9) for a dataset.
    """
    P, _ = forward_pass(X, W_list, b_list)
    return np.argmax(P, axis=1)

def train_one_model(X_train, Y_train,
                    X_val,   Y_val,
                    init_name, seed_init):
    """
    Train ONE network with a given initialization strategy.
    Uses:
      - mini-batch gradient descent
      - L2 weight decay (lam)
      - early stopping based on validation loss
    Returns:
      W_list, b_list      : trained parameters (best on val)
      train_losses        : list of train losses per epoch
      val_losses          : list of val losses per epoch
    """
    # initialize parameters
    params = build_params(layers, init_name, seed_init)
    W_list, b_list = params["W"], params["b"]

    rng = np.random.default_rng(seed_init)
    N = X_train.shape[0]

    best_val = np.inf
    best_W, best_b = None, None
    wait = 0

    train_losses = []
    val_losses = []

    for epoch in range(1, max_epochs + 1):
        # ---- mini-batch loop ----
        idx = np.arange(N)
        rng.shuffle(idx)

        for start in range(0, N, batch_size):
            batch_idx = idx[start:start + batch_size]
            xb = X_train[batch_idx]
            tb = Y_train[batch_idx]

            # forward pass
            P, caches = forward_pass(xb, W_list, b_list)

            # backward pass (loss + gradients)
            loss_batch, grads_W, grads_b = backward_pass(
                P, tb, W_list, b_list, caches, lam=lam
            )

            # gradient descent update
            for i in range(len(W_list)):
                W_list[i] -= lr * grads_W[i]
                b_list[i] -= lr * grads_b[i]

        # ---- end of epoch: compute train/val loss ----
        train_loss = evaluate_loss(X_train, Y_train, W_list, b_list, lam)
        val_loss   = evaluate_loss(X_val,   Y_val,   W_list, b_list, lam)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # ---- early stopping ----
        if val_loss + 1e-6 < best_val:
            best_val = val_loss
            best_W = [W.copy() for W in W_list]
            best_b = [b.copy() for b in b_list]
            wait = 0
        else:
            wait += 1
            if wait >= early_stopping:
                # restore best parameters and stop
                W_list = best_W
                b_list = best_b
                break

    return W_list, b_list, train_losses, val_losses

def main():
    results = {}

    # ---------- Model A: Uniform(-1/sqrt(m), 1/sqrt(m)) ----------
    W_A, b_A, train_A, val_A = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        init_name="uniform_1_sqrt_m",
        seed_init=seed,
    )

    y_train_pred_A = predict_classes(X_train, W_A, b_A)
    y_test_pred_A  = predict_classes(X_test,  W_A, b_A)

    train_err_A = 100 * (1 - np.mean(y_train_pred_A == Y_train_orig))
    test_err_A  = 100 * (1 - np.mean(y_test_pred_A  == Y_test_orig))

    results["A"] = {
        "W": W_A,
        "b": b_A,
        "train_losses": train_A,
        "val_losses": val_A,
        "train_err": train_err_A,
        "test_err": test_err_A,
    }

    # ---------- Model B: Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n))) ----------
    seed_B = seed + 1  # just a different seed
    W_B, b_B, train_B, val_B = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        init_name="uniform_sqrt6_m_plus_n",
        seed_init=seed_B,
    )

    y_train_pred_B = predict_classes(X_train, W_B, b_B)
    y_test_pred_B  = predict_classes(X_test,  W_B, b_B)

    train_err_B = 100 * (1 - np.mean(y_train_pred_B == Y_train_orig))
    test_err_B  = 100 * (1 - np.mean(y_test_pred_B  == Y_test_orig))

    results["B"] = {
        "W": W_B,
        "b": b_B,
        "train_losses": train_B,
        "val_losses": val_B,
        "train_err": train_err_B,
        "test_err": test_err_B,
    }

    # ---------- print misclassification errors ----------
    print("=== Misclassification Errors ===")
    print("Model A: Uniform(-1/sqrt(m), 1/sqrt(m))")
    print(f"  Train error: {train_err_A:.2f}%")
    print(f"  Test  error: {test_err_A:.2f}%\n")

    print("Model B: Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n)))")
    print(f"  Train error: {train_err_B:.2f}%")
    print(f"  Test  error: {test_err_B:.2f}%\n")

    # ---------- plot loss vs epochs for both models ----------
    epochs_A = range(1, len(train_A) + 1)
    epochs_B = range(1, len(train_B) + 1)

    plt.figure(figsize=(8, 6))

    # Model A
    plt.subplot(2, 1, 1)
    plt.plot(epochs_A, train_A, label="Train loss (Model A)")
    plt.plot(epochs_A, val_A,   label="Validation loss (Model A)")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (CE + L2)")
    plt.title("Model A: Uniform(-1/sqrt(m), 1/sqrt(m))")
    plt.grid(True)
    plt.legend()

    # Model B
    plt.subplot(2, 1, 2)
    plt.plot(epochs_B, train_B, label="Train loss (Model B)")
    plt.plot(epochs_B, val_B,   label="Validation loss (Model B)")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (CE + L2)")
    plt.title("Model B: Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n)))")
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()
    return results

torch.Size([60000, 28, 28]) torch.Size([10000, 28, 28])


In [None]:
results = main()