In [1]:
import numpy as np
import torch
from torchvision import datasets

In [2]:
seed = 1131
lr= 0.01
#input = 784 pixels, 156 for 2 hidden layers, output= 10 different types of clothing items
layers= [784, 156, 156, 10]
batch_size= 256
lam = 0.0018738
early_stopping = 10
max_epochs= 30

In [3]:
#error because file showing as non existent 
import os
# Create data folder if it doesn't exist
os.makedirs('./data', exist_ok=True)

In [4]:
np.random.seed(seed)
torch.manual_seed(seed)

#define number of classes
num_classes=10

#load fasion MNUST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True)

print(train_dataset.data.shape, test_dataset.data.shape)

# Prepare the data as numpy arrays
X_train = train_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_train = train_dataset.targets.numpy()

X_test = test_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_test = test_dataset.targets.numpy()

# Split the training set into train and validation sets (80% / 20%)
validation_size = int(0.2 * X_train.shape[0])
X_validation, Y_validation = X_train[:validation_size], Y_train[:validation_size]
X_train, Y_train = X_train[validation_size:], Y_train[validation_size:]

# Save original labels before one-hot encoding
Y_train_orig = Y_train
Y_validation_orig = Y_validation
Y_test_orig = Y_test

# Convert labels to one-hot encoding for multi-class classification
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

Y_train = one_hot_encode(Y_train, num_classes)
Y_validation = one_hot_encode(Y_validation, num_classes)
Y_test = one_hot_encode(Y_test, num_classes)

# Standardizing the data

# Calculate the mean and standard deviation of the training features
X_train_mean = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)
X_train_std[X_train_std == 0] = 1  # To avoid division by zero

# Standardize all three subsets of data
X_train = (X_train - X_train_mean) / X_train_std
X_validation = (X_validation - X_train_mean) / X_train_std
X_test = (X_test - X_train_mean) / X_train_std

torch.Size([60000, 28, 28]) torch.Size([10000, 28, 28])


In [5]:
def init_uniform_1_over_sqrt_m(rng, m, n, dtype=np.float32):
    # Unif(-1/sqrt(m), 1/sqrt(m))
    bound = 1.0 / np.sqrt(m)
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype)
    b = np.zeros((n,), dtype=dtype)  # biases = 0
    return W, b

In [6]:
def init_uniform_sqrt6_over_m_plus_n(rng, m, n, dtype=np.float32):
    # Unif(-sqrt(6/(m+n)), sqrt(6/(m+n)))
    bound = np.sqrt(6.0 / (m + n))
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype)
    b = np.zeros((n,), dtype=dtype)  # biases = 0
    return W, b

In [7]:
def relu(x):  # hidden activation
    return np.maximum(x, 0.0)

In [8]:
def relu_grad(z):
    return (z > 0).astype(float)

In [9]:
def softmax(z):
    z = z - z.max(axis=1, keepdims=True)# stability
    e = np.exp(z)
    return e / e.sum(axis=1, keepdims=True)

In [10]:
def cross_entropy_onehot(p, t):
    return -np.mean(np.sum(t * np.log(p + 1e-12), axis=1))

In [11]:
def l2_penalty(W_list):
    """
    Sum of squares of all weights in all layers.
    L2 = sum_l ||W^(l)||_F^2   (Frobenius norm squared)
    """
    total = 0.0
    for W in W_list:        # loop over each weight matrix
        total += np.sum(W * W)
    return total


In [12]:
def build_params(layers, init_name, seed):
    rng = np.random.default_rng(seed)
    W_list = []
    b_list = []
    
    for i in range(len(layers) - 1):
        m = layers[i]     # fan-in
        n = layers[i+1]   # fan-out

        if init_name == "uniform_1_sqrt_m":
            W, b = init_uniform_1_over_sqrt_m(rng, m, n)
        elif init_name == "uniform_sqrt6_m_plus_n":
            W, b = init_uniform_sqrt6_over_m_plus_n(rng, m, n)

        W_list.append(W)
        b_list.append(b)

    return  W_list, b_list


In [13]:
#W, B = build_params(layers, "uniform_1_sqrt_m", seed)
#print(W)
#print(len(W))

In [14]:
def forward_pass(X, W_list, b_list):
    """
    Forward pass through an L-layer MLP (ReLU hidden, softmax output).

    X      : (N, d_in)   mini-batch
    W_list : [W1,...,WL]  (d_in x h1, h1 x h2, ..., h_{L-1} x K)
    b_list : [b1,...,bL]  (h1,), ..., (K,)

    Returns:
      P      : (N, K) probabilities
      caches : list of (A_prev, Z) for each layer (needed in backprop)
    """
    A = X                  # current activations (A^0 = X)
    caches = []            # to store (A_{ℓ-1}, Z_ℓ)

    # ----- hidden layers: 1 .. L-1 -----
    for W, b in zip(W_list[:-1], b_list[:-1]):
        Z = A @ W + b      # (N, n_ℓ)
        caches.append((A, Z))
        A = relu(Z)

    # ----- output layer: L -----
    ZL = A @ W_list[-1] + b_list[-1]   # (N, K)
    P = softmax(ZL)                    # (N, K)
    caches.append((A, ZL))             # last hidden A_{L-1}, Z_L

    return P, caches


In [15]:
def backward_pass(P, T, W_list, b_list, caches, lam=0.0):
    """
    Back-prop for the same L-layer MLP.

    P      : (N, K) probs from forward_pass
    T      : (N, K) one-hot targets
    W_list : [W1,...,WL]
    b_list : [b1,...,bL]
    caches : list of (A_prev, Z) from forward_pass
    lam    : L2 weight decay coefficient

    Returns:
      loss    : CE + L2
      grads_W : list of dJ/dW for each W
      grads_b : list of dJ/db for each b
    """
    L = len(W_list)             # number of layers
    N = T.shape[0]              # batch size

    # ----- loss = CE + λ * ||W||^2 -----
    ce = cross_entropy_onehot(P, T)
    l2 = l2_penalty(W_list)
    loss = ce + lam * l2

    # lists of gradients (Python lists, *not* numpy arrays)
    grads_W = [None] * L
    grads_b = [None] * L

    # ===== output layer gradient (layer L) =====
    # dJ/dZ_L = (P - T) / N  (for mini-batch size N)
    dZ = (P - T) / N          # (N, K)

    A_prev, ZL = caches[-1]   # last hidden A_{L-1}, Z_L

    grads_W[-1] = A_prev.T @ dZ + 2 * lam * W_list[-1]  # (h_{L-1}, K)
    grads_b[-1] = dZ.sum(axis=0)                        # (K,)

    # backprop to previous activations
    dA = dZ @ W_list[-1].T    # (N, h_{L-1})

    # ===== hidden layers: L-1 down to 1 =====
    for i in range(L-2, -1, -1):
        A_prev, Z = caches[i]        # A_{i-1}, Z_i
        dZ = dA * relu_grad(Z)       # ⊙ g'(Z_i)
        grads_W[i] = A_prev.T @ dZ + 2 * lam * W_list[i]
        grads_b[i] = dZ.sum(axis=0)
        dA = dZ @ W_list[i].T        # backprop to A_{i-1}

    return loss, grads_W, grads_b


In [16]:

def evaluate_loss(X, T, W_list, b_list, lam, eval_batch_size=2048):
    """
    Compute full loss (cross-entropy + L2) on a dataset, in chunks.
    """
    N = X.shape[0]
    total_ce = 0.0
    count = 0

    for start in range(0, N, eval_batch_size):
        xb = X[start:start + eval_batch_size]
        tb = T[start:start + eval_batch_size]
        P, _ = forward_pass(xb, W_list, b_list)
        ce = cross_entropy_onehot(P, tb)
        total_ce += ce * xb.shape[0]
        count += xb.shape[0]

    ce_mean = total_ce / count
    l2 = sum((W**2).sum() for W in W_list)
    return ce_mean + lam * l2


In [17]:
def predict_classes(X, W_list, b_list, eval_batch_size=2048):
    N = X.shape[0]
    preds = []
    for start in range(0, N, eval_batch_size):
        xb = X[start:start + eval_batch_size]
        P, _ = forward_pass(xb, W_list, b_list)
        preds.append(np.argmax(P, axis=1))
    return np.concatenate(preds, axis=0)

In [None]:
np.random.seed(0)

# Tiny fake dataset: 5 samples, 8 features, 3 classes
X_fake = np.random.rand(5, 8).astype(np.float32)
T_fake = np.eye(3)[np.random.randint(0, 3, size=5)]

# Tiny network: 8 -> 4 -> 4 -> 3
layers_test = [8, 4, 4, 3]
W_list, b_list = build_params(layers_test, "uniform_1_sqrt_m", seed=123)

# Forward + backward once
P, caches = forward_pass(X_fake, W_list, b_list)
loss, gW, gB = backward_pass(P, T_fake, W_list, b_list, caches, lam=0.01)

print("Sanity loss:", loss)
print("W shapes:", [W.shape for W in W_list])
print("grad W shapes:", [G.shape for G in gW])
print("grad b shapes:", [g.shape for g in gB])


In [16]:
def train_one_model(X_train, Y_train,
                    X_val,   Y_val,
                    init_name, seed_init):
    """
    Train ONE network with:
      - mini-batch gradient descent
      - L2 weight decay
      - early stopping on validation loss
    Returns:
      W_list, b_list, train_losses, val_losses
    """

    # initialise parameters
    params = build_params(layers, init_name, seed_init)
    W_list, b_list = params["W"], params["b"]

    rng = np.random.default_rng(seed_init)
    N = X_train.shape[0]

    best_val = np.inf
    best_W, best_b = None, None
    wait = 0

    train_losses = []
    val_losses = []

    for epoch in range(1, max_epochs + 1):
        # --- shuffle indices ---
        idx = np.arange(N)
        rng.shuffle(idx)

        epoch_loss_sum = 0.0
        epoch_count = 0

        # --- mini-batch loop ---
        for start in range(0, N, batch_size):
            batch_idx = idx[start:start + batch_size]
            xb = X_train[batch_idx]
            tb = Y_train[batch_idx]

            # forward
            P, caches = forward_pass(xb, W_list, b_list)

            # backward
            loss_batch, grads_W, grads_b = backward_pass(
                P, tb, W_list, b_list, caches, lam=lam
            )

            # accumulate batch loss (for train loss curve)
            bs = xb.shape[0]
            epoch_loss_sum += loss_batch * bs
            epoch_count += bs

            # update parameters
            for i in range(len(W_list)):
                W_list[i] -= lr * grads_W[i]
                b_list[i] -= lr * grads_b[i]

        # --- end of epoch: compute average train loss ---
        train_loss = epoch_loss_sum / epoch_count

        # and validation loss (in chunks, smaller set)
        val_loss = evaluate_loss(X_val, Y_val, W_list, b_list, lam)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch {epoch:3d} | train loss: {train_loss:.4f} | val loss: {val_loss:.4f}")

        # --- early stopping ---
        if val_loss + 1e-6 < best_val:
            best_val = val_loss
            best_W = [W.copy() for W in W_list]
            best_b = [b.copy() for b in b_list]
            wait = 0
        else:
            wait += 1
            if wait >= early_stopping:
                print("Early stopping triggered.")
                W_list = best_W
                b_list = best_b
                break

    return W_list, b_list, train_losses, val_losses

In [17]:
def main():
    results = {}

    # --------- Model A ---------
    print("Training Model A (Uniform(-1/sqrt(m), 1/sqrt(m)))...")
    W_A, b_A, train_A, val_A = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        init_name="uniform_1_sqrt_m",
        seed_init=seed,
    )

    y_train_pred_A = predict_classes(X_train, W_A, b_A)
    y_test_pred_A  = predict_classes(X_test,  W_A, b_A)

    train_err_A = 100 * (1 - np.mean(y_train_pred_A == Y_train_orig))
    test_err_A  = 100 * (1 - np.mean(y_test_pred_A  == Y_test_orig))

    results["A"] = {
        "W": W_A, "b": b_A,
        "train_losses": train_A,
        "val_losses": val_A,
        "train_err": train_err_A,
        "test_err": test_err_A,
    }

    # --------- Model B ---------
    print("\nTraining Model B (Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n)))...)")
    seed_B = seed + 1
    W_B, b_B, train_B, val_B = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        init_name="uniform_sqrt6_m_plus_n",
        seed_init=seed_B,
    )

    y_train_pred_B = predict_classes(X_train, W_B, b_B)
    y_test_pred_B  = predict_classes(X_test,  W_B, b_B)

    train_err_B = 100 * (1 - np.mean(y_train_pred_B == Y_train_orig))
    test_err_B  = 100 * (1 - np.mean(y_test_pred_B  == Y_test_orig))

    results["B"] = {
        "W": W_B, "b": b_B,
        "train_losses": train_B,
        "val_losses": val_B,
        "train_err": train_err_B,
        "test_err": test_err_B,
    }

    # --------- print misclassification errors ---------
    print("\n=== Misclassification Errors ===")
    print("Model A: Uniform(-1/sqrt(m), 1/sqrt(m))")
    print(f"  Train error: {train_err_A:.2f}%")
    print(f"  Test  error: {test_err_A:.2f}%\n")

    print("Model B: Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n)))")
    print(f"  Train error: {train_err_B:.2f}%")
    print(f"  Test  error: {test_err_B:.2f}%\n")

    # --------- plots ---------
    epochs_A = range(1, len(train_A) + 1)
    epochs_B = range(1, len(train_B) + 1)

    plt.figure(figsize=(8, 6))

    # Model A
    plt.subplot(2, 1, 1)
    plt.plot(epochs_A, train_A, label="Train loss (Model A)")
    plt.plot(epochs_A, val_A,   label="Validation loss (Model A)")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Model A: Uniform(-1/sqrt(m), 1/sqrt(m))")
    plt.grid(True)
    plt.legend()

    # Model B
    plt.subplot(2, 1, 2)
    plt.plot(epochs_B, train_B, label="Train loss (Model B)")
    plt.plot(epochs_B, val_B,   label="Validation loss (Model B)")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Model B: Uniform(-sqrt(6/(m+n)), sqrt(6/(m+n)))")
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

    return results


In [None]:
# PURE NUMPY SANITY TEST: no Fashion-MNIST involved

np.random.seed(0)

# tiny fake dataset: 5 samples, 784 features, 10 classes
X_fake = np.random.randn(5, 784).astype(np.float32)
T_fake = np.eye(10)[np.random.randint(0, 10, size=5)]   # random one-hot labels

# tiny 2-hidden-layer net: 784 -> 4 -> 3 -> 10
layers_test = [784, 4, 3, 10]

# simple random init (no fancy formulas, just to test)
W_list = []
b_list = []
for m, n in zip(layers_test[:-1], layers_test[1:]):
    W_list.append(np.random.randn(m, n).astype(np.float32) * 0.1)
    b_list.append(np.zeros(n, dtype=np.float32))

# run forward and backward once
P, caches = forward_pass(X_fake, W_list, b_list)
loss, gW, gB = backward_pass(P, T_fake, W_list, b_list, caches, lam=0.01)

print("Sanity loss:", loss)
print("grad W shapes:", [g.shape for g in gW])
print("grad b shapes:", [g.shape for g in gB])
