In [11]:
import numpy as np
import torch
from torchvision import datasets
import matplotlib.pyplot as plt

In [12]:
seed = 1131
lr= 0.01
#input = 784 pixels, 156 for 2 hidden layers, output= 10 different types of clothing items
layers= [784, 156, 156, 10]
batch_size= 256 #minibatch
lam = 0.0018738
early_stopping = 10 #how many times without better results until you stop
max_epochs= 30

In [13]:
np.random.seed(seed)
torch.manual_seed(seed)

#define number of classes
num_classes=10

#load fasion MNUST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True)

print(train_dataset.data.shape, test_dataset.data.shape)

# Prepare the data as numpy arrays
X_train = train_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_train = train_dataset.targets.numpy()

X_test = test_dataset.data.numpy().reshape(-1, 28 * 28).astype('float32') / 255.0
Y_test = test_dataset.targets.numpy()

# Split the training set into train and validation sets (80% / 20%)
validation_size = int(0.2 * X_train.shape[0])
X_validation, Y_validation = X_train[:validation_size], Y_train[:validation_size]
X_train, Y_train = X_train[validation_size:], Y_train[validation_size:]

# Save original labels before one-hot encoding
Y_train_orig = Y_train
Y_validation_orig = Y_validation
Y_test_orig = Y_test

# Convert labels to one-hot encoding for multi-class classification
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

Y_train = one_hot_encode(Y_train, num_classes)
Y_validation = one_hot_encode(Y_validation, num_classes)
Y_test = one_hot_encode(Y_test, num_classes)

# Standardizing the data

# Calculate the mean and standard deviation of the training features
X_train_mean = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)
X_train_std[X_train_std == 0] = 1  # To avoid division by zero

# Standardize all three subsets of data
X_train = (X_train - X_train_mean) / X_train_std
X_validation = (X_validation - X_train_mean) / X_train_std
X_test = (X_test - X_train_mean) / X_train_std

torch.Size([60000, 28, 28]) torch.Size([10000, 28, 28])


In [14]:
# Initializations for W and b

def init_uniform_1_over_sqrt_m(rng, m, n, dtype=np.float32):
    #W_ij ~ Unif(-1/sqrt(m), 1/sqrt(m)),  m = fan-in
    bound = 1.0 / np.sqrt(m) #the boundareis
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype) #the uniform distribute between -bound adn bound
    b = np.zeros((n,), dtype=dtype)#bias is 0 
    return W, b


def init_uniform_sqrt6_over_m_plus_n(rng, m, n, dtype=np.float32):
    #W_ij ~ Unif(-sqrt(6/(m+n)), sqrt(6/(m+n)))
    bound = np.sqrt(6.0 / (m + n))
    W = rng.uniform(-bound, bound, size=(m, n)).astype(dtype)
    b = np.zeros((n,), dtype=dtype)
    return W, b


def build_params(layers, init_name, seed):
    #layers = [n_in, n_h1, ..., n_out], init is the sqt1 or sqrt6, seed is 1311
    #returns:W_list: [W1, ..., WL],b_list: [b1, ..., bL]

    rng = np.random.default_rng(seed)
    W_list= [] 
    b_list = []

    for i in range(len(layers) - 1):
        m = layers[i] 
        n = layers[i + 1] 

        if init_name == "uniform_1_sqrt_m": 
            W, b = init_uniform_1_over_sqrt_m(rng, m, n) 
        elif init_name == "uniform_sqrt6_m_plus_n":
            W, b = init_uniform_sqrt6_over_m_plus_n(rng, m, n)

        W_list.append(W)
        b_list.append(b)

    return W_list, b_list


In [1]:
#Activation functions 
def relu(z): 
    return np.maximum(z, 0.0)

def relu_grad(z):
    # derivative of ReLU
    return (z > 0).astype(float) #fiecewise function from slides

def softmax(z): 
    # softamx e^z/sum(e^z)
    z = z - z.max(axis=1, keepdims=True)
    e = np.exp(z)
    return e / e.sum(axis=1, keepdims=True)



In [None]:
#loss functions 
def cross_entropy_onehot(p, t):
    # p, t : (N, K)
    return -np.mean(np.sum(t * np.log(p + 1e-12), axis=1))

def l2_penalty(W_list):
    #sum over all layers l of ||W^(l)||_F^2

    total = 0.0
    for W in W_list:
        total += np.sum(W * W)
    return total

In [16]:
# Forward / Backward propagation

def forward_pass(X, W_list, b_list):
    #Hidden layers: ReLU
    #Output layer: softmax
    A = X  # current activations
    caches = [] # store (A_prev, Z) for each layer

    # hidden layers: 0 to L-2. zip will iterate through both till -1 or the last elemet
    for W, b in zip(W_list[:-1], b_list[:-1]):
        Z = np.dot(A, W) + b # (N, n_L)
        caches.append((A, Z))# save A^{(l-1)} and Z^{(l)}
        A = relu(Z)# activation for next layer

    # output layer 
    ZL = np.dot(A , W_list[-1]) + b_list[-1] # (N, K)
    P  = softmax(ZL) # (N, K) activiation 
    caches.append((A, ZL)) # last hidden A, output Z

    return P, caches


def backward_pass(P, T, W_list, b_list, caches, lam=0.0):


    #P predicted probs
    #T one-hot targets

    #Returns
    #loss CE + L2
    #grads_W: list dJ/dW_l same shapes as W_list
    #grads_b: list dJ/db_l same shapes as b_list

    L = len(W_list)#number of weight layers
    N = T.shape[0]#batch size

    # loss = CE + lambda * sum ||W||^2 
    ce = cross_entropy_onehot(P, T)
    l2 = l2_penalty(W_list)
    loss = ce + lam * l2
    
    #empty gradient containers
    grads_W = np.zeros(L)
    grads_b = np.zeros(L)

    # output layer 
    # derivative dJ/dZ^L = (P - T)/N  (softmax + CE)
    dZ = (P - T) / N
    A_prev, ZL = caches[-1] # last hidden A, output Z

    grads_W[-1] = np.dot(A_prev.T , dZ) + 2 * lam * W_list[-1] # (n_{L-1}, K)
    grads_b[-1] = dZ.sum(axis=0) 

    dA = np.dot(dZ, W_list[-1].T) # backprop to A^{L-1}

    # hidden layers: L-1 to 1
    for i in range(L - 2, -1, -1):
        A_prev, Z = caches[i] # A^{(l-1)}, Z^{(l)}
        dZ = dA * relu_grad(Z) 
        #grads
        grads_W[i] = np.dot(A_prev.T, dZ) + 2 * lam * W_list[i]
        grads_b[i] = dZ.sum(axis=0)

        dA = np.dot(dZ, W_list[i].T)# backprop to previous layer

    return loss, grads_W, grads_b


In [17]:
# loss and predict class 
#batch size is 204y for evaluation for fast bc were not training here 

def evaluate_loss(X, T, W_list, b_list, lam, eval_batch_size=2048):
    
    #Compute CE + L2 on a dataset using mini-batches
    N = X.shape[0]
    total_ce = 0.0 #total ce loss * batch sizes 
    count = 0 #number of samples processed so far 

    for start in range(0, N, eval_batch_size):
        xb = X[start:start + eval_batch_size]
        tb = T[start:start + eval_batch_size]
        P, T = forward_pass(xb, W_list, b_list)
        ce = cross_entropy_onehot(P, tb)
        total_ce += ce * xb.shape[0]
        count += xb.shape[0]

    ce_mean = total_ce / count #avg over samples 
    l2 = l2_penalty(W_list)#sum of squares of all weights in all layers 
    return ce_mean + lam * l2


def predict_classes(X, W_list, b_list, eval_batch_size=2048):
    #Return predicted class indices on X.
    N = X.shape[0]
    preds = [] #store predictions for each mini batch in this list 

    for start in range(0, N, eval_batch_size): #loop over X in mini batches 
        xb = X[start:start + eval_batch_size]
        P, T = forward_pass(xb, W_list, b_list) #frwrd pass to get prob P 
        batch_preds = np.argmax(P, axis-1) #each row is sample, shape is batch size. returns index of largest P 
        preds.append(batch_preds)
    #concatenate all of the batch pred into one big array of (N,1)
    return np.concatenate(preds, axis=0)


In [18]:
# training one model 
def train_one_model(X_train, Y_train,X_val,Y_val,layers, init_name,seed, lr=0.01, lam=0.0018738, batch_size=256, max_epochs=30, early_stopping=10):
    #Train ONE MLP with mini-batch GD + L2 + early stopping.
    
    rng = np.random.default_rng(seed_init)
    N = X_train.shape[0]

    # init parameters
    W_list, b_list = build_params(layers, init_name, seed)

    best_val = np.inf
    best_W=0
    best_b = 0
    wait = 0

    train_losses = []
    val_losses = []

    for epoch in range(1, max_epochs + 1):

        # shuffle indices. medium article in src 
        idx = np.arange(N)
        rng.shuffle(idx)

        epoch_loss_sum = 0.0
        epoch_count = 0

        # minibatch
        for start in range(0, N, batch_size):
            batch_idx = idx[start:start + batch_size]
            xb = X_train[batch_idx]
            tb = Y_train[batch_idx]

            P, caches = forward_pass(xb, W_list, b_list)
            loss_batch, grads_W, grads_b = backward_pass(P, tb, W_list, b_list, caches, lam=lam)
            #batch size 
            bs = xb.shape[0]
            epoch_loss_sum += loss_batch * bs
            epoch_count+= bs

            # parameter update
            for i in range(len(W_list)):
                W_list[i] -= lr * grads_W[i]
                b_list[i] -= lr * grads_b[i]

        # ---- average train loss ----
        train_loss = epoch_loss_sum / epoch_count
        train_losses.append(train_loss)

        # ---- validation loss ----
        val_loss = evaluate_loss(X_val, Y_val, W_list, b_list, lam)
        val_losses.append(val_loss)

        print(f"Epoch {epoch:3d} | train: {train_loss:.4f} | val: {val_loss:.4f}")

        # ---- early stopping ----
        if val_loss + 1e-6 < best_val:
            best_val = val_loss
            best_W   = [W.copy() for W in W_list]
            best_b   = [b.copy() for b in b_list]
            wait     = 0
        else:
            wait += 1
            if wait >= early_stopping:
                print("Early stopping triggered.")
                W_list = best_W
                b_list = best_b
                break

    return W_list, b_list, train_losses, val_losses


In [None]:
def main():
    results = {}

    # -------- Model A --------
    print("Training Model A (Unif(-1/sqrt(m), 1/sqrt(m)))...")
    W_A, b_A, train_A, val_A = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        layers,
        init_name="uniform_1_sqrt_m",
        seed_init=seed,
        lr=lr,
        lam=lam,
        batch_size=batch_size,
        max_epochs=max_epochs,
        early_stopping=early_stopping,
    )

    y_train_pred_A = predict_classes(X_train, W_A, b_A)
    y_test_pred_A  = predict_classes(X_test,  W_A, b_A)

    train_err_A = 100 * (1.0 - np.mean(y_train_pred_A == Y_train_orig))
    test_err_A  = 100 * (1.0 - np.mean(y_test_pred_A  == Y_test_orig))

    results["A"] = {
        "W": W_A, "b": b_A,
        "train_losses": train_A,
        "val_losses": val_A,
        "train_err": train_err_A,
        "test_err": test_err_A,
    }

    # -------- Model B --------
    print("\nTraining Model B (Unif(-sqrt(6/(m+n)), sqrt(6/(m+n))))...")
    W_B, b_B, train_B, val_B = train_one_model(
        X_train, Y_train,
        X_validation, Y_validation,
        layers,
        init_name="uniform_sqrt6_m_plus_n",
        seed_init=seed + 1,
        lr=lr,
        lam=lam,
        batch_size=batch_size,
        max_epochs=max_epochs,
        early_stopping=early_stopping,
    )

    y_train_pred_B = predict_classes(X_train, W_B, b_B)
    y_test_pred_B  = predict_classes(X_test,  W_B, b_B)

    train_err_B = 100 * (1.0 - np.mean(y_train_pred_B == Y_train_orig))
    test_err_B  = 100 * (1.0 - np.mean(y_test_pred_B  == Y_test_orig))

    results["B"] = {
        "W": W_B, "b": b_B,
        "train_losses": train_B,
        "val_losses": val_B,
        "train_err": train_err_B,
        "test_err": test_err_B,
    }

    # -------- Print misclassification errors --------
    print("\nMisclassification errors:")
    print(f"Model A – train: {train_err_A:6.2f}% | test: {test_err_A:6.2f}%")
    print(f"Model B – train: {train_err_B:6.2f}% | test: {test_err_B:6.2f}%")

    # -------- Plot CE + L2 vs epochs for both models --------
    epochs_A = range(1, len(train_A) + 1)
    epochs_B = range(1, len(train_B) + 1)

    plt.figure(figsize=(8, 6))

    # Model A
    plt.plot(epochs_A, train_A, label="Train loss (Model A)")
    plt.plot(epochs_A, val_A,   label="Val loss (Model A)", linestyle="--")

    # Model B
    plt.plot(epochs_B, train_B, label="Train loss (Model B)")
    plt.plot(epochs_B, val_B,   label="Val loss (Model B)", linestyle="--")

    plt.xlabel("Epoch")
    plt.ylabel("Cross-entropy + L2")
    plt.title("Training / validation loss for Model A & B")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return results

# Run
results = main()


Training Model A (Unif(-1/sqrt(m), 1/sqrt(m)))...
