In [19]:
from tensorflow.keras.datasets import mnist
import numpy as np

In [3]:
df = mnist.load_data()

In [5]:
(X_train, y_train), (X_test, y_test) = df
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (60000, 28, 28)
y_train shape: (60000,)
X_test shape: (10000, 28, 28)
y_test shape: (10000,)


In [6]:
# normalize pixels
X_train = X_train / 255.0
X_test = X_test / 255.0

#One Hot Encode Labels

In [8]:
def one_hot(y, num_classes=10):
    out = np.zeros((len(y), num_classes))
    out[np.arange(len(y)), y] = 1
    return out

y_train_oh = one_hot(y_train)
y_test_oh = one_hot(y_test)

#Initialize Network Parameters

In [9]:
input_size = 784
hidden_size = 128
output_size = 10

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

#Activation Functions

In [10]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exp = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp / np.sum(exp, axis=1, keepdims=True)

#Forward Propagation

In [11]:
def forward(X):
    Z1 = X @ W1 + b1
    A1 = relu(Z1)

    Z2 = A1 @ W2 + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

#Loss Function (Cross Entropy)

In [12]:
def cross_entropy(y_true, y_pred):
    m = y_true.shape[0]
    return -np.sum(y_true * np.log(y_pred + 1e-9)) / m

#Backpropogation

In [13]:
def backward(X, y_true, Z1, A1, A2, lr=0.1):
    global W1, b1, W2, b2

    m = X.shape[0]

    dZ2 = A2 - y_true
    dW2 = (A1.T @ dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    dA1 = dZ2 @ W2.T
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (X.T @ dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1

#Training Loop

In [16]:
epochs = 10
batch_size = 64

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train_oh[i:i+batch_size]

        # Flatten X_batch from (batch_size, 28, 28) to (batch_size, 784)
        X_batch_flat = X_batch.reshape(X_batch.shape[0], -1)

        Z1, A1, Z2, A2 = forward(X_batch_flat)
        backward(X_batch_flat, y_batch, Z1, A1, A2)

    # Flatten X_train for loss calculation as well
    X_train_flat_for_loss = X_train[:1000].reshape(X_train[:1000].shape[0], -1)
    _, _, _, train_pred = forward(X_train_flat_for_loss)
    loss = cross_entropy(y_train_oh[:1000], train_pred)

    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 1, Loss: 0.3136
Epoch 2, Loss: 0.2234
Epoch 3, Loss: 0.1722
Epoch 4, Loss: 0.1416
Epoch 5, Loss: 0.1213
Epoch 6, Loss: 0.1081
Epoch 7, Loss: 0.0974
Epoch 8, Loss: 0.0885
Epoch 9, Loss: 0.0818
Epoch 10, Loss: 0.0770


#Accuracy Check

In [18]:
X_test_flat = X_test.reshape(X_test.shape[0], -1)
_, _, _, test_pred = forward(X_test_flat)

pred_labels = np.argmax(test_pred, axis=1)
accuracy = np.mean(pred_labels == y_test)

print("Test Accuracy:", accuracy)

Test Accuracy: 0.9748
