In [1]:
from tensorflow.keras.datasets import mnist
import numpy as np
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [2]:
X_train = X_train / 255.0
X_test = X_test / 255.0

X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [3]:
def one_hot_encode(y, num_classes=10):
    return np.eye(num_classes)[y]

y_train = one_hot_encode(y_train)
y_test = one_hot_encode(y_test)

In [4]:
def initialize_parameters(input_size, hidden_size, output_size):
    W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
    b2 = np.zeros((1, output_size))
    return W1, b1, W2, b2

In [5]:
def relu(Z):
    return np.maximum(0, Z)

In [6]:
def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

In [7]:
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [8]:
def backward_propagation(X, y, Z1, A1, Z2, A2, W1, W2, lambda_reg=0.01):
    m = X.shape[0]
    dZ2 = A2 - y
    dW2 = np.dot(A1.T, dZ2) / m + lambda_reg * W2
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    dZ1 = np.dot(dZ2, W2.T) * (Z1 > 0)
    dW1 = np.dot(X.T, dZ1) / m + lambda_reg * W1
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    return dW1, db1, dW2, db2

In [9]:
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

In [10]:
input_size = 784
hidden_size = 256
output_size = 10
learning_rate = 0.06
epochs = 150

In [12]:
def create_mini_batches(X, y, batch_size=128):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    for i in range(0, X.shape[0], batch_size):
        yield X_shuffled[i:i+batch_size], y_shuffled[i:i+batch_size]

In [14]:
def compute_accuracy(X, y, W1, b1, W2, b2, batch_size=128):
    correct = 0
    total = 0
    for X_batch, y_batch in create_mini_batches(X, y, batch_size):
        _, _, _, A2 = forward_propagation(X_batch, W1, b1, W2, b2)
        predictions = np.argmax(A2, axis=1)
        labels = np.argmax(y_batch, axis=1)
        correct += np.sum(predictions == labels)
        total += y_batch.shape[0]
    return correct / total

In [16]:
W1, b1, W2, b2 = initialize_parameters(input_size, hidden_size, output_size)
batch_size = 128 
for epoch in range(epochs):
    for X_batch, y_batch in create_mini_batches(X_train, y_train, batch_size):
        Z1, A1, Z2, A2 = forward_propagation(X_batch, W1, b1, W2, b2)
        dW1, db1, dW2, db2 = backward_propagation(X_batch, y_batch, Z1, A1, Z2, A2, W1, W2)
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

    if epoch % 10 == 0:
        train_accuracy = compute_accuracy(X_train, y_train, W1, b1, W2, b2, batch_size)
        print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.4f}")

Epoch 0, Train Accuracy: 0.9024
Epoch 10, Train Accuracy: 0.9339
Epoch 20, Train Accuracy: 0.9375
Epoch 30, Train Accuracy: 0.9372
Epoch 40, Train Accuracy: 0.9373
Epoch 50, Train Accuracy: 0.9400
Epoch 60, Train Accuracy: 0.9402
Epoch 70, Train Accuracy: 0.9408
Epoch 80, Train Accuracy: 0.9401
Epoch 90, Train Accuracy: 0.9412
Epoch 100, Train Accuracy: 0.9408
Epoch 110, Train Accuracy: 0.9404
Epoch 120, Train Accuracy: 0.9415
Epoch 130, Train Accuracy: 0.9404
Epoch 140, Train Accuracy: 0.9408


In [17]:
test_accuracy = compute_accuracy(X_test, y_test, W1, b1, W2, b2, batch_size)
print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")


Final Test Accuracy: 0.9418
