In [9]:
import numpy as np
from tensorflow.keras.datasets import mnist

# Load MNIST data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize the data to [0, 1]
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# One-hot encoding for labels
y_train_one_hot = np.eye(10)[y_train]
y_test_one_hot = np.eye(10)[y_test]

In [6]:
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(0)
    W1 = np.random.randn(input_size, hidden1_size) * 0.01
    b1 = np.zeros((1, hidden1_size))
    W2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
    b2 = np.zeros((1, hidden2_size))
    W3 = np.random.randn(hidden2_size, output_size) * 0.01
    b3 = np.zeros((1, output_size))
    return W1, b1, W2, b2, W3, b3

In [3]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_Z / exp_Z.sum(axis=1, keepdims=True)

def forward_propagation(X, W1, b1, W2, b2, W3, b3):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = relu(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

In [5]:
def compute_loss(Y, A3):
    m = Y.shape[0]
    log_likelihood = -np.log(A3[range(m), Y.argmax(axis=1)])
    loss = np.sum(log_likelihood) / m
    return loss

In [6]:
def relu_derivative(Z):
    return Z > 0

def backpropagation(X, Y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3):
    m = Y.shape[0]
    
    dZ3 = A3 - Y
    dW3 = np.dot(A2.T, dZ3) / m
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m

    dA2 = np.dot(dZ3, W3.T)
    dZ2 = dA2 * relu_derivative(Z2)
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2, dW3, db3

def update_parameters(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W3 -= learning_rate * dW3
    b3 -= learning_rate * db3
    return W1, b1, W2, b2, W3, b3

In [13]:
def train(X_train, y_train, X_test, y_test, hidden1_size=60, hidden2_size=30, epochs=100, learning_rate=0.01):
    input_size = X_train.shape[1]
    output_size = 10  # 10 classes for MNIST

    # Initialize parameters
    W1, b1, W2, b2, W3, b3 = initialize_parameters(input_size, hidden1_size, hidden2_size, output_size)

    for epoch in range(epochs):
        # Forward propagation
        Z1, A1, Z2, A2, Z3, A3 = forward_propagation(X_train, W1, b1, W2, b2, W3, b3)

        # Compute loss
        loss = compute_loss(y_train, A3)
        
        # Backpropagation
        dW1, db1, dW2, db2, dW3, db3 = backpropagation(X_train, y_train, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3)

        # Update parameters
        W1, b1, W2, b2, W3, b3 = update_parameters(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, learning_rate)

        if epoch % 10 == 0:
            # Print the loss every 10 epochs
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    # Test the accuracy
    _, _, _, _, _, A3_test = forward_propagation(X_test, W1, b1, W2, b2, W3, b3)
    predictions = np.argmax(A3_test, axis=1)
    accuracy = np.mean(predictions == np.argmax(y_test, axis=1))
    print(f"Test Accuracy: {accuracy:.4f}")

In [15]:
train(X_train, y_train_one_hot, X_test, y_test_one_hot, epochs=200, learning_rate=0.5)

Epoch 0, Loss: 2.3026
Epoch 10, Loss: 2.3016
Epoch 20, Loss: 2.3012
Epoch 30, Loss: 2.3010
Epoch 40, Loss: 2.3008
Epoch 50, Loss: 2.3006
Epoch 60, Loss: 2.3001
Epoch 70, Loss: 2.2991
Epoch 80, Loss: 2.2963
Epoch 90, Loss: 2.2834
Epoch 100, Loss: 2.1496
Epoch 110, Loss: 1.8163
Epoch 120, Loss: 2.0517
Epoch 130, Loss: 1.7551
Epoch 140, Loss: 1.6423
Epoch 150, Loss: 1.6740
Epoch 160, Loss: 1.2708
Epoch 170, Loss: 1.0153
Epoch 180, Loss: 1.0895
Epoch 190, Loss: 0.7927
Test Accuracy: 0.7712
