In [734]:
import numpy as np
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return mnist_data, mnist_labels
train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

In [735]:
# We have numbers and we have them labeled
# We normalize the data by applying a Min-Max Normalization

normalized_data_train = (train_X - np.mean(train_X)) / np.std(train_X)
normalized_data_test = (test_X - np.mean(test_X)) / np.std(test_X)

In [736]:
# One-Hot Encoding
# there are 10 labels "0 -> 9"

numberOfClasses = 10

one_hot_encoded_train = np.zeros((len(train_Y), numberOfClasses))
one_hot_encoded_train[np.arange(len(train_Y)), train_Y] = 1

one_hot_encoded_test = np.zeros((len(test_Y), numberOfClasses))
one_hot_encoded_test[np.arange(len(test_Y)), test_Y] = 1

# 5
print(one_hot_encoded_test[0])

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


In [737]:
def split_into_batches(data, batch_size):
    num_batches = data.shape[0] // batch_size 
    batches = np.array_split(data, num_batches)
    
    return batches

batchSize = 100

print(normalized_data_train[0].shape)
print(one_hot_encoded_train[0].shape)

normalized_data_train = split_into_batches(normalized_data_train, batchSize)
one_hot_encoded_train = split_into_batches(one_hot_encoded_train, batchSize)

print(normalized_data_train[0].shape)
print(one_hot_encoded_train[0].shape)

(784,)
(10,)
(100, 784)
(100, 10)


In [738]:
# Initialize weights with He initialization
W1 = np.random.randn(784, 100) * np.sqrt(2 / 784)
W2 = np.random.randn(100, 10) * np.sqrt(2 / 100)

# Initialize biases
b1 = np.zeros((100,))
b2 = np.zeros((10,))

In [739]:
# for hidden layer
def relu(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)

def relu_derivative(z, alpha=0.01):
    return np.where(z > 0, 1, alpha)

# for output layer
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [740]:
# Forward propagation
def forwardPropagation(data, weightsL1, biasL1, weightsL2, biasL2):
    hidden = np.dot(data, weightsL1) + biasL1
    reluz_hidden = relu(hidden)
    
    output = np.dot(reluz_hidden, weightsL2) + biasL2
    return softmax(output), reluz_hidden

In [741]:
def cross_entropy_loss(one_hot_labels, predictions):
    predictions = np.clip(predictions, 1e-10, 1.0)
    return -np.mean(np.sum(one_hot_labels * np.log(predictions), axis=1))

In [742]:
def l2_regularization(weights, lambda_val):
    return lambda_val * np.sum(weights ** 2)

In [743]:
num_epochs = 200
learning_rate = 0.01
lambda_val = 0.0001 

for epoch in range(num_epochs):
    for normalized_data_batch, one_hot_encoded_batch in zip(normalized_data_train, one_hot_encoded_train):
        activation_L2, activation_L1 = forwardPropagation(normalized_data_batch, W1, b1, W2, b2)
        
        loss = cross_entropy_loss(one_hot_encoded_batch, activation_L2)

        m = activation_L2.shape[0]
        
        # Output layer
        error_output_layer = one_hot_encoded_batch - activation_L2
        d_W2 = np.dot(activation_L1.T, error_output_layer) / m
        d_b2 = np.sum(error_output_layer, axis=0) / m
        
        # Hiddem layer
        error_hidden_layer = np.dot(error_output_layer, W2.T) * relu_derivative(activation_L1)
        d_W1 = np.dot(normalized_data_batch.T, error_hidden_layer) / m
        d_b1 = np.sum(error_hidden_layer, axis=0) / m
        
        # L2 regularization
        d_W1 += lambda_val * W1 / m
        d_W2 += lambda_val * W2 / m
        
        W1 += learning_rate * d_W1
        b1 += learning_rate * d_b1
        W2 += learning_rate * d_W2
        b2 += learning_rate * d_b2
        
    _, test_predictions = forwardPropagation(normalized_data_test, W1, b1, W2, b2)
    test_accuracy = calculate_accuracy(test_predictions, one_hot_encoded_test)
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss}, Test Accuracy: {test_accuracy:.2f}%")

    
print(np.isnan(W1).any(), np.isnan(W2).any())
print(np.isnan(activation_L1).any(), np.isnan(activation_L2).any())

Epoch 1/200, Loss: 0.414423152412823, Test Accuracy: 0.28%
Epoch 2/200, Loss: 0.3419255565631778, Test Accuracy: 0.20%
Epoch 3/200, Loss: 0.3066146235111716, Test Accuracy: 0.16%
Epoch 4/200, Loss: 0.28532693438050644, Test Accuracy: 0.16%
Epoch 5/200, Loss: 0.2702665585076447, Test Accuracy: 0.16%
Epoch 6/200, Loss: 0.2581731137958874, Test Accuracy: 0.14%
Epoch 7/200, Loss: 0.24842842246209831, Test Accuracy: 0.20%
Epoch 8/200, Loss: 0.2411512370585035, Test Accuracy: 0.21%
Epoch 9/200, Loss: 0.23502715467180263, Test Accuracy: 0.25%
Epoch 10/200, Loss: 0.22951616154441393, Test Accuracy: 0.24%
Epoch 11/200, Loss: 0.22455402923974124, Test Accuracy: 0.23%
Epoch 12/200, Loss: 0.2198177457069336, Test Accuracy: 0.23%
Epoch 13/200, Loss: 0.2162459123110598, Test Accuracy: 0.23%
Epoch 14/200, Loss: 0.21297554922662582, Test Accuracy: 0.24%
Epoch 15/200, Loss: 0.2102353163643673, Test Accuracy: 0.24%


KeyboardInterrupt: 

In [None]:
def calculate_accuracy(predictions, one_hot_labels):
    predicted_classes = np.argmax(predictions, axis=1)
    actual_classes = np.argmax(one_hot_labels, axis=1) 

    return np.mean(predicted_classes == actual_classes) * 100

_, test_predictions  = forwardPropagation(normalized_data_test, W1, b1, W2, b2)
accuracy = calculate_accuracy(test_predictions, one_hot_encoded_test)

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 0.01%
