## MNIST Digit Classification Neural Network 
## A Pure NumPy Implementation

### First implementation

In [1]:
import numpy as np

In [6]:
# A feedforward neural netowrk implementation for digit classification
# It uses ReLU activation for hidden layers and Softmax activation for the output
class NeuralNetwork:
    def __init__(self, input_neurons=784, hidden_layers=[512, 512], output_size=10):
        self.input_neurons = input_neurons
        self.hidden_layers = hidden_layers
        self.output_size = output_size

        # Lists to store the weights and biases for each layer
        self.weights = []
        self.biases = []

        # Input to first hidden layer
        self.weights.append(0.01 * np.random.randn(input_neurons, hidden_layers[0]))
        self.biases.append(np.zeros((1, hidden_layers[0])))

        # Hidden layer to hidden layer
        # For each pair of hidden layers, initialize weight and bias
        for i in range(len(hidden_layers) - 1):
            # Weight matrix between hidden layer i and i + 1
            self.weights.append(0.01 * np.random.randn(hidden_layers[i], hidden_layers[i+1]))
            # Bias vector for layer i + 1
            self.biases.append(np.zeros((1, hidden_layers[i+1])))
        
        # Last hidden layer to output
        self.weights.append(0.01 * np.random.randn(hidden_layers[len(hidden_layers)-1], output_size))
        self.biases.append(np.zeros((1, output_size)))

    # Loop through all layers (hidden + output)
    def forward(self, x_input):
        layers = [x_input] # Initialize input to the first layer

        for i in range(len(self.weights)):
            # Linear transformation: Z = A_prev @ W + b
            # - A_prev: activations from previous layer (or input)
            # - W: current layer's weights
            # - b: current layer's biases
            z = np.dot(layers[-1], self.weights[i]) + self.biases[i]
            layers.append(z)  # Store pre-activation values (needed for backprop)

            # Activation Functions (ReLU + SoftMax)
            if i == len(self.weights) - 1:
                # Output layer: apply Softmax
                # Exponentiate shifted values
                # Normalize to get probabilities (softmax)
                finalOutput = np.exp(layers[-1] - np.max(layers[-1], axis=1, keepdims=True))
                finalOutput = finalOutput / np.sum(finalOutput, axis=1, keepdims=True)
                layers.append(finalOutput)  # Append final output probabilities
            else:
                # Hidden layers: apply ReLU 
                # ReLU activation: max(0, z)
                layers.append(np.maximum(0, layers[-1]))
        
        # Return the final layer output (Softmax probabilities)
        return layers[-1]

In [7]:
# LossCategoricalCrossEntropy implementation
def LossCategoricalCrossEntropy(yPred, yTrue):
    # If predicted class has a prediction of 0% likelihood this prevents log(0), which would be infinity
    yPred = np.clip(yPred, 1e-10, 1 - 1e-10)

    # We calculate the sum of the log losses
    loss = -np.sum(yTrue * np.log(yPred), axis=1)

    # We calculate the average loss - this depends on the number of samples
    # So the return loss is the average loss not the summed up loss (which took me a while to understand)
    average_loss = np.mean(loss)

    return average_loss

In [8]:
myNeuralNet = NeuralNetwork()

In [9]:
result = myNeuralNet.forward(np.random.rand(1, 784))
result

array([[0.1001581 , 0.09964019, 0.10055926, 0.09988822, 0.09950752,
        0.10033094, 0.09959257, 0.1002767 , 0.09990941, 0.1001371 ]])

### Backpropogation