In [2]:
import numpy as np

In [3]:
def sigmoid(z):
    """ Sigmoid activation function."""
    return 1 / (1 + np.exp(-z))

def sigmoid_prime(z):
        """Derivative of the sigmoid function."""
        return sigmoid(z) * (1 - sigmoid(z))

def mse(y_pred, y):
    """Mean Squared Error (MSE) cost function."""
    return np.mean((y_pred - y) ** 2)

def mse_prime(y_pred, y):
    """Derivative of the Mean Squared Error (MSE) cost function."""
    return 2 * (y_pred - y) / y.shape[1]
    
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def softmax_prime(z):
    s = softmax(z)
    return s * (1 - s)

In [159]:
class NeuralNetwork:
    def __init__(self, layer_sizes):
        """
        Initialize the neural network with random weights and biases.
        
        Args:
            layer_sizes (list): List of integers specifying the number of neurons in each layer.
                               Example: [input_size, hidden_size, ..., output_size]
        """
        self.layer_sizes = layer_sizes
        self.num_layers = len(layer_sizes)
        
        # Initialize weights and biases
        self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
        self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
        

    def forward(self, X):
        # Reshape input to a column vector
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        
        self.a = [X] # List to store activations for each layer
        self.z = [] # List to store weighted inputs for each layer

        for l, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = w @ self.a[-1] + b # Weighted input
            self.z.append(z) 

            # Use softmax for last layer sigmoid for hidden layers
            if l == len(self.biases) - 1: # Last layer (output)
                a = softmax(z) # Softmax for output
            else:
                a = sigmoid(z) # Sigmoid for hidden
            
            self.a.append(a) # Activation

        return self.a[-1] # Output of the network
    

    def backprop(self, y):
        """
        Compute gradients using backpropagation.
        
        Args:
            y: Target outputs of shape (output_size, batch_size).
        
        Returns:
            Tuple (grad_w, grad_b) containing gradients for each layer.
        """
        # Initialize gradient lists
        grad_w = [np.zeros_like(w) for w in self.weights]
        grad_b = [np.zeros_like(b) for b in self.biases]
        
        # Output layer error (delta_L)
        delta = mse_prime(self.a[-1], y) * softmax_prime(self.z[-1])
        # Store output layer gradients
        grad_w[-1] = delta @ self.a[-2].T
        grad_b[-1] = delta
        
        # Propagate error backward through hidden layers
        for l in reversed(range(len(self.weights) - 1)):
            # Calculate delta for current layer
            delta = (self.weights[l + 1].T @ delta) * sigmoid_prime(self.z[l])
            # Compute gradients
            grad_w[l] = delta @ self.a[l].T
            grad_b[l] = delta
        
        return grad_w, grad_b
        

    def update_parameters(self, grad_w, grad_b, eta, batch_size):
        """Update weights and biases using averaged gradients."""
        for l in range(len(self.weights)):
            self.weights[l] -= (eta / batch_size) * grad_w[l]
            self.biases[l] -= (eta / batch_size) * grad_b[l]
            

    def train(self, x_train, y_train, epochs, batch_size, eta, results=False):
        """
        Train the network using mini-batch SGD.
        
        Args:
            x_train: Input data of shape (num_samples, input_size).
            y_train: Target outputs of shape (num_samples, output_size).
            epochs: Number of training epochs.
            batch_size: Size of mini-batches.
            eta: Learning rate.
        """
        num_samples = len(x_train)
        input_size = self.layer_sizes[0]
        output_size = self.layer_sizes[-1]
        
        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(num_samples)
            X_shuffled = [x_train[i] for i in indices]
            y_shuffled = [y_train[i] for i in indices]

            if results:
                # Initialize lists to store outputs and true labels
                epoch_outputs = []
                epoch_labels = []
            
            # Process mini-batches
            for i in range(0, num_samples, batch_size):
                # Get mini-batch
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Initialize gradient arrays
                grad_w = [np.zeros_like(w) for w in self.weights]
                grad_b = [np.zeros_like(b) for b in self.biases]
                
                # Accumulate gradients over mini-batch
                for x, y_true in zip(X_batch, y_batch):
                    # Convert to column vectors
                    x = x.reshape(-1, 1)
                    y_true = y_true.reshape(-1, 1)

                    
                    # Forward + Backprop
                    output = self.forward(x)
                    batch_grad_w, batch_grad_b = self.backprop(y_true)

                    if results:
                        epoch_outputs.append(output.flatten())  # Save output
                        epoch_labels.append(y_true.flatten())  # Save true label
                    
                    # Add to total gradients
                    for l in range(len(self.weights)):
                        grad_w[l] += batch_grad_w[l]
                        grad_b[l] += batch_grad_b[l]
                
                # Update parameters (gradients are averaged here)
                self.update_parameters(grad_w, grad_b, eta, batch_size)

            
            # Optional: Print progress
            if results && (epoch % 10 == 0): 
                # Convert lists to numpy arrays
                epoch_outputs = np.array(epoch_outputs)
                epoch_labels = np.array(epoch_labels)
    
                # Calculate loss (MSE)
                loss = mse(epoch_outputs, epoch_labels)
    
                # Calculate accuracy
                predictions = np.argmax(epoch_outputs, axis=1)
                true_labels = np.argmax(epoch_labels, axis=1)
                accuracy = np.mean(predictions == true_labels)
    
                # Print progress
                print(f"Epoch: {epoch + 1}/{epochs}, Loss: {loss:.4f}, Accuracy: {accuracy * 100:.2f}%")


In [143]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the data to the range [0, 1]
x_train = x_train / 255.0
x_test = x_test / 255.0

# Flatten the images (28x28 -> 784)
x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)

# Convert labels to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

In [161]:
layer_sizes = [784, 16, 16, 10]
nn = NeuralNetwork(layer_sizes)

epochs = 20
batch_size = 32
eta = 0.01

nn.train(x_train, y_train, epochs, batch_size, eta, results=True)

ValueError: operands could not be broadcast together with shapes (10,1) (60000,10) 

In [None]:
"""
epoch loop
    shuffle(training_data)
    break data into minibatches according to batch size
    loop over minibatches
        for a single minibatch loop over the training data inside it
            forward pass with training example as input
            calculate gradient using backprop
        average over the components inside the gradients respectfully to compute an estimate of the total gradient of the cost function 
        update weights and biases according to the learning rate and the gradient (estimated over the gradients of each training example in a minibatch)
"""