In [20]:
from pathlib import Path

ROOT_DIR = Path('..') / '..'
!pip install -q -r {ROOT_DIR/ 'requirements.txt'}

import torch

from torchvision.datasets import MNIST, VisionDataset
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

from activations import relu
from devices import Device
from networks import FeedForwardNetwork
from utils import DATA_PATH

### Training a Neural Network with Backpropagation

Backpropagation is like a teacher for neural networks. It helps the network learn from mistakes by making small changes to its internal settings. Here's how it works:

1. **Learning from Examples:** We show the network many examples and tell it the correct answers.
2. **Making Guesses:** The network tries to guess the answer for each example.
3. **Checking Mistakes:** After guessing, we check how far off its guess was from the correct answer.
4. **Learning from Mistakes:** Using the mistakes it made, the network fine-tunes its internal settings to guess better next time.
5. **Repeat:** We keep showing examples until the network gets good at guessing right.

The magic of backpropagation is in step 4, where it figures out which settings to tweak and by how much. This "tweaking" is done using a math trick called gradient descent.

#### Checking if Gradients are Correct: Gradient Checking

Imagine you've got a math formula, and you've made some changes to it. You'd want to double-check if your changes were right. That's what gradient checking does for neural networks.

In simple terms, gradient checking compares two methods of finding gradients (slopes). One method uses the standard backpropagation technique. The other uses a quick-and-dirty method called "finite difference approximation." If both methods give similar answers, we can be pretty sure our backpropagation is set up correctly.

Here's a simple way to do gradient checking:

In [None]:
from loss import cross_entropy


def check_gradients(epsilon: float = 1e-6):
    # Disable tracking computations
    with torch.no_grad():
        # Set some basics and random data
        samples = 100
        input_size = 300
        output_classes = 10
        # Set up a basic neural network
        network = FeedForwardNetwork(input_size, [100, 200], [sigmoid, relu], output_classes)
        parameters = list(network.parameters())
        # Random input
        input_data = torch.randn(samples, input_size)
        # Make random target labels
        labels = torch.zeros(samples, output_classes)
        targets = torch.randint(0, output_classes, (samples,))
        labels[torch.arange(samples), targets] = 1
        for param in parameters:
            # Check the loss when we reduce the parameter a tiny bit
            param -= epsilon
            pred_minus = network(input_data)
            loss_minus = cross_entropy(pred_minus, labels)
            # Check the loss when we increase the parameter a tiny bit
            param += 2 * epsilon
            pred_plus = network(input_data)
            loss_plus = cross_entropy(pred_plus, labels)
            # Quick-and-dirty gradient calculation
            estimated_gradient = (loss_plus - loss_minus) / (2 * epsilon)
            # Bring parameter back to original
            param -= epsilon
            # Get the actual gradient using backpropagation
            pred = network(input_data)
            network.backward(input_data, labels, pred)
            # See how different the two gradients are
            difference = torch.abs(estimated_gradient - torch.mean(param.grad))
            print(f"Difference between estimated and real gradient: {difference}")


# Run our gradient check
check_gradients()

#### Training the Model

Now that we've verified that our gradients are correct, we can train our model. We'll use a RandomDataset. This dataset generates random data and labels on the fly. It's useful for testing and debugging.

In [None]:
def plot_loss_and_accuracy(losses: list[float], accuracies: list[float]):
    """Plot the loss and accuracy of the model during training"""
    fig_loss = pyplot.figure(1)
    loss_ax = fig_loss.add_subplot(111)
    loss_ax.set_title("Loss")
    loss_ax.set_xlabel("epochs")
    loss_ax.set_ylabel("loss")
    loss_ax.plot(losses, c="r")

    fig_accuracy = pyplot.figure(2)
    accuracy_ax = fig_accuracy.add_subplot(111)
    accuracy_ax.set_title("Accuracy")
    accuracy_ax.set_xlabel("epochs")
    accuracy_ax.set_ylabel("acc")
    accuracy_ax.plot(accuracies, c="b")
    pyplot.show()

In [None]:
from datasets import SizedDataset


def convert_to_one_hot(tensor: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    """
    Convert tensor to one-hot encoding based on provided labels.
    """
    one_hot = torch.zeros_like(tensor)
    one_hot[torch.arange(tensor.size(0)), labels] = 1.0
    return one_hot


def train_one_batch(network: FeedForwardNetwork, optimizer: StochasticGradientDescent, x: torch.Tensor,
                    y: torch.Tensor) -> None:
    """
    Train the network on a single batch of data.
    """
    y_pred = network(x)
    y_onehot = convert_to_one_hot(y_pred, y)
    network.backward(x, y_onehot, y_pred)
    optimizer.step()


def report_progress(epoch: int, accuracy: float, loss: float, avg_time: float):
    """
    Report training progress.
    """
    print(f"\rEpoch:{epoch:03d} Accuracy:{accuracy:.2f}% Loss:{loss:.4f} Time/epoch:{avg_time:.3f}s", end='')


def calculate_average_time(previous_avg: float, current_time: float, epoch: int) -> float:
    """
    Calculate the average time taken per epoch.
    """
    return (previous_avg * (epoch - 1) + current_time) / epoch


def train_feed_forward_network(
        network: FeedForwardNetwork,
        dataset: SizedDataset | VisionDataset,
        optimizer: StochasticGradientDescent,
        epochs: int = 1,
        batch_size: int = 1,
        reports_every: int = 1,
        device=Device.CPU
) -> tuple[list[float], list[float]]:
    network.to(device)
    data_loader = DataLoader(dataset, batch_size, shuffle=True)
    dataset_size = len(dataset)
    average_time_per_epoch = 0
    losses, accuracies = [], []
    for epoch in range(1, epochs + 1):
        epoch_start_time = timer()
        for x, y in data_loader:
            x, y = x.view(x.size(0), -1).float().to(device), y.to(device)
            train_one_batch(network, optimizer, x, y)
        average_time_per_epoch = calculate_average_time(average_time_per_epoch, timer() - epoch_start_time, epoch)
        if epoch % reports_every == 0:
            x_all = dataset.data.view(dataset_size, -1).float().to(device)
            true_labels = dataset.targets.to(device)
            predicted_output = network(x_all).to(device)
            onehot_prediction = convert_to_one_hot(predicted_output, true_labels)
            loss = cross_entropy(predicted_output, onehot_prediction)
            losses.append(loss)
            predicted_labels = torch.argmax(predicted_output, dim=1)
            accuracy = 100 * (predicted_labels == true_labels).sum().item() / dataset_size
            accuracies.append(accuracy)
            report_progress(epoch, accuracy, loss, average_time_per_epoch)
    return losses, accuracies


In [None]:
def train_network_on_dataset(dataset_class):
    # Hyperparameters
    n_samples = 2000
    n_features = 300
    n_classes = 10
    hidden_layer_sizes = [300, 400]
    activation_functions = [celu, relu]
    activation_function_parameters = [float(n_classes), None]
    learning_rate = 1e-3
    epochs = 100
    batch_size = 32

    # Initialize network
    network = FeedForwardNetwork(n_features, hidden_layer_sizes, activation_functions,
                                 n_classes, activation_function_parameters)

    # Generate dataset based on the provided dataset class
    dataset = dataset_class(n_samples, n_features, n_classes)

    # Initialize optimizer
    optimizer = StochasticGradientDescent(network.parameters(), learning_rate=learning_rate)

    # Train network
    with torch.no_grad():
        losses, accuracies = train_feed_forward_network(network, dataset, optimizer,
                                                        epochs=epochs, batch_size=batch_size)

    # Plot results
    plot_loss_and_accuracy(losses, accuracies)


train_network_on_dataset(RandomUniformDataset)

In [None]:
from datasets import BernoulliDataset

train_network_on_dataset(BernoulliDataset)

In [None]:
from datasets import RandomNormalDataset

train_network_on_dataset(RandomNormalDataset)

#### Training the Model on MNIST

Having introduced the MNIST dataset and our chosen training methodology, let's delve into the specifics of putting them into action.

In [None]:
def train_network_on_mnist_dataset():
    # Hyperparameters
    n_features = 784
    n_classes = 10
    hidden_layer_sizes = [512, 1024, 128]
    activation_functions = [relu, relu, relu]
    learning_rate = 1e-5
    epochs = 30
    batch_size = 32
    # Initialize network
    network = FeedForwardNetwork(n_features, hidden_layer_sizes, activation_functions, n_classes)
    # Generate random dataset

    dataset = MNIST(
        str(DATA_PATH / "mnist"), train=False, transform=ToTensor(), download=True
    )
    # Initialize optimizer
    optimizer = StochasticGradientDescent(network.parameters(), learning_rate=learning_rate)
    # Train network
    with torch.no_grad():
        losses, accuracies = train_feed_forward_network(network, dataset, optimizer, epochs=epochs,
                                                        batch_size=batch_size)
    # Plot results
    plot_loss_and_accuracy(losses, accuracies)


train_network_on_mnist_dataset()