In [1]:
from src.activation import ReLU
from src.nn import LinearLayer, SGDWithMomentum
from src.loss import CrossEntropyLoss
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

# Load the training and test datasets
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Define data loaders
batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [3]:
input_layer = LinearLayer(784, 256)
relu1 = ReLU()
hidden_layer = LinearLayer(256, 128)
relu2 = ReLU()
output_layer = LinearLayer(128, 10)
criterion = CrossEntropyLoss()

# Collect parameters and gradients for the optimizer
parameters = [input_layer.weights, input_layer.bias,
            hidden_layer.weights, hidden_layer.bias,
            output_layer.weights, output_layer.bias]

gradients = [input_layer.weights_grad, input_layer.bias_grad,
            hidden_layer.weights_grad, hidden_layer.bias_grad,
            output_layer.weights_grad, output_layer.bias_grad]

# Initialize optimizer
optimizer = SGDWithMomentum(parameters, learning_rate=0.1, momentum=0.9)

# Training loop
num_epochs = 60
patience = 5
trigger_times = 0
best_loss = float('inf')

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        # Prepare input and labels
        data = data.view(-1, 28*28).numpy()
        data = (data - data.mean()) / (data.std() + 1e-8)
        target = target.numpy()
        
        # Forward pass
        out = input_layer.forward(data)
        out = relu1.forward(out)
        out = hidden_layer.forward(out)
        out = relu2.forward(out)
        out = output_layer.forward(out)
        loss = criterion.forward(out, target)
        
        # Backward pass
        grad_loss = criterion.backward()
        grad_output = output_layer.backward(grad_loss)
        grad_relu2 = relu2.backward(grad_output)
        grad_hidden = hidden_layer.backward(grad_relu2)
        grad_relu1 = relu1.backward(grad_hidden)
        grad_input = input_layer.backward(grad_relu1)
        
        # Print gradients to check if they are non-zero and sensible
        print("Gradients for input layer weights:", np.mean(input_layer.weights_grad))
        print("Gradients for hidden layer weights:", np.mean(hidden_layer.weights_grad))
        print("Gradients for output layer weights:", np.mean(output_layer.weights_grad))

        optimizer.step(parameters, gradients)
        
        input_layer.zero_grad()
        hidden_layer.zero_grad()
        output_layer.zero_grad()
        
        epoch_loss += loss

    # Print average loss for the epoch
    avg_loss = epoch_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
    
    # Early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping on epoch:', epoch)
            break

Gradients for input layer weights: -9.225513329476314e-05
Gradients for hidden layer weights: 0.00023099092524554314
Gradients for output layer weights: -4.98732999343332e-19
Gradients for input layer weights: -2.839363233550367e-06
Gradients for hidden layer weights: 0.0012072990994847041
Gradients for output layer weights: -3.2526065174565133e-19
Gradients for input layer weights: -1.1658616210003862e-07
Gradients for hidden layer weights: 0.0019772256096315757
Gradients for output layer weights: -1.0842021724855044e-19
Gradients for input layer weights: -7.811919267120919e-05
Gradients for hidden layer weights: -0.0003006361539481362
Gradients for output layer weights: -1.3444106938820255e-18
Gradients for input layer weights: -1.0939919490216022e-05
Gradients for hidden layer weights: 0.0028755555200341695
Gradients for output layer weights: 2.168404344971009e-19
Gradients for input layer weights: 7.27603152734289e-05
Gradients for hidden layer weights: -3.5801303264304976e-05
Grad

KeyboardInterrupt: 

In [None]:
from src.activation import Softmax
softmax = Softmax()
correct = 0
total = 0
for data, target in test_loader:
    data = data.view(-1, 28*28).numpy()
    target = target.numpy()
    
    # Forward pass
    out = input_layer.forward(data)
    out = relu1.forward(out)
    out = hidden_layer.forward(out)
    out = relu2.forward(out)
    out = output_layer.forward(out)
    logits = softmax.forward(out)
    
    predicted = np.argmax(logits, axis=1)
    total += target.size
    correct += (predicted == target).sum()
accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 9.13%
