In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import time

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

class Config:
    """Configuration class for hyperparameters and settings"""
    BATCH_SIZE = 128
    LEARNING_RATE = 0.001
    EPOCHS = 10
    NUM_CLASSES = 10
    INPUT_SIZE = 28 * 28  # MNIST image size
    HIDDEN_SIZE_1 = 512
    HIDDEN_SIZE_2 = 256
    DROPOUT_RATE = 0.5

    # CNN specific
    CNN_CHANNELS_1 = 32
    CNN_CHANNELS_2 = 64
    CNN_KERNEL_SIZE = 3
    POOL_SIZE = 2

def get_device():
    """Get the best available device (GPU if available, else CPU)"""
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        device = torch.device('cpu')
        print("Using CPU")
    return device

def load_mnist_data():
    """
    Load and preprocess MNIST dataset
    Returns train and test data loaders
    """
    # Data preprocessing pipeline
    transform = transforms.Compose([
        transforms.ToTensor(),  # Convert PIL Image to tensor and normalize to [0,1]
        transforms.Normalize((0.1307,), (0.3081,))  # MNIST specific normalization
    ])

    # Download and load training data
    train_dataset = torchvision.datasets.MNIST(
        root='./data',
        train=True,
        download=True,
        transform=transform
    )

    # Download and load test data
    test_dataset = torchvision.datasets.MNIST(
        root='./data',
        train=False,
        download=True,
        transform=transform
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=False,
        num_workers=2
    )

    print(f"Training samples: {len(train_dataset)}")
    print(f"Test samples: {len(test_dataset)}")
    print(f"Batch size: {Config.BATCH_SIZE}")

    return train_loader, test_loader

class ANN(nn.Module):
    """
    Artificial Neural Network (Fully Connected Network)
    Architecture: Input -> FC1 -> ReLU -> Dropout -> FC2 -> ReLU -> Dropout -> FC3 -> Output
    """
    def __init__(self):
        super(ANN, self).__init__()

        # Define layers
        self.fc1 = nn.Linear(Config.INPUT_SIZE, Config.HIDDEN_SIZE_1)
        self.fc2 = nn.Linear(Config.HIDDEN_SIZE_1, Config.HIDDEN_SIZE_2)
        self.fc3 = nn.Linear(Config.HIDDEN_SIZE_2, Config.NUM_CLASSES)

        # Dropout layers for regularization
        self.dropout = nn.Dropout(Config.DROPOUT_RATE)

        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize network weights using Xavier/Glorot initialization"""
        for layer in [self.fc1, self.fc2, self.fc3]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        """Forward propagation"""
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class CNN(nn.Module):
    """
    Convolutional Neural Network
    Architecture: Conv1 -> ReLU -> MaxPool -> Conv2 -> ReLU -> MaxPool -> Flatten -> FC1 -> ReLU -> Dropout -> FC2
    """
    def __init__(self):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv2d(1, Config.CNN_CHANNELS_1, Config.CNN_KERNEL_SIZE, padding=1)
        self.conv2 = nn.Conv2d(Config.CNN_CHANNELS_1, Config.CNN_CHANNELS_2, Config.CNN_KERNEL_SIZE, padding=1)
        self.pool = nn.MaxPool2d(Config.POOL_SIZE, Config.POOL_SIZE)

        self.fc_input_size = Config.CNN_CHANNELS_2 * 7 * 7
        self.fc1 = nn.Linear(self.fc_input_size, Config.HIDDEN_SIZE_2)
        self.fc2 = nn.Linear(Config.HIDDEN_SIZE_2, Config.NUM_CLASSES)
        self.dropout = nn.Dropout(Config.DROPOUT_RATE)
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class NetworkTrainer:
    """Training and evaluation handler"""

    def __init__(self, model, device, train_loader, test_loader):
        self.model = model.to(device)
        self.device = device
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=Config.LEARNING_RATE)
        self.train_losses, self.train_accuracies = [], []
        self.test_losses, self.test_accuracies = [], []

    def train_epoch(self):
        self.model.train()
        running_loss, correct, total = 0.0, 0, 0
        for data, targets in tqdm(self.train_loader, desc='Training'):
            data, targets = data.to(self.device), targets.to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(data)
            loss = self.criterion(outputs, targets)
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        return running_loss / len(self.train_loader), 100. * correct / total

    def evaluate(self):
        self.model.eval()
        loss, correct, total = 0.0, 0, 0
        with torch.no_grad():
            for data, targets in self.test_loader:
                data, targets = data.to(self.device), targets.to(self.device)
                outputs = self.model(data)
                loss += self.criterion(outputs, targets).item()
                _, predicted = torch.max(outputs.data, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        return loss / len(self.test_loader), 100. * correct / total

    def train(self, epochs):
        for epoch in range(epochs):
            train_loss, train_acc = self.train_epoch()
            test_loss, test_acc = self.evaluate()
            self.train_losses.append(train_loss)
            self.train_accuracies.append(train_acc)
            self.test_losses.append(test_loss)
            self.test_accuracies.append(test_acc)
            print(f"Epoch {epoch+1}/{epochs}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
        return self.train_losses, self.train_accuracies, self.test_losses, self.test_accuracies

def main():
    device = get_device()
    train_loader, test_loader = load_mnist_data()
    ann_model = ANN()
    cnn_model = CNN()
    ann_trainer = NetworkTrainer(ann_model, device, train_loader, test_loader)
    cnn_trainer = NetworkTrainer(cnn_model, device, train_loader, test_loader)
    ann_history = ann_trainer.train(Config.EPOCHS)
    cnn_history = cnn_trainer.train(Config.EPOCHS)
    print("ANN Final Accuracy:", ann_history[3][-1])
    print("CNN Final Accuracy:", cnn_history[3][-1])

if __name__ == "__main__":
    main()