In [2]:
# Install sklearn if not present (usually is in Colab)
!pip install scikit-learn



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign


import time
import os
import copy
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt

# Silence warnings if any (optional)
import warnings
warnings.filterwarnings('ignore')

In [4]:
print("--- Setup ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

--- Setup ---
Using device: cuda


In [5]:
dataset_root_dir = '/content/data'
# Example if using the specific path from the prompt (less standard for torchvision):
# You might need to manually structure it like: /content/drive/MyDrive/dataset/At2/MNIST/raw/
# mnist_raw_path_train = '/content/drive/MyDrive/dataset/At2/train-images.idx3-ubyte'
# mnist_raw_path_test = '/content/drive/MyDrive/dataset/At2/t10k-images.idx3-ubyte'
# For simplicity, we let torchvision manage the download/structure in dataset_root_dir

# --- Hyperparameters ---
batch_size = 128 # Increased batch size for potentially faster GPU training
learning_rate_cnn = 0.001
learning_rate_finetune = 0.0005 # Often use smaller LR for fine-tuning
num_epochs_cnn = 10         # Standard for CNN on MNIST
num_epochs_finetune = 8     # Fine-tuning might converge faster
num_classes = 10
weight_decay = 1e-4 # Regularization

In [6]:
print("\n--- Data Loading & Preprocessing ---")

# --- Transforms ---
# For standard CNN and Faster R-CNN (adapted input)
transform_mnist = transforms.Compose([
    transforms.ToTensor(), # Convert PIL image or numpy array to FloatTensor (C x H x W) and scale [0,1]
    transforms.Normalize((0.1307,), (0.3081,)) # MNIST specific mean/std
])

# For Faster R-CNN and Pre-trained models (expect 3 channels and often larger size)
transform_rgb_resized = transforms.Compose([
    transforms.Grayscale(num_output_channels=3), # Convert MNIST to 3 channels
    transforms.Resize((224, 224)), # Resize to match VGG/AlexNet/Faster R-CNN input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet mean/std
])


# --- Datasets ---
# Use dataset_root_dir. 'download=True' will download if not found in the specified structure.
train_dataset_mnist = datasets.MNIST(root=dataset_root_dir, train=True, download=True, transform=transform_mnist)
test_dataset_mnist = datasets.MNIST(root=dataset_root_dir, train=False, download=True, transform=transform_mnist)

train_dataset_rgb = datasets.MNIST(root=dataset_root_dir, train=True, download=True, transform=transform_rgb_resized)
test_dataset_rgb = datasets.MNIST(root=dataset_root_dir, train=False, download=True, transform=transform_rgb_resized)


--- Data Loading & Preprocessing ---


100%|██████████| 9.91M/9.91M [00:02<00:00, 4.56MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 133kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.27MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 8.77MB/s]


In [7]:
train_loader_mnist = DataLoader(dataset=train_dataset_mnist, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader_mnist = DataLoader(dataset=test_dataset_mnist, batch_size=batch_size, shuffle=False, num_workers=2)

train_loader_rgb = DataLoader(dataset=train_dataset_rgb, batch_size=batch_size // 4 , shuffle=True, num_workers=2) # Reduce batch size for larger images
test_loader_rgb = DataLoader(dataset=test_dataset_rgb, batch_size=batch_size // 4, shuffle=False, num_workers=2)

print(f"MNIST dataset: Train={len(train_dataset_mnist)}, Test={len(test_dataset_mnist)}")
print(f"RGB Resized dataset: Train={len(train_dataset_rgb)}, Test={len(test_dataset_rgb)}")
print(f"MNIST Batch size: {batch_size}, RGB Batch size: {batch_size // 4}")

MNIST dataset: Train=60000, Test=10000
RGB Resized dataset: Train=60000, Test=10000
MNIST Batch size: 128, RGB Batch size: 32


In [8]:
print("\n--- Model Definitions ---")

# --- Part 1: Simple CNN ---
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        # Input: 1x28x28
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, stride=1, padding=2)
        # Shape: 32x28x28
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Shape: 32x14x14
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2)
        # Shape: 64x14x14
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        # Shape: 64x7x7
        self.fc1 = nn.Linear(64 * 7 * 7, 1000) # Flattened size
        self.dropout = nn.Dropout(0.5) # Regularization
        self.fc2 = nn.Linear(1000, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7) # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

print("SimpleCNN defined.")

# --- Part 2: Faster R-CNN (Adaptation Attempt) ---
# NOTE: Faster R-CNN is fundamentally an OBJECT DETECTOR.
# Using it for pure classification is highly inefficient and non-standard.
# We will load a pre-trained model and demonstrate INFERENCE,
# but proper training as a classifier is complex and not recommended for this task.
# We will extract class predictions from its output for comparison purposes.

def get_faster_rcnn_model(num_classes_det=num_classes + 1): # Add 1 for background class
    # Load a pre-trained Faster R-CNN model (ResNet50 backbone)
    model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    # Get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # Replace the pre-trained head with a new one (num_classes + background)
    model.roi_heads.box_predictor = models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes_det)

    # --- Adaptation for Grayscale Input (Optional, handled by transforms instead) ---
    # If we didn't use Grayscale(3) in transforms, we could modify the first layer:
    # model.backbone.body.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

    return model

print("Faster R-CNN loading function defined (for inference adaptation).")


# --- Part 4: Fine-tuning Pre-trained Models ---
def get_finetuned_model(model_name, num_classes, use_pretrained=True, freeze_features=True):
    model_ft = None
    input_size = 224 # VGG/AlexNet expect 224x224

    if model_name == "vgg16":
        model_ft = models.vgg16(pretrained=use_pretrained)
        # Freeze feature parameters
        if freeze_features:
            for param in model_ft.features.parameters():
                param.requires_grad = False
        # Replace the classifier
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)

    elif model_name == "alexnet":
        model_ft = models.alexnet(pretrained=use_pretrained)
        # Freeze feature parameters
        if freeze_features:
            for param in model_ft.features.parameters():
                param.requires_grad = False
        # Replace the classifier
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)

    else:
        print("Invalid model name, exiting...")
        exit()

    # --- Adaptation for Grayscale Input (Optional, handled by transforms instead) ---
    # If we didn't use Grayscale(3) in transforms, we could modify the first layer:
    # if model_name in ["vgg16", "alexnet"]:
    #     # Get original weights
    #     original_conv1 = model_ft.features[0]
    #     original_weights = original_conv1.weight.data
    #     # Create new conv layer for 1 channel input
    #     new_conv1 = nn.Conv2d(1, original_conv1.out_channels, kernel_size=original_conv1.kernel_size,
    #                           stride=original_conv1.stride, padding=original_conv1.padding, bias=(original_conv1.bias is not None))
    #     # Average weights across the input channels (simple approach)
    #     new_conv1.weight.data = torch.mean(original_weights, dim=1, keepdim=True)
    #     if original_conv1.bias is not None:
    #         new_conv1.bias.data = original_conv1.bias.data
    #     model_ft.features[0] = new_conv1
    # else: # ResNet etc. have different structure
    #     pass # Handle other models if needed

    return model_ft, input_size

print("Fine-tuning functions defined for VGG16, AlexNet.")


--- Model Definitions ---
SimpleCNN defined.
Faster R-CNN loading function defined (for inference adaptation).
Fine-tuning functions defined for VGG16, AlexNet.


In [10]:
print("\n--- Training & Evaluation Functions ---")

# --- Generic Training Function (for Classifiers: CNN, Fine-tuned) ---
def train_classifier(model, device, train_loader, optimizer, criterion, epoch, log_interval=100):
    model.train()
    train_loss = 0
    correct = 0
    start_time = time.time()

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

        if batch_idx % log_interval == 0:
             print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                   f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

    end_time = time.time()
    train_loss /= len(train_loader) # Avg loss per batch
    accuracy = 100. * correct / len(train_loader.dataset)
    training_time = end_time - start_time
    print(f'\nTraining Set: Average loss: {train_loss:.4f}, Accuracy: {correct}/{len(train_loader.dataset)} ({accuracy:.2f}%), Time: {training_time:.2f}s')
    return train_loss, accuracy, training_time


# --- Generic Evaluation Function (for Classifiers: CNN, Fine-tuned) ---
def evaluate_classifier(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss (reduction='sum' default)
            pred = output.argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_targets.extend(target.cpu().numpy())

    test_loss /= len(test_loader) # Avg loss per batch
    accuracy = accuracy_score(all_targets, all_preds) * 100
    f1 = f1_score(all_targets, all_preds, average='weighted') # Use weighted for multiclass

    print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%, F1 Score: {f1:.4f}\n')
    return test_loss, accuracy, f1


--- Training & Evaluation Functions ---


In [15]:
# --- Evaluation Function ADAPTED for Faster R-CNN Inference ---
# This function runs inference and tries to interpret the output for classification
def evaluate_faster_rcnn_adapted(model, device, test_loader):
    model.eval()
    all_preds = []
    all_targets = []
    inference_time = 0

    with torch.no_grad():
        # The loop correctly gets images and the original MNIST labels (targets)
        for images, targets in test_loader:
            # Prepare images for the model (list of tensors)
            images = list(img.to(device) for img in images)

            # !!! REMOVE THIS LINE - Not needed for inference !!!
            # targets_list = [{k: v.to(device) for k, v in t.items()} for t in [{'labels': target.unsqueeze(0)}] for target in targets]

            start_time = time.time()
            # Pass only images during evaluation
            outputs = model(images)
            inference_time += (time.time() - start_time)

            # --- Interpretation for Classification ---
            # For each image's output, find the detection with the highest score
            # and use its label as the classification prediction.
            # This is a heuristic for this task.
            for i, output in enumerate(outputs):
                if len(output['labels']) > 0:
                    # Get score and label of highest confidence detection
                    # FRCNN output labels include background (0), so map back if needed
                    # Our model head predicts num_classes + 1 (background)
                    # We only care about the actual digit classes (1 to 10 conceptually)
                    best_score_idx = torch.argmax(output['scores'])
                    predicted_label = output['labels'][best_score_idx].item()

                    # Map background (class 0) or adjust if necessary based on predictor output
                    # Assuming the predictor outputs 0 for bg, 1 for digit '0', ..., 10 for digit '9'
                    # We need to map class 1 -> target 0, class 2 -> target 1, ..., class 10 -> target 9
                    if predicted_label > 0 : # Ignore background predictions if any show up
                        mapped_pred = predicted_label - 1 # Map back to 0-9 range
                        all_preds.append(mapped_pred)
                    else:
                        # Handle cases where only background is detected or no boxes found
                        # Assign a default prediction (e.g., -1 or a random class) or the most frequent class?
                        # For simplicity, let's append a placeholder or skip if only background detected.
                        # We'll append -1 and filter later if necessary, or just take the most likely non-bg class
                        non_bg_indices = (output['labels'] > 0).nonzero(as_tuple=True)[0]
                        if len(non_bg_indices) > 0:
                            best_non_bg_score_idx = non_bg_indices[torch.argmax(output['scores'][non_bg_indices])]
                            mapped_pred = output['labels'][best_non_bg_score_idx].item() - 1
                            all_preds.append(mapped_pred)
                        else:
                             all_preds.append(-1) # Indicate no digit detected / only background

                else:
                    # No detections found for this image
                    all_preds.append(-1) # Use -1 to indicate no prediction / failure

                # Append the ground truth label from the dataloader's 'targets'
                all_targets.append(targets[i].item()) # This uses the correct 'targets' (plural)

    # Filter out failed predictions (-1) for metric calculation
    valid_indices = [i for i, p in enumerate(all_preds) if p != -1]
    filtered_preds = [all_preds[i] for i in valid_indices]
    filtered_targets = [all_targets[i] for i in valid_indices]

    if len(filtered_preds) == 0:
        print("Warning: Faster R-CNN adaptation failed to produce valid predictions.")
        return 0.0, 0.0, 0.0, inference_time / len(test_loader.dataset) if len(test_loader.dataset) > 0 else 0

    # Ensure lengths match after filtering before calculating metrics
    if len(filtered_preds) != len(filtered_targets):
         print(f"Warning: Mismatch between filtered predictions ({len(filtered_preds)}) and targets ({len(filtered_targets)}). Skipping metrics.")
         return 0.0, 0.0, 0.0, inference_time / len(test_loader.dataset) if len(test_loader.dataset) > 0 else 0


    accuracy = accuracy_score(filtered_targets, filtered_preds) * 100
    f1 = f1_score(filtered_targets, filtered_preds, average='weighted')
    avg_inference_time_per_image = inference_time / len(test_loader.dataset) if len(test_loader.dataset) > 0 else 0


    print(f'Faster R-CNN (Adapted Eval): Accuracy: {accuracy:.2f}% (on {len(filtered_preds)}/{len(all_targets)} images), F1 Score: {f1:.4f}, Avg Inference Time: {avg_inference_time_per_image*1000:.2f} ms/image\n')
    # Note: Loss calculation is not straightforward here as we didn't train it for classification loss.
    return 0.0, accuracy, f1, avg_inference_time_per_image # Return 0 for loss

In [12]:
results = {} # Dictionary to store metrics for comparison

# --- Part 1: Train and Evaluate Simple CNN ---
print("\n--- Part 1: Simple CNN ---")
cnn_model = SimpleCNN(num_classes=num_classes).to(device)
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=learning_rate_cnn, weight_decay=weight_decay)
cnn_criterion = nn.CrossEntropyLoss()

cnn_total_train_time = 0
print("Training CNN...")
for epoch in range(1, num_epochs_cnn + 1):
    train_loss, train_acc, train_time = train_classifier(cnn_model, device, train_loader_mnist, cnn_optimizer, cnn_criterion, epoch)
    cnn_total_train_time += train_time
    if epoch == num_epochs_cnn: # Evaluate only on the last epoch for final metrics
         print("\nEvaluating CNN...")
         test_loss, test_acc, test_f1 = evaluate_classifier(cnn_model, device, test_loader_mnist, cnn_criterion)
         results['SimpleCNN'] = {'Loss': test_loss, 'Accuracy': test_acc, 'F1 Score': test_f1, 'Training Time (s)': cnn_total_train_time}


--- Part 1: Simple CNN ---
Training CNN...

Training Set: Average loss: 0.1421, Accuracy: 57362/60000 (95.60%), Time: 15.59s

Training Set: Average loss: 0.0485, Accuracy: 59054/60000 (98.42%), Time: 15.64s

Training Set: Average loss: 0.0356, Accuracy: 59338/60000 (98.90%), Time: 14.03s

Training Set: Average loss: 0.0290, Accuracy: 59445/60000 (99.08%), Time: 15.23s

Training Set: Average loss: 0.0249, Accuracy: 59520/60000 (99.20%), Time: 13.57s

Training Set: Average loss: 0.0230, Accuracy: 59543/60000 (99.24%), Time: 13.47s

Training Set: Average loss: 0.0196, Accuracy: 59593/60000 (99.32%), Time: 14.10s

Training Set: Average loss: 0.0192, Accuracy: 59617/60000 (99.36%), Time: 14.71s

Training Set: Average loss: 0.0180, Accuracy: 59639/60000 (99.40%), Time: 13.84s

Training Set: Average loss: 0.0164, Accuracy: 59682/60000 (99.47%), Time: 13.68s

Evaluating CNN...
Test set: Average loss: 0.0224, Accuracy: 99.27%, F1 Score: 0.9927



In [16]:
print("\n--- Part 2: Faster R-CNN (Adapted Evaluation) ---")
print("NOTE: Loading pre-trained Faster R-CNN and adapting output for classification.")
print("This is NOT training Faster R-CNN for classification, only evaluating its potential.")

try:
    # Remember: num_classes for detection = actual classes + background
    frcnn_model = get_faster_rcnn_model(num_classes_det=num_classes + 1).to(device)
    # No training loop here as it's non-standard and computationally expensive for this task.
    # We just evaluate using the adapted function.
    print("Evaluating Faster R-CNN (Adapted)...")
    frcnn_loss, frcnn_acc, frcnn_f1, frcnn_inf_time = evaluate_faster_rcnn_adapted(frcnn_model, device, test_loader_rgb)
    # Training time is N/A as we are not training it here. Inference time is reported per image.
    results['FasterRCNN_Adapted'] = {'Loss': frcnn_loss, 'Accuracy': frcnn_acc, 'F1 Score': frcnn_f1, 'Training Time (s)': 'N/A (Inference Only)'}
except Exception as e:
    print(f"Could not run Faster R-CNN evaluation: {e}")
    print("Skipping Faster R-CNN part.")
    results['FasterRCNN_Adapted'] = {'Loss': float('nan'), 'Accuracy': float('nan'), 'F1 Score': float('nan'), 'Training Time (s)': 'N/A (Error)'}


--- Part 2: Faster R-CNN (Adapted Evaluation) ---
NOTE: Loading pre-trained Faster R-CNN and adapting output for classification.
This is NOT training Faster R-CNN for classification, only evaluating its potential.
Evaluating Faster R-CNN (Adapted)...
Faster R-CNN (Adapted Eval): Accuracy: 11.35% (on 10000/10000 images), F1 Score: 0.0231, Avg Inference Time: 80.91 ms/image



In [17]:
print("\n--- Part 4: Fine-tuning Pre-trained Models ---")

for model_name in ["vgg16", "alexnet"]:
    print(f"\n--- Fine-tuning {model_name} ---")
    ft_model, _ = get_finetuned_model(model_name, num_classes, use_pretrained=True, freeze_features=True)
    ft_model = ft_model.to(device)

    # Observe that only parameters of final layer are being optimized as
    # opposed to before.
    params_to_update = []
    print("Params to learn:")
    for name, param in ft_model.named_parameters():
        if param.requires_grad:
            params_to_update.append(param)
            # print("\t", name) # Uncomment to see layers being trained

    ft_optimizer = optim.Adam(params_to_update, lr=learning_rate_finetune, weight_decay=weight_decay)
    ft_criterion = nn.CrossEntropyLoss()

    ft_total_train_time = 0
    print(f"Training {model_name} (Fine-tuning)...")
    for epoch in range(1, num_epochs_finetune + 1):
        # Use the RGB loader for fine-tuning
        train_loss, train_acc, train_time = train_classifier(ft_model, device, train_loader_rgb, ft_optimizer, ft_criterion, epoch)
        ft_total_train_time += train_time
        if epoch == num_epochs_finetune:
            print(f"\nEvaluating {model_name} (Fine-tuned)...")
            # Use the RGB loader for evaluation as well
            test_loss, test_acc, test_f1 = evaluate_classifier(ft_model, device, test_loader_rgb, ft_criterion)
            results[f'{model_name}_FineTuned'] = {'Loss': test_loss, 'Accuracy': test_acc, 'F1 Score': test_f1, 'Training Time (s)': ft_total_train_time}


--- Part 4: Fine-tuning Pre-trained Models ---

--- Fine-tuning vgg16 ---


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:10<00:00, 52.1MB/s]


Params to learn:
Training vgg16 (Fine-tuning)...

Training Set: Average loss: 0.1638, Accuracy: 57483/60000 (95.81%), Time: 418.52s

Training Set: Average loss: 0.1148, Accuracy: 58524/60000 (97.54%), Time: 418.96s

Training Set: Average loss: 0.1049, Accuracy: 58674/60000 (97.79%), Time: 418.22s

Training Set: Average loss: 0.1013, Accuracy: 58786/60000 (97.98%), Time: 418.29s

Training Set: Average loss: 0.0984, Accuracy: 58877/60000 (98.13%), Time: 418.56s

Training Set: Average loss: 0.0965, Accuracy: 58927/60000 (98.21%), Time: 418.26s

Training Set: Average loss: 0.0936, Accuracy: 58970/60000 (98.28%), Time: 418.03s

Training Set: Average loss: 0.0954, Accuracy: 58965/60000 (98.28%), Time: 417.92s

Evaluating vgg16 (Fine-tuned)...
Test set: Average loss: 0.0542, Accuracy: 99.00%, F1 Score: 0.9900


--- Fine-tuning alexnet ---


Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:02<00:00, 102MB/s]


Params to learn:
Training alexnet (Fine-tuning)...

Training Set: Average loss: 0.1474, Accuracy: 57408/60000 (95.68%), Time: 130.50s

Training Set: Average loss: 0.0927, Accuracy: 58491/60000 (97.48%), Time: 130.03s

Training Set: Average loss: 0.0886, Accuracy: 58589/60000 (97.65%), Time: 128.25s

Training Set: Average loss: 0.0801, Accuracy: 58703/60000 (97.84%), Time: 132.33s

Training Set: Average loss: 0.0787, Accuracy: 58691/60000 (97.82%), Time: 129.40s

Training Set: Average loss: 0.0776, Accuracy: 58738/60000 (97.90%), Time: 130.13s

Training Set: Average loss: 0.0708, Accuracy: 58823/60000 (98.04%), Time: 130.92s

Training Set: Average loss: 0.0721, Accuracy: 58778/60000 (97.96%), Time: 131.14s

Evaluating alexnet (Fine-tuned)...
Test set: Average loss: 0.0293, Accuracy: 99.14%, F1 Score: 0.9914



In [18]:
print("\n\n--- Comparison of Models ---")

print(f"{'Model':<25} | {'Test Loss':<12} | {'Accuracy (%)':<15} | {'F1 Score':<12} | {'Training Time (s)':<18}")
print("-" * 85)
for name, metrics in results.items():
    loss = f"{metrics.get('Loss', 'N/A'):.4f}" if isinstance(metrics.get('Loss'), (int, float)) and not np.isnan(metrics.get('Loss')) else metrics.get('Loss', 'N/A')
    acc = f"{metrics.get('Accuracy', 'N/A'):.2f}" if isinstance(metrics.get('Accuracy'), (int, float)) and not np.isnan(metrics.get('Accuracy')) else metrics.get('Accuracy', 'N/A')
    f1 = f"{metrics.get('F1 Score', 'N/A'):.4f}" if isinstance(metrics.get('F1 Score'), (int, float)) and not np.isnan(metrics.get('F1 Score')) else metrics.get('F1 Score', 'N/A')
    time_val = metrics.get('Training Time (s)', 'N/A')
    train_time = f"{time_val:.2f}" if isinstance(time_val, (int, float)) else time_val

    print(f"{name:<25} | {loss:<12} | {acc:<15} | {f1:<12} | {train_time:<18}")

print("\n--- Conclusion ---")
print("1.  **Simple CNN:** Typically performs very well on MNIST (>98-99% accuracy) with relatively low computational cost and training time. It's well-suited for this image classification task.")
print("2.  **Faster R-CNN (Adapted):** As expected, using an object detector for simple classification is inappropriate. The adaptation process (interpreting detection outputs) is a heuristic. Accuracy is likely much lower, and inference time per image is significantly higher due to the complex architecture. Training this model from scratch or even fine-tuning its classification head specifically for this task (while ignoring bounding boxes) would be computationally very expensive and unlikely to outperform the simple CNN.")
print("3.  **Fine-tuned VGG16/AlexNet:** These models, pre-trained on ImageNet, can achieve high accuracy on MNIST after fine-tuning. They benefit from learned low-level features. However, they require larger input images (224x224) and 3 channels, increasing data loading and processing time. Training time (even just the classifier layer) might be longer than the simple CNN due to the larger backbone, although fewer epochs might be needed. For a simple dataset like MNIST, the complexity and computational cost of these large models might be overkill compared to a well-designed simple CNN.")
print("\n**Overall:** For MNIST classification, a custom-designed CNN (like SimpleCNN) offers the best balance of high accuracy, efficiency, and training speed. Fine-tuning large pre-trained models can also yield good results but comes with higher computational overhead. Faster R-CNN is not suitable for this specific task.")




--- Comparison of Models ---
Model                     | Test Loss    | Accuracy (%)    | F1 Score     | Training Time (s) 
-------------------------------------------------------------------------------------
SimpleCNN                 | 0.0224       | 99.27           | 0.9927       | 143.84            
FasterRCNN_Adapted        | 0.0000       | 11.35           | 0.0231       | N/A (Inference Only)
vgg16_FineTuned           | 0.0542       | 99.00           | 0.9900       | 3346.77           
alexnet_FineTuned         | 0.0293       | 99.14           | 0.9914       | 1042.70           

--- Conclusion ---
1.  **Simple CNN:** Typically performs very well on MNIST (>98-99% accuracy) with relatively low computational cost and training time. It's well-suited for this image classification task.
2.  **Faster R-CNN (Adapted):** As expected, using an object detector for simple classification is inappropriate. The adaptation process (interpreting detection outputs) is a heuristic. Accuracy is 