## CONVNEXT test comparison vs Cspdarknet53, Convnext, Resnet18
----------------------------


## loading dataset


In [1]:
import torchvision.datasets as datasets
from torchvision.datasets import Food101
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Use the ImageNet normalization required by all three models
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
BATCH_SIZE = 64

# Define the transforms
transform = transforms.Compose([
    transforms.Resize(224), # Resize to 224x224 
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
])

# Load and automatically download the dataset
train_dataset = datasets.CIFAR10(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform
)
test_dataset = datasets.CIFAR10(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)


Files already downloaded and verified
Files already downloaded and verified


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from tqdm import tqdm 

NUM_CLASSES = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

def train_on_epoch(model, dataloader, criterion, optimizer, device):
    model.train() 
    running_loss = 0.0    
    for inputs, labels in tqdm(dataloader, desc="Training"):        
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

cuda


In [3]:
!nvidia-smi

Fri Nov 14 01:52:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.65.06              Driver Version: 580.65.06      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     On  |   00000000:17:00.0 Off |                    0 |
|  0%   63C    P0            140W /  300W |   14257MiB /  46068MiB |     87%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A40                     On  |   00

In [4]:
def test_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    
    # Disable gradient tracking for efficiency during inference
    with torch.no_grad(): 
        for inputs, labels in tqdm(dataloader, desc="Testing"):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            
            # Get the predicted class index (max score)
            _, predicted = torch.max(outputs.data, 1)
            
            # Update counts
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [5]:
def save_model_weights(model, model_name):
    """Saves the model's state dictionary to a file."""
    filename = f"{model_name.lower().replace('_', '-')}.pth"
    torch.save(model.state_dict(), filename)
    print(f"--- Weights saved to {filename} ---")

In [6]:
from torchvision.models import convnext_tiny

model = convnext_tiny(weights=None) 
LEARNING_RATE = 4e-3            
WEIGHT_DECAY = 5e-2
EPOCHS = 10

num_ftrs = model.classifier[-1].in_features
model.classifier[-1] = nn.Linear(num_ftrs, NUM_CLASSES) 
model.to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
history = {'train_loss': [], 'test_acc': []}
best_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_on_epoch(model, train_loader, criterion, optimizer, DEVICE)
    test_acc = test_accuracy(model, test_loader, DEVICE)
    history['train_loss'].append(train_loss)
    history['test_acc'].append(test_acc)

    print(f"Epoch {epoch}/{EPOCHS} | Loss: {train_loss:.4f} | Acc: {test_acc:.2f}%")

print(f"\nTraining complete. Best Accuracy for {best_acc:.2f}%")
saved_model_name = "ConvNeXt_Tiny_CIFAR100"
save_model_weights(model, saved_model_name)

Training: 100%|██████████| 782/782 [05:24<00:00,  2.41it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.67it/s]


Epoch 1/10 | Loss: 1.7999 | Acc: 42.43%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.43it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.71it/s]


Epoch 2/10 | Loss: 1.3968 | Acc: 54.60%


Training: 100%|██████████| 782/782 [05:20<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.69it/s]


Epoch 3/10 | Loss: 1.1570 | Acc: 62.41%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.71it/s]


Epoch 4/10 | Loss: 0.9815 | Acc: 69.16%


Training: 100%|██████████| 782/782 [05:20<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.70it/s]


Epoch 5/10 | Loss: 0.8522 | Acc: 72.16%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.62it/s]


Epoch 6/10 | Loss: 0.7482 | Acc: 73.92%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.63it/s]


Epoch 7/10 | Loss: 0.6673 | Acc: 74.25%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.68it/s]


Epoch 8/10 | Loss: 0.5965 | Acc: 79.42%


Training: 100%|██████████| 782/782 [05:20<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.65it/s]


Epoch 9/10 | Loss: 0.5326 | Acc: 81.18%


Training: 100%|██████████| 782/782 [05:21<00:00,  2.44it/s]
Testing: 100%|██████████| 157/157 [00:11<00:00, 13.66it/s]


Epoch 10/10 | Loss: 0.4821 | Acc: 80.14%

Training complete. Best Accuracy for 0.00%
--- Weights saved to convnext-tiny-cifar100.pth ---


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- 1. Basic Building Blocks ---

class ConvBlock(nn.Module):
    """
    A standard Convolution -> BatchNorm -> Activation (LeakyReLU) block.
    """
    def __init__(self, in_c, out_c, k=3, s=1, p=None):
        super(ConvBlock, self).__init__()
        # Calculate padding automatically if not specified
        if p is None:
            p = k // 2
            
        self.conv = nn.Conv2d(in_c, out_c, kernel_size=k, stride=s, padding=p, bias=False)
        self.bn = nn.BatchNorm2d(out_c)
        self.act = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class ResUnit(nn.Module):
    """
    The small Residual Unit used inside the CSP block.
    """
    def __init__(self, c, n=1):
        super(ResUnit, self).__init__()
        # The first convolution reduces channels by half
        self.conv1 = ConvBlock(c, c // 2, k=1)
        # The second convolution restores the original channels
        self.conv2 = ConvBlock(c // 2, c, k=3)
        self.add = n == 1 # Only add the residual if n=1 (standard darknet residual)

    def forward(self, x):
        return x + self.conv2(self.conv1(x))

# --- 2. The Core CSP Block ---

class CSPDarknetBlock(nn.Module):
    """
    The core Cross Stage Partial (CSP) block. 
    It splits the feature map into two paths and merges them at the end.
    """
    def __init__(self, in_c, out_c, num_res=1):
        super(CSPDarknetBlock, self).__init__()
        
        # 1. Transition Block (Downsampling)
        self.downsample = ConvBlock(in_c, out_c, k=3, s=2)
        
        # 2. Split the channels of the downsampled feature map
        c_half = out_c // 2
        
        # 3. Path 1: Partial transition (simple 1x1 convolution)
        self.conv_trans_p1 = ConvBlock(out_c, c_half, k=1)
        
        # 4. Path 2: Main residual stack
        self.conv_trans_p2 = ConvBlock(out_c, c_half, k=1)
        
        # 5. The stack of Residual Units (ResUnit)
        self.res_stack = nn.Sequential(*[
            ResUnit(c_half, n=1) for _ in range(num_res)
        ])
        
        # 6. Path 2 final 1x1 convolution before merging
        self.conv_stack_p2 = ConvBlock(c_half, c_half, k=1)
        
        # 7. Final Merge and Transition 
        # Merges the two paths (c_half + c_half = out_c)
        self.conv_merge = ConvBlock(out_c, out_c, k=1)

    def forward(self, x):
        # 1. Downsample (if needed, although usually done in the main stage)
        x = self.downsample(x)
        
        # 2. Split the downsampled feature map
        x1 = self.conv_trans_p1(x) # Path 1: Partial
        x2 = self.conv_trans_p2(x) # Path 2: Main
        
        # 3. Process Path 2 (Residual Stack)
        x2 = self.res_stack(x2)
        x2 = self.conv_stack_p2(x2)
        
        # 4. Concatenate and Merge (Cross Stage Partial)
        x_merged = torch.cat([x1, x2], dim=1)
        
        # 5. Final output
        return self.conv_merge(x_merged)


# --- 3. CSPDarknet53 Full Architecture ---

class CSPDarknet53(nn.Module):
    """
    The CSPDarknet53 backbone for image classification (simplified).
    This structure is typically used as a feature extractor.
    """
    def __init__(self, in_channels=3, num_classes=1000, stages=(1, 2, 8, 8, 4)):
        super(CSPDarknet53, self).__init__()
        
        # Initial Conv Layer (Stem) - No downsampling yet, but deepens channels
        self.stem = ConvBlock(in_channels, 32) 
        
        # Darknet uses five main stages (0, 1, 2, 3, 4)
        c = [32, 64, 128, 256, 512, 1024]
        
        # Stage 0: Simple Downsampling
        self.stage0 = ConvBlock(c[0], c[1], k=3, s=2) # 32 -> 64
        
        # Stages 1-4: CSP Blocks
        self.stage1 = self._make_csp_layer(c[1], c[2], stages[1]) # 64 -> 128 (Res: 2)
        self.stage2 = self._make_csp_layer(c[2], c[3], stages[2]) # 128 -> 256 (Res: 8)
        self.stage3 = self._make_csp_layer(c[3], c[4], stages[3]) # 256 -> 512 (Res: 8)
        self.stage4 = self._make_csp_layer(c[4], c[5], stages[4]) # 512 -> 1024 (Res: 4)
        
        # Classification Head
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(c[5], num_classes) # 1024 features to NUM_CLASSES

    def _make_csp_layer(self, in_c, out_c, num_res):
        """Helper to create a standard CSP Darknet Stage."""
        # The timm version often combines the downsampling conv with the CSP block
        return nn.Sequential(
            ConvBlock(in_c, out_c, k=3, s=2), # Downsampling
            CSPDarknetBlock(out_c, out_c, num_res)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.stage0(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        
        # Classification path
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [8]:
model = CSPDarknet53(num_classes=10)
model.to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
history = {'train_loss': [], 'test_acc': []}
best_acc = 0.

for epoch in range(1, EPOCHS + 1):
    train_loss = train_on_epoch(model, train_loader, criterion, optimizer, DEVICE)
    test_acc = test_accuracy(model, test_loader, DEVICE)
    history['train_loss'].append(train_loss)
    history['test_acc'].append(test_acc)

    print(f"Epoch {epoch}/{EPOCHS} | Loss: {train_loss:.4f} | Acc: {test_acc:.2f}%")

print(f"\nTraining complete. Best Accuracy for {best_acc:.2f}%")
saved_model_name = "CSPDarkNet53_CIFAR100"
save_model_weights(model, saved_model_name)


Training: 100%|██████████| 782/782 [00:55<00:00, 14.00it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.33it/s]


Epoch 1/10 | Loss: 2.1105 | Acc: 34.21%


Training: 100%|██████████| 782/782 [00:55<00:00, 13.99it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.57it/s]


Epoch 2/10 | Loss: 1.6559 | Acc: 44.54%


Training: 100%|██████████| 782/782 [00:55<00:00, 13.98it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 40.84it/s]


Epoch 3/10 | Loss: 1.4117 | Acc: 51.98%


Training: 100%|██████████| 782/782 [00:56<00:00, 13.89it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.32it/s]


Epoch 4/10 | Loss: 1.2129 | Acc: 58.78%


Training: 100%|██████████| 782/782 [00:55<00:00, 13.97it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.22it/s]


Epoch 5/10 | Loss: 1.0406 | Acc: 63.29%


Training: 100%|██████████| 782/782 [00:56<00:00, 13.87it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 40.11it/s]


Epoch 6/10 | Loss: 0.9156 | Acc: 69.41%


Training: 100%|██████████| 782/782 [00:55<00:00, 13.98it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.24it/s]


Epoch 7/10 | Loss: 0.8179 | Acc: 70.50%


Training: 100%|██████████| 782/782 [00:55<00:00, 14.01it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.34it/s]


Epoch 8/10 | Loss: 0.7463 | Acc: 72.22%


Training: 100%|██████████| 782/782 [00:55<00:00, 13.98it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.35it/s]


Epoch 9/10 | Loss: 0.6791 | Acc: 72.58%


Training: 100%|██████████| 782/782 [00:55<00:00, 14.01it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 41.09it/s]


Epoch 10/10 | Loss: 0.6346 | Acc: 72.15%

Training complete. Best Accuracy for 0.00%
--- Weights saved to cspdarknet53-cifar100.pth ---


In [9]:
from torchvision.models import resnet18
model = resnet18(weights=None)

num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, NUM_CLASSES)
model.to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
history = {'train_loss': [], 'test_acc': []}
best_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_on_epoch(model, train_loader, criterion, optimizer, DEVICE)
    test_acc = test_accuracy(model, test_loader, DEVICE)
    history['train_loss'].append(train_loss)
    history['test_acc'].append(test_acc)

    print(f"Epoch {epoch}/{EPOCHS} | Loss: {train_loss:.4f} | Acc: {test_acc:.2f}%")

print(f"\nTraining complete. Best Accuracy for {best_acc:.2f}%")
saved_model_name = "ResNet18_CIFAR100"
save_model_weights(model, saved_model_name)

Training: 100%|██████████| 782/782 [00:42<00:00, 18.48it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.62it/s]


Epoch 1/10 | Loss: 1.6378 | Acc: 52.10%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.51it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.08it/s]


Epoch 2/10 | Loss: 0.9712 | Acc: 68.85%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.56it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.40it/s]


Epoch 3/10 | Loss: 0.6843 | Acc: 74.98%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.57it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 43.77it/s]


Epoch 4/10 | Loss: 0.5356 | Acc: 78.46%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.57it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.46it/s]


Epoch 5/10 | Loss: 0.4324 | Acc: 80.83%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.58it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 43.94it/s]


Epoch 6/10 | Loss: 0.3471 | Acc: 80.89%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.55it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.07it/s]


Epoch 7/10 | Loss: 0.2863 | Acc: 80.92%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.55it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.77it/s]


Epoch 8/10 | Loss: 0.2323 | Acc: 81.98%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.57it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.32it/s]


Epoch 9/10 | Loss: 0.2012 | Acc: 81.02%


Training: 100%|██████████| 782/782 [00:42<00:00, 18.57it/s]
Testing: 100%|██████████| 157/157 [00:03<00:00, 44.46it/s]

Epoch 10/10 | Loss: 0.1855 | Acc: 80.73%

Training complete. Best Accuracy for 0.00%
--- Weights saved to resnet18-cifar100.pth ---





In [None]:
import numpy as np
import matplotlib.pyplot as plt

def draw_comparison_graphs(all_results):

    if not all_results:
        print("Error: The 'all_results' dictionary is empty. Please ensure all models finished training.")
        return

    names = list(all_results.keys())
    accuracies = [data['best_accuracy'] for data in all_results.values()]
    parameters = [data['parameters_m'] for data in all_results.values()]

    
    x = np.arange(len(names))
    width = 0.35

    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Accuracy Bar Chart (Primary Axis)
    rects1 = ax1.bar(x - width/2, accuracies, width, label='Best Test Accuracy (%)', color='mediumblue')
    ax1.set_ylabel('Accuracy (%)', color='mediumblue')
    ax1.set_ylim(min(accuracies)*0.9, max(accuracies)*1.05) 
    ax1.tick_params(axis='y', labelcolor='mediumblue')

    # Parameters Bar Chart (Secondary Axis)
    ax2 = ax1.twinx()
    rects2 = ax2.bar(x + width/2, parameters, width, label='Parameters (M)', color='darkorange')
    ax2.set_ylabel('Parameters (M)', color='darkorange')
    ax2.tick_params(axis='y', labelcolor='darkorange')
    
    # Labeling and Titles
    ax1.set_xticks(x)
    ax1.set_xticklabels(names)
    ax1.set_title(f'Model Comparison: Accuracy vs. Parameters')
    
    # Combine Legends
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines + lines2, labels + labels2, loc='upper left')

    plt.grid(axis='y', linestyle='--')
    plt.show()
    
    plt.figure(figsize=(10, 6))

    for name in names:
        history = all_results[name]['history']
        plt.plot(history['test_acc'], label=f'{name} (Best: {all_results[name]["best_accuracy"]:.2f}%)')

    plt.title('Test Accuracy Progression Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Test Accuracy (%)')
    plt.legend()
    plt.grid(True)
    plt.show()


