<a href="https://colab.research.google.com/github/sesmael/Real-Time-ML-/blob/main/Homework6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchinfo

import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchinfo import summary
import time
import pandas as pd

# --- Vision Transformer components ---
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.n_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        x = self.proj(x)  # [B, embed_dim, H', W']
        x = x.flatten(2).transpose(1, 2)  # [B, num_patches, embed_dim]
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, heads, hidden_dim, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, n_classes, embed_dim, depth, heads, mlp_ratio):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, (img_size // patch_size)**2 + 1, embed_dim))

        self.encoder = nn.Sequential(*[
            TransformerEncoder(embed_dim, heads, embed_dim * mlp_ratio) for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, n_classes)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.encoder(x)
        x = self.norm(x[:, 0])
        return self.head(x)

# --- CIFAR-100 Loader ---
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# --- Training & Analysis ---
def train_model(config, epochs=20):
    print(f"\n⚙️ Training config: {config}")
    model = ViT(
        img_size=32,
        patch_size=config['patch'],
        in_channels=3,
        n_classes=100,
        embed_dim=config['embed'],
        depth=config['depth'],
        heads=config['heads'],
        mlp_ratio=config['mlp_ratio']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss / len(train_loader):.4f}")
    train_time = time.time() - start_time

    # Accuracy
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    acc = correct / total

    # Summary (FLOPs, params)
    info = summary(
        model,
        input_size=(1, 3, 32, 32),
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "mult_adds"]
    )

    return acc, train_time, info.total_params, info.total_mult_adds

# --- Run First 4 Configs ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

configs = [
    {'patch': 4, 'embed': 256, 'depth': 4, 'heads': 2, 'mlp_ratio': 2},  # Config 1
    {'patch': 4, 'embed': 256, 'depth': 4, 'heads': 4, 'mlp_ratio': 2},  # Config 2
    {'patch': 4, 'embed': 256, 'depth': 8, 'heads': 2, 'mlp_ratio': 2},  # Config 3
    {'patch': 4, 'embed': 256, 'depth': 8, 'heads': 4, 'mlp_ratio': 2},  # Config 4
]

results = []
for cfg in configs:
    acc, t, params, flops = train_model(cfg, epochs=20)
    results.append({**cfg, 'accuracy': acc, 'time_sec': round(t, 2), 'params': params, 'flops': flops})

# Show result table
df = pd.DataFrame(results)
from IPython.display import display
display(df)



⚙️ Training config: {'patch': 4, 'embed': 256, 'depth': 4, 'heads': 2, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 4.0860
Epoch 2/20 | Loss: 3.8475
Epoch 3/20 | Loss: 3.8191
Epoch 4/20 | Loss: 3.9076
Epoch 5/20 | Loss: 4.0502
Epoch 6/20 | Loss: 3.9547
Epoch 7/20 | Loss: 4.0595
Epoch 8/20 | Loss: 3.9915
Epoch 9/20 | Loss: 3.9898
Epoch 10/20 | Loss: 3.9527
Epoch 11/20 | Loss: 3.9029
Epoch 12/20 | Loss: 3.8643
Epoch 13/20 | Loss: 3.8635
Epoch 14/20 | Loss: 3.8936
Epoch 15/20 | Loss: 3.8882
Epoch 16/20 | Loss: 3.9861
Epoch 17/20 | Loss: 3.9817
Epoch 18/20 | Loss: 3.8814
Epoch 19/20 | Loss: 3.8594
Epoch 20/20 | Loss: 3.8640

⚙️ Training config: {'patch': 4, 'embed': 256, 'depth': 4, 'heads': 4, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 3.9396
Epoch 2/20 | Loss: 3.6627
Epoch 3/20 | Loss: 3.5821
Epoch 4/20 | Loss: 3.5726
Epoch 5/20 | Loss: 3.5576
Epoch 6/20 | Loss: 3.4981
Epoch 7/20 | Loss: 3.4872
Epoch 8/20 | Loss: 3.4538
Epoch 9/20 | Loss: 3.6119
Epoch 10/20 | Loss: 3.6527
Epoch 11/20 | Loss: 3.6115
Epoch

Unnamed: 0,patch,embed,depth,heads,mlp_ratio,accuracy,time_sec,params,flops
0,4,256,4,2,2,0.0917,242.99,2164068,1888868
1,4,256,4,4,2,0.1376,243.95,2164068,1888868
2,4,256,8,2,2,0.048,423.84,4272484,2948708
3,4,256,8,4,2,0.0569,421.22,4272484,2948708


In [None]:
!pip install torchinfo

import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchinfo import summary
import time
import pandas as pd

# --- Vision Transformer components ---
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.n_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, heads, hidden_dim, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, n_classes, embed_dim, depth, heads, mlp_ratio):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, (img_size // patch_size)**2 + 1, embed_dim))

        self.encoder = nn.Sequential(*[
            TransformerEncoder(embed_dim, heads, embed_dim * mlp_ratio) for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, n_classes)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.encoder(x)
        x = self.norm(x[:, 0])
        return self.head(x)

# --- CIFAR-100 Loader ---
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# --- Training & Analysis ---
def train_model(config, epochs=20):
    print(f"\n⚙️ Training config: {config}")
    model = ViT(
        img_size=32,
        patch_size=config['patch'],
        in_channels=3,
        n_classes=100,
        embed_dim=config['embed'],
        depth=config['depth'],
        heads=config['heads'],
        mlp_ratio=config['mlp_ratio']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss / len(train_loader):.4f}")
    train_time = time.time() - start_time

    # Accuracy
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    acc = correct / total

    # Summary (FLOPs, params)
    info = summary(
        model,
        input_size=(1, 3, 32, 32),
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "mult_adds"]
    )

    return acc, train_time, info.total_params, info.total_mult_adds

# --- Run Configs 5 to 11 ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

configs = [
    {'patch': 4, 'embed': 512, 'depth': 4, 'heads': 2, 'mlp_ratio': 2},  # Config 5
    {'patch': 4, 'embed': 512, 'depth': 4, 'heads': 4, 'mlp_ratio': 2},  # Config 6
    {'patch': 4, 'embed': 512, 'depth': 8, 'heads': 2, 'mlp_ratio': 2},  # Config 7
    {'patch': 4, 'embed': 512, 'depth': 8, 'heads': 4, 'mlp_ratio': 2},  # Config 8
    {'patch': 8, 'embed': 256, 'depth': 4, 'heads': 2, 'mlp_ratio': 2},  # Config 9
    {'patch': 8, 'embed': 256, 'depth': 4, 'heads': 4, 'mlp_ratio': 2},  # Config 10
    {'patch': 8, 'embed': 256, 'depth': 8, 'heads': 2, 'mlp_ratio': 2},  # Config 11
]

results = []
for i, cfg in enumerate(configs, start=5):
    acc, t, params, flops = train_model(cfg, epochs=20)

    params_M = params / 1e6
    flops_G = flops / 1e9
    time_min = t / 60

    results.append({
        'Config': i,
        **cfg,
        'accuracy': round(acc, 4),
        'params_M': round(params_M, 2),
        'flops_GFLOPs': round(flops_G, 2),
        'time_min': round(time_min, 2)
    })

    print(f"\n Finished Config {i}")
    print(f"Patch: {cfg['patch']} | Embed: {cfg['embed']} | Layers: {cfg['depth']} | Heads: {cfg['heads']}")
    print(f"Accuracy: {round(acc * 100, 2)}% | Params: {params_M:.2f}M | FLOPs: {flops_G:.2f} GFLOPs | Time: {time_min:.2f} min")

#  Final summary
df = pd.DataFrame(results)
df = df[['Config', 'patch', 'embed', 'depth', 'heads', 'mlp_ratio', 'accuracy', 'params_M', 'flops_GFLOPs', 'time_min']]

from IPython.display import display
print("\nFinal Summary of ViT Configs 5 to 11:")
display(df)




100%|██████████| 169M/169M [00:03<00:00, 42.3MB/s]



⚙️ Training config: {'patch': 4, 'embed': 512, 'depth': 4, 'heads': 2, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 4.3110
Epoch 2/20 | Loss: 4.2456
Epoch 3/20 | Loss: 4.2863
Epoch 4/20 | Loss: 4.3006
Epoch 5/20 | Loss: 4.2814
Epoch 6/20 | Loss: 4.2551
Epoch 7/20 | Loss: 4.2587
Epoch 8/20 | Loss: 4.2548
Epoch 9/20 | Loss: 4.2138
Epoch 10/20 | Loss: 4.2085
Epoch 11/20 | Loss: 4.2232
Epoch 12/20 | Loss: 4.3000
Epoch 13/20 | Loss: 4.2977
Epoch 14/20 | Loss: 4.2858
Epoch 15/20 | Loss: 4.2608
Epoch 16/20 | Loss: 4.2537
Epoch 17/20 | Loss: 4.2863
Epoch 18/20 | Loss: 4.2509
Epoch 19/20 | Loss: 4.2406
Epoch 20/20 | Loss: 4.2710

 Finished Config 5
Patch: 4 | Embed: 512 | Layers: 4 | Heads: 2
Accuracy: 4.9% | Params: 8.52M | FLOPs: 0.01 GFLOPs | Time: 5.66 min

⚙️ Training config: {'patch': 4, 'embed': 512, 'depth': 4, 'heads': 4, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 4.2052
Epoch 2/20 | Loss: 4.0700
Epoch 3/20 | Loss: 4.0843
Epoch 4/20 | Loss: 4.0602
Epoch 5/20 | Loss: 4.0309
Epoch 6/20 | Loss: 4.0996
Epo

Unnamed: 0,Config,patch,embed,depth,heads,mlp_ratio,accuracy,params_M,flops_GFLOPs,time_min
0,5,4,512,4,2,2,0.049,8.52,0.01,5.66
1,6,4,512,4,4,2,0.0661,8.52,0.01,5.61
2,7,4,512,8,2,2,0.0261,16.93,0.01,10.68
3,8,4,512,8,4,2,0.0245,16.93,0.01,10.67
4,9,8,256,4,2,2,0.0846,2.19,0.0,3.95
5,10,8,256,4,4,2,0.153,2.19,0.0,3.99
6,11,8,256,8,2,2,0.042,4.3,0.0,7.06


In [None]:
# Install torchinfo for model summary
!pip install torchinfo
# --- Imports ---
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchinfo import summary
import time
import pandas as pd

# --- Vision Transformer components ---
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.n_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, heads, hidden_dim, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class ViT(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, n_classes, embed_dim, depth, heads, mlp_ratio):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, (img_size // patch_size)**2 + 1, embed_dim))

        self.encoder = nn.Sequential(*[
            TransformerEncoder(embed_dim, heads, embed_dim * mlp_ratio) for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, n_classes)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.encoder(x)
        x = self.norm(x[:, 0])
        return self.head(x)

# --- CIFAR-100 Loader ---
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# --- Training & Analysis ---
def train_model(config, epochs=20):
    print(f"\nTraining config: {config}")
    model = ViT(
        img_size=32,
        patch_size=config['patch'],
        in_channels=3,
        n_classes=100,
        embed_dim=config['embed'],
        depth=config['depth'],
        heads=config['heads'],
        mlp_ratio=config['mlp_ratio']
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss / len(train_loader):.4f}")
    train_time = time.time() - start_time

    # Accuracy
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    acc = correct / total

    # Summary (FLOPs, params)
    info = summary(
        model,
        input_size=(1, 3, 32, 32),
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "mult_adds"]
    )

    return acc, train_time, info.total_params, info.total_mult_adds

# --- Run Configs 12 to 16 ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

configs = [
    {'patch': 8, 'embed': 256, 'depth': 8, 'heads': 4, 'mlp_ratio': 2},  # Config 12
    {'patch': 8, 'embed': 512, 'depth': 4, 'heads': 2, 'mlp_ratio': 2},  # Config 13
    {'patch': 8, 'embed': 512, 'depth': 4, 'heads': 4, 'mlp_ratio': 2},  # Config 14
    {'patch': 8, 'embed': 512, 'depth': 8, 'heads': 2, 'mlp_ratio': 2},  # Config 15
    {'patch': 8, 'embed': 512, 'depth': 8, 'heads': 4, 'mlp_ratio': 2},  # Config 16
]




In [None]:
results_12_16 = []
for i, cfg in enumerate(configs, start=12):
    acc, t, params, flops = train_model(cfg, epochs=20)

    params_M = params / 1e6
    time_min = t / 60

    results_12_16.append({
        'Config': i,
        **cfg,
        'accuracy': round(acc, 4),
        'params_M': round(params_M, 2),
        'FLOPs': round(flops, 2),
        'time_min': round(time_min, 2)
    })

    print(f"\n Finished Config {i}")
    print(f"Patch: {cfg['patch']} | Embed: {cfg['embed']} | Layers: {cfg['depth']} | Heads: {cfg['heads']}")
    print(f"Accuracy: {round(acc * 100, 2)}% | Params: {params_M:.2f}M | FLOPs: {flops:.2f} FLOPs | Time: {time_min:.2f} min")

# Final summary
df_12_16 = pd.DataFrame(results_12_16)
df_12_16 = df_12_16[['Config', 'patch', 'embed', 'depth', 'heads', 'mlp_ratio', 'accuracy', 'params_M', 'FLOPs', 'time_min']]

from IPython.display import display
print("\Final Summary of ViT Configs 12 to 16:")
display(df_12_16)


Training config: {'patch': 8, 'embed': 256, 'depth': 8, 'heads': 4, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 3.9626
Epoch 2/20 | Loss: 3.6698
Epoch 3/20 | Loss: 3.6474
Epoch 4/20 | Loss: 3.5957
Epoch 5/20 | Loss: 3.5750
Epoch 6/20 | Loss: 3.6107
Epoch 7/20 | Loss: 3.7060
Epoch 8/20 | Loss: 3.7035
Epoch 9/20 | Loss: 3.7332
Epoch 10/20 | Loss: 3.7207
Epoch 11/20 | Loss: 3.7124
Epoch 12/20 | Loss: 3.7479
Epoch 13/20 | Loss: 3.8437
Epoch 14/20 | Loss: 3.9424
Epoch 15/20 | Loss: 3.9304
Epoch 16/20 | Loss: 3.9707
Epoch 17/20 | Loss: 4.0459
Epoch 18/20 | Loss: 3.9946
Epoch 19/20 | Loss: 3.9809
Epoch 20/20 | Loss: 3.9550

 Finished Config 12
Patch: 8 | Embed: 256 | Layers: 8 | Heads: 4
Accuracy: 9.84% | Params: 4.30M | FLOPs: 2936420.00 FLOPs | Time: 7.24 min

Training config: {'patch': 8, 'embed': 512, 'depth': 4, 'heads': 2, 'mlp_ratio': 2}
Epoch 1/20 | Loss: 4.3445
Epoch 2/20 | Loss: 4.2746
Epoch 3/20 | Loss: 4.2956
Epoch 4/20 | Loss: 4.3212
Epoch 5/20 | Loss: 4.3218
Epoch 6/20 | Loss: 4.2877
Ep

Unnamed: 0,Config,patch,embed,depth,heads,mlp_ratio,accuracy,params_M,FLOPs,time_min
0,12,8,256,8,4,2,0.0984,4.3,2936420,7.24
1,13,8,512,4,2,2,0.0586,8.57,5850212,4.15
2,14,8,512,4,4,2,0.0586,8.57,5850212,4.22
3,15,8,512,8,2,2,0.0332,16.98,10067044,7.36
4,16,8,512,8,4,2,0.0414,16.98,10067044,7.34


In [None]:
# Install required packages
!pip install torchinfo torchvision --quiet

import torch
import torch.nn as nn
import time
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torchinfo import summary

# ----------------------------
# Configuration
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
learning_rate = 0.001
num_epochs = 10

# ----------------------------
# Data Loaders
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.CIFAR100(root='./data', train=False,download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=2)

# ----------------------------
# Training & Evaluation Function
# ----------------------------
def train_resnet18(epochs=10):
    print("Training ResNet-18 on CIFAR-100")

    # Load and modify ResNet-18
    model = models.resnet18(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, 100)  # CIFAR-100 has 100 classes
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    start_time = time.time()
    for epoch in range(1, epochs + 1):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch}/{epochs} — Loss: {avg_loss:.4f}")

    total_time = time.time() - start_time
    time_per_epoch = total_time / epochs

    # Evaluation
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            preds = model(images).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total

    # Model summary for params & FLOPs
    info = summary(
        model,
        input_size=(1, 3, 32, 32),
        col_names=["num_params", "mult_adds"],
        verbose=0
    )
    params = info.total_params
    flops  = info.total_mult_adds

    return accuracy, time_per_epoch, params, flops

# ----------------------------
# Run & Report
# ----------------------------
if __name__ == "__main__":
    acc, t_ep, params, flops = train_resnet18(epochs=num_epochs)
    print("\n=== ResNet-18 CIFAR-100 Results ===")
    print(f"Test Accuracy:      {acc*100:.2f}%")
    print(f"Time per Epoch:     {t_ep:.2f} sec")
    print(f"Total Parameters:   {params:,}")
    print(f"Total FLOPs:        {flops:,}")


Training ResNet-18 on CIFAR-100




Epoch 1/10 — Loss: 3.5097
Epoch 2/10 — Loss: 2.7671
Epoch 3/10 — Loss: 2.3713
Epoch 4/10 — Loss: 2.0826
Epoch 5/10 — Loss: 1.8169
Epoch 6/10 — Loss: 1.5707
Epoch 7/10 — Loss: 1.3219
Epoch 8/10 — Loss: 1.0629
Epoch 9/10 — Loss: 0.8280
Epoch 10/10 — Loss: 0.6338

=== ResNet-18 CIFAR-100 Results ===
Test Accuracy:      43.79%
Time per Epoch:     9.90 sec
Total Parameters:   11,227,812
Total FLOPs:        37,072,356


In [None]:
!pip install torchinfo transformers timm

import torch
import torch.nn as nn
import time
import pandas as pd
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchinfo import summary
from transformers import AutoFeatureExtractor, SwinForImageClassification, AutoConfig

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preparation
feat = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
normalize = transforms.Normalize(mean=feat.image_mean, std=feat.image_std)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize
])
train_ds = datasets.CIFAR100("./data", train=True,  download=True, transform=transform)
test_ds  = datasets.CIFAR100("./data", train=False, download=True, transform=transform)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, num_workers=2)

# Utility: train & eval loop
def train_and_eval(model, optimizer, criterion, n_epochs):
    model.train()
    t0 = time.time()
    for e in range(n_epochs):
        loss_sum = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x).logits
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()
        print(f"Epoch {e+1}/{n_epochs} loss {loss_sum/len(train_loader):.4f}")
    total_time = time.time() - t0

    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).logits.argmax(dim=-1)
            correct += (preds==y).sum().item()
            total += y.size(0)
    acc = correct/total

    info = summary(
        model,
        input_size=(1,3,224,224),
        col_names=["num_params","mult_adds"],
        verbose=0
    )
    return acc, total_time/n_epochs, info.total_params, info.total_mult_adds

results = []

# Fine‑tune pretrained Swin‑Tiny & Swin‑Small
for name in ["microsoft/swin-tiny-patch4-window7-224",
             "microsoft/swin-small-patch4-window7-224"]:
    print(f"\n-- Fine‑tuning {name} --")
    model = SwinForImageClassification.from_pretrained(name)
    model.classifier = nn.Linear(model.classifier.in_features, 100)
    model.to(device)
    for p in model.swin.parameters():
        p.requires_grad = False

    optim = torch.optim.Adam(model.parameters(), lr=2e-5)
    crit  = nn.CrossEntropyLoss()
    acc, t_ep, p_cnt, fl = train_and_eval(model, optim, crit, n_epochs=3)
    results.append({
        "Model": name.split("/")[-1],
        "Pretrained": "Yes",
        "Epochs": 3,
        "Accuracy (%)": round(acc*100,2),
        "Time/Epoch (s)": round(t_ep,2),
        "Params (M)": round(p_cnt/1e6,2),
        "FLOPs": fl             # raw mult_adds
    })

# Train Swin‑Tiny from scratch
print("\n-- Training Swin‑Tiny from scratch --")
cfg = AutoConfig.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
cfg.num_labels = 100
scratch = SwinForImageClassification(cfg).to(device)
optim = torch.optim.Adam(scratch.parameters(), lr=2e-5)
crit  = nn.CrossEntropyLoss()
acc, t_ep, p_cnt, fl = train_and_eval(scratch, optim, crit, n_epochs=5)
results.append({
    "Model": "swin-tiny-scratch",
    "Pretrained": "No",
    "Epochs": 5,
    "Accuracy (%)": round(acc*100,2),
    "Time/Epoch (s)": round(t_ep,2),
    "Params (M)": round(p_cnt/1e6,2),
    "FLOPs": fl             # raw mult_adds
})

# Display final table
df = pd.DataFrame(results)
from IPython.display import display
print("\nFine‑tune vs Scratch Results (Raw FLOPs)")
display(df)


Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

100%|██████████| 169M/169M [00:01<00:00, 105MB/s]



-- Fine‑tuning microsoft/swin-tiny-patch4-window7-224 --


config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/113M [00:00<?, ?B/s]

Epoch 1/3 loss 4.0725
Epoch 2/3 loss 3.0717
Epoch 3/3 loss 2.3887

-- Fine‑tuning microsoft/swin-small-patch4-window7-224 --


config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/199M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/199M [00:00<?, ?B/s]

Epoch 1/3 loss 3.9731
Epoch 2/3 loss 2.8850
Epoch 3/3 loss 2.1676

-- Training Swin‑Tiny from scratch --
Epoch 1/5 loss 3.9773
Epoch 2/5 loss 3.3656
Epoch 3/5 loss 2.9978
Epoch 4/5 loss 2.7057
Epoch 5/5 loss 2.4477

Fine‑tune vs Scratch Results (Raw FLOPs)


Unnamed: 0,Model,Pretrained,Epochs,Accuracy (%),Time/Epoch (s),Params (M),FLOPs
0,swin-tiny-patch4-window7-224,Yes,3,62.22,67.24,27.6,62104420
1,swin-small-patch4-window7-224,Yes,3,66.45,104.46,48.91,104686948
2,swin-tiny-scratch,No,5,36.85,177.04,27.6,62104420
