In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
    --------------------------------------- 0.2/10.4 MB 3.5 MB/s eta 0:00:03
   --- ------------------------------------ 0.8/10.4 MB 8.6 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/10.4 MB 24.0 MB/s eta 0:00:01
   -------------------- ------------------- 5.4/10.4 MB 29.1 MB/s eta 0:00:01
   --------------------------------- ------ 8.7/10.4 MB 37.2 MB/s eta 0:00:01
   -----


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, AutoConfig
from tqdm import tqdm

# 1) Settings
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE  = 32
LR          = 2e-5
EPOCHS      = 5            # adjust between 2–5 epochs as desired
NUM_CLASSES = 100

# 2) Data transforms (resize CIFAR‑100 to 224×224 for Swin)
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

train_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
test_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

train_ds = datasets.CIFAR100(root="./data", train=True,  download=True, transform=train_tf)
test_ds  = datasets.CIFAR100(root="./data", train=False, download=True, transform=test_tf)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

criterion = nn.CrossEntropyLoss()

# 3) Utility: train & eval
def train_model(model, optimizer):
    model.train()
    total_time = 0.0
    for epoch in range(EPOCHS):
        epoch_start = time.time()
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
        for imgs, labels in loop:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(imgs).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        epoch_time = time.time() - epoch_start
        total_time += epoch_time
        print(f"Epoch {epoch+1} time: {epoch_time:.1f}s")
    return total_time / EPOCHS    # avg time per epoch

@torch.no_grad()
def evaluate(model):
    model.eval()
    correct = 0
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        preds = model(imgs).logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
    return correct / len(test_ds)

results = []

# 4) Fine‑tune pretrained Tiny & Small, freezing backbone
for variant in [
        "microsoft/swin-tiny-patch4-window7-224",
        "microsoft/swin-small-patch4-window7-224"
    ]:
    print(f"\n>>> Fine‑tuning {variant.split('/')[-1]}")

    model = SwinForImageClassification.from_pretrained(
        variant,
        num_labels=NUM_CLASSES,
        ignore_mismatched_sizes=True    # drop & re-init head if shape mismatches
    ).to(DEVICE)

    # freeze everything except the classification head
    for name, param in model.named_parameters():
        if "classifier" not in name:
            param.requires_grad = False

    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
    avg_time = train_model(model, optimizer)
    acc      = evaluate(model)
    results.append({
        "model": f"FT {variant.split('/')[-1]}",
        "time/epoch": avg_time,
        "test_acc":  acc
    })

# 5) Train Swin‑Tiny from scratch
print("\n>>> Training Swin‑Tiny from scratch")
cfg      = AutoConfig.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224",
    num_labels=NUM_CLASSES
)
scratch = SwinForImageClassification(cfg).to(DEVICE)
opt_scratch = optim.Adam(scratch.parameters(), lr=LR)

avg_time = train_model(scratch, opt_scratch)
acc      = evaluate(scratch)
results.append({
    "model": "Scratch swin-tiny",
    "time/epoch": avg_time,
    "test_acc":  acc
})

# 6) Summary
print("\n=== Summary ===")
print(f"{'Model':25} | {'Time/Epoch (s)':>14} | {'Test Acc (%)':>12}")
print("-"*56)
for r in results:
    print(f"{r['model']:25} | {r['time/epoch']:14.1f} | {r['test_acc']*100:12.2f}")


Files already downloaded and verified
Files already downloaded and verified


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



>>> Fine‑tuning swin-tiny-patch4-window7-224


                                                                                                                       

Epoch 1 time: 45.3s


                                                                                                                       

Epoch 2 time: 45.4s


                                                                                                                       

Epoch 3 time: 45.5s


                                                                                                                       

Epoch 4 time: 45.5s


                                                                                                                       

Epoch 5 time: 45.6s

>>> Fine‑tuning swin-small-patch4-window7-224


config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/199M [00:00<?, ?B/s]

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-small-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5:   0%|                                                                              | 0/1563 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/199M [00:00<?, ?B/s]

                                                                                                                       

Epoch 1 time: 69.8s


                                                                                                                       

Epoch 2 time: 69.9s


                                                                                                                       

Epoch 3 time: 69.4s


                                                                                                                       

Epoch 4 time: 69.8s


                                                                                                                       

Epoch 5 time: 69.7s

>>> Training Swin‑Tiny from scratch


                                                                                                                       

Epoch 1 time: 109.8s


                                                                                                                       

Epoch 2 time: 109.8s


                                                                                                                       

Epoch 3 time: 110.2s


                                                                                                                       

Epoch 4 time: 109.6s


                                                                                                                       

Epoch 5 time: 110.1s

=== Summary ===
Model                     | Time/Epoch (s) | Test Acc (%)
--------------------------------------------------------
FT swin-tiny-patch4-window7-224 |           45.5 |        66.41
FT swin-small-patch4-window7-224 |           69.7 |        70.52
Scratch swin-tiny         |          109.9 |        37.43
