# Assignment: Vision Transformers on CIFAR10

In [1]:
#imports
from __future__ import print_function
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils


In [2]:
#loading the dataset
dataset = dset.CIFAR10(root="./data", download=True,
                           transform=transforms.Compose([
                               transforms.Resize(64),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                           ]))
nc=3

dataloader = torch.utils.data.DataLoader(dataset, batch_size=128,
                                         shuffle=True, num_workers=2)


100%|██████████| 170M/170M [00:03<00:00, 48.6MB/s]


In [3]:
#checking the availability of cuda devices
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Tasks:
* try to get the best test Accuracy on Cifar10 using a transformer model
* pre-trained models allowed


In [4]:
#install
!pip install transformers



In [5]:
#imports
from transformers import (
    ViTConfig,
    ViTFeatureExtractor,
    ViTForImageClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW

In [6]:
#seeden f. reproduzierbarkeit
seed = 42
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [7]:
#bilder vom pretrained model korrekt skalieren/transformioeren
feature_extractor = ViTFeatureExtractor.from_pretrained(
    "google/vit-base-patch16-224-in21k"
)

vit_transform = transforms.Compose(
    [
        transforms.Resize(64),
        transforms.CenterCrop(64),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=feature_extractor.image_mean, std=feature_extractor.image_std
        ),
    ]
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [8]:
#train/test Daten laden
train_dataset = dset.CIFAR10(
    root="./data", train=True, download=False, transform=vit_transform
)
test_dataset = dset.CIFAR10(
    root="./data", train=False, download=False, transform=vit_transform
)

#Dataloader
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True, num_workers=2
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=128, shuffle=False, num_workers=2
)


In [9]:

torch.backends.cudnn.benchmark = True  # schneller auf GPU

config = ViTConfig.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    image_size=64, #img size
    num_labels=10, #klassen
    id2label={i: c for i, c in enumerate(train_dataset.classes)}, #bidirektionales mapping
    label2id={c: i for i, c in enumerate(train_dataset.classes)},
)

model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    config=config,
    ignore_mismatched_sizes=True,  #positionembeddings werden interpoliert
).to(device)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized because the shapes did not match:
- vit.embeddings.position_embeddings: found shape torch.Size([1, 197, 768]) in the checkpoint and torch.Size([1, 17, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
#Optimizer-adamw, scheduler m. warmup
epochs = 20
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.05)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps,
)

In [11]:
#train u valid.
for ep in range(epochs):
    model.train()
    ep_loss = 0.0
    ep_correct = 0
    ep_samples = 0

    for imgs, lbls in train_loader:
        imgs, lbls = imgs.to(device), lbls.to(device)

        out = model(pixel_values=imgs, labels=lbls)
        loss = out.loss

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        scheduler.step()

        ep_loss += loss.item() * imgs.size(0)
        ep_correct += (out.logits.argmax(dim=-1) == lbls).sum().item()
        ep_samples += lbls.size(0)

    train_acc = ep_correct / ep_samples
    train_loss = ep_loss / ep_samples

    # ─ Validation nach jeder Epoche
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for imgs, lbls in test_loader:
            imgs, lbls = imgs.to(device), lbls.to(device)
            preds = model(pixel_values=imgs).logits.argmax(dim=-1)
            val_correct += (preds == lbls).sum().item()
            val_total += lbls.size(0)

    val_acc = val_correct / val_total
    print(
        f"[{ep+1:02}/{epochs}]  "
        f"loss={train_loss:.4f}  "
        f"train_acc={train_acc:.3f}  "
        f"val_acc={val_acc:.3f}"
    )



[01/20]  loss=2.0276  train_acc=0.320  val_acc=0.623
[02/20]  loss=0.8892  train_acc=0.750  val_acc=0.824
[03/20]  loss=0.4418  train_acc=0.873  val_acc=0.860
[04/20]  loss=0.2362  train_acc=0.938  val_acc=0.871
[05/20]  loss=0.1276  train_acc=0.971  val_acc=0.869
[06/20]  loss=0.0647  train_acc=0.989  val_acc=0.875
[07/20]  loss=0.0398  train_acc=0.993  val_acc=0.869
[08/20]  loss=0.0275  train_acc=0.996  val_acc=0.879
[09/20]  loss=0.0175  train_acc=0.998  val_acc=0.879
[10/20]  loss=0.0118  train_acc=0.999  val_acc=0.875
[11/20]  loss=0.0137  train_acc=0.998  val_acc=0.877
[12/20]  loss=0.0102  train_acc=0.998  val_acc=0.880
[13/20]  loss=0.0079  train_acc=0.999  val_acc=0.879
[14/20]  loss=0.0053  train_acc=1.000  val_acc=0.884
[15/20]  loss=0.0033  train_acc=1.000  val_acc=0.884
[16/20]  loss=0.0028  train_acc=1.000  val_acc=0.884
[17/20]  loss=0.0024  train_acc=1.000  val_acc=0.884
[18/20]  loss=0.0022  train_acc=1.000  val_acc=0.884
[19/20]  loss=0.0020  train_acc=1.000  val_acc

In [11]:
#ERgebnis scheint valide zu sein, mit einer val_acc von 88% kann man zufrieden sein. vermutlich würde auch hier data augmentation den wert weiter steigern..