In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import random
import matplotlib.pyplot as plt

In [None]:
torchvision.__version__

'0.23.0+cu126'

In [None]:
device = "cuda" if torch.cuda.is_available() else "gpu"

In [None]:
device

'cuda'

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)

In [None]:
BATCH_SIZE = 256
EPOCHES = 10
LEARNING_RATE = 3e-4
PATCH_SIZE = 4
NUM_CLASSES = 10
IMAGE_SIZE = 32
CHANNELS = 3
EMBED_DIM = 256
NUM_HEADS = 8
DEPTH = 6
MLP_DIM = 512
DROP_RATE = 0.1



In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5), (0.5))
])


In [None]:
train_dataset = datasets.CIFAR10(root = 'data',
                                train = True,
                                download  = True,
                                transform = transform)

In [None]:
test_dataset = datasets.CIFAR10(root = 'data',
                                train  = False,
                                download = True,
                                transform = transform)



In [None]:
test_dataset

Dataset CIFAR10
    Number of datapoints: 10000
    Root location: data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.5)
           )

In [None]:
train_loader = DataLoader(dataset = train_dataset,
                           batch_size =  BATCH_SIZE,
                           shuffle  = True)
test_loader = DataLoader(dataset = test_dataset,
                         batch_size = BATCH_SIZE,
                         shuffle  = True)

In [None]:
len(train_loader)

196

In [None]:
class PatchEmbedding(nn.Module):
  def __init__(self,
               img_size,
               patch_size,
               in_channels,
               embed_dim):
    super().__init__()
    self.patch_size = patch_size
    self.proj = nn.Conv2d(in_channels= in_channels,
                          out_channels= embed_dim,
                          kernel_size = patch_size,
                          stride = patch_size,
                          )
    num_patches = (img_size// patch_size) **2
    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1,1+ num_patches, embed_dim))

  def forward(self ,x: torch.Tensor):
    B = x.size(0)
    x = self.proj(x)
    x = x.flatten(2).transpose(1,2)
    cls_token = self.cls_token.expand(B,-1,-1)
    x = torch.cat((cls_token , x), dim = 1)
    x = x + self.pos_embed
    return x

In [None]:
class MLP(nn.Module):
  def __init__(self, in_features,
               hidden_features,
               drop_rate):
    super().__init__()
    self.fc1 = nn.Linear(in_features = in_features,
                         out_features = hidden_features)
    self.fc2 =  nn.Linear(in_features = hidden_features,
                         out_features = in_features)
    self.dropout = nn.Dropout(drop_rate)

  def forward(self, x):
    x = self.dropout(F.gelu(self.fc1(x)))
    x = self.dropout(self.fc2(x))
    return x


In [None]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, embed_dim, num_heads, mlp_dim, drop_rate):
    super().__init__()
    self.norm1 = nn.LayerNorm(embed_dim)
    self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout = drop_rate, batch_first = True)
    self.norm2 = nn.LayerNorm(embed_dim)
    self.mlp = MLP(embed_dim, mlp_dim,drop_rate)

  def forward(self, x):
    x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
    x = x + self.mlp(self.norm2(x))
    return x


In [None]:
class VisionTransformer(nn.Module):
  def __init__(self, img_size, patch_size, in_channels, num_classes, embed_dim, depth, num_heads, mlp_dim, drop_rate):
    super().__init__()
    self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
    self.encoder = nn.Sequential(
        *[TransformerEncoderLayer(embed_dim, num_heads, mlp_dim, drop_rate)
        for _ in range(depth)]

    )
    self.norm = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim , num_classes)
  def forward(self, x):
    x = self.patch_embed(x)
    x = self.encoder(x)
    x = self.norm(x)
    cls_token = x[:, 0]
    return self.head(cls_token)

In [None]:
model = VisionTransformer(IMAGE_SIZE, PATCH_SIZE, CHANNELS,NUM_CLASSES, EMBED_DIM, DEPTH, NUM_HEADS, MLP_DIM, DROP_RATE).to(device)

In [None]:
model

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (proj): Conv2d(3, 256, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncoderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=256, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncoderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE)

In [None]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0
)

In [None]:
def train(model, loader, optimizer, criterion):
  model.train()

  total_loss , correct = 0,0
  for x , y in loader:
    x ,y = x.to(device) , y.to(device)
    optimizer.zero_grad()
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optimizer.step()

    total_loss += loss.item() * x.size(0)
    correct += (out.argmax(1)== y).sum().item()
  return total_loss/ len(loader.dataset), correct/len(loader.dataset)




In [None]:
def evaluate(model, loader):
  model.eval()
  correct = 0
  with torch.inference_mode():
    for x , y in loader:
      x , y = x.to(device), y.to(device)
      out = model(x)
      correct += (out.argmax(dim = 1)== y).sum().item()
  return correct/  len(loader.dataset)

In [None]:
from tqdm.auto import tqdm

In [None]:
train_accuracies = []
test_accuracies = []
for epoch in tqdm(range(EPOCHES)):
  train_loss , train_acc = train(model, train_loader, optimizer, criterion)
  test_acc = evaluate(model ,test_loader)
  train_accuracies.append(train_acc)
  test_accuracies.append(test_acc)
  print(f"Epoch:{epoch+1}/{EPOCHES}, Train loss: {train_loss:.4f}, Train_acc: {train_acc:.4f}%, Test_acc: {test_acc:.4f}%")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:1/10, Train loss: 1.8188, Train_acc: 0.3417%, Test_acc: 0.4489%
Epoch:2/10, Train loss: 1.4494, Train_acc: 0.4816%, Test_acc: 0.5088%
Epoch:3/10, Train loss: 1.2948, Train_acc: 0.5371%, Test_acc: 0.5448%
Epoch:4/10, Train loss: 1.1867, Train_acc: 0.5747%, Test_acc: 0.5695%
Epoch:5/10, Train loss: 1.0922, Train_acc: 0.6128%, Test_acc: 0.5942%
Epoch:6/10, Train loss: 1.0199, Train_acc: 0.6371%, Test_acc: 0.6050%
Epoch:7/10, Train loss: 0.9422, Train_acc: 0.6646%, Test_acc: 0.6165%
Epoch:8/10, Train loss: 0.8711, Train_acc: 0.6886%, Test_acc: 0.6211%
Epoch:9/10, Train loss: 0.8088, Train_acc: 0.7102%, Test_acc: 0.6241%
Epoch:10/10, Train loss: 0.7453, Train_acc: 0.7341%, Test_acc: 0.6317%
