# VISION TRANSFORMER

![](../../images/vit_2.png)

In [4]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.datasets.mnist import MNIST
from torchvision.transforms import ToTensor

np.random.seed(2910)
torch.manual_seed(2910)

# setup device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# hyper parameters
batch_size = 4
learning_rate = 0.001
num_epochs = 10

# Preparing data:

train_set = MNIST(root="../../data",train=True, download=False, transform=ToTensor())
test_set = MNIST(root="../../data",train=False, download=False, transform=ToTensor())

train_loader = DataLoader(dataset=train_set, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(dataset=test_set, shuffle=False, batch_size=batch_size)



# func
def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin()

# Model
class ViT(nn.Module):
    def __init__(self, input_shape, n_patches = 7, hidden_d = 8):
        super(ViT, self).__init__()
        
        self.input_shape = input_shape
        self.n_patches = n_patches
        self.patch_size = (input_shape[1] / n_patches, input_shape[2] / n_patches)
        self.input_d = int(input_shape[0] * self.patch_size[0] * self.patch_size[1])
        self.hidden_d = hidden_d
        # 1) Linear maper
        self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
        
        # 2) Classification token
        self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
        
    def forward(self):
        # Dividing into patches
        n, c, w, h= images.shape
        patches = images.reshape(n, self.n_patches ** 2, self.input_d)
        
        # Running linear layer for tokeniztion
        tokens = self.linear_mapper(patches)
        
        # Adding classification token to the tokens
        tokens = torch.stack([torch.vstack((self.class_token, tokens[i])) for i in range(len(tokens))])
        
        return tokens


# Training loop
model = ViT(input_shape=(1,28,28), n_patches=7)

criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    train_loss = 0.0
    for batch in train_loader:
        images, labels = batch
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        outputs = outputs.to(device)
        loss = criterion(outputs, labels) / len(images)
        
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch + 1}/{num_epochs} loss:{train_loss:.2f}")
    
    

# Predict and accuracy
correct, total = 0, 0
test_loss = 0.0
with torch.no_grad():
    for i, (images, labels) in enumerate(test_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images).to(device)
        loss = criterion(outputs, labels)
        test_loss += loss/len(images)
        
        correct += torch.sum(torch.argmax(outputs, dim = 1) == labels).item()
        total += len(images)
    print(f"Test loss: {test_loss:.2f}")
    print(f"Test accuracy: {correct / total * 100:.2f}%")
    
    

ValueError: optimizer got an empty parameter list