In [1]:
import torch
import torchvision
from torch import nn
from torchvision import transforms
import matplotlib.pyplot as plt
!pip install torchinfo
from torchinfo import summary
!git clone 'https://github.com/mrdbourke/pytorch-deep-learning'
!mv pytorch-deep-learning/going_modular .
!mv pytorch-deep-learning/helper_functions.py .
from going_modular.going_modular import data_setup,engine
from helper_functions import download_data,plot_loss_curves

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Cloning into 'pytorch-deep-learning'...
remote: Enumerating objects: 4393, done.[K
remote: Total 4393 (delta 0), reused 0 (delta 0), pack-reused 4393 (from 1)[K
Receiving objects: 100% (4393/4393), 764.14 MiB | 15.62 MiB/s, done.
Resolving deltas: 100% (2656/2656), done.
Updating files: 100% (248/248), done.


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
from pathlib import Path
data_path = Path('data/')

In [8]:
image_path = download_data(source='https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip',destination=image_path)

[INFO] Did not find data/data/pizza-steak-sushi directory, creating one...
[INFO] Downloading pizza_steak_sushi_20_percent.zip from https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip...
[INFO] Unzipping pizza_steak_sushi_20_percent.zip data...


In [9]:
image_path

PosixPath('data/data/pizza-steak-sushi')

In [10]:
train_dir = image_path / 'train'
test_dir = image_path / 'test'

In [12]:
IMG_SIZE=224
BATCH_SIZE=32
manual_tranforms = transforms.Compose([transforms.Resize((IMG_SIZE,IMG_SIZE)),transforms.ToTensor()])

train_dataloader,test_dataloader,class_names = data_setup.create_dataloaders(train_dir=train_dir,test_dir=test_dir,transform=manual_tranforms,batch_size=BATCH_SIZE)

In [13]:
class MSABlock(nn.Module):
  def __init__(self,embedding_dim=768,num_heads=12,attn_dropout=0):
    super().__init__()

    self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
    self.msa = nn.MultiheadAttention(embed_dim=embedding_dim,num_heads=num_heads,dropout=attn_dropout,batch_first=True)

  def forward(self,x):
    x = self.layer_norm(x)
    x,_ = self.msa(key=x,query=x,value=x,need_weights=False)
    return x

In [23]:
class PatchEmbedding(nn.Module):
  def __init__(self,in_channels,patch_size,embedding_dim):
    super().__init__()
    self.patcher = nn.Conv2d(in_channels=in_channels,out_channels=embedding_dim,kernel_size=patch_size,stride=patch_size)
    self.flatten = nn.Flatten(start_dim=2,end_dim=3)

  def forward(self,x):
    x= self.patcher(x)
    x = self.flatten(x)
    return x.permute(0,2,1)

In [17]:
class MLPBlock(nn.Module):
  def __init__(self,embedding_dim,mlp_dim,dropout):
    super().__init__()
    self.mlp = nn.Sequential(
        nn.Linear(in_features=embedding_dim,out_features=mlp_dim),
        nn.GELU(),
        nn.Dropout(p=dropout),
        nn.Linear(in_features=mlp_dim,out_features=embedding_dim),
        nn.Dropout(p=dropout)
    )

    self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
  def forward(self,x):
    x = self.layer_norm(x)
    x = self.mlp(x)
    return x

In [18]:
class TransformerEncoder(nn.Module):
  def __init__(self,embedding_dim,mlp_dim,attn_dropout,dropout,num_heads):
    super().__init__()
    self.msa = MSABlock(embedding_dim=embedding_dim,num_heads=num_heads,attn_dropout=attn_dropout)
    self.mlp = MLPBlock(embedding_dim=embedding_dim,mlp_dim=mlp_dim,dropout=dropout)

  def forward(self,x):
    x = self.msa(x) + x
    x = self.mlp(x) + x
    return x

In [20]:
num_classes = len(class_names)

In [37]:
class Vit(nn.Module):
  def __init__(self,in_channels,patch_size,embedding_dim,mlp_dim,attn_dropout,dropout,num_heads,num_layers):
    super().__init__()
    self.embedding = PatchEmbedding(in_channels,patch_size,embedding_dim)
    self.transformer = nn.Sequential(*[TransformerEncoder(embedding_dim,mlp_dim,attn_dropout,dropout,num_heads) for _ in range(num_layers)])
    self.classifier = nn.Sequential(nn.LayerNorm(normalized_shape=embedding_dim),nn.Linear(embedding_dim,num_classes))
    self.num_patches = (IMG_SIZE//patch_size) ** 2 # Correct calculation of num_patches
    self.class_token = nn.Parameter(torch.rand(1,1,embedding_dim),requires_grad=True)
    self.pos_embedding = nn.Parameter(torch.rand(1,self.num_patches+1,embedding_dim),requires_grad=True)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self,x):
    batch_size = x.shape[0]
    class_token = self.class_token.expand(batch_size,-1,-1)
    embeddings = self.embedding(x)
    embeddings = torch.cat([class_token,embeddings],dim=1)
    embeddings = embeddings + self.pos_embedding
    embeddings = self.dropout(embeddings)
    x = self.transformer(embeddings)
    x = self.classifier(x[:,0,:])
    return x

In [38]:
model = Vit(in_channels=3,patch_size=16,embedding_dim=768,mlp_dim=3072,attn_dropout=0.1,dropout=0.1,num_heads=12,num_layers=12).to(device)

In [39]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

results = engine.train(model=model,train_dataloader=train_dataloader,test_dataloader=test_dataloader,optimizer=optimizer,loss_fn=loss_fn,epochs=5,device=device)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 4.0131 | train_acc: 0.3021 | test_loss: 1.4017 | test_acc: 0.3500
Epoch: 2 | train_loss: 1.3103 | train_acc: 0.3167 | test_loss: 1.1811 | test_acc: 0.3500
Epoch: 3 | train_loss: 1.2114 | train_acc: 0.3271 | test_loss: 1.1555 | test_acc: 0.2875
Epoch: 4 | train_loss: 1.1316 | train_acc: 0.3104 | test_loss: 1.1249 | test_acc: 0.3625
Epoch: 5 | train_loss: 1.1149 | train_acc: 0.3063 | test_loss: 1.0939 | test_acc: 0.3500
