# Vision Transformer (ViT) for Image Classification [5 points]
Use a Vision Transformer to solve the Cats and Dogs Dataset. You can use pre-defined ViT model or implement from scratch.
Deploy the model and record a short video (~5 mins) on how it works.

In [15]:
import os
import zipfile
import torch
import timm
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
from torch import nn, optim
import PIL
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

In [None]:
zip_file_path = "/content/sample_data/kagglecatsanddogs_5340.zip"
extract_path = "kagglecatsanddogs_5340"

if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")
dataset_path = "kagglecatsanddogs_5340/PetImages"
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

#dataset = ImageFolder(root=dataset_path, transform=transform)
class CustomImageFolder(ImageFolder):
    def __init__(self, root, transform=None):
        super().__init__(root, transform)

    def __getitem__(self, index):
        try:
            return super().__getitem__(index)
        except (PIL.UnidentifiedImageError, OSError, ValueError) as e:
            print(f"Skipping corrupted image: {self.imgs[index][0]}")
            return self.__getitem__((index + 1) % len(self.imgs))

dataset = CustomImageFolder(root=dataset_path, transform=transform)

Dataset extracted successfully!


In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load ViT model
model = timm.create_model("vit_base_patch16_224", pretrained=True, num_classes=2)
model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
# Training
def train_model(model, train_loader, val_loader, epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct, total = 0, 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct/total:.2f}%")
    print("Training complete!")

train_model(model, train_loader, val_loader, epochs=5)



Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Dog/11702.jpg
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Cat/666.jpg
Epoch 1, Loss: 0.0827, Accuracy: 96.88%
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Dog/11702.jpg
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Cat/666.jpg
Epoch 2, Loss: 0.0588, Accuracy: 97.78%
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Dog/11702.jpg
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Cat/666.jpg
Epoch 3, Loss: 0.0521, Accuracy: 98.08%
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Cat/666.jpg
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Dog/11702.jpg
Epoch 4, Loss: 0.0474, Accuracy: 98.19%
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Dog/11702.jpg
Skipping corrupted image: kagglecatsanddogs_5340/PetImages/Cat/666.jpg
Epoch 5, Loss: 0.0386, Accuracy: 98.55%
Training complete!


4. Deploy your trained ViT model. This could be a simple script or application that takes an image as input and predicts whether it's a cat or a dog.

The model is deployed here - https://huggingface.co/spaces/ruthvikvkumar/cat_dog

In [None]:
torch.save(model.state_dict(), "vit_cats_dogs.pth")

def predict(image_path, model):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    with torch.no_grad():
        output = model(image)
        _, predicted = torch.max(output, 1)
    return "Dog" if predicted.item() == 1 else "Cat"

# Test prediction
sample_image = "/content/sample_data/cat.jpg"
print("Prediction - 1:", predict(sample_image, model))

sample_image1 = "/content/sample_data/dog.jpg"
print("Prediction - 2:", predict(sample_image1, model))

Prediction - 1: Cat
Prediction - 2: Dog


5. Record a short video (~5 mins) demonstrating how your deployed ViT model works. The video should showcase the model taking image inputs and providing predictions. Explain the key aspects of your implementation and deployment process in the video.
   a. Upload the video to UBbox and create a shared link
   b. Add the link at the end of your ipynb file.

**Shared UBbox Video Link:**
https://buffalo.box.com/s/3argmh5tl2skykkita4skkve0bju5cul

# References:
1. https://www.microsoft.com/en-us/download/details.aspx?id=54765
2. https://huggingface.co/timm/vit_base_patch16_224.augreg2_in21k_ft_in1k