In [2]:
import os
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split, Subset
from PIL import Image

# Set paths and parameters
dataset_root = "/kaggle/input/cats-dogs/PetImages"  # This should contain 'Cat/' and 'Dog/' folders
image_size = 224  # Resize all images to 128x128
batch_size = 32   # Batch size for DataLoaders
num_workers = 2   # Adjust based on system

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),       # Resize to fixed size
    transforms.ToTensor(),                             # Convert to tensor
    transforms.Normalize([0.5, 0.5, 0.5],               # Normalize RGB channels to [-1, 1]
                         [0.5, 0.5, 0.5])
])

# Function to filter out corrupt images (the dataset has a few)
def is_valid_image(path):
    try:
        img = Image.open(path)
        img.verify()
        return True
    except Exception:
        return False

# Load full dataset with labels (0 = Cat, 1 = Dog)
full_dataset = ImageFolder(root=dataset_root, transform=transform)

# Filter dataset to exclude corrupted files
valid_indices = [i for i, (path, _) in enumerate(full_dataset.samples) if is_valid_image(path)]
clean_dataset = Subset(full_dataset, valid_indices)

# Compute split sizes
total_size = len(clean_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

# Random split into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(clean_dataset, [train_size, val_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# Print dataset statistics
print(f"Total valid images: {total_size}")
print(f"Training images: {len(train_dataset)}")
print(f"Validation images: {len(val_dataset)}")
print(f"Test images: {len(test_dataset)}")


Total valid images: 24998
Training images: 17498
Validation images: 3749
Test images: 3751


In [3]:
import timm
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ViT base model with pretrained weights
model = timm.create_model("vit_base_patch16_224", pretrained=True)

# Replace classification head for 2 classes (cat/dog)
model.head = nn.Linear(model.head.in_features, 2)
model.to(device)


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [4]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=3e-5)
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()

    train_acc = correct / len(train_loader.dataset)
    avg_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Train Accuracy: {train_acc:.4f}")


Epoch 1/5: 100%|██████████| 547/547 [10:13<00:00,  1.12s/it]


Epoch 1, Loss: 0.0344, Train Accuracy: 0.9883


Epoch 2/5: 100%|██████████| 547/547 [10:23<00:00,  1.14s/it]


Epoch 2, Loss: 0.0133, Train Accuracy: 0.9952


Epoch 3/5: 100%|██████████| 547/547 [10:23<00:00,  1.14s/it]


Epoch 3, Loss: 0.0128, Train Accuracy: 0.9958


Epoch 4/5: 100%|██████████| 547/547 [10:24<00:00,  1.14s/it]


Epoch 4, Loss: 0.0138, Train Accuracy: 0.9953


Epoch 5/5: 100%|██████████| 547/547 [10:23<00:00,  1.14s/it]

Epoch 5, Loss: 0.0087, Train Accuracy: 0.9968





In [5]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return accuracy

val_acc = evaluate(model, val_loader)
test_acc = evaluate(model, test_loader)
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


Validation Accuracy: 0.9885
Test Accuracy: 0.9877


In [6]:
# Save model weights
torch.save(model.state_dict(), "vit_model.pth")


In [10]:
import timm
import torch
import torch.nn as nn

# Load model architecture
model = timm.create_model("vit_base_patch16_224", pretrained=False)
model.head = nn.Linear(model.head.in_features, 2)

# Load saved weights
model.load_state_dict(torch.load("/kaggle/working/vit_model.pth", map_location=torch.device("cpu")))
model.eval()


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [12]:
from torchvision import transforms
from PIL import Image

# Define preprocessing transforms (must match training)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# Predict function
def predict_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        output = model(image)
        pred = torch.argmax(output, dim=1).item()

    classes = ["Cat", "Dog"]
    return classes[pred]


In [14]:
image_path = "/kaggle/input/cat-image/360_F_236992283_sNOxCVQeFLd5pdqaKGh8DRGMZy7P4XKm.jpg"
print(predict_image(image_path))


Cat


## Deploying the model (Video)

https://buffalo.box.com/s/98e77uwrwmmhasnnorqpf4kvpdno3qzh

## Website (Deployed)

https://vit-model-ncfdvpkcxqm9zwqrrdjjz4.streamlit.app