In [None]:
# Install necessary libraries
!pip install torch torchvision timm



In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import timm  # Library for Vision Transformers

In [None]:
# ✅ Step 1: Define the Teacher and Smaller Student Models
teacher_model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=10)
student_model = timm.create_model('vit_tiny_patch16_224', pretrained=False, num_classes=10, img_size=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ✅ Step 2: Dataset Preparation with Lower Resolution
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize to 128x128
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)

Files already downloaded and verified




Files already downloaded and verified


In [None]:
# ✅ Step 3: Define Distillation Loss
def distillation_loss(student_logits, teacher_logits, labels, temperature=2.0, alpha=0.5):
    soft_teacher_probs = nn.functional.softmax(teacher_logits / temperature, dim=1)
    soft_student_probs = nn.functional.log_softmax(student_logits / temperature, dim=1)
    distill_loss = nn.functional.kl_div(soft_student_probs, soft_teacher_probs, reduction='batchmean') * (temperature ** 2)
    hard_loss = nn.functional.cross_entropy(student_logits, labels)
    return alpha * distill_loss + (1 - alpha) * hard_loss

In [None]:
# ✅ Step 4: Enable Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [None]:
# ✅ Step 5: Training Loop
def train_student(teacher_model, student_model, train_loader, optimizer, temperature=2.0, alpha=0.5, num_epochs=5):
    teacher_model.eval()
    student_model.train()

    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():  # Enable mixed precision
                with torch.no_grad():
                    # Resize input for teacher to its expected size
                    teacher_input = nn.functional.interpolate(inputs, size=(224, 224), mode='bicubic', align_corners=False)
                    teacher_outputs = teacher_model(teacher_input)
                student_outputs = student_model(inputs)
                loss = distillation_loss(student_outputs, teacher_outputs, labels, temperature, alpha)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

In [None]:
# ✅ Step 6: Evaluate Student Model
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test images: {accuracy:.2f}%')

In [None]:
# ✅ Step 7: Train and Evaluate
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher_model = teacher_model.to(device)
student_model = student_model.to(device)
optimizer = optim.Adam(student_model.parameters(), lr=0.001)

In [None]:
# Train
train_student(teacher_model, student_model, train_loader, optimizer, temperature=2.0, alpha=0.5, num_epochs=5)

  with torch.cuda.amp.autocast():  # Enable mixed precision


In [None]:
# evaluate
evaluate_model(student_model, test_loader)

Accuracy of the model on the test images: 48.69%
