<h1>Tiny ImageNet</h1>

<h2>BEiT</h2>

In [1]:
import torch
import time
import torch.nn as nn
from torchvision.models import resnet18
import os
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip
from torch.utils.data import DataLoader
import torch.nn.functional as F
        
def save_checkpoint(student_model, optimizer, epoch, loss, checkpoint_dir, prefix):
    if checkpoint_dir and prefix:
        checkpoint_path = os.path.join(checkpoint_dir, f"{prefix}{epoch + 1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': student_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

def load_checkpoint(checkpoint_dir, prefix, student_model, optimizer):
    start_epoch = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)

    if checkpoint_dir and os.path.exists(checkpoint_dir):
        checkpoint_files = [
            f for f in os.listdir(checkpoint_dir) if f.startswith(prefix) and f.endswith(".pt")
        ]
        if checkpoint_files:
            latest_checkpoint = max(
                checkpoint_files,
                key=lambda x: int(x[len(prefix):-3])
            )
            checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
            print(f"Loading checkpoint from {checkpoint_path}...")
            checkpoint = torch.load(checkpoint_path)
            student_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}.")
    return start_epoch

def compute_model_size(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)

def init_tiny_imagenet_data(data_dir, batch_size=32):
    transform = Compose([
        Resize((224, 224)),
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_dir = os.path.join(data_dir, "train")
    val_dir = os.path.join(data_dir, "val/images")

    train_dataset = ImageFolder(root=train_dir, transform=transform)
    val_dataset = ImageFolder(root=val_dir, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_loader, val_loader

# Initialize data loaders
print("Initializing Tiny ImageNet data loaders...")
tiny_imagenet_dir = "./data/tiny-imagenet-200"
train_loader, val_loader = init_tiny_imagenet_data(tiny_imagenet_dir)

Initializing Tiny ImageNet data loaders...


In [2]:
def train_student_model(teacher_model, student_model, train_loader, optimizer, num_epochs=5, checkpoint_dir=None, prefix=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)
    teacher_model.to(device)
    teacher_model.eval()
    student_model.train()

    # Load checkpoint if available
    start_epoch = 0
    if checkpoint_dir and prefix:
        start_epoch = load_checkpoint(checkpoint_dir, prefix, student_model, optimizer)

    for epoch in range(start_epoch, num_epochs):
        print(f"Training epoch {epoch + 1}/{num_epochs}...")
        epoch_loss = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher model outputs
            with torch.no_grad():
                teacher_outputs = teacher_model(images)
                # Use pooler_output or global average pooling for classification
                teacher_logits = teacher_outputs.pooler_output

            # Student model outputs
            student_logits = student_model(images)

            # Compute loss
            loss = F.cross_entropy(student_logits, labels)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs} Loss: {epoch_loss / len(train_loader):.4f}")

        # Save checkpoint
        if checkpoint_dir and prefix:
            save_checkpoint(student_model, optimizer, epoch, epoch_loss / len(train_loader), checkpoint_dir, prefix)

    print("Training complete.")

    print(f"Student Model Size: {compute_model_size(student_model):.2f} MB")

In [3]:
def evaluate_student_model(student_model, teacher_model, val_loader, checkpoint_dir=None, prefix=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)
    teacher_model.to(device)
    student_model.eval()
    teacher_model.eval()

    # Load from checkpoint if specified
    if checkpoint_dir and prefix:
        print(f"Searching for checkpoints in {checkpoint_dir}...")
        checkpoint_files = [
            f for f in os.listdir(checkpoint_dir) if f.startswith(prefix) and f.endswith(".pt")
        ]
        if checkpoint_files:
            latest_checkpoint = max(
                checkpoint_files,
                key=lambda x: int(x[len(prefix):-3])
            )
            checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
            print(f"Loading checkpoint from {checkpoint_path}...")
            checkpoint = torch.load(checkpoint_path)
            student_model.load_state_dict(checkpoint['model_state_dict'])

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            # Student model outputs
            student_logits = student_model(images)
            predictions = torch.argmax(student_logits, dim=-1)

            all_predictions.extend(predictions.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Evaluation Accuracy: {accuracy:.4f}")


<h4>5 epochs</h4>

In [5]:
from torchvision.models import resnet18
from transformers import BeitModel

# Initialize BEiT Teacher Model
print("Initializing BEiT Teacher Model...")
beit_teacher_model = BeitModel.from_pretrained("microsoft/beit-base-patch16-224")

# Initialize Student Model
print("Initializing ResNet Student Model...")
num_classes = 200
student_model = resnet18(pretrained=True)
student_model.fc = torch.nn.Linear(student_model.fc.in_features, num_classes)

prefix = "resnet_beit_epoch_"
checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Optimizer
optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

# Training
print("Training ResNet Student Model...")
train_student_model(beit_teacher_model, student_model, train_loader, optimizer, 5, checkpoint_dir, prefix)

# Evaluation
print("Evaluating ResNet Student Model...")
evaluate_student_model(student_model, beit_teacher_model, val_loader, checkpoint_dir, prefix)


Initializing BEiT Teacher Model...
Initializing ResNet Student Model...




Training ResNet Student Model...
Loading checkpoint from ./checkpoints/resnet_beit_epoch_4.pt...


  checkpoint = torch.load(checkpoint_path)


Resuming training from epoch 4.
Training epoch 5/5...
Epoch 5/5 Loss: 0.5559
Checkpoint saved at ./checkpoints/resnet_beit_epoch_5.pt
Training complete.
Student Model Size: 43.03 MB
Evaluating ResNet Student Model...
Searching for checkpoints in ./checkpoints...
Loading checkpoint from ./checkpoints/resnet_beit_epoch_5.pt...


  checkpoint = torch.load(checkpoint_path)


Evaluation Accuracy: 0.7150


<h4>10 epochs</h4>

In [4]:
from torchvision.models import resnet18
from transformers import BeitModel

# Initialize BEiT Teacher Model
print("Initializing BEiT Teacher Model...")
beit_teacher_model = BeitModel.from_pretrained("microsoft/beit-base-patch16-224")

# Initialize Student Model
print("Initializing ResNet Student Model...")
num_classes = 200
student_model = resnet18(pretrained=True)
student_model.fc = torch.nn.Linear(student_model.fc.in_features, num_classes)

prefix = "resnet_beit_epoch_"
checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Optimizer
optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

# Training
print("Training ResNet Student Model...")
train_student_model(beit_teacher_model, student_model, train_loader, optimizer, 10, checkpoint_dir, prefix)

# Evaluation
print("Evaluating ResNet Student Model...")
evaluate_student_model(student_model, beit_teacher_model, val_loader, checkpoint_dir, prefix)

Initializing BEiT Teacher Model...
Initializing ResNet Student Model...




Training ResNet Student Model...
Loading checkpoint from ./checkpoints/resnet_beit_epoch_8.pt...


  checkpoint = torch.load(checkpoint_path)


Resuming training from epoch 8.
Training epoch 9/10...
Epoch 9/10 Loss: 0.1931
Checkpoint saved at ./checkpoints/resnet_beit_epoch_9.pt
Training epoch 10/10...
Epoch 10/10 Loss: 0.1573
Checkpoint saved at ./checkpoints/resnet_beit_epoch_10.pt
Training complete.
Student Model Size: 43.03 MB
Evaluating ResNet Student Model...
Searching for checkpoints in ./checkpoints...
Loading checkpoint from ./checkpoints/resnet_beit_epoch_10.pt...


  checkpoint = torch.load(checkpoint_path)


NameError: name 'accuracy_score' is not defined

In [5]:
from sklearn.metrics import accuracy_score
# Evaluation
print("Evaluating ResNet Student Model...")
evaluate_student_model(student_model, beit_teacher_model, val_loader, checkpoint_dir, prefix)

Evaluating ResNet Student Model...
Searching for checkpoints in ./checkpoints...
Loading checkpoint from ./checkpoints/resnet_beit_epoch_10.pt...


  checkpoint = torch.load(checkpoint_path)


Evaluation Accuracy: 0.7011


<h2>DINO</h2>

<h4>5 epochs</h4>

In [4]:
import torch
import time
import torch.nn as nn
from torchvision.models import resnet18, vit_b_16, ViT_B_16_Weights
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip
from torch.utils.data import DataLoader
import os
import torch.nn.functional as F
from sklearn.metrics import accuracy_score


# Utility Functions
def save_checkpoint(student_model, optimizer, epoch, loss, checkpoint_dir, prefix):
    if checkpoint_dir and prefix:
        checkpoint_path = os.path.join(checkpoint_dir, f"{prefix}{epoch + 1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': student_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")


def load_checkpoint(checkpoint_dir, prefix, student_model, optimizer):
    start_epoch = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)

    if checkpoint_dir and os.path.exists(checkpoint_dir):
        checkpoint_files = [
            f for f in os.listdir(checkpoint_dir) if f.startswith(prefix) and f.endswith(".pt")
        ]
        if checkpoint_files:
            latest_checkpoint = max(
                checkpoint_files,
                key=lambda x: int(x[len(prefix):-3])
            )
            checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
            print(f"Loading checkpoint from {checkpoint_path}...")
            checkpoint = torch.load(checkpoint_path)
            student_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}.")
    return start_epoch


def compute_model_size(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)


def init_tiny_imagenet_data(data_dir, batch_size=32):
    transform = Compose([
        Resize((384, 384)),  # Match DINO's input size
        RandomHorizontalFlip(),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_dir = os.path.join(data_dir, "train")
    val_dir = os.path.join(data_dir, "val/images")

    train_dataset = ImageFolder(root=train_dir, transform=transform)
    val_dataset = ImageFolder(root=val_dir, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_loader, val_loader

def train_student_model(teacher_model, student_model, train_loader, optimizer, num_epochs=5, checkpoint_dir=None, prefix=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)
    teacher_model.to(device)
    teacher_model.eval()
    student_model.train()

    start_epoch = 0
    if checkpoint_dir and prefix:
        start_epoch = load_checkpoint(checkpoint_dir, prefix, student_model, optimizer)

    for epoch in range(start_epoch, num_epochs):
        print(f"Training epoch {epoch + 1}/{num_epochs}...")
        epoch_loss = 0.0
        start_time = time.time()

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher outputs
            with torch.no_grad():
                teacher_features = teacher_model(images)  # DINO ViT outputs [batch_size, hidden_dim]
                teacher_logits = teacher_features.mean(dim=1)  # Global average pooling for class representation

            # Student outputs
            student_logits = student_model(images)

            # Classification loss
            loss = nn.CrossEntropyLoss()(student_logits, labels)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        end_time = time.time()
        print(f"Epoch {epoch + 1}/{num_epochs} Loss: {epoch_loss / len(train_loader):.4f}")
        print(f"Epoch {epoch + 1} completed in {end_time - start_time:.2f} seconds.")

        if checkpoint_dir and prefix:
            save_checkpoint(student_model, optimizer, epoch, epoch_loss / len(train_loader), checkpoint_dir, prefix)

    print("Training complete.")
    print(f"Student Model Size: {compute_model_size(student_model):.2f} MB")

# Evaluation Function
def evaluate_student_model(student_model, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    student_model.to(device)
    student_model.eval()

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            logits = student_model(images)
            predictions = torch.argmax(logits, dim=-1)

            all_predictions.extend(predictions.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Evaluation Accuracy: {accuracy:.4f}")


# Main Code
print("Initializing Tiny ImageNet data loaders...")
tiny_imagenet_dir = "./data/tiny-imagenet-200"
train_loader, val_loader = init_tiny_imagenet_data(tiny_imagenet_dir, batch_size=32)

# Initialize Teacher and Student Models
print("Initializing DINO Teacher Model...")
dino_weights = ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1
dino_teacher_model = vit_b_16(weights=dino_weights).eval()
dino_teacher_model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')

print("Initializing ResNet Student Model...")
student_model = resnet18(pretrained=True)
student_model.fc = nn.Linear(student_model.fc.in_features, 200)

# Optimizer
optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

# Training
checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
prefix = "resnet_dino_epoch_"

print("Training Student Model...")
train_student_model(dino_teacher_model, student_model, train_loader, optimizer, num_epochs=5, checkpoint_dir=checkpoint_dir, prefix=prefix)

# Evaluation
print("Evaluating Student Model...")
evaluate_student_model(student_model, val_loader)

Initializing Tiny ImageNet data loaders...
The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Initializing DINO Teacher Model...


Using cache found in /home/yx3493/.cache/torch/hub/facebookresearch_dino_main


Initializing ResNet Student Model...
Training Student Model...


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


<h4>10 epochs</h4>

In [None]:
print("Training Student Model...")
train_student_model(dino_teacher_model, student_model, train_loader, optimizer, num_epochs=10, checkpoint_dir=checkpoint_dir, prefix=prefix)

# Evaluation
print("Evaluating Student Model...")
evaluate_student_model(student_model, val_loader)