#Fine Tuning of swin model and validation testing

In [2]:
# Import necessary libraries
import modal
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from PIL import Image
import io
from transformers import AutoImageProcessor
from transformers import AutoModelForImageClassification
from torchvision import transforms
from tqdm import tqdm

# Modal setup
stub = modal.App("swinv2-fairface-training")

# Define the image for Modal
image = modal.Image.debian_slim().pip_install(
    "torch",
    "torchvision",
    "transformers",
    "pandas",
    "pillow",
    "tqdm",
    "pyarrow",
    "accelerate"
)

# Create volumes
volume = modal.Volume.from_name("fairface-data", create_if_missing=True)
model_volume = modal.Volume.from_name("swinv2-models", create_if_missing=True)

In [3]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
from accelerate import Accelerator
def create_swinv2_model(num_classes):
    model = AutoModelForImageClassification.from_pretrained(
        'microsoft/swinv2-base-patch4-window16-256',
        num_labels=num_classes,
        ignore_mismatched_sizes=True
    )
    return model

# Get processor and normalization values
processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window16-256")
mean = processor.image_mean
std = processor.image_std

preprocessor_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# Dataset class for FairFace
class FairFaceDataset(Dataset):
    def __init__(self, parquet_file, transform=None):
        self.data = pd.read_parquet(parquet_file)
        # Use processor's normalization
        self.transform = transform or transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image = Image.open(io.BytesIO(row['image']['bytes'])).convert('RGB')
        label = row['race']  # Adjust if your label column is different
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

def get_dataloaders(batch_size=32):
    train_dataset = FairFaceDataset("/root/data/train.parquet")
    val_dataset = FairFaceDataset("/root/data/validation.parquet")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    return train_loader, val_loader

In [5]:

def train_epoch(model, train_loader, criterion, optimizer, accelerator, print_every=10):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (images, labels) in enumerate(train_loader):
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        if (batch_idx + 1) % print_every == 0 or (batch_idx + 1) == len(train_loader):
            accelerator.print(f"Batch {batch_idx+1}/{len(train_loader)} - "
                              f"Train Loss: {total_loss/(batch_idx+1):.4f}, "
                              f"Train Acc: {100.*correct/total:.2f}%")

    return total_loss / len(train_loader), 100. * correct / total

def validate(model, val_loader, criterion, accelerator, print_every=10):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(val_loader):
            outputs = model(images).logits
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            if (batch_idx + 1) % print_every == 0 or (batch_idx + 1) == len(val_loader):
                accelerator.print(f"Batch {batch_idx+1}/{len(val_loader)} - "
                                  f"Val Loss: {total_loss/(batch_idx+1):.4f}, "
                                  f"Val Acc: {100.*correct/total:.2f}%")

    return total_loss / len(val_loader), 100. * correct / total

In [7]:
@stub.function(
    image=image,
    gpu="A100",  # Use 1 A100 GPU
    volumes={"/root/data": volume, "/root/models": model_volume},
    timeout=14400
)
def train_swinv2_model(num_epochs=10, batch_size=16, learning_rate=2e-5):
    accelerator = Accelerator()
    accelerator.print(f"Accelerator process {accelerator.process_index} of {accelerator.num_processes} on device {accelerator.device}")

    train_loader, val_loader = get_dataloaders(batch_size)
    num_classes = 7  # Adjust if your dataset has a different number of classes
    model = create_swinv2_model(num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    # Prepare for distributed training (even if using 1 GPU, this is fine)
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader
    )

    best_val_acc = 0
    for epoch in range(num_epochs):
        accelerator.print(f"\nEpoch {epoch+1}/{num_epochs}")

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, accelerator)
        accelerator.print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%")

        val_loss, val_acc = validate(model, val_loader, criterion, accelerator)
        accelerator.print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            accelerator.save(model.state_dict(), "/root/models/swinv2_fairface_best.pth")
            accelerator.print(f"Saved new best model with validation accuracy: {val_acc:.2f}%")

    accelerator.save(model.state_dict(), "/root/models/swinv2_fairface_final.pth")
    accelerator.print("\nTraining completed!")
    return best_val_acc

In [9]:
# Run the training
with stub.run():
    best_acc = train_swinv2_model.remote(num_epochs=10, batch_size=16)
    print(f"Best validation accuracy: {best_acc:.2f}%")

To carry out validation seperately for base swin model 

In [2]:
import modal
import torch
import pandas as pd
import numpy as np
from PIL import Image
import io
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForImageClassification, AutoImageProcessor
from tqdm import tqdm

stub = modal.App("swin-pth-validation")

image = modal.Image.debian_slim().pip_install(
    "torch",
    "torchvision",
    "transformers",
    "pandas",
    "pillow",
    "pyarrow",
    "tqdm"
)

data_volume = modal.Volume.from_name("fairface-data")
model_volume = modal.Volume.from_name("swinv2-models")
output_volume = modal.Volume.from_name("swin-validation-output", create_if_missing=True)

class FairFaceDataset(Dataset):
    def __init__(self, parquet_file, transform=None):
        self.data = pd.read_parquet(parquet_file)
        processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window16-256")
        mean = processor.image_mean
        std = processor.image_std
        self.transform = transform or transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image = Image.open(io.BytesIO(row['image']['bytes'])).convert('RGB')
        label = row['race']
        if self.transform:
            image = self.transform(image)
        return image, label

@stub.function(
    image=image,
    gpu="T4",
    timeout=1800,
    volumes={
        "/data": data_volume,
        "/models": model_volume,
        "/output": output_volume
    }
)
def validate_swin_pth(
    pth_model_path="/models/swinv2_fairface_best.pth",
    parquet_path="/data/validation.parquet",
    output_csv="/output/swin_val_predictions_base.csv",
    batch_size=32,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    fairface_classes = [
        "White", "Black", "Latino_Hispanic", "East Asian",
        "Southeast Asian", "Indian", "Middle Eastern"
    ]
    num_classes = 7
    model = AutoModelForImageClassification.from_pretrained(
        'microsoft/swinv2-base-patch4-window16-256',
        num_labels=num_classes,
        ignore_mismatched_sizes=True
    )
    model.load_state_dict(torch.load(pth_model_path, map_location=device))
    model.eval()
    model = model.to(device)

    val_dataset = FairFaceDataset(parquet_path)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            probs = torch.softmax(outputs, dim=1)
            top5_probs, top5_preds = probs.topk(5, dim=1)
            top1_pred = top5_preds[:, 0]
            top1_prob = top5_probs[:, 0]
            _, predicted = probs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            for i in range(len(labels)):
                all_predictions.append({
                    'true_label': fairface_classes[labels[i].item()],
                    'predicted_label': fairface_classes[predicted[i].item()],
                    'confidence': probs[i][predicted[i]].item(),
                    'top1_pred': fairface_classes[top1_pred[i].item()],
                    'top1_prob': top1_prob[i].item(),
                    'top5_preds': [fairface_classes[idx] for idx in top5_preds[i].cpu().numpy()],
                    'top5_probs': top5_probs[i].cpu().numpy().tolist()
                })

    accuracy = 100. * correct / total
    results_df = pd.DataFrame(all_predictions)
    results_df.to_csv(output_csv, index=False)
    print(f"Validation Accuracy: {accuracy:.2f}%")
    print(f"Predictions saved to {output_csv}")
    return accuracy

# Run validation
with stub.run():
    acc = validate_swin_pth.remote()
    print(f"Final Validation Accuracy: {acc:.2f}%")

Final Validation Accuracy: 71.94%
