# Efficient Adaptation and Analysis of Vision Transformers using LoRA

In [3]:
import torch
import sys

print("=== PyTorch Environment Test ===")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

    # Test GPU tensor
    x = torch.randn(3, 3).cuda()
    print(f"\nGPU tensor created: {x.device}")
    print(f"Tensor shape: {x.shape}")
else:
    print("CUDA not available - using CPU")
    x = torch.randn(3, 3)
    print(f"CPU tensor created: {x.device}")

print("\n✅ PyTorch test completed!")

=== PyTorch Environment Test ===
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
Number of GPUs: 1
GPU name: Tesla T4

GPU tensor created: cuda:0
Tensor shape: torch.Size([3, 3])

✅ PyTorch test completed!


#### Downloading the data

In [4]:
import torchvision
import torchvision.transforms as transforms

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CIFAR-100 training dataset
trainset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                        download=True, transform=transform)

# Load the CIFAR-100 test dataset
testset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)

print("CIFAR-100 dataset imported successfully.")
print(f"Training set size: {len(trainset)}")
print(f"Test set size: {len(testset)}")

CIFAR-100 dataset imported successfully.
Training set size: 50000
Test set size: 10000


#### Resizing the data for ViT model


In [5]:
import torchvision.transforms as transforms

# Define transforms for training and validation/testing
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Upsample to ViT resolution
    transforms.RandomHorizontalFlip(), # Example data augmentation
    transforms.RandomCrop(224, padding=4), # Example data augmentation
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)), # Upsample to ViT resolution
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Apply the transforms to the datasets
trainset.transform = train_transform
testset.transform = test_transform

print("Data preparation complete. Transforms applied to datasets.")

Data preparation complete. Transforms applied to datasets.


#### Loading the VIT and freezing the parameters

In [6]:
from transformers import ViTForImageClassification

# Load a pre-trained Vision Transformer model
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=100, ignore_mismatched_sizes=True)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False # This freezes the parameters

print("Pre-trained ViT model loaded and parameters frozen.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-trained ViT model loaded and parameters frozen.


In [7]:
# %pip install peft transformers datasets

In [8]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
config = LoraConfig(
    r=16, # Rank of the update matrices.
    lora_alpha=16, # Scaling factor for the LoRA update.
    target_modules=["query", "value"], # Modules to apply LoRA to.
    lora_dropout=0.1, # Dropout probability for LoRA layers.
    bias="none", # Bias type.
)

# Get the LoRA-infused model
model = get_peft_model(model, config)

# Print trainable parameters
model.print_trainable_parameters()

print("LoRA adapters integrated into the model.")

trainable params: 589,824 || all params: 86,465,380 || trainable%: 0.6822
LoRA adapters integrated into the model.


In [9]:
# You need to reinstall DeepSpeed and force it to compile this special CPU Adam extension.

# # # Uninstall the old version first
# !pip uninstall deepspeed -y

# # Re-install with the build flag for CPUAdam
# !DS_BUILD_CPU_ADAM=1 pip install deepspeed


Next, we need to create a DeepSpeed configuration file. This is typically a JSON file that specifies the various optimization settings for DeepSpeed. Here's an example configuration for mixed precision training and ZeRO Stage 2 optimization, which is often used for memory efficiency.

You can save this configuration to a file named `deepspeed_config.json`.

In [15]:
%%writefile deepspeed_config.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        }
    },
    "zero_force_ds_cpu_optimizer": false,
    "train_batch_size": 16,
    "train_micro_batch_size_per_gpu": 16,
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 1.0,
    "steps_per_print": 200
}

Overwriting deepspeed_config.json


## Step 1: Create the DataLoaders

In [11]:
from torch.utils.data import DataLoader

# Create DataLoaders
train_loader = DataLoader(trainset, batch_size=16, shuffle=True)
test_loader = DataLoader(testset, batch_size=16, shuffle=False)

print("DataLoaders created.")

DataLoaders created.


## Step 2: Enable Gradient Checkpointing

In [12]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
print("Gradient checkpointing enabled.")

Gradient checkpointing enabled.


## Step 3: Initialize DeepSpeed

In [13]:
%pip install mpi4py



In [16]:
import torch.optim as optim
import deepspeed

# 1. Manually create the standard PyTorch optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# 2. Initialize DeepSpeed, passing the optimizer you just created
model_engine, optimizer, _, _ = deepspeed.initialize(
    model=model,
    optimizer=optimizer,  # Pass the optimizer here
    config_params='deepspeed_config.json'
)

print("DeepSpeed engine initialized with PyTorch AdamW (forced).")

DeepSpeed engine initialized with PyTorch AdamW (forced).


## Step 1: Write the Training Script

In [23]:
%%writefile train.py
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import ViTForImageClassification
from peft import LoraConfig, get_peft_model
import deepspeed
from sklearn.metrics import confusion_matrix
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

print("--- Initializing Training Script ---")

# --- 1. Data Prep ---
print("Setting up data transformations...")
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(224, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

print("Loading CIFAR-100 dataset...")
trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=train_transform)
testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=test_transform)

train_loader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(testset, batch_size=16, shuffle=False, num_workers=2)
print("DataLoaders created.")

# --- 2. Model Setup ---
print("Loading pre-trained ViT model...")
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=100, ignore_mismatched_sizes=True)

# Freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

# --- 3. LoRA Setup ---
print("Applying LoRA adapters...")
config = LoraConfig(
    r=16, lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1, bias="none",
)
model = get_peft_model(model, config)
print("LoRA adapters applied.")

# --- !! CORRECTED ORDER: Unfreeze classifier AFTER LoRA !! ---
print("Unfreezing classification head...")
for param in model.classifier.parameters():
    param.requires_grad = True
# --- End of Fix ---

print("New trainable parameters:")
model.print_trainable_parameters()

# --- 4. Gradient Checkpointing (Task 8) ---
print("Enabling gradient checkpointing...")
model.gradient_checkpointing_enable()

# --- 5. DeepSpeed Initialization (Task 8) ---
print("Initializing DeepSpeed...")
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
model_engine, optimizer, _, _ = deepspeed.initialize(
    model=model,
    optimizer=optimizer,
    config_params='deepspeed_config.json'
)
print("DeepSpeed engine initialized successfully.")

# --- 6. Training Loop (Task 9) ---
device = model_engine.device
num_epochs = 3 # Start with 3-5 epochs to test

print(f"--- Starting training for {num_epochs} epochs ---")
for epoch in range(num_epochs):
    model_engine.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model_engine(inputs, labels=labels)
        loss = outputs.loss

        model_engine.backward(loss)
        model_engine.step()

        total_loss += loss.item()
        if i % 100 == 0:
            print(f"  Epoch {epoch+1}, Step {i}: Loss = {loss.item():.4f}")

    print(f"**Epoch {epoch+1}/{num_epochs} - Avg. Training Loss: {total_loss / len(train_loader):.4f}**")

print("--- Training complete ---")

# --- 7. Evaluation (Task 11) ---
print("--- Starting evaluation ---")
model_engine.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model_engine(inputs)

        _, predicted = torch.max(outputs.logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = 100 * correct / total
print(f"**Final Test Accuracy: {accuracy:.2f}%**")

# --- 8. Confusion Matrix (Task 11) ---
print("Generating confusion matrix...")
cm = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix (first 10x10):")
print(cm[:10, :10])

Overwriting train.py


In [24]:
# Install libraries for the script, just in case
!pip install deepspeed scikit-learn



In [25]:
# Launch the training script with DeepSpeed
# !deepspeed --num_gpus=1 train.py
# We're just adding --master_port 29501 to pick a new, free port
!deepspeed --num_gpus=1 --master_port 29501 train.py

2025-10-19 21:49:20.411098: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760910560.431753   30106 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760910560.437907   30106 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760910560.453609   30106 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760910560.453637   30106 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760910560.453640   30106 computation_placer.cc:177] computation placer alr