### Loading AI2D dataset from Hugging Face

https://huggingface.co/datasets/lmms-lab/ai2d

In [1]:
from datasets import load_dataset

ds = load_dataset("lmms-lab/ai2d")

In [2]:
print(ds)
print(ds['test'][8])  

DatasetDict({
    test: Dataset({
        features: ['question', 'options', 'answer', 'image'],
        num_rows: 3088
    })
})
{'question': 'What event will probably cause the elephant seal population to increase?', 'options': ['an increase in whales', 'a decrease in penguins', 'a decrease in birds', 'an increase in fish'], 'answer': '3', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=576x396 at 0x7E8230E65A20>}


In [3]:
ds['test'][8]['options'][int(ds['test'][8]['answer'])-1]

'a decrease in birds'

### Load CLIP model and processor

In [4]:
from transformers import CLIPProcessor, CLIPModel
from datasets import Dataset

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")



### Preprocessing the dataset

In [5]:
def preprocess_example(example):
    image = example['image']
    
    question = example['question']
    options = " ".join(example['options'])  
    
    inputs = processor(
        images=image, 
        text=question + " " + options, 
        return_tensors="pt", 
        padding=True,
        truncation=True,  
        max_length=77  
    )
    
    label = int(example['answer']) 
    
    return {
        'pixel_values': inputs['pixel_values'][0], 
        'input_ids': inputs['input_ids'][0],        
        'attention_mask': inputs['attention_mask'][0], 
        'labels': label  
    }

In [6]:
preprocessed_dataset = ds['test'].map(preprocess_example, batched=False)

### Finetuning the CLIP model with AI2D dataset

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduce batch size to lower memory usage
    evaluation_strategy="epoch",
    fp16=True,  # Enable mixed precision training if supported
    logging_dir="./logs",
    save_total_limit=2,  # Limit the number of checkpoints to save
)



### Padding

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=processor.tokenizer, model=model)

### We are creating a custom model for the dataset 

In [9]:
import torch
import torch.nn.functional as F

# Define contrastive loss function
def contrastive_loss(logits_per_image, logits_per_text):
    # We use the cosine similarity to compute the loss
    labels = torch.arange(logits_per_image.size(0), device=logits_per_image.device)
    loss_img = F.cross_entropy(logits_per_image, labels)
    loss_txt = F.cross_entropy(logits_per_text, labels)
    return (loss_img + loss_txt) / 2


In [10]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        # Get the image and text inputs
        pixel_values = inputs['pixel_values']
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        
        # Extract logits for the image and text
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text
        
        # Calculate loss
        loss = contrastive_loss(logits_per_image, logits_per_text)
        
        return loss


In [11]:
import torch
import gc

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

clear_memory()

per_device_train_batch_size=training_args.per_device_train_batch_size // 2  # or set to a small value, e.g., 2
gradient_accumulation_steps=4  # Adjust based on memory limits

In [None]:
try:
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=preprocessed_dataset,
        data_collator=data_collator
    )
    trainer.train()
except RuntimeError as e:
    if "CUDA out of memory" in str(e):
        print("Caught CUDA out of memory error. Reducing batch size or using CPU.")
        clear_memory()  # Clear cache if out of memory error occurs

        # Optionally, set no_cuda=True to switch to CPU training
        training_args.no_cuda = True
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=preprocessed_dataset,
            data_collator=data_collator
        )
        trainer.train()
    else:
        raise e

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmshruthi03[0m ([33mshruthimohan03[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113379899999625, max=1.0…

Epoch,Training Loss,Validation Loss
