In [None]:
# DL Assignment 2 Submission (Part A)

In [120]:
!pip install evaluate bert_score rouge_score num2words

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [55]:
# random.seed(42)
# np.random.seed(42)
# torch.manual_seed(42)
# torch.cuda.manual_seed(42)

In [121]:
import json
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import evaluate
import os
import time

import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import precision_recall_fscore_support

from transformers import CLIPModel, CLIPProcessor, GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, AutoProcessor, AutoModelForImageTextToText 
from transformers.image_utils import load_image

import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from bert_score import score
from rouge_score import rouge_scorer


In [166]:
BATCH_SIZE = 16
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
PATH_TO_ROOTDIR = "/kaggle/input/dataset/custom_captions_dataset"
LEARNING_RATE=5e-4
EPOCHS = 5

In [167]:
class CustomCaptionsDataset(Dataset):

    def __init__(self, root_dir, transform=None, split='train', maxlen=50, tokenizer=None):

        self.split = split
        self.root_dir = root_dir
        self.transform = transform
        self.caption_file = os.path.join(root_dir, f'{split}.csv')
        self.image_dir = os.path.join(root_dir, f'{split}')
        self.image_paths = []
        self.captions = []
        self.maxlen = maxlen
        self.tokenizer = tokenizer

        df = pd.read_csv(self.caption_file)
        # df = df.head(100)

        for index, row in tqdm(df.iterrows(), total=len(df)):
            image_path = os.path.join(self.image_dir, row['filename'])
            caption = row['caption']

            self.image_paths.append(image_path)
            self.captions.append(caption)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        caption = self.captions[idx]
        
        captionenc = self.tokenizer(
            caption, 
            padding="max_length",
            max_length=self.maxlen,
            truncation=True, 
            return_tensors="pt"
        )

        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return image, captionenc.input_ids.squeeze(0)

## ImageCaptionModel (class)
A neural network model for image captioning that combines CLIP's vision encoder with GPT-2 decoder and contrastive learning.
- **clip_model (str)**: Name of CLIP model to use (default: "openai/clip-vit-base-patch32").
- **gpt2_model (str)**: Name of GPT-2 model to use (default: "gpt2").
- **freeze_clip (bool)**: Whether to freeze CLIP parameters (default: True).
- **freeze_gpt2_partial (bool)**: Whether to freeze lower GPT-2 layers (default: True).
- **projection_dim (int)**: Dimension for contrastive learning projections (default: 256).
- **contrastive_weight (float)**: Weight for contrastive loss component (default: 1).

The model extracts visual features from images using CLIP's vision encoder, projects them to match GPT-2's dimension, and generates captions using GPT-2 with cross-attention. During training, it uses both language modeling and contrastive learning losses to align image and text representations.

In [149]:

class ImageCaptionModel(nn.Module):
    
    def __init__(self, clip_model="openai/clip-vit-base-patch32", gpt2_model="gpt2", projection_dim=256, contrastive_weight=1):
        
        super(ImageCaptionModel, self).__init__()

        self.contrastive_weight = contrastive_weight
        clip = CLIPModel.from_pretrained(clip_model)
        
        # Extracting only the vision encoder from CLIP
        self.encoder = clip.vision_model
        
        for param in self.encoder.parameters():
                param.requires_grad = False
                
        # setup the decoder
        gpt2_config = GPT2Config.from_pretrained(gpt2_model)
        gpt2_config.add_cross_attention = True 
        self.decoder = GPT2LMHeadModel.from_pretrained(gpt2_model, config=gpt2_config)

        # encoder dimension from CLIP's vision model
        self.encoder_dim = self.encoder.config.hidden_size
        
        # decoder dimension from GPT-2
        self.decoder_dim = self.decoder.config.hidden_size
        
        for i, block in enumerate(self.decoder.transformer.h):
            if i < len(self.decoder.transformer.h) - 2:
                for param in block.parameters():
                    param.requires_grad = False

        # projection layer to get image encoder output dimension to the same size as decoder dimension
        self.connect = nn.Sequential(
            nn.Linear(self.encoder_dim, self.decoder_dim * 2),
            nn.GELU(),
            nn.Linear(self.decoder_dim * 2, self.decoder_dim)
        )
        
        # projection heads for contrastive learning
        self.img_projection = nn.Sequential(
            nn.Linear(self.encoder_dim, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim)
        )
        
        self.txt_projection = nn.Sequential(
            nn.Linear(self.decoder_dim, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim)
        )
        
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Adding special tokens for image and caption
        special_tokens = {'additional_special_tokens': ['<|img|>', '<|caption|>']}
        num_added = self.tokenizer.add_special_tokens(special_tokens)
        self.decoder.resize_token_embeddings(len(self.tokenizer))
        
        # Get token IDs for special tokens
        self.img_token_id = self.tokenizer.convert_tokens_to_ids("<|img|>")
        self.caption_token_id = self.tokenizer.convert_tokens_to_ids("<|caption|>")
        
        if self.img_token_id == self.tokenizer.unk_token_id:
            self.img_token_id = self.tokenizer.eos_token_id
        if self.caption_token_id == self.tokenizer.unk_token_id:
            self.caption_token_id = self.tokenizer.eos_token_id
        
    def get_image_embeddings(self, images):
        
        encoder_outputs = self.encoder(pixel_values=images).last_hidden_state
       
        cls_output = encoder_outputs[:, 0, :]
        return self.img_projection(cls_output)
    
    def get_text_embeddings(self, captions):
       
        tokenized = self.tokenizer(
            captions,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(next(self.parameters()).device)
            
        text_outputs = self.decoder.transformer(
            input_ids=tokenized.input_ids,
            attention_mask=tokenized.attention_mask,
            output_hidden_states=True
        )
        
        # Use last hidden state of the last token as text representation
        last_token_ids = tokenized.attention_mask.sum(1) - 1
        batch_size = tokenized.input_ids.shape[0]
        last_hidden_states = text_outputs.last_hidden_state
        
        text_embeddings = []
        for i in range(batch_size):
            text_embeddings.append(last_hidden_states[i, last_token_ids[i], :])
            
        text_embeddings = torch.stack(text_embeddings)
        return self.txt_projection(text_embeddings)
    
    def contrastive_loss(self, img_embeds, txt_embeds, temperature=0.07):
        
        # for cosine similarity
        img_embeds = F.normalize(img_embeds, p=2, dim=1)
        txt_embeds = F.normalize(txt_embeds, p=2, dim=1)
        
        logits = torch.matmul(img_embeds, txt_embeds.t()) / temperature
        batch_size = img_embeds.shape[0]
        labels = torch.arange(batch_size, device=logits.device)
        
        # calculate loss (symmetric loss: image-to-text and text-to-image)
        i2t_loss = F.cross_entropy(logits, labels)
        t2i_loss = F.cross_entropy(logits.t(), labels)
        
        return (i2t_loss + t2i_loss) / 2.0
        
    def forward(self, images, labels=None):
        
        encoder_outputs = self.encoder(pixel_values=images).last_hidden_state
            
        cls_output = encoder_outputs[:, 0, :]
        img_features = self.connect(cls_output)
        batch_size = encoder_outputs.size(0)
        
        # Create input with both image and caption tokens
        img_caption_tokens = torch.zeros((batch_size, 2), dtype=torch.long, device=encoder_outputs.device)
        img_caption_tokens[:, 0] = self.img_token_id
        img_caption_tokens[:, 1] = self.caption_token_id
        
        # Training mode
        if labels is not None:
            
            # Convert string labels to token IDs if needed
            if isinstance(labels, list) and isinstance(labels[0], str):
                tokenized_captions = self.tokenizer(
                    labels,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=128
                ).to(encoder_outputs.device)
                caption_ids = tokenized_captions.input_ids
            else:
                caption_ids = labels
                
            # combining image and caption tokens with caption IDs
            input_ids = torch.cat([img_caption_tokens, caption_ids], dim=1)
            
            outputs = self.decoder(
                input_ids=input_ids,
                encoder_hidden_states=img_features.unsqueeze(1),
                labels=input_ids,  # Using input_ids as labels with shift
                return_dict=True
            )
            
            lm_loss = outputs.loss
            
            if self.contrastive_weight > 0:
                img_embeds = self.get_image_embeddings(images)
                
                if isinstance(labels, list) and isinstance(labels[0], str):
                    caption_texts = labels
                else:
                    caption_texts = self.tokenizer.batch_decode(caption_ids, skip_special_tokens=True)
                    
                txt_embeds = self.get_text_embeddings(caption_texts)
                
                # calculate contrastive loss
                contr_loss = self.contrastive_loss(img_embeds, txt_embeds)
                
                # combine losses
                total_loss = lm_loss + self.contrastive_weight * contr_loss # weight = 1 gave best results
                return total_loss, outputs.logits
                
            else:
                return lm_loss, outputs.logits
        
        # Inference mode
        else:
            
            generated = self.decoder.generate(
                input_ids=img_caption_tokens,
                encoder_hidden_states=img_features.unsqueeze(1),
                max_length=50,
                num_beams=4,
                early_stopping=True,
                pad_token_id=self.tokenizer.pad_token_id,
                no_repeat_ngram_size=2,
                temperature=0.7
            )
            
            # Decode the generated IDs
            generated_captions = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
            
            return generated_captions

In [150]:
model = ImageCaptionModel(
    clip_model="openai/clip-vit-base-patch32", 
    gpt2_model="gpt2", 
    contrastive_weight=1
).to(DEVICE)

tokenizer = model.tokenizer

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

In [168]:
transform=transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

train_dataset = CustomCaptionsDataset(
    root_dir = PATH_TO_ROOTDIR, 
    transform=transform, 
    split='train',
    tokenizer=tokenizer
)
val_dataset = CustomCaptionsDataset(
    root_dir = PATH_TO_ROOTDIR, 
    transform=transform, 
    split='val',
    tokenizer=tokenizer
)
test_dataset = CustomCaptionsDataset(
    root_dir = PATH_TO_ROOTDIR, 
    transform=transform, 
    split='test', 
    tokenizer=tokenizer
)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)


100%|██████████| 5715/5715 [00:00<00:00, 26359.95it/s]
100%|██████████| 946/946 [00:00<00:00, 26322.74it/s]
100%|██████████| 928/928 [00:00<00:00, 26520.72it/s]


## zero_shot_captioning (function)
Generate captions using the pre-trained SmolVLM model without training.
- **image_path (str)**: Path to the input image.
- **model (obj)**: The loaded pre-trained model instance.
- **processor (obj)**: The model's processor for input preparation.

Returns the generated caption as a string.

In [159]:

def zero_shot_captioning(image_path, model, processor):
    
    # Create input messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "url": image_path},
                {"type": "text", "text": "Generate a caption for this image."},
            ]
        }
    ]
    
    # Process inputs
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device, dtype=torch.bfloat16)
    
    # Generate caption
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False
    )
    
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the assistant's response (after "Assistant: ")
    if "Assistant:" in caption:
        caption = caption.split("Assistant:")[1].strip()
    
    return caption


In [162]:
def evaluate_smolvlm(model, processor, dataloader, device, save_captions=True, save_path="smolvlm_captions.csv"):
    
    # Download necessary NLTK data
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    
    rouge_calc = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    
    references = []
    hypotheses = []
    ref_texts = []  # For ROUGE calculation
    hyp_texts = []  # For ROUGE calculation
    
    filenames = []
    generated_captions = []
    
    print("Generating captions for evaluation...")
    
    with torch.no_grad():
        for batch_idx, (images, caption_ids) in enumerate(dataloader):
            # Get filenames for this batch
            if hasattr(dataloader.dataset, 'image_paths'):
                batch_filenames = [os.path.basename(dataloader.dataset.image_paths[i]) 
                                  for i in range(batch_idx * dataloader.batch_size, 
                                               min((batch_idx + 1) * dataloader.batch_size, len(dataloader.dataset)))]
            else:
                # If filenames are not available, use indices
                batch_filenames = [f"img_{batch_idx * dataloader.batch_size + i}.jpg" for i in range(len(images))]
            
            if hasattr(dataloader.dataset, 'tokenizer'):
                reference_captions = dataloader.dataset.tokenizer.batch_decode(caption_ids, skip_special_tokens=True)
            else:
                reference_captions = [' '.join([str(token.item()) for token in caption]) for caption in caption_ids]
            
            batch_captions = []
            
            # For each image in the batch, we need to save it temporarily then use zero_shot_captioning
            for i, img in enumerate(images):
                # Convert tensor to PIL Image
                img_pil = transforms.ToPILImage()(img)
                
                temp_path = f"temp_img_{i}.jpg"
                img_pil.save(temp_path)
                
                # Generate caption using the zero_shot_captioning function
                caption = zero_shot_captioning(temp_path, model, processor)
                batch_captions.append(caption)
                
                if os.path.exists(temp_path):
                    os.remove(temp_path)
            
            for ref, hyp in zip(reference_captions, batch_captions):
                references.append([ref.split()])
                hypotheses.append(hyp.split())
                ref_texts.append(ref)
                hyp_texts.append(hyp)
            
            filenames.extend(batch_filenames)
            generated_captions.extend(batch_captions)
    
    # Save captions to CSV
    if save_captions:
        captions_df = pd.DataFrame({
            'filename': filenames,
            'generated_caption': generated_captions
        })
        captions_df.to_csv(save_path, index=False)
        print(f"Saved generated captions to {save_path}")
    
    print("Computing evaluation metrics...")
    
    # Compute BLEU score
    smoothing = SmoothingFunction().method1
    bleu = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    
    # Compute METEOR score
    meteor_scores = []
    for ref, hyp in zip(references, hypotheses):
        meteor_scores.append(meteor_score(ref, hyp))
        
    meteor_avg = np.mean(meteor_scores) if meteor_scores else 0
    
    # Compute ROUGE-L score using rouge_score package
    rouge_scores = []
    for ref, hyp in zip(ref_texts, hyp_texts):
        score = rouge_calc.score(ref, hyp)
        rouge_scores.append(score['rougeL'].fmeasure)
      
       
    rouge_l_avg = np.mean(rouge_scores) if rouge_scores else 0
    
    results = {
        'bleu': bleu,
        'meteor': meteor_avg,
        'rouge_l': rouge_l_avg 
    }
    
    print("\nEvaluation Results:")
    print(f"BLEU: {results['bleu']:.4f}")
    print(f"METEOR: {results['meteor']:.4f}")
    print(f"ROUGE-L: {results['rouge_l']:.4f}")
    
    num_examples = min(3, len(hypotheses))
    
    print("\nExample generations:")
    for i in range(num_examples):
        print(f"Reference: {' '.join(references[i][0])}")
        print(f"Generated: {' '.join(hypotheses[i])}")
        print("-" * 50)
        print()
    
    return results

In [163]:
smolvlm_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
smolvlm_model = AutoModelForImageTextToText.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct", 
    torch_dtype=torch.bfloat16, 
    device_map=DEVICE
)

# Evaluate
results = evaluate_smolvlm(
    model=smolvlm_model,
    processor=smolvlm_processor,
    dataloader=test_loader,
    device=DEVICE,
    save_path="smolvlm_evaluation.csv"
)

Generating captions for evaluation...


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The cur

Saved generated captions to smolvlm_evaluation.csv
Computing evaluation metrics...

Evaluation Results:
BLEU: 0.0034
METEOR: 0.0966
ROUGE-L: 0.1560

Example generations:
Reference: A large building with bars on the windows in front of it. There is people walking in front of the building. There is a street in front of the building with many cars on it.
Generated: A very colorful, distorted view of a building with many windows and columns.
--------------------------------------------------

Reference: A person is skiing through the snow. There is loose snow all around them from him jumping. The person is wearing a yellow snow suit. The person is holding two ski poles in their hands.
Generated: A colorful image of trees is shown with a spectrum of colors extending from red to orange to yellow to green to blue.
--------------------------------------------------

Reference: There is a bed in a room against a wall. There is a brown blanket on top of the bed. There is a small brown book shelf

## train_model (function)
Train the image captioning model with validation monitoring.
- **model (nn.Module)**: The neural network model to train.
- **train_loader (DataLoader)**: DataLoader with training data.
- **val_loader (DataLoader)**: DataLoader with validation data.
- **optimizer (torch.optim)**: Optimizer for weight updates.
- **criterion (nn.Module)**: Loss function.
- **device (str)**: Device to run training on ('cuda' or 'cpu').
- **epochs (int)**: Number of training epochs (default: 10).

Returns the trained model and training history dictionary with loss metrics.

In [152]:
def train_model(
    model, 
    train_loader, 
    val_loader, 
    optimizer, 
    criterion, 
    device, 
    epochs=10, 
):
    
    history = {
        "train_loss": [],
        "val_loss": [],
        "epoch_times": []
    }
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        
        epoch_start = time.time()
        print(f"\nEpoch {epoch+1}/{epochs}")
        
        # Training phase
        model.train()
        epoch_train_loss = 0.0
        num_train_batches = len(train_loader)
        
        for batch_idx, (images, captions) in enumerate(train_loader):

            if batch_idx % 10 == 0:
                progress = batch_idx / num_train_batches * 100
                print(f"Training: {progress:.1f}% complete", end="\r")
            
            images = images.to(device)
            captions = captions.to(device)
            
            loss, _ = model(images, captions)
            
            # Optimization step
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            # Accumulate batch loss
            epoch_train_loss += loss.item()
        
        # Calculate average training loss
        avg_train_loss = epoch_train_loss / num_train_batches
        history["train_loss"].append(avg_train_loss)
        
        # Validation phase
        model.eval()
        epoch_val_loss = 0.0
        num_val_batches = len(val_loader)
        
        with torch.no_grad():
            for batch_idx, (images, captions) in enumerate(val_loader):

                if batch_idx % 10 == 0:
                    progress = batch_idx / num_val_batches * 100
                    print(f"Validation: {progress:.1f}% complete", end="\r")
                
                images = images.to(device)
                captions = captions.to(device)
                
                loss, _ = model(images, captions)
                
                # Accumulate batch loss
                epoch_val_loss += loss.item()
        
        # Calculate average validation loss
        avg_val_loss = epoch_val_loss / num_val_batches
        history["val_loss"].append(avg_val_loss)
        
        # Calculate epoch time
        epoch_time = time.time() - epoch_start
        history["epoch_times"].append(epoch_time)
        
        # Print epoch summary
        print(f"\nEpoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Time: {epoch_time:.2f}s")
        
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'clip_gpt_image_captioner.pth')
        print(f"Model saved to clip_gpt_image_captioner.pth")
    
    # Training complete - load best model
    checkpoint = torch.load('clip_gpt_image_captioner.pth')
    model.load_state_dict(checkpoint)
    
    print(f"\nTraining completed. Best validation loss: {best_val_loss:.4f}")
    return model, history

In [153]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)

# main loss function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

model, history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=DEVICE,
        epochs=EPOCHS   
)


Epoch 1/5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 97.8% complete

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 83.3% complete
Epoch 1/5 - Train Loss: 3.6215, Val Loss: 2.8614, Time: 110.24s
Model saved to clip_gpt_image_captioner.pth

Epoch 2/5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 97.8% complete

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 83.3% complete
Epoch 2/5 - Train Loss: 2.6000, Val Loss: 2.6611, Time: 109.46s
Model saved to clip_gpt_image_captioner.pth

Epoch 3/5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 97.8% complete

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 83.3% complete
Epoch 3/5 - Train Loss: 2.3119, Val Loss: 2.6628, Time: 109.58s
Model saved to clip_gpt_image_captioner.pth

Epoch 4/5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 97.8% complete

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 83.3% complete
Epoch 4/5 - Train Loss: 2.1048, Val Loss: 2.6418, Time: 109.14s
Model saved to clip_gpt_image_captioner.pth

Epoch 5/5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training: 97.8% complete

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 83.3% complete
Epoch 5/5 - Train Loss: 1.9172, Val Loss: 2.6471, Time: 109.29s
Model saved to clip_gpt_image_captioner.pth


  checkpoint = torch.load('clip_gpt_image_captioner.pth')



Training completed. Best validation loss: 2.6471


## evaluate_model (function)
Evaluate model performance using BLEU, ROUGE-L, METEOR.
- **model (nn.Module)**: Trained model.
- **dataloader (DataLoader)**: Test data loader.
- **device (str)**: 'cuda' or 'cpu'.
- **save_captions (bool)**: Whether to save generated captions to CSV (default: True).
- **save_path (str)**: Path to save generated captions (default: "generated_captions.csv").

Returns a dict containing BLEU, ROUGE-L, METEOR scores for the test set.

In [164]:
def evaluate_model(model, dataloader, device, save_captions=True, save_path="generated_captions.csv"):
    
    # Download necessary NLTK data
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    
    rouge_calc = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    
    model.eval()
    references = []
    hypotheses = []
    ref_texts = []  # For ROUGE calculation
    hyp_texts = []  # For ROUGE calculation
    
    # For saving captions
    filenames = []
    generated_captions = []
    
    print("Generating captions for evaluation...")
    with torch.no_grad():
        for batch_idx, (images, caption_ids) in enumerate(dataloader):
            images = images.to(device)
            
            # Get filenames for this batch
            if hasattr(dataloader.dataset, 'image_paths'):
                batch_filenames = [os.path.basename(dataloader.dataset.image_paths[dataloader.dataset.indices[i]]) 
                              if hasattr(dataloader.dataset, 'indices') else 
                              os.path.basename(dataloader.dataset.image_paths[i + batch_idx * dataloader.batch_size]) 
                              for i in range(len(images))]
            else:
                # If filenames are not available, use indices
                batch_filenames = [f"img_{batch_idx * dataloader.batch_size + i}.jpg" for i in range(len(images))]
            
            # Generate captions
            batch_captions = model(images)
            reference_captions = model.tokenizer.batch_decode(caption_ids, skip_special_tokens=True)
            
            # Store for evaluation metrics
            for ref, hyp in zip(reference_captions, batch_captions):
                references.append([ref.split()])
                hypotheses.append(hyp.split())
                ref_texts.append(ref)
                hyp_texts.append(hyp)
            
            # Store for CSV file
            filenames.extend(batch_filenames)
            generated_captions.extend(batch_captions)
    
    # Save captions to CSV
    if save_captions:
        captions_df = pd.DataFrame({
            'filename': filenames,
            'generated_caption': generated_captions
        })
        captions_df.to_csv(save_path, index=False)
        print(f"Saved generated captions to {save_path}")
    
    print("Computing evaluation metrics...")
    
    # Compute BLEU score
    smoothing = SmoothingFunction().method1
    bleu = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    
    # Compute METEOR score
    meteor_scores = []
    for ref, hyp in zip(references, hypotheses):
        try:
            meteor_scores.append(meteor_score(ref, hyp))
        except:
            # Skip problematic pairs
            continue
    meteor_avg = np.mean(meteor_scores) if meteor_scores else 0
    
    # Compute ROUGE-L score using rouge_score package
    rouge_scores = []
    for ref, hyp in zip(ref_texts, hyp_texts):
        score = rouge_calc.score(ref, hyp)
        rouge_scores.append(score['rougeL'].fmeasure)
       
    rouge_l_avg = np.mean(rouge_scores) if rouge_scores else 0
    
    results = {
        'bleu': bleu,
        'meteor': meteor_avg,
        'rouge_l': rouge_l_avg 
    }
    
    print("\nEvaluation Results:")
    print(f"BLEU: {results['bleu']:.4f}")
    print(f"METEOR: {results['meteor']:.4f}")
    print(f"ROUGE-L: {results['rouge_l']:.4f}")
    
    num_examples = min(3, len(hypotheses))
    
    print("\nExample generations:")
    for i in range(num_examples):
        print(f"Reference: {' '.join(references[i][0])}")
        print(f"Generated: {' '.join(hypotheses[i])}")
        print("-" * 50)
        print()
    
    return results

In [165]:
results = evaluate_model(model, test_loader, DEVICE)

Generating captions for evaluation...


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The cur

Saved generated captions to generated_captions.csv
Computing evaluation metrics...

Evaluation Results:
BLEU: 0.0664
METEOR: 0.2518
ROUGE-L: 0.3140

Example generations:
Reference: A large building with bars on the windows in front of it. There is people walking in front of the building. There is a street in front of the building with many cars on it.
Generated: A white and blue bus is parked in front of a building. There is a large white building behind the bus.
--------------------------------------------------

Reference: A person is skiing through the snow. There is loose snow all around them from him jumping. The person is wearing a yellow snow suit. The person is holding two ski poles in their hands.
Generated: A man is wearing a black wet suit. He is holding a white surfboard in his hand. There is a large wave in the water behind the man.
--------------------------------------------------

Reference: There is a bed in a room against a wall. There is a brown blanket on top of the

# 