In [None]:
import os
import pandas as pd
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.notebook import tqdm

from transformers import Blip2Processor, Blip2ForConditionalGeneration
from transformers import AutoProcessor, BlipForConditionalGeneration
from peft import LoraConfig, get_peft_model

In [None]:
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, image_folder, processor):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.processor = processor
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_folder, row["file_name"])
        
        with Image.open(img_path) as im:
            if im.mode == "P":
                im = im.convert("RGBA")
            image = im.convert("RGB")
        
        caption = row["text"]
        
        # ✅ Process image and text together (like your working script)
        encoding = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            return_tensors="pt"
        )
        
        # Remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        return encoding

In [None]:
# ============ Training Script ============ #
from transformers import get_cosine_schedule_with_warmup

def main():
    # Paths
    train_csv = "/kaggle/input/biology-dataset/Biology-Dataset/train/metadata.csv"
    train_folder = "/kaggle/input/biology-dataset/Biology-Dataset/train/"

    val_csv = "/kaggle/input/biology-dataset/Biology-Dataset/val/metadata.csv"
    val_folder = "/kaggle/input/biology-dataset/Biology-Dataset/val/"

    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained(
        "/kaggle/working/best_model_epoch",
        device_map="auto"
    )

    for param in model.parameters():
        param.requires_grad = False
    
    # 2. Unfreeze last 2 text decoder layers
    for i in [-1, -4]:
        for param in model.text_decoder.bert.encoder.layer[i].parameters():
            param.requires_grad = True
    
    # 3. Unfreeze LM head (for vocab adaptation)
    for param in model.text_decoder.cls.predictions.parameters():
        param.requires_grad = True

    for i in [-1, -4]:
        layer = model.text_decoder.bert.encoder.layer[i]
        if hasattr(layer, "crossattention"):
            for param in layer.crossattention.parameters():
                param.requires_grad = True
    # Datasets
    train_dataset = ImageCaptionDataset(train_csv, train_folder, processor)
    val_dataset = ImageCaptionDataset(val_csv, val_folder, processor)

    # Dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
    )
    
    num_epochs = 10
    total_steps = len(train_loader) * num_epochs

    # Optimizer
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    # optimizer = AdamW(model.parameters(), lr=1e-5)
    optimizer = AdamW(trainable_params, lr=1e-5)
    print(f"Trainable params: {sum(p.numel() for p in trainable_params)}")
    # optimizer = AdamW(
    #     trainable_params,
    #     lr=1e-5,
    #     weight_decay=0.01
    # )
    # scheduler = get_cosine_schedule_with_warmup(
    #     optimizer,
    #     num_warmup_steps=500,
    #     num_training_steps=total_steps
    # )

    # Training loop
    # best_val_loss = float("inf")
    best_val_loss = 0.7055
    model.train()

    for epoch in range(37, num_epochs+37):
        print(f"\nEpoch {epoch}")

        # Training
        total_loss = 0.0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch} [Train]", leave=False)
        for step, batch in enumerate(train_bar):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            attention_mask = batch.pop("attention_mask").to(device)

            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask, 
                labels=input_ids
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            train_bar.set_postfix({"loss": f"{loss.item():.4f}"})

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch} | Avg Train Loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0.0
        val_bar = tqdm(val_loader, desc=f"Epoch {epoch} [Val]", leave=False)
        with torch.no_grad():
            for batch in val_bar:
                input_ids = batch.pop("input_ids").to(device)
                pixel_values = batch.pop("pixel_values").to(device)
                attention_mask = batch.pop("attention_mask").to(device)
    
                outputs = model(
                    input_ids=input_ids,
                    pixel_values=pixel_values,
                    attention_mask=attention_mask,  # ✅ Pass attention mask
                    labels=input_ids
                )

                val_loss += outputs.loss.item()
                val_bar.set_postfix({"loss": f"{outputs.loss.item():.4f}"})

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch} | Avg Val Loss: {avg_val_loss:.4f}")

        # Save best checkpoint
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(f"best_model_epoch")
            print(f"best_model_epoch saved")

        model.train()

In [None]:
if __name__ == "__main__":
    main()

In [None]:
!pip -q install pycocoevalcap

In [None]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge

import json

class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, image_folder, processor):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_folder, row["file_name"])
        with Image.open(img_path) as im:
            if im.mode == "P":
                im = im.convert("RGBA")
            image = im.convert("RGB")
        caption = row["text"]
        return {"image": image, "caption": caption, "id": idx}

def collate_fn(batch, processor):
    images = [item["image"] for item in batch]
    captions = [item["caption"] for item in batch]
    ids = [item["id"] for item in batch]
    pixel_values = processor(images=images, return_tensors="pt").pixel_values
    return pixel_values, captions, ids

In [None]:
test_csv = "/kaggle/input/biology-dataset/Biology-Dataset/test/metadata.csv"
test_folder = "/kaggle/input/biology-dataset/Biology-Dataset/test"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load processor & base model
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "/kaggle/working/best_model_epoch",
    device_map="auto"
)

model.eval()

# Dataset + DataLoader
test_dataset = ImageCaptionDataset(test_csv, test_folder, processor)
test_loader = DataLoader(
    test_dataset,
    batch_size=4,           # CHANGED: Smaller for more stable generation
    shuffle=False,
    collate_fn=lambda b: collate_fn(b, processor)
)

# Collect predictions and references
gts = {}   # ground truth {id: [captions]}
res = {}   # predictions {id: [caption]}

print("Generating captions...")
with torch.inference_mode():
    for pixel_values, captions, ids in tqdm(test_loader, desc="Evaluating"):
        pixel_values = pixel_values.to(device)

        # OPTIMIZED: Much faster generation settings
        generated_ids = model.generate(
            pixel_values=pixel_values,
            max_length=512,              # CHANGED: 64 instead of 1024 (16x faster!)
            num_beams=3,                # CHANGED: 3 beams for better quality
            early_stopping=True,       # ADDED: Stop when EOS token found
            repetition_penalty=1.2,    # ADDED: Prevent repetition
            no_repeat_ngram_size=3,    # ADDED: Avoid repetitive phrases
        )
        
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

        for i, idx in enumerate(ids):
            gts[idx] = [captions[i].lower()]                    # ground truth
            res[idx] = [generated_text[i].strip().lower()]      # model prediction (stripped)
        
        # ADDED: Clear GPU cache periodically
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

print(f"Generated {len(res)} captions")

# Save predictions to file
with open("predictions_freezed2.json", "w") as f:
    json.dump(res, f, indent=2)

# ============ Metrics ============ #
print("\n==== Evaluation Results ====")

# BLEU-1 to BLEU-4
try:
    bleu_scorer = Bleu(4)
    bleu_score, _ = bleu_scorer.compute_score(gts, res)
    print(f"BLEU-1: {bleu_score[0]:.4f}")
    print(f"BLEU-2: {bleu_score[1]:.4f}")
    print(f"BLEU-3: {bleu_score[2]:.4f}")
    print(f"BLEU-4: {bleu_score[3]:.4f}")
except Exception as e:
    print(f"Error computing BLEU: {e}")

# ROUGE-L
try:
    rouge_scorer = Rouge()
    rouge_score, _ = rouge_scorer.compute_score(gts, res)
    print(f"ROUGE-L: {rouge_score:.4f}")
except Exception as e:
    print(f"Error computing ROUGE: {e}")

# CIDEr
try:
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(gts, res)
    print(f"CIDEr: {cider_score:.4f}")
except Exception as e:
    print(f"Error computing CIDEr: {e}")