In [1]:
import os
import requests
import torch
import pickle
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm # Ensure tqdm is imported
from transformers import BlipProcessor, BlipForQuestionAnswering, BitsAndBytesConfig
from datasets import load_dataset
import gc
from peft import LoraConfig, get_peft_model

2025-05-15 21:06:08.869891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747343169.067854      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747343169.122489      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:

# --- Configuration ---
MODEL_NAME = "Salesforce/blip-vqa-base"
CSV_PATH = r"/kaggle/input/merged-training/merged_final.csv"
IMAGES_BASE_FOLDER = r"/kaggle/input/vrdatasets/abo-images-small"
OUTPUT_MODEL_DIR = "/kaggle/working/blip-finetuned"
TRACKING_FILE = "tracking_information.pkl"

R_LORA = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
# Updated TARGET_MODULES for BLIP. Common ones are query/key/value in attention layers.
# The exact names depend on the model architecture.
# You might need to inspect model.named_modules() to find precise names if these don't work.
# Common patterns for BLIP-like models (BERT-based text parts, ViT-based vision parts):
TARGET_MODULES = ["q_proj",
        "k_proj",
        "v_proj",
        "mlp.fc1",
        "mlp.fc2",
        "output.dense"]


IMAGE_SIZE = (384, 384)
QUESTION_MAX_LENGTH = 36
ANSWER_MAX_LENGTH = 16 # Increased from 8

BATCH_SIZE = 16
NUM_WORKERS = 2
LEARNING_RATE = 5e-5
NUM_EPOCHS = 3
PATIENCE = 10
GRADIENT_ACCUMULATION_STEPS = 1
LOG_INTERVAL = 40  # Print loss every 50 steps/batches

LOAD_IN_FP16 = True
LOAD_IN_8BIT = False
LOAD_IN_4BIT = False

In [3]:
# --- Setup ---
if 'model' in globals(): del model
if 'processor' in globals(): del processor
if 'optimizer' in globals(): del optimizer
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"Initial GPU Memory: Allocated={torch.cuda.memory_allocated(0)/1024**2:.2f}MB, Reserved={torch.cuda.memory_reserved(0)/1024**2:.2f}MB")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA Device Name: {torch.cuda.get_device_name(0)}")

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

processor = BlipProcessor.from_pretrained(MODEL_NAME)
peft_config = LoraConfig(
    r=R_LORA,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none"
)

model_kwargs = {}
if torch.cuda.is_available():
    if LOAD_IN_FP16 and not (LOAD_IN_8BIT or LOAD_IN_4BIT):
        print("Loading model in float16...")
        model_kwargs["torch_dtype"] = torch.float16
    elif LOAD_IN_8BIT:
        print("Loading model in 8-bit...")
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model_kwargs["quantization_config"] = quantization_config
    elif LOAD_IN_4BIT:
        print("Loading model in 4-bit...")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
            bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4",
        )
        model_kwargs["quantization_config"] = quantization_config
    else: print("Loading model in float32...")
else: print("Loading model on CPU in float32...")

model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME, **model_kwargs)
print(f"Base model '{MODEL_NAME}' loaded.")

# --- (Optional) Calculate parameters of the BASE model before LoRA ---
base_total_params = sum(p.numel() for p in model.parameters())
base_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n--- Base Model Parameters (Before LoRA) ---")
print(f"Total parameters: {base_total_params:,}")
print(f"Initially trainable parameters: {base_trainable_params:,}\n")

print("Applying PEFT LoRA adapters...")
model = get_peft_model(model, peft_config)

print("\n--- Model Parameters AFTER LoRA ---")
model.print_trainable_parameters()
total_params_after_lora = sum(p.numel() for p in model.parameters())
trainable_params_after_lora = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters (including frozen base + LoRA adapters): {total_params_after_lora:,}")
print(f"Trainable LoRA parameters (manually calculated): {trainable_params_after_lora:,}")
if total_params_after_lora > 0:
    print(f"Percentage of trainable parameters in current model object: {(trainable_params_after_lora / total_params_after_lora) * 100:.4f}%\n")

print(f"Moving model to {device}...")
model.to(device)
print("Model moved to device.")

if torch.cuda.is_available():
    print(f"After model loading & .to(device): Allocated={torch.cuda.memory_allocated(0)/1024**2:.2f}MB, Reserved={torch.cuda.memory_reserved(0)/1024**2:.2f}MB")

if hasattr(torch, 'compile') and os.environ.get("KAGGLE_RUNTIME_VERSION", "0").startswith("Python"):
    if device.type == 'cuda':
        print("Compiling the model... (this may take a moment)")
        try:
            model = torch.compile(model, mode="reduce-overhead")
            print("Model compiled successfully.")
        except Exception as e:
            print(f"Could not compile model: {e}. Proceeding without compilation.")
    else: print("Model is on CPU, skipping torch.compile.")
else: print("torch.compile not available or not in a suitable Kaggle environment. Proceeding without compilation.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Initial GPU Memory: Allocated=0.00MB, Reserved=0.00MB
Using device: cuda
CUDA Device Name: Tesla P100-PCIE-16GB


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Loading model in float16...


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Base model 'Salesforce/blip-vqa-base' loaded.

--- Base Model Parameters (Before LoRA) ---
Total parameters: 384,672,572
Initially trainable parameters: 384,672,572

Applying PEFT LoRA adapters...

--- Model Parameters AFTER LoRA ---
trainable params: 4,128,768 || all params: 388,801,340 || trainable%: 1.0619
Total parameters (including frozen base + LoRA adapters): 388,801,340
Trainable LoRA parameters (manually calculated): 4,128,768
Percentage of trainable parameters in current model object: 1.0619%

Moving model to cuda...
Model moved to device.
After model loading & .to(device): Allocated=776.91MB, Reserved=798.00MB
torch.compile not available or not in a suitable Kaggle environment. Proceeding without compilation.


In [4]:

class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, images_folder):
        self.dataset = dataset
        self.processor = processor
        self.images_folder = images_folder
    def __len__(self): return len(self.dataset)
    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item['question'] # Original question
        answer = item['answer']
        rel_path = item['full_image_path']
        
        instructed_question = f"{question} Answer in one word."

        image_path = os.path.join(self.images_folder, *rel_path.replace('\\', '/').split('/'))
        try:
            image = Image.open(image_path).convert("RGB").resize(IMAGE_SIZE)
        except FileNotFoundError: raise FileNotFoundError(f"Image not found: {image_path}")
        except Exception as e: raise Exception(f"Could not load image {image_path}: {e}")
        
        # Use the 'instructed_question' when processing
        encoding = self.processor(
            images=image, 
            text=instructed_question, # <--- USE MODIFIED QUESTION HERE
            padding="max_length", 
            truncation=True, 
            max_length=QUESTION_MAX_LENGTH, # Max length for the (question + instruction)
            return_tensors="pt"
        )
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        labels = self.processor.tokenizer(
            text=answer, 
            max_length=ANSWER_MAX_LENGTH, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )["input_ids"].squeeze(0)
        encoding["labels"] = labels
        return encoding


In [5]:

print("Loading full dataset...")
full_ds = load_dataset("csv", data_files=CSV_PATH, split="train")
num_rows_to_keep = 10000000 # Increased from 50k
ds_subset = full_ds.select(range(len(full_ds) - min(num_rows_to_keep, len(full_ds)), len(full_ds))) if len(full_ds) > num_rows_to_keep else full_ds
print(f"Using subset of {len(ds_subset)} rows.")

split = ds_subset.shuffle(seed=42).train_test_split(test_size=0.1)
training_hf_dataset, valid_hf_dataset = split["train"], split["test"]
print(f"Split into: Training: {len(training_hf_dataset)}, Validation: {len(valid_hf_dataset)}")

def is_valid_example(example):
    # Ensure answer is a non-empty string and image path exists
    if not (isinstance(example.get('answer'), str) and example['answer'].strip() and example.get('full_image_path')):
        return False

    # Normalize the answer (strip and lowercase)
    ans = example['answer'].strip().lower()
    
    # Filter out yes/no questions
    return ans not in ["yes", "no"]
training_hf_dataset = training_hf_dataset.filter(is_valid_example)
valid_hf_dataset = valid_hf_dataset.filter(is_valid_example)
print(f"Filtered sizes: Training: {len(training_hf_dataset)}, Validation: {len(valid_hf_dataset)}")

if not training_hf_dataset or not valid_hf_dataset:
    print("Error: Training or validation dataset is empty. Exiting.")
    exit()

train_dataset = VQADataset(dataset=training_hf_dataset, processor=processor, images_folder=IMAGES_BASE_FOLDER)
valid_dataset = VQADataset(dataset=valid_hf_dataset, processor=processor, images_folder=IMAGES_BASE_FOLDER)

def collate_fn_safe(batch):
    batch = [item for item in batch if item is not None]
    return torch.utils.data.dataloader.default_collate(batch) if batch else None

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn_safe)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn_safe)

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scaler_enabled = torch.cuda.is_available() and (LOAD_IN_FP16 or not (LOAD_IN_8BIT or LOAD_IN_4BIT))
scaler = torch.cuda.amp.GradScaler(enabled=scaler_enabled)
autocast_enabled = scaler_enabled
autocast_dtype = torch.float16 if LOAD_IN_FP16 and torch.cuda.is_available() else None
print(f"GradScaler enabled: {scaler.is_enabled()}, AMP Autocast enabled: {autocast_enabled}, dtype: {autocast_dtype}")

tracking_information = []
min_eval_loss = float("inf")
early_stop_counter = 0


Loading full dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Using subset of 131847 rows.
Split into: Training: 118662, Validation: 13185


Filter:   0%|          | 0/118662 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13185 [00:00<?, ? examples/s]

Filtered sizes: Training: 118622, Validation: 13180
GradScaler enabled: True, AMP Autocast enabled: True, dtype: torch.float16


  scaler = torch.cuda.amp.GradScaler(enabled=scaler_enabled)


In [6]:
# loading checkpoints

start_epoch = 0
checkpoint_path = os.path.join(OUTPUT_MODEL_DIR, "checkpoint.pth")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
    if scaler.is_enabled() and checkpoint["scaler_state_dict"]:
        scaler.load_state_dict(checkpoint["scaler_state_dict"])
    start_epoch = checkpoint["epoch"] + 1
    early_stop_counter = checkpoint["early_stop_counter"]
    min_eval_loss = checkpoint["min_eval_loss"]
    tracking_information = checkpoint["tracking_information"]
    print(f"Resumed training from epoch {start_epoch}")
else:
    print("Starting training from scratch.")


Starting training from scratch.


In [7]:

# --- Training Loop ---
for epoch in range(start_epoch, NUM_EPOCHS):

    model.train()
    epoch_train_loss_accum = 0.0 # Accumulator for average epoch loss
    interval_loss_accum = 0.0    # Accumulator for interval loss
    interval_steps = 0           # Counter for steps in current interval
    
    optimizer.zero_grad()

    # Wrap train_dataloader with tqdm for progress bar
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} train", unit="batch")

    for i, batch in enumerate(progress_bar):
        if batch is None:
            print(f"Warning: Skipped an empty batch at step {i} in epoch {epoch+1}")
            continue
        
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.amp.autocast(device_type=device.type, dtype=autocast_dtype, enabled=autocast_enabled):
            outputs = model(**batch)
            loss = outputs.loss
            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss_scaled = loss / GRADIENT_ACCUMULATION_STEPS # Use a different var for scaled loss if needed
            else:
                loss_scaled = loss

        current_loss_item = loss.item() # Get the unscaled loss for logging
        if scaler.is_enabled():
            scaler.scale(loss_scaled).backward()
        else:
            loss_scaled.backward()
        
        epoch_train_loss_accum += current_loss_item # Accumulate unscaled loss for epoch average
        interval_loss_accum += current_loss_item
        interval_steps += 1

        if (i + 1) % GRADIENT_ACCUMULATION_STEPS == 0 or (i + 1) == len(train_dataloader):
            if scaler.is_enabled():
                # Optional: Clip gradients before optimizer step if using scaler
                # scaler.unscale_(optimizer) # Unscale for clipping
                # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Example clip
                scaler.step(optimizer)
                scaler.update()
            else:
                # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Example clip
                optimizer.step()
            optimizer.zero_grad()

        # ★★★ Log interval loss ★★★
        if (i + 1) % LOG_INTERVAL == 0 or (i + 1) == len(train_dataloader):
            if interval_steps > 0:
                avg_interval_loss = interval_loss_accum / interval_steps
                # Update tqdm progress bar description
                progress_bar.set_postfix({"loss": f"{avg_interval_loss:.4f}", "lr": f"{optimizer.param_groups[0]['lr']:.2e}"})
                # Or print to console:
                # print(f"  Epoch {epoch+1}, Step [{i+1}/{len(train_dataloader)}], Avg Interval Loss: {avg_interval_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.2e}")
                interval_loss_accum = 0.0 # Reset for next interval
                interval_steps = 0

    # --- Validation Phase ---
    model.eval()
    eval_loss_accum = 0.0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} valid", unit="batch"):
            if batch is None: continue
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.amp.autocast(device_type=device.type, dtype=autocast_dtype, enabled=autocast_enabled):
                outputs = model(**batch)
                eval_loss_accum += outputs.loss.item()

    # Calculate average losses for the epoch
    # avg_train_loss_epoch = epoch_train_loss_accum / len(train_dataloader) # Based on number of batches
    # More accurate if some batches were skipped or if last batch is partial:
    num_actual_train_batches = sum(1 for b in train_dataloader if b is not None) if len(train_dataloader) > 0 else 0
    avg_train_loss_epoch = epoch_train_loss_accum / num_actual_train_batches if num_actual_train_batches > 0 else 0.0
    
    num_actual_valid_batches = sum(1 for b in valid_dataloader if b is not None) if len(valid_dataloader) > 0 else 0
    avg_eval_loss = eval_loss_accum / num_actual_valid_batches if num_actual_valid_batches > 0 else 0.0
    
    current_lr = optimizer.param_groups[0]["lr"]
    tracking_information.append((avg_train_loss_epoch, avg_eval_loss, current_lr))
    print(f"\nEpoch {epoch+1} Summary: Avg Train Loss={avg_train_loss_epoch:.4f}, Avg Valid Loss={avg_eval_loss:.4f}, LR={current_lr:.2e}")

    if avg_eval_loss < min_eval_loss:
        print(f"Validation loss improved from {min_eval_loss:.4f} to {avg_eval_loss:.4f}. Saving model to {OUTPUT_MODEL_DIR}")
        min_eval_loss = avg_eval_loss
        os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)
        save_path = os.path.join(OUTPUT_MODEL_DIR, "checkpoint.pth")
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "scaler_state_dict": scaler.state_dict() if scaler.is_enabled() else None,
            "early_stop_counter": early_stop_counter,
            "min_eval_loss": min_eval_loss,
            "tracking_information": tracking_information
        }, save_path)
        
        processor.save_pretrained(OUTPUT_MODEL_DIR)
        model.save_pretrained(OUTPUT_MODEL_DIR)

        early_stop_counter = 0
    else:
        early_stop_counter += 1
        print(f"Validation loss did not improve. Early stopping counter: {early_stop_counter}/{PATIENCE}")
        if early_stop_counter >= PATIENCE:
            print("Early stopping triggered.")
            break
    scheduler.step()

# --- Save Tracking Information ---
tracking_file_dir = os.path.dirname(TRACKING_FILE)
if tracking_file_dir and not os.path.exists(tracking_file_dir):
    os.makedirs(tracking_file_dir, exist_ok=True)
with open(TRACKING_FILE, "wb") as f:
    pickle.dump(tracking_information, f)
print(f"Tracking information saved to {TRACKING_FILE}")
print("Training finished!")

Epoch 1/3 train:   0%|          | 0/7414 [00:00<?, ?batch/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1/3 train: 100%|██████████| 7414/7414 [1:59:48<00:00,  1.03batch/s, loss=7.4597, lr=5.00e-05]
Epoch 1/3 valid: 100%|██████████| 824/824 [06:08<00:00,  2.23batch/s]



Epoch 1 Summary: Avg Train Loss=7.5272, Avg Valid Loss=7.4681, LR=5.00e-05
Validation loss improved from inf to 7.4681. Saving model to /kaggle/working/blip-finetuned


Epoch 2/3 train: 100%|██████████| 7414/7414 [1:59:37<00:00,  1.03batch/s, loss=7.4517, lr=4.50e-05]
Epoch 2/3 valid: 100%|██████████| 824/824 [06:08<00:00,  2.24batch/s]



Epoch 2 Summary: Avg Train Loss=7.4544, Avg Valid Loss=7.4598, LR=4.50e-05
Validation loss improved from 7.4681 to 7.4598. Saving model to /kaggle/working/blip-finetuned


Epoch 3/3 train: 100%|██████████| 7414/7414 [1:59:57<00:00,  1.03batch/s, loss=7.4487, lr=4.05e-05]
Epoch 3/3 valid: 100%|██████████| 824/824 [06:10<00:00,  2.22batch/s]



Epoch 3 Summary: Avg Train Loss=7.4450, Avg Valid Loss=7.4558, LR=4.05e-05
Validation loss improved from 7.4598 to 7.4558. Saving model to /kaggle/working/blip-finetuned
Tracking information saved to tracking_information.pkl
Training finished!


# Inference

In [8]:
# import os
# import torch
# import pickle
# from PIL import Image
# from torch.utils.data import DataLoader
# from tqdm import tqdm
# from transformers import BlipProcessor, BlipForQuestionAnswering, BitsAndBytesConfig
# from datasets import load_dataset
# import gc
# import csv
# from peft import PeftModel # For loading LoRA adapters

# # --- Configuration (Mostly from your training script) ---
# BASE_MODEL_NAME = "Salesforce/blip-vqa-base" # The original base model
# SAVED_LORA_MODEL_DIR = "/kaggle/input/r-8-1l-epochs-4" # Directory where your LoRA model and processor were saved
# CSV_PATH = r"/kaggle/input/vrdatasets/generated_questions/generated_questions/finalDataset.csv"
# IMAGES_BASE_FOLDER = r"/kaggle/input/vrdatasets/abo-images-small"
# OUTPUT_PREDICTIONS_CSV = "r-8-1l-epochs-4.csv" # CSV for this run

# IMAGE_SIZE = (384, 384)
# QUESTION_MAX_LENGTH = 32
# ANSWER_MAX_LENGTH = 8    # For model.generate()

# # Batch size for inference (can often be larger than training)
# INFERENCE_BATCH_SIZE = 16
# NUM_WORKERS = 2

# # Memory Saving Options for loading the base model (match what you used for inference previously)
# LOAD_IN_FP16_INFERENCE = True
# LOAD_IN_8BIT_INFERENCE = False
# LOAD_IN_4BIT_INFERENCE = False

# # --- Setup ---
# gc.collect()
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # 1) Load the processor (from saved LoRA model directory is preferred)
# try:
#     processor = BlipProcessor.from_pretrained(SAVED_LORA_MODEL_DIR)
#     print(f"Processor loaded successfully from {SAVED_LORA_MODEL_DIR}")
# except Exception as e:
#     print(f"Warning: Error loading processor from {SAVED_LORA_MODEL_DIR}: {e}")
#     print(f"Attempting to load processor from base model name: {BASE_MODEL_NAME}")
#     processor = BlipProcessor.from_pretrained(BASE_MODEL_NAME)

# # 2) Load the base model
# model_kwargs_inference = {}
# if torch.cuda.is_available():
#     if LOAD_IN_FP16_INFERENCE and not (LOAD_IN_8BIT_INFERENCE or LOAD_IN_4BIT_INFERENCE):
#         print("Loading base model in float16 for inference...")
#         model_kwargs_inference["torch_dtype"] = torch.float16
#     elif LOAD_IN_8BIT_INFERENCE: # Add other quantization options if needed
#         print("Loading base model in 8-bit for inference...")
#         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
#         model_kwargs_inference["quantization_config"] = quantization_config
#     elif LOAD_IN_4BIT_INFERENCE:
#         print("Loading base model in 4-bit for inference...")
#         quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)
#         model_kwargs_inference["quantization_config"] = quantization_config
#     else:
#         print("Loading base model in float32 for inference...")
# else:
#     print("Loading base model on CPU in float32 for inference...")

# try:
#     base_model = BlipForQuestionAnswering.from_pretrained(
#         BASE_MODEL_NAME,
#         **model_kwargs_inference
#     )
#     print(f"Base model '{BASE_MODEL_NAME}' loaded successfully.")
# except Exception as e:
#     print(f"Error loading base model: {e}")
#     exit()

# # 3) Load the LoRA PEFT model (Your fine-tuned model)
# try:
#     model = PeftModel.from_pretrained(base_model, SAVED_LORA_MODEL_DIR)
#     model = model.merge_and_unload() # Optional: merge for faster inference
#     print(f"LoRA adapters loaded successfully from {SAVED_LORA_MODEL_DIR} and merged.")
# except Exception as e:
#     print(f"Error loading LoRA adapters from {SAVED_LORA_MODEL_DIR}: {e}")
#     print("Ensure the SAVED_LORA_MODEL_DIR is correct and contains adapter files.")
#     exit()

# # 4) Move model to device and set to evaluation mode
# model.to(device)
# model.eval()
# print("Model moved to device and set to evaluation mode.")


# # --- VQADataset Class (from your training script) ---
# class VQADataset(torch.utils.data.Dataset):
#     def __init__(self, dataset, processor, images_folder):
#         self.dataset = dataset # This will be a Hugging Face Dataset object
#         self.processor = processor
#         self.images_folder = images_folder

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         item = self.dataset[idx] # Access item from Hugging Face Dataset
#         question_text = item['question']
#         answer_text = item['answer']
#         rel_path = item['full_image_path']

#         image_path_parts = rel_path.replace('\\', '/').split('/')
#         image_path = os.path.join(self.images_folder, *image_path_parts)

#         try:
#             image = Image.open(image_path).convert("RGB").resize(IMAGE_SIZE)
#         except Exception as e:
#             # print(f"Warning: Could not load image {image_path}: {e}. Returning None.")
#             return None # Will be filtered by collate_fn

#         # For inference, we only need to process image and question.
#         # Labels are not strictly needed by model.generate but useful for loss if calculated.
#         encoding = self.processor(
#             images=image,
#             text=question_text,
#             padding="max_length",
#             truncation=True,
#             max_length=QUESTION_MAX_LENGTH,
#             return_tensors="pt"
#         )
#         # Squeeze batch dimension from processor output
#         encoding = {k: v.squeeze(0) for k, v in encoding.items()}

#         # Store original texts for CSV and metrics
#         encoding["original_question"] = question_text
#         encoding["original_answer"] = answer_text
#         return encoding

# # --- Load and Prepare Data (exactly as in your training script) ---
# print("Loading full dataset for splitting...")
# full_ds = load_dataset("csv", data_files=CSV_PATH, split="train")
# print(f"Full dataset loaded with {len(full_ds)} rows.")

# num_rows_to_keep = 50000000 # Or however many you used
# total_rows_in_full_ds = len(full_ds)

# if total_rows_in_full_ds <= num_rows_to_keep:
#     ds_subset = full_ds
# else:
#     start_index = total_rows_in_full_ds - num_rows_to_keep
#     ds_subset = full_ds.select(indices=range(start_index, total_rows_in_full_ds))

# print(f"Preparing to split the subset of {len(ds_subset)} rows.")
# split = ds_subset.shuffle(seed=42).train_test_split(test_size=0.1) # Use the same seed and test_size
# # training_hf_dataset = split["train"] # Not needed for this script
# valid_hf_dataset = split["test"] # This is the split we want to infer on

# print(f"Using validation split with {len(valid_hf_dataset)} rows for inference.")

# # Filter validation set (as in training script)
# def is_valid_example_for_inference(example):
#     # Slightly different from training: VQADataset getitem handles image loading errors
#     if not (isinstance(example.get('question'), str) and example['question'].strip()): return False
#     if not (isinstance(example.get('answer'), str) and example['answer'].strip()): return False # Still need answer for GT
#     rel_path = example.get('full_image_path')
#     if not rel_path: return False
#     return True

# valid_hf_dataset_filtered = valid_hf_dataset.filter(is_valid_example_for_inference)
# print(f"Filtered validation set size for inference: {len(valid_hf_dataset_filtered)}")

# if len(valid_hf_dataset_filtered) == 0:
#     print("Validation dataset is empty after filtering. Exiting.")
#     exit()

# # Custom collate_fn for inference
# def inference_collate_fn(batch):
#     batch = [item for item in batch if item is not None] # Filter out None items from VQADataset
#     if not batch:
#         return None

#     original_questions = [item.pop("original_question") for item in batch]
#     original_answers = [item.pop("original_answer") for item in batch]

#     # Default collate for the rest (pixel_values, input_ids, attention_mask)
#     collated_tensors = torch.utils.data.dataloader.default_collate(batch)

#     return {
#         **collated_tensors,
#         "original_question": original_questions,
#         "original_answer": original_answers
#     }

# # Create DataLoader for the validation set
# inference_dataset = VQADataset(dataset=valid_hf_dataset_filtered, processor=processor, images_folder=IMAGES_BASE_FOLDER)
# inference_dataloader = DataLoader(inference_dataset, batch_size=INFERENCE_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=inference_collate_fn)

# # --- Inference Loop & CSV Writing ---
# print(f"\nStarting inference on the validation set ({len(inference_dataset)} samples)...")
# print(f"Predictions will be saved to: {OUTPUT_PREDICTIONS_CSV}")

# generate_kwargs = {
#     "max_new_tokens": ANSWER_MAX_LENGTH,
#     "num_beams": 3, # Example, adjust as needed
# }

# total_samples_processed = 0
# with open(OUTPUT_PREDICTIONS_CSV, 'w', newline='', encoding='utf-8') as csvfile:
#     csv_writer = csv.writer(csvfile)
#     csv_writer.writerow(["Question", "GroundTruth_Answer", "Predicted_Answer"])

#     with torch.no_grad():
#         for batch in tqdm(inference_dataloader, desc="Inferring on validation set"):
#             if batch is None: # Skipped batch from collate_fn
#                 continue

#             pixel_values = batch["pixel_values"].to(device)
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             original_questions_batch = batch["original_question"]
#             original_answers_batch = batch["original_answer"]

#             autocast_enabled = torch.cuda.is_available() and LOAD_IN_FP16_INFERENCE
#             with torch.amp.autocast(device_type=device.type, dtype=torch.float16 if autocast_enabled else None, enabled=autocast_enabled):
#                 generated_ids = model.generate(
#                     pixel_values=pixel_values,
#                     input_ids=input_ids,
#                     attention_mask=attention_mask,
#                     **generate_kwargs
#                 )

#             generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

#             for i in range(len(generated_texts)):
#                 question_text = original_questions_batch[i].strip()
#                 truth_text = original_answers_batch[i].strip()
#                 pred_text = generated_texts[i].strip()
#                 csv_writer.writerow([question_text, truth_text, pred_text])
            
#             total_samples_processed += len(generated_texts)

# print(f"\nInference complete. {total_samples_processed} predictions saved to {OUTPUT_PREDICTIONS_CSV}")
# print("You can now run the metrics calculation script/cell using this CSV file.")

In [9]:
# import pandas as pd
# from bert_score import score as bert_score_calc
# from rouge_score import rouge_scorer
# import nltk
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# from nltk.translate.meteor_score import meteor_score
# from sacrebleu.metrics import BLEU as SacreBLEU
# import os # Added for NLTK custom path

# # --- Configuration ---
# PREDICTIONS_CSV_PATH = "/kaggle/working/r-8-1l-epochs-4.csv" # IMPORTANT: Use the CSV from Part 1
# BERT_SCORE_MODEL_TYPE = "microsoft/deberta-xlarge-mnli"

# # # --- NLTK Resource Download (Robust version) ---
# # def download_nltk_resource(resource_name_for_download, resource_path_to_check):
# #     try:
# #         nltk.data.find(resource_path_to_check)
# #         print(f"NLTK resource '{resource_name_for_download}' (checked as '{resource_path_to_check}') already available.")
# #     except LookupError:
# #         print(f"NLTK resource '{resource_name_for_download}' (checked as '{resource_path_to_check}') not found. Downloading...")
# #         try:
# #             custom_nltk_path = os.path.join(os.getcwd(), "custom_nltk_data_for_metrics")
# #             os.makedirs(custom_nltk_path, exist_ok=True)
# #             if custom_nltk_path not in nltk.data.path:
# #                  nltk.data.path.append(custom_nltk_path)

# #             nltk.download(resource_name_for_download, download_dir=custom_nltk_path, quiet=False)
# #             nltk.data.find(resource_path_to_check) # Verify
# #             print(f"NLTK resource '{resource_name_for_download}' downloaded successfully to custom path.")
# #         except Exception as e_custom:
# #             print(f"Failed to download to custom path ({e_custom}). Trying default NLTK download paths...")
# #             try:
# #                 nltk.download(resource_name_for_download, quiet=False)
# #                 nltk.data.find(resource_path_to_check) # Verify
# #                 print(f"NLTK resource '{resource_name_for_download}' downloaded successfully to default path.")
# #             except Exception as e_default:
# #                  print(f"Failed to download NLTK resource '{resource_name_for_download}' to any path. Error: {e_default}")
# #                  print("Please try manually: import nltk; nltk.download('all') or specific packages.")

# # print("Checking and downloading NLTK resources for metrics...")
# # download_nltk_resource('punkt', 'tokenizers/punkt')
# # download_nltk_resource('wordnet', 'corpora/wordnet')
# # download_nltk_resource('omw-1.4', 'corpora/omw-1.4')
# # print("NLTK resource check complete for metrics.")


# # --- Load Predictions ---
# try:
#     df = pd.read_csv(PREDICTIONS_CSV_PATH)
#     if df.empty:
#         print(f"CSV file '{PREDICTIONS_CSV_PATH}' is empty. No metrics to calculate.")
#         df = None
# except FileNotFoundError:
#     print(f"CSV file '{PREDICTIONS_CSV_PATH}' not found.")
#     df = None
# except Exception as e:
#     print(f"Error loading CSV: {e}")
#     df = None

# if df is not None:
#     # Ensure columns exist before trying to access them
#     if "Predicted_Answer" not in df.columns or "GroundTruth_Answer" not in df.columns:
#         print(f"Error: CSV file '{PREDICTIONS_CSV_PATH}' must contain 'Predicted_Answer' and 'GroundTruth_Answer' columns.")
#         df = None
#     else:
#         predictions_list = df["Predicted_Answer"].astype(str).apply(lambda x: x if x.lower() != 'nan' else "").tolist()
#         references_list = df["GroundTruth_Answer"].astype(str).apply(lambda x: x if x.lower() != 'nan' else "").tolist()

#         if not predictions_list or not references_list or len(predictions_list) != len(references_list):
#             print("No valid predictions or references found, or lists have mismatched lengths.")
#             df = None # Skip calculations if lists are problematic

# if df is not None:
#     print(f"Loaded {len(predictions_list)} predictions and references for metrics calculation.")

#     # --- 0. Exact Match Accuracy --- <--- ADDED HERE
#     print("\nCalculating Exact Match Accuracy...")
#     exact_matches = 0
#     total_samples = len(predictions_list)
#     if total_samples > 0:
#         for pred, ref in zip(predictions_list, references_list):
#             # Clean both prediction and reference for fair comparison
#             clean_pred = pred.strip().lower()
#             clean_ref = ref.strip().lower()
#             if clean_pred == clean_ref:
#                 exact_matches += 1
#         accuracy = exact_matches / total_samples
#         print(f"  Exact Match Accuracy: {accuracy:.4f} ({exact_matches}/{total_samples})")
#     else:
#         print("  No samples to calculate accuracy.")


#     # --- 1. BERTScore ---
#     print("\nCalculating BERTScore...")
#     try:
#         P, R, F1 = bert_score_calc(
#             predictions_list, references_list, lang="en", model_type=BERT_SCORE_MODEL_TYPE,
#             verbose=False, device=None, baseline_path=None
#         )
#         print(f"  BERTScore: Precision={P.mean():.4f}, Recall={R.mean():.4f}, F1={F1.mean():.4f}")
#     except Exception as e:
#         print(f"  Error calculating BERTScore: {e}")

#     # --- 2. ROUGE Score ---
#     print("\nCalculating ROUGE scores...")
#     try:
#         rouge_types = ['rouge1', 'rouge2', 'rougeL']
#         scorer = rouge_scorer.RougeScorer(rouge_types, use_stemmer=True)
#         sum_rouge1_f, sum_rouge2_f, sum_rougeL_f = 0, 0, 0
#         count = 0
#         for pred, ref in zip(predictions_list, references_list):
#             score = scorer.score(ref, pred) # ROUGE scorer usually expects (reference, prediction)
#             sum_rouge1_f += score['rouge1'].fmeasure
#             sum_rouge2_f += score['rouge2'].fmeasure
#             sum_rougeL_f += score['rougeL'].fmeasure
#             count +=1
#         if count > 0:
#             print(f"  ROUGE-1 F1: {sum_rouge1_f / count:.4f}")
#             print(f"  ROUGE-2 F1: {sum_rouge2_f / count:.4f}")
#             print(f"  ROUGE-L F1: {sum_rougeL_f / count:.4f}")
#         else: print("  No samples for ROUGE.")
#     except Exception as e:
#         print(f"  Error calculating ROUGE: {e}")

#     # --- 3. SacreBLEU Score ---
#     print("\nCalculating SacreBLEU score...")
#     try:
#         references_for_sacrebleu = [[ref] for ref in references_list] # SacreBLEU expects list of lists for references
#         sacrebleu_metric = SacreBLEU()
#         bleu_score = sacrebleu_metric.corpus_score(predictions_list, references_for_sacrebleu)
#         print(f"  SacreBLEU: {bleu_score.score / 100:.4f} (Score: {bleu_score.score:.2f})")
#     except Exception as e:
#         print(f"  Error calculating SacreBLEU: {e}")

#     # --- 4. NLTK METEOR Score ---
#     print("\nCalculating METEOR score...")
#     try:
#         sum_meteor = 0
#         count = 0
#         for pred, ref in zip(predictions_list, references_list):
#             pred_tokens = nltk.word_tokenize(pred.lower()) # Tokenize for METEOR
#             ref_tokens = nltk.word_tokenize(ref.lower())   # Tokenize for METEOR
#             # meteor_score expects a list of reference token lists, and one hypothesis token list
#             if not pred_tokens and not ref_tokens: # both empty after tokenization
#                 sum_meteor += 1.0 # Perfect match for empty strings
#             elif not pred_tokens or not ref_tokens: # one is empty
#                 sum_meteor += 0.0
#             else:
#                  sum_meteor += meteor_score([ref_tokens], pred_tokens)
#             count +=1
#         if count > 0:
#             print(f"  METEOR: {sum_meteor / count:.4f}")
#         else:
#             print("  No samples for METEOR.")
#     except Exception as e:
#         print(f"  Error calculating METEOR: {e}")

#     print("\nMetrics calculation complete.")
# else:
#     print("Skipping metrics calculation as data could not be loaded or was empty, or columns were missing.")

In [10]:
# pip install bert_score

In [11]:
# pip install rouge_score

In [12]:
# pip install nltk

In [13]:
# pip install sacrebleu