In [1]:
!pip uninstall -y pylibcudagraph-cu12 rmm-cu12 && pip install  pandas numpy pillow torch torchvision transformers scikit-learn timeout-decorator peft accelerate datasets bitsandbytes bert-score rouge-score rapidfuzz sentence-transformers evaluate

[0mFound existing installation: rmm-cu12 25.2.0
Uninstalling rmm-cu12-25.2.0:
  Successfully uninstalled rmm-cu12-25.2.0
Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (fro

In [2]:
import os
import gc
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
import torch
from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    AutoProcessor,
    Trainer,
    TrainingArguments
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
from tqdm import tqdm
from timeout_decorator import timeout, TimeoutError
from accelerate import Accelerator
from transformers.data.data_collator import default_data_collator
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    f1_score
)
from bert_score import BERTScorer, score
from rouge_score import rouge_scorer
from rapidfuzz.distance import Levenshtein
from sentence_transformers import SentenceTransformer, util
import evaluate

2025-05-16 09:23:33.082818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747387413.305983      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747387413.372882      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train_df = pd.read_csv('/kaggle/input/datasetfinal/train.csv')
val_df = pd.read_csv('/kaggle/input/datasetfinal/val.csv')
test_df = pd.read_csv('/kaggle/input/datasetfinal/test.csv')
print(f"Loaded training dataset with {len(train_df)} entries")
print(f"Loaded validation dataset with {len(val_df)} entries")
print(f"Loaded testing dataset with {len(test_df)} entries")

Loaded training dataset with 80001 entries
Loaded validation dataset with 20001 entries
Loaded testing dataset with 20001 entries


In [4]:
val_df["full_image_path"] = val_df["full_image_path"].apply(lambda p: p.replace("\\", "/"))
val_df["full_image_path"] = val_df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / p))
train_df["full_image_path"] = train_df["full_image_path"].apply(lambda p: p.replace("\\", "/"))
train_df["full_image_path"] = train_df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / p))
test_df["full_image_path"] = test_df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / p))
test_df["full_image_path"] = test_df["full_image_path"].apply(lambda p: p.replace("\\", "/"))

this class defines a dataset for visual question answering that takes a dataframe and a processor as inputs. it handles loading images with a timeout to avoid long delays and uses a fallback black image if the image file is missing or can’t be loaded. Each data point includes an image, a question, and an answer. The processor prepares the image and question text into tokenized inputs with attention masks, while the answer is separately tokenized as labels with a fixed maximum length. All tensors are squeezed to remove batch dimensions before returning, making the data ready for training or evaluation.

In [5]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    @timeout(10)  # Timeout after 10 seconds for image loading/processing
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = row['full_image_path']
        try:
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                image = Image.new("RGB", (224, 224), (0, 0, 0))  # Fallback image
            else:
                image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            image = Image.new("RGB", (224, 224), (0, 0, 0))  # Fallback image
        
        question = row['question']
        answer = row['answer']

        # Process image and question with attention mask
        encoding = self.processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )
        
        # Tokenize answer as labels with fixed length
        labels = self.processor.tokenizer(
            answer,
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )["input_ids"]

        # Remove batch dimension from tensors
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels.squeeze(0)

        return encoding

In [None]:
# initializing the accelerator for efficient training and inference across devices.  
# importing the blip processor class for processing images and questions with a fast tokenizer.  
# loading the blip model pretrained for question answering tasks.

accelerator = Accelerator()
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [7]:
train_dataset = VQADataset(train_df, processor)
val_dataset = VQADataset(val_df, processor)

In [None]:
# creating a lora configuration with specified rank, alpha, target modules, dropout, and no bias.  
# applying lora parameter-efficient fine-tuning to the base model using the config.  
# printing confirmation that lora is applied to the model.  
# preparing the model with accelerator for optimized multi-device training or inference.
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
print("LoRA applied to the model")

# Prepare model with Accelerator
model = accelerator.prepare(model)

LoRA applied to the model


In [None]:
# setting up training arguments with output directory, run name, number of epochs, batch size, gradient accumulation, learning rate, weight decay, logging configuration, saving strategy, mixed precision, and reporting disabled.  
# creating a trainer instance using the prepared model, training arguments, training and validation datasets, and a default data collator for batching.
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    run_name="blip_vqa_lora_finetune",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

# Create Trainer instance with default data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
if torch.cuda.is_available():
    print("GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1479 MiB |   1479 MiB |   1479 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     10 MiB |     10 MiB |     10 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |   1479 MiB |   1479 MiB |   1479 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     10 MiB |     10 MiB |     10 MiB |      0 B   |
|-----------------------------

In [11]:
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,10.22
20,10.02
30,9.8225
40,9.6396
50,9.4945
60,9.3697
70,9.2166
80,9.0908
90,8.9652
100,8.8752




TrainOutput(global_step=7500, training_loss=8.150731196085612, metrics={'train_runtime': 24966.6611, 'train_samples_per_second': 9.613, 'train_steps_per_second': 0.3, 'total_flos': 6.253478163804365e+16, 'train_loss': 8.150731196085612, 'epoch': 2.9991000899910008})

In [None]:
# evaluating the fine-tuned model on the test dataset using the trainer’s evaluate method.  
# printing the evaluation results to monitor model performance.  
# saving the fine-tuned model to the specified directory for future use or deployment.  
# confirming the model save location with a print statement.

eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
trainer.save_model("/kaggle/working/fine_tuned_blip_vqa_lora")
print("Model saved to '/kaggle/working/fine_tuned_blip_vqa_lora'")



Evaluation Results: {'eval_runtime': 1150.2138, 'eval_samples_per_second': 17.389, 'eval_steps_per_second': 1.088, 'epoch': 2.9991000899910008}
Model saved to '/kaggle/working/fine_tuned_blip_vqa_lora'


In [None]:
skipped_entries = []

# Function to predict answer with timeout
@timeout(10) 
def predict_answer(image_path, question):
    try:
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            return ""
        image = Image.open(image_path).convert("RGB")

        # Prepare inputs for BLIP VQA
        inputs = processor(images=image, text=question, return_tensors="pt", padding=True).to(device)

        # Use generate for inference
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=20)
        predicted_answer = processor.decode(output_ids[0], skip_special_tokens=True).strip()

        # Extract one-word answer (post-process if needed)
        predicted_answer = predicted_answer.split()[-1] if predicted_answer else ""

        del inputs, output_ids
        gc.collect()
        torch.cuda.empty_cache()
        return predicted_answer
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

In [14]:
accelerator = Accelerator()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base",use_fast = True)
base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
# base_model_copy = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
model = PeftModel.from_pretrained(base_model, "/kaggle/working/fine_tuned_blip_vqa_lora/")
model = accelerator.prepare(model)
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear(in_features=768, out_features=2304, bias=True)
                (projection): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): BlipMLP(
                (activation_fn): GELUActivation()
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
                (fc2): Linear(in_features=3072, out_features=768, bias=True)
              )
              (layer_norm2): Lay

In [15]:
import os
os.makedirs("/kaggle/working/resultCSV", exist_ok=True)

In [16]:
# Resume support with handling for empty CSV
start_idx = 0
pred_path = '/kaggle/working/resultCSV/predictions.csv'
if os.path.exists(pred_path):
    try:
        existing = pd.read_csv(pred_path)
        if not existing.empty:
            start_idx = len(existing)
            print(f"Resuming from index {start_idx}")
        else:
            print(f"Prediction file {pred_path} is empty, starting from index 0")
    except pd.errors.EmptyDataError:
        print(f"Prediction file {pred_path} is empty, starting from index 0")
else:
    print(f"No existing prediction file found, starting from index 0")

#Lists to store predictions and ground truth
df = test_df
predictions = []
ground_truths = df["answer"].tolist()
y_true = []
y_pred = []
results = []
all_preds = []
# Predict answers for each row
for idx, row in tqdm(df.reset_index(drop=True).iterrows(), total=len(df), desc="Processing"):
    if idx < start_idx:
        continue  # Skip already processed rows
    full_image_path = row["full_image_path"]
    question = row["question"]
    try:
        predicted = predict_answer(full_image_path, question)
    except TimeoutError:
        print(f"Timeout processing row {idx}: {full_image_path}")
        skipped_entries.append({"row": idx, "full_image_path": full_image_path, "question": question})
        predicted = ""
    predictions.append(predicted)

    y_true.append(str(ground_truths[idx]).lower())
    y_pred.append(str(predicted).lower())
    
    all_preds.append(str(predicted).lower())
    
    results.append({
        "img_path": full_image_path,
        "question": question,
        "true_answer": str(ground_truths[idx]).lower(),
        "predicted_answer": str(predicted).lower()
    })
    if (idx + 1) % 1000 == 0 or (idx + 1) == len(df):
        # Convert to DataFrame
        chunk_df = pd.DataFrame(results)

        # Save predictions
        if not os.path.exists(pred_path):
            chunk_df.to_csv(pred_path, mode='w', index=False, header=True)
        else:
            chunk_df.to_csv(pred_path, mode='a', index=False, header=False)

        print(f"Checkpoint saved at index {idx + 1}")
        # Reset for next chunk
        results = []
        y_true = []
        y_pred = []

No existing prediction file found, starting from index 0


Processing:   5%|▍         | 1000/20001 [09:08<2:57:00,  1.79it/s]

Checkpoint saved at index 1000


Processing:  10%|▉         | 2000/20001 [18:29<2:50:53,  1.76it/s]

Checkpoint saved at index 2000


Processing:  15%|█▍        | 3000/20001 [27:51<2:40:23,  1.77it/s]

Checkpoint saved at index 3000


Processing:  20%|█▉        | 4000/20001 [37:12<2:30:18,  1.77it/s]

Checkpoint saved at index 4000


Processing:  25%|██▍       | 5000/20001 [46:35<2:23:00,  1.75it/s]

Checkpoint saved at index 5000


Processing:  30%|██▉       | 6000/20001 [55:59<2:10:53,  1.78it/s]

Checkpoint saved at index 6000


Processing:  35%|███▍      | 7000/20001 [1:05:19<2:02:54,  1.76it/s]

Checkpoint saved at index 7000


Processing:  40%|███▉      | 8000/20001 [1:14:41<1:52:03,  1.78it/s]

Checkpoint saved at index 8000


Processing:  45%|████▍     | 9000/20001 [1:23:59<1:41:39,  1.80it/s]

Checkpoint saved at index 9000


Processing:  50%|████▉     | 10000/20001 [1:33:21<1:33:47,  1.78it/s]

Checkpoint saved at index 10000


Processing:  55%|█████▍    | 11000/20001 [1:42:42<1:26:23,  1.74it/s]

Checkpoint saved at index 11000


Processing:  60%|█████▉    | 12000/20001 [1:52:05<1:16:02,  1.75it/s]

Checkpoint saved at index 12000


Processing:  65%|██████▍   | 13000/20001 [2:01:26<1:05:47,  1.77it/s]

Checkpoint saved at index 13000


Processing:  70%|██████▉   | 14000/20001 [2:10:47<56:26,  1.77it/s]

Checkpoint saved at index 14000


Processing:  75%|███████▍  | 15000/20001 [2:20:07<47:01,  1.77it/s]

Checkpoint saved at index 15000


Processing:  80%|███████▉  | 16000/20001 [2:29:26<37:59,  1.76it/s]

Checkpoint saved at index 16000


Processing:  85%|████████▍ | 17000/20001 [2:38:45<27:49,  1.80it/s]

Checkpoint saved at index 17000


Processing:  90%|████████▉ | 18000/20001 [2:48:06<18:48,  1.77it/s]

Checkpoint saved at index 18000


Processing:  95%|█████████▍| 19000/20001 [2:57:20<09:06,  1.83it/s]

Checkpoint saved at index 19000


Processing: 100%|█████████▉| 20000/20001 [3:06:30<00:00,  1.81it/s]

Checkpoint saved at index 20000


Processing: 100%|██████████| 20001/20001 [3:06:30<00:00,  1.79it/s]

Checkpoint saved at index 20001





In [17]:
# Normalize case
preds = pd.read_csv('/kaggle/working/resultCSV/predictions.csv')
preds = preds.dropna()
# preds = preds.iloc[1:].reset_index(drop=True)
all_preds_BLIP = preds['predicted_answer'].tolist()
all_actuals_BLIP = preds['true_answer'].tolist()

In [18]:

preds_l = [p.lower() for p in all_preds_BLIP if isinstance(p, str)]
refs_l = [r.lower() for r in all_actuals_BLIP if isinstance(r, str)]


# Compute exact-match binary metrics
y_pred_bin = [int(p == r) for p, r in zip(preds_l, refs_l)]
y_true_bin = [1] * len(all_actuals_BLIP)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Print metrics
print(f"Exact-match Accuracy: {acc:.3f}")
print(f"Exact-match Precision: {prec:.3f}")
print(f"Exact-match Recall:    {rec:.3f}")
print(f"Exact-match F1:        {f1:.3f}\n")

# Save predictions and ground truths to CSV
# pred_ref_df = pd.DataFrame({
#     "Prediction": predictions,
#     "Ground_Truth": refs
# })
# df.to_csv('prediction_output.csv', index=False)

Exact-match Accuracy: 0.624
Exact-match Precision: 1.000
Exact-match Recall:    0.624
Exact-match F1:        0.768



In [19]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []

for pred, ref in zip(preds_l, refs_l):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

print(f"ROUGE Scores:")
print(f"ROUGE-1 F1: {np.mean(rouge1_scores):.4f}")
print(f"ROUGE-L F1: {np.mean(rougeL_scores):.4f}")

ROUGE Scores:
ROUGE-1 F1: 0.6367
ROUGE-L F1: 0.6367


loading the pretrained sentence transformer model for embedding sentences. It encodes lists of predicted and reference sentences into dense vector representations on the gpu. Then, it computes cosine similarity scores between corresponding pairs of predicted and reference embeddings. Finally, it prints out the average cosine similarity across all pairs as a measure of how close the predicted sentences are to the references in semantic space.

In [None]:

model = SentenceTransformer('all-MiniLM-L6-v2')
pred_embeds = model.encode(preds_l, convert_to_tensor=True, device='cuda')
ref_embeds = model.encode(refs_l, convert_to_tensor=True, device='cuda')

cos_sims = [
    util.cos_sim(pred_embeds[i], ref_embeds[i]).item()
    for i in range(len(preds_l))
]

print(f"Sentence-BERT Cosine Similarity:")
print(f"Average Cosine Similarity: {np.mean(cos_sims):.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Sentence-BERT Cosine Similarity:
Average Cosine Similarity: 0.8165
