In [1]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->be

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=18af55bbac39439a520e952734fe220adbea07891ba8aabcee8a42fb34c0d209
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import torch
from transformers import BlipForQuestionAnswering, BlipProcessor
from PIL import Image
import csv
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk

nltk.download('punkt')

# Check device (CUDA or CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and processor
model_name = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForQuestionAnswering.from_pretrained(model_name)
model.to(device)
model.eval()

def load_eval_data(csv_path, images_root):
    samples = []
    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Replace backslashes with forward slashes
            relative_path = row["full_image_path"].replace("\\", "/")
            full_image_path = os.path.join(images_root, relative_path)
            samples.append({
                "image_path": full_image_path,
                "question": row["question"],
                "answer": row["answer"]
            })
    return samples



# Set dataset and images path
CSV_PATH = "/kaggle/input/merged-training/merged_final.csv"
IMAGES_ROOT = "/kaggle/input/vrdatasets/abo-images-small"

# Load evaluation data
eval_data = load_eval_data(CSV_PATH, IMAGES_ROOT)

BATCH_SIZE = 16  # Adjust based on GPU memory

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
smooth_fn = SmoothingFunction().method1

predictions = []
references = []

def preprocess_batch(batch_samples):
    images = [Image.open(s["image_path"]).convert("RGB") for s in batch_samples]
    questions = [s["question"] for s in batch_samples]
    # Move to GPU by ensuring inputs are on the correct device
    inputs = processor(images=images, text=questions, return_tensors="pt", padding=True).to(device)
    return inputs

# Evaluation loop with progress updates
from tqdm import tqdm  # or use from tqdm.notebook import tqdm for notebooks

# Evaluation loop with progress bar
for i in tqdm(range(0, len(eval_data), BATCH_SIZE), desc="Running inference", unit="batch"):
    batch_samples = eval_data[i : i + BATCH_SIZE]
    references.extend([s["answer"].strip() for s in batch_samples])

    try:
        inputs = preprocess_batch(batch_samples)
    except Exception as e:
        print(f"❌ Error preprocessing batch {i // BATCH_SIZE + 1}: {e}")
        continue

    with torch.no_grad():
        try:
            generated_ids = model.generate(**inputs)
            batch_predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
        except Exception as e:
            print(f"❌ Error during inference in batch {i // BATCH_SIZE + 1}: {e}")
            continue

    predictions.extend(batch_predictions)




# Calculate Exact Match
exact_matches = sum(
    1 for p, r in zip(predictions, references) if p.strip().lower() == r.lower()
)
total = len(references)

# Calculate BERTScore
P, R, F1 = bert_score(predictions, references, lang='en', rescale_with_baseline=True)

# Calculate BLEU scores
bleu_scores = []
for pred, ref in zip(predictions, references):
    pred_tokens = nltk.word_tokenize(pred.lower())
    ref_tokens = [nltk.word_tokenize(ref.lower())]
    bleu = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smooth_fn)
    bleu_scores.append(bleu)

# Calculate ROUGE scores
rouge1_scores = []
rougeL_scores = []
for pred, ref in zip(predictions, references):
    rouge_scores = scorer.score(ref, pred)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

# Print results
print(f"Total samples evaluated: {total}")
print(f"Exact Match Accuracy: {exact_matches}/{total} = {exact_matches/total:.4f}")
print(f"BERTScore (F1): {F1.mean().item():.4f}")
print(f"BLEU Score (avg): {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"ROUGE-1 F1 Score (avg): {sum(rouge1_scores)/len(rouge1_scores):.4f}")
print(f"ROUGE-L F1 Score (avg): {sum(rougeL_scores)/len(rougeL_scores):.4f}")


2025-05-15 19:01:06.930095: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747335667.120718      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747335667.178498      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False

Using device: cuda


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Running inference: 100%|██████████| 8241/8241 [1:24:05<00:00,  1.63batch/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total samples evaluated: 131847
Exact Match Accuracy: 61456/131847 = 0.4661
BERTScore (F1): 0.7541
BLEU Score (avg): 0.0854
ROUGE-1 F1 Score (avg): 0.4864
ROUGE-L F1 Score (avg): 0.4864
