In [6]:
import torch
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModel
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
from tqdm import tqdm

In [2]:
!pip install sacrebleu

Defaulting to user installation because normal site-packages is not writeable
Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Using cached sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.5.1


In [5]:
!pip install rouge_score; bert_score 

Defaulting to user installation because normal site-packages is not writeable
/bin/bash: line 1: bert_score: command not found


In [7]:
dataset = load_dataset("Fakhraddin/NLMCXR")
split_to_use = "validation"  # or "train", can also combine
data = dataset[split_to_use]

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [9]:
preds, refs = [], []

for example in tqdm(data, desc="Generating captions"):
    image = example["image"]
    ref_text = example["text"] 
    
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)
    
    pred = processor.decode(output[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append(ref_text)

Generating captions: 100%|██████████| 1505/1505 [02:52<00:00,  8.72it/s]


In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token
    return emb

cosines = []
for p, r in zip(preds, refs):
    e1, e2 = get_embedding(p), get_embedding(r)
    cos = np.dot(e1, e2) / (np.linalg.norm(e1) * np.linalg.norm(e2))
    cosines.append(cos)

avg_cosine = np.mean(cosines)
print(f"Avg. Cosine Similarity: {avg_cosine:.4f}")

Avg. Cosine Similarity: 0.6542


In [11]:
avg_bleu = corpus_bleu(preds, [refs]).score
print(f"Avg. BLEU: {avg_bleu:.4f}")

Avg. BLEU: 0.0054


In [12]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1s, rouge2s, rougeLs = [], [], []
for r, p in zip(refs, preds):
    s = scorer.score(r, p)
    rouge1s.append(s['rouge1'].fmeasure)
    rouge2s.append(s['rouge2'].fmeasure)
    rougeLs.append(s['rougeL'].fmeasure)
avg_rouge1 = np.mean(rouge1s)
avg_rouge2 = np.mean(rouge2s)
avg_rougeL = np.mean(rougeLs)

print(f"Avg. ROUGE-1: {avg_rouge1:.4f}")
print(f"Avg. ROUGE-2: {avg_rouge2:.4f}")
print(f"Avg. ROUGE-L: {avg_rougeL:.4f}")

Avg. ROUGE-1: 0.0344
Avg. ROUGE-2: 0.0022
Avg. ROUGE-L: 0.0326


In [14]:
import evaluate

# Load metric
bertscore = evaluate.load("bertscore")

# Compute BERTScore
results = bertscore.compute(predictions=preds, references=refs, model_type="bert-base-uncased")

# Get average F1 score
avg_bertscore = sum(results["f1"]) / len(results["f1"])

print(f"Avg. BERTScore (F1): {avg_bertscore:.4f}")

Avg. BERTScore (F1): 0.3781


In [15]:
print("\n===== Final Evaluation Metrics (BLIP Base + NLMCXR) =====")
print(f"Avg. Cosine Similarity: {avg_cosine:.4f}")
print(f"Avg. BLEU:              {avg_bleu:.4f}")
print(f"Avg. ROUGE-1:           {avg_rouge1:.4f}")
print(f"Avg. ROUGE-2:           {avg_rouge2:.4f}")
print(f"Avg. ROUGE-L:           {avg_rougeL:.4f}")
print(f"Avg. BERTScore (F1):    {avg_bertscore:.4f}")


===== Final Evaluation Metrics (BLIP Base + NLMCXR) =====
Avg. Cosine Similarity: 0.6542
Avg. BLEU:              0.0054
Avg. ROUGE-1:           0.0344
Avg. ROUGE-2:           0.0022
Avg. ROUGE-L:           0.0326
Avg. BERTScore (F1):    0.3781
