In [15]:
!pip install sacrebleu
!pip install rouge-score
!pip install bert-score
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [16]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModel
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
from tqdm import tqdm
import evaluate

In [19]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("adibvafa/BLIP-MIMIC-CXR")
model = BlipForConditionalGeneration.from_pretrained("adibvafa/BLIP-MIMIC-CXR").to(device)
model.eval()

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-23): 24 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1024,),

In [21]:
dataset = load_dataset("Fakhraddin/NLMCXR")
split_to_use = "validation"
data = dataset[split_to_use]
print(f"Loaded {len(data)} examples from {split_to_use} split.")

Loaded 1505 examples from validation split.


In [22]:
# here i am defining batch size
BATCH_SIZE = 64

# function to handle batching
def collate_fn(batch):
    images = [item["image"] for item in batch]
    texts = [item["text"] for item in batch]
    inputs = processor(images=images, return_tensors="pt")
    return inputs, texts

# i am creating DataLoader
eval_dataloader = DataLoader(data, batch_size=BATCH_SIZE, collate_fn=collate_fn)

preds, refs = [], []

print(f"Starting generation with batch size: {BATCH_SIZE}...")

# batched generation loop for the data
with torch.no_grad():
    for inputs, batch_texts in tqdm(eval_dataloader, desc="Generating batches"):
        inputs = inputs.to(device)
        output_ids = model.generate(**inputs, max_new_tokens=50)

        # decoding the whole batch at once
        batch_preds = processor.batch_decode(output_ids, skip_special_tokens=True)

        preds.extend(batch_preds)
        refs.extend(batch_texts)

print(f"Generated {len(preds)} captions.")

Starting generation with batch size: 64...


Generating batches: 100%|██████████| 24/24 [02:22<00:00,  5.94s/it]

Generated 1505 captions.





In [23]:
print("Computing Cosine Similarity...")

# loading bert-base-uncased for BERT score calculation
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()

def get_embeddings_batched(text_list, batch_size=128):
    """
    Extracts CLS embeddings for a list of texts in batches to speed up processing.
    """
    all_embs = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i : i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        embs = outputs.last_hidden_state[:, 0, :]
        all_embs.append(embs.cpu())
    return torch.cat(all_embs, dim=0)

# get embedding for all preds and refs
pred_embs = get_embeddings_batched(preds)
ref_embs = get_embeddings_batched(refs)

# calculate cosine similarity
cosine_sims = torch.nn.functional.cosine_similarity(pred_embs, ref_embs, dim=1)
avg_cosine = cosine_sims.mean().item()

print(f"Avg. Cosine Similarity: {avg_cosine:.4f}")

Computing Cosine Similarity...
Avg. Cosine Similarity: 0.7011


In [24]:
# --- BLEU ---
print("Computing BLEU...")
avg_bleu = corpus_bleu(preds, [refs]).score
print(f"Avg. BLEU: {avg_bleu:.4f}")

# --- ROUGE ---
print("Computing ROUGE...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1s, rouge2s, rougeLs = [], [], []

for r, p in zip(refs, preds):
    s = scorer.score(r, p)
    rouge1s.append(s['rouge1'].fmeasure)
    rouge2s.append(s['rouge2'].fmeasure)
    rougeLs.append(s['rougeL'].fmeasure)

avg_rouge1 = np.mean(rouge1s)
avg_rouge2 = np.mean(rouge2s)
avg_rougeL = np.mean(rougeLs)

# --- BERTScore (Batched) ---
print("Computing BERTScore...")
bertscore_metric = evaluate.load("bertscore")
results = bertscore_metric.compute(predictions=preds, references=refs, model_type="bert-base-uncased", batch_size=64, device=device)
avg_bertscore = np.mean(results["f1"])

print("\n===== Final Evaluation Metrics =====")
print(f"Avg. Cosine Similarity: {avg_cosine:.4f}")
print(f"Avg. BLEU:              {avg_bleu:.4f}")
print(f"Avg. ROUGE-1:           {avg_rouge1:.4f}")
print(f"Avg. ROUGE-2:           {avg_rouge2:.4f}")
print(f"Avg. ROUGE-L:           {avg_rougeL:.4f}")
print(f"Avg. BERTScore (F1):    {avg_bertscore:.4f}")

Computing BLEU...
Avg. BLEU: 0.0828
Computing ROUGE...
Computing BERTScore...


Downloading builder script: 0.00B [00:00, ?B/s]


===== Final Evaluation Metrics =====
Avg. Cosine Similarity: 0.7011
Avg. BLEU:              0.0828
Avg. ROUGE-1:           0.0504
Avg. ROUGE-2:           0.0020
Avg. ROUGE-L:           0.0402
Avg. BERTScore (F1):    0.4208


In [29]:
for i in range(5):
    print(f"Reference: {refs[i]}")
    print(f"Prediction: {preds[i]}\n")

Reference: 1. Increased opacity in the right upper lobe with XXXX associated atelectasis may represent focal consolidation or mass lesion with atelectasis. Recommend chest CT for further evaluation. 2. XXXX opacity overlying the left 5th rib may represent focal airspace disease. There is XXXX increased opacity within the right upper lobe with possible mass and associated area of atelectasis or focal consolidation. The cardiac silhouette is within normal limits. XXXX opacity in the left midlung overlying the posterior left 5th rib may represent focal airspace disease. No pleural effusion or pneumothorax. No acute bone abnormality.
Prediction: final report chest radiograph indication on hazelburn syndrome, hazelburn syndrome, and hazelburn syndrome, five years ago, now with four weeks of cough and green sputum production, including one hour of fever and brown sputum production, five

Reference: Status post left mastectomy. Heart size normal. Lungs are clear.
Prediction: final report ches