In [None]:
!pip install sacrebleu
!pip install rouge-score
!pip install evaluate
!pip install bert-score

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [10]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from tqdm import tqdm
import numpy as np
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
import evaluate

### Setting up model on GPU

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# loading model and tokenizer
# model = OpenGVLab/InternVL2_5-1B
model_path = "OpenGVLab/InternVL2_5-1B"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    use_flash_attn=True
).to(device)
model.eval()

# i am manually setting the img_context_token_id to avoid assertion error
model.img_context_token_id = tokenizer.convert_tokens_to_ids('<IMG_CONTEXT>')
print(f"IMG_CONTEXT_TOKEN ID set to: {model.img_context_token_id}")

Using device: cuda
IMG_CONTEXT_TOKEN ID set to: 151667


### Downloading dataset

In [11]:
dataset = load_dataset("Fakhraddin/NLMCXR")
split_to_use = "validation"
data = dataset[split_to_use]
print(f"Loaded {len(data)} examples.")

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
BATCH_SIZE = 32

def build_transform(input_size=448):
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

transform = build_transform()

Loaded 1505 examples.


### Batched generation and prompts for InternVL

In [12]:
BATCH_SIZE = 32

# collate function for batching
def collate_fn(batch):
    images = []
    prompts = []
    raw_texts = []

    # defining the visual token sequence manually
    # InternVL 2.5 uses 256 tokens for a 448x448 image
    # Format: <img> + <IMG_CONTEXT>*256 + </img>
    visual_tokens = "<img>" + "<IMG_CONTEXT>" * 256 + "</img>"

    for item in batch:
        # resizing the image
        image = item["image"]
        pixel_values = transform(image)
        images.append(pixel_values)

        # adding the visual tokens in the prompt
        question = "Describe this medical image."
        prompt = f"{visual_tokens}\n{question}"
        prompts.append(prompt)
        raw_texts.append(item["text"])

    # stack images: [Batch, 3, 448, 448]
    pixel_values = torch.stack(images)

    # tokenize input with prompts
    model_inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048
    )

    return model_inputs, pixel_values, raw_texts

eval_dataloader = DataLoader(data, batch_size=BATCH_SIZE, collate_fn=collate_fn)

preds, refs = [], []
print(f"Starting generation with batch size: {BATCH_SIZE}...")

with torch.no_grad():
    for model_inputs, pixel_values, batch_texts in tqdm(eval_dataloader, desc="Generating"):
        input_ids = model_inputs["input_ids"].to(device)
        attention_mask = model_inputs["attention_mask"].to(device)

        # bfloat16 should be used for images since model uses it
        pixel_values = pixel_values.to(device).to(torch.bfloat16)

        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            max_new_tokens=50,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds.extend(batch_preds)
        refs.extend(batch_texts)

print(f"Generated {len(preds)} captions.")

Starting generation with batch size: 32...


Generating: 100%|██████████| 48/48 [03:03<00:00,  3.83s/it]

Generated 1505 captions.





### Computing performance metrics

In [14]:
print(f"Evaluating {len(preds)} samples...")


## COsine Simiarity
print("Computing Cosine Similarity...")

# using bert-base-uncased for embeddings
eval_model_name = "bert-base-uncased"
eval_tokenizer = AutoTokenizer.from_pretrained(eval_model_name)
eval_model = AutoModel.from_pretrained(eval_model_name).to(device)
eval_model.eval()

def get_embeddings_batched(text_list, batch_size=128):
    """
    Computes CLS embeddings for a list of texts using batching to maximize GPU usage.
    """
    all_embs = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i : i + batch_size]
        inputs = eval_tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = eval_model(**inputs)

        embs = outputs.last_hidden_state[:, 0, :].cpu()
        all_embs.append(embs)

    return torch.cat(all_embs, dim=0)

# Compute embeddings
pred_embs = get_embeddings_batched(preds)
ref_embs = get_embeddings_batched(refs)

# Compute Cosine Similarity
# cosine_similarity expects tensors of shape (N, D)
cosine_sims = torch.nn.functional.cosine_similarity(pred_embs, ref_embs, dim=1)
avg_cosine = cosine_sims.mean().item()

print(f"Done. Avg Cosine: {avg_cosine:.4f}")

# BLEU Score
print("Computing BLEU...")
# sacrebleu expects references as a list of lists (for multiple refs per image)
avg_bleu = corpus_bleu(preds, [refs]).score

# ROUGE Score
print("Computing ROUGE...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for r, p in zip(refs, preds):
    scores = scorer.score(r, p)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

## BERT SCore
print("Computing BERTScore...")
bertscore_metric = evaluate.load("bertscore")
results = bertscore_metric.compute(
    predictions=preds,
    references=refs,
    model_type="bert-base-uncased",
    batch_size=64,
    device=device
)

avg_bertscore = np.mean(results["f1"])

print("\n" + "="*40)
print(f"  Evaluation Report: InternVL2.5-1B")
print("="*40)
print(f"Avg. Cosine Similarity: {avg_cosine:.4f}")
print(f"Avg. BLEU:              {avg_bleu:.4f}")
print(f"Avg. ROUGE-1:           {avg_rouge1:.4f}")
print(f"Avg. ROUGE-2:           {avg_rouge2:.4f}")
print(f"Avg. ROUGE-L:           {avg_rougeL:.4f}")
print(f"Avg. BERTScore (F1):    {avg_bertscore:.4f}")
print("="*40)

Evaluating 1505 samples...
Computing Cosine Similarity...
Done. Avg Cosine: 0.6342
Computing BLEU...
Computing ROUGE...
Computing BERTScore...

  Evaluation Report: InternVL2.5-1B
Avg. Cosine Similarity: 0.6342
Avg. BLEU:              0.3246
Avg. ROUGE-1:           0.0740
Avg. ROUGE-2:           0.0060
Avg. ROUGE-L:           0.0552
Avg. BERTScore (F1):    0.3688
