In [1]:
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip -q val2017.zip

--2025-11-24 23:00:42--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.161.249, 52.217.86.108, 3.5.10.213, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.161.249|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip’


2025-11-24 23:01:30 (16.2 MB/s) - ‘val2017.zip’ saved [815585330/815585330]



In [5]:
import os
import torch
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification

In [9]:
CSV_PATH = "/content/COCO_val_mcq_llama3.1_rephrased.csv"          # <-- path to your CSV file
IMAGE_COL = "image_path"               # column with image filenames
LABEL_COL = "correct_answer"           # column with correct index (0–3)
CAPTION_COL_TEMPLATE = "caption_{}"    # caption_0, caption_1, ...

NUM_CHOICES = 4                        # captions per example
IMAGE_ROOT = "."                       # base dir; "." since paths already start with "val2017/"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = AutoModelForZeroShotImageClassification.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)

In [None]:
model.eval()

df = pd.read_csv(CSV_PATH)

print(f"Loaded {len(df)} examples")

# ------------------------------------------------------------------
# Benchmark loop
# ------------------------------------------------------------------
correct = 0
total = 0

prefix = 'data/coco/images/'
prefix_len = len(prefix)

all_preds = []
all_labels = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    # 1. Load image
    img_col_adapted = row[IMAGE_COL][prefix_len:]
    img_path = os.path.join(IMAGE_ROOT, img_col_adapted)
    image = Image.open(img_path).convert("RGB")

    # 2. Build list of candidate captions
    captions = [row[CAPTION_COL_TEMPLATE.format(i)] for i in range(NUM_CHOICES)]

    # 3. Run CLIP
    inputs = processor(
        text=captions,
        images=image,
        return_tensors="pt",
        padding=True,
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        # logits_per_image shape: (1, num_captions)
        logits = outputs.logits_per_image
        pred_idx = logits.argmax(dim=-1).item()

    label = int(row[LABEL_COL])
    all_preds.append(pred_idx)
    all_labels.append(label)

    if pred_idx == label:
        correct += 1
    total += 1

accuracy = correct / total if total > 0 else 0.0
print(f"Accuracy: {accuracy:.4f}  ({correct}/{total})")

Loaded 5914 examples


  0%|          | 0/5914 [00:00<?, ?it/s]