In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
import json
import os

with open("../data/flickr_test_images.json", "r") as f:
    test_image_filenames = json.load(f)

image_dir = "../data/Flickr8k_Dataset/Flicker8k_Dataset"
test_image_paths = [os.path.join(image_dir, img_name) for img_name in test_image_filenames]

In [3]:
from tqdm import tqdm

blip_predictions = {}

for img_path in tqdm(test_image_paths):
    try:
        raw_image = Image.open(img_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt").to(device)
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)

        img_name = os.path.basename(img_path)
        blip_predictions[img_name] = caption
    except Exception as e:
        print(f"Error processing {img_path}: {e}")


  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.
`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [27:44<00:00,  1.66s/it]


In [4]:
with open("../data/flickr_encoded_captions.json", "r") as f:
    gt_results = json.load(f)

In [5]:
with open("../data/flickr_vocab.json", "r") as f:
    vocab_data = json.load(f)
    idx2word = {int(k): v for k, v in vocab_data['idx2word'].items()}  

with open("../data/flickr_encoded_captions.json", "r") as f:
    gt_encoded = json.load(f)

def decode_caption(encoded_caption, idx2word):
    decoded = []
    for idx in encoded_caption:
        word = idx2word.get(idx, '<unk>')
        if word not in ['<pad>', '<start>', '<end>']:  
            decoded.append(word)
    return decoded

In [6]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from tqdm import tqdm

references = []
hypotheses = []
meteor_scores = []

for img_name, pred_caption in tqdm(blip_predictions.items()):
    encoded_gt_captions = gt_encoded.get(img_name, [])
    if not encoded_gt_captions:
        continue
    
    decoded_gt_captions = [decode_caption(encoded_cap, idx2word) for encoded_cap in encoded_gt_captions]
    
    tokenized_refs = [[word.lower() for word in ref_caption] for ref_caption in decoded_gt_captions]
    
    tokenized_pred = pred_caption.strip().lower().split()
    
    references.append(tokenized_refs)
    hypotheses.append(tokenized_pred)
    
    meteor_per_ref = []
    for ref in tokenized_refs:
        try:
            meteor_per_ref.append(meteor_score([ref], tokenized_pred))
        except Exception as e:
            continue
    if meteor_per_ref:
        meteor_scores.append(max(meteor_per_ref))

smoothie = SmoothingFunction().method4
bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0), smoothing_function=smoothie)
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
avg_bleu = (bleu1 + bleu2 + bleu3 + bleu4) / 4
avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0

print(f"\nBLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")
print(f"BLEU-3: {bleu3:.4f}")
print(f"BLEU-4: {bleu4:.4f}")
print(f"Avg BLEU (1-4): {avg_bleu:.4f}")
print(f"METEOR: {avg_meteor:.4f}")

print(f"\nNumber of evaluated pairs: {len(references)}")
print(f"\nSample comparison:")
for i in range(min(3, len(references))):
    img_name = list(blip_predictions.keys())[i]
    print(f"\nImage: {img_name}")
    print(f"Ground Truth: {references[i][0][:10]}...")  
    print(f"Prediction: {hypotheses[i][:10]}...")      

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 179.99it/s]



BLEU-1: 0.6235
BLEU-2: 0.4824
BLEU-3: 0.3553
BLEU-4: 0.2515
Avg BLEU (1-4): 0.4282
METEOR: 0.4137

Number of evaluated pairs: 1000

Sample comparison:

Image: 3385593926_d3e9c21170.jpg
Ground Truth: ['the', 'dogs', 'are', 'in', 'the', 'snow', 'in', 'front', 'of', 'a']...
Prediction: ['two', 'dogs', 'playing', 'in', 'the', 'snow']...

Image: 2677656448_6b7e7702af.jpg
Ground Truth: ['a', 'brown', 'and', 'white', 'dog', 'swimming', 'towards', 'some', 'in', 'the']...
Prediction: ['a', 'man', 'in', 'a', 'pool', 'with', 'a', 'dog']...

Image: 311146855_0b65fdb169.jpg
Ground Truth: ['a', 'man', 'and', 'a', 'woman', 'in', 'festive', 'costumes', 'dancing']...
Prediction: ['man', 'wearing', 'a', 'yellow', 'and', 'green', 'costume']...
