In [None]:
print('Installing dependencies...')
!pip uninstall -y transformers tokenizers
!pip install transformers==4.28.0 tokenizers==0.13.3 timm==0.4.12 fairscale==0.4.4 pycocoevalcap datasets --no-dependencies --upgrade

Installing dependencies...
Found existing installation: transformers 4.51.1
Uninstalling transformers-4.51.1:
  Successfully uninstalled transformers-4.51.1
Found existing installation: tokenizers 0.21.1
Uninstalling tokenizers-0.21.1:
  Successfully uninstalled tokenizers-0.21.1
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.13.3
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting timm==0.4.12
  Downloading timm-0.4.12-py3-none-any.whl.metadata (30 kB)
Collecting fairscale==0.4.4
  Downloading fairscale-0.4.4.tar.gz (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.4/235.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requ

In [None]:
# BLIP Benchmark Evaluation with Statistical Sampling - Fixed Version
# This notebook evaluates BLIP models on standard benchmarks using statistically valid sample sizes

import os
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
import requests
import json
import random
import math
import warnings
warnings.filterwarnings("ignore")

# Add this at the beginning of your script to avoid redundant downloads
import os
import sys

# Check if running in Colab
if 'google.colab' in sys.modules:
    print('Running in Colab, checking dependencies...')

    # Define paths
    coco_img_path = 'datasets/coco/val2017'
    coco_ann_path = 'datasets/coco/annotations'
    vqa_ann_path = 'datasets/vqa'
    flickr_path = 'datasets/flickr30k'

    # Only install dependencies if needed
    if os.path.exists('BLIP'):
        print("Removing existing BLIP directory to avoid nesting...")
        !rm -rf BLIP

    # Clone fresh
    print("Cloning BLIP repo...")
    !git clone https://github.com/salesforce/BLIP.git
    %cd BLIP

    # Only download datasets if they don't exist
    if not os.path.exists(f"{coco_img_path}") or len(os.listdir(f"{coco_img_path}")) == 0:
        print('Downloading COCO validation dataset...')
        !mkdir -p {coco_img_path}
        !mkdir -p {coco_ann_path}

        # Download COCO validation images (5K)
        !wget -c http://images.cocodataset.org/zips/val2017.zip -P datasets/coco/
        !unzip -q datasets/coco/val2017.zip -d datasets/coco/
    else:
        print(f'COCO validation images already exist in {coco_img_path}, skipping download.')

    if not os.path.exists(f"{coco_ann_path}/captions_val2017.json"):
        # Download COCO annotations
        !wget -c http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P datasets/coco/
        !unzip -q datasets/coco/annotations_trainval2017.zip -d datasets/coco/
    else:
        print(f'COCO annotations already exist in {coco_ann_path}, skipping download.')

    # For Flickr30K evaluation
    if not os.path.exists(flickr_path):
        print('Setting up Flickr30K evaluation...')
        !mkdir -p {flickr_path}
        !mkdir -p {flickr_path}/annotations

    # For VQA evaluation
    if not os.path.exists('VQA'):
        print('Setting up VQA evaluation...')
        !git clone https://github.com/GT-Vision-Lab/VQA.git

    if not os.path.exists(f"{vqa_ann_path}/v2_mscoco_val2014_annotations.json"):
        # Download VQA annotations
        !mkdir -p {vqa_ann_path}
        !wget -c https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip -P {vqa_ann_path}/
        !unzip -q {vqa_ann_path}/v2_Annotations_Val_mscoco.zip -d {vqa_ann_path}/
        !wget -c https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip -P {vqa_ann_path}/
        !unzip -q {vqa_ann_path}/v2_Questions_Val_mscoco.zip -d {vqa_ann_path}/
    else:
        print(f'VQA annotations already exist in {vqa_ann_path}, skipping download.')

    # Install necessary packages for evaluation
    !pip install -q pycocoevalcap
    !pip install -q "transformers>=4.28.0" pillow evaluate

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define paths based on environment
if 'google.colab' in sys.modules:
    # Colab environment
    coco_img_path = 'datasets/coco/val2017'
    coco_ann_path = 'datasets/coco/annotations'
    vqa_ann_path = 'datasets/vqa'
    flickr_path = 'datasets/flickr30k'
else:
    # Local environment
    coco_img_path = '../datasets/coco/val2017'
    coco_ann_path = '../datasets/coco/annotations'
    vqa_ann_path = '../datasets/vqa'
    flickr_path = '../datasets/flickr30k'

# Helper function to calculate minimum sample size
def calculate_sample_size(population_size, confidence_level=0.95, margin_of_error=0.05):
    """
    Calculate the minimum sample size required for statistical significance

    Args:
        population_size: Total number of items in the population
        confidence_level: Desired confidence level (default: 0.95 for 95%)
        margin_of_error: Acceptable margin of error (default: 0.05 for 5%)

    Returns:
        Minimum sample size required
    """
    # Z-score for given confidence level
    z_scores = {
        0.90: 1.645,
        0.95: 1.96,
        0.99: 2.576
    }
    z = z_scores.get(confidence_level, 1.96)

    # Calculate sample size using the formula
    # n = (z²pq)/(E²)
    # For maximum variance, p = q = 0.5
    numerator = z**2 * 0.5 * (1-0.5)
    denominator = margin_of_error**2
    sample_size = numerator / denominator

    # Finite population correction
    if population_size is not None:
        sample_size = (sample_size * population_size) / (sample_size + population_size - 1)

    return math.ceil(sample_size)

# Helper function to check if an image can be properly loaded and transformed
def validate_image(img_path, transform):
    """Check if an image can be properly loaded and transformed"""
    try:
        # First check if we can open and decode the image
        with Image.open(img_path) as img:
            # Ensure the image has 3 channels (RGB)
            if img.mode != 'RGB':
                return False

            # Try applying the transform to check for compatibility
            tensor = transform(img)
            if tensor.shape[0] != 3:  # Should be 3 channels (RGB)
                return False

        return True
    except Exception as e:
        print(f"Image validation error for {img_path}: {str(e)}")
        return False

# Helper function to compare our results with paper results
def compare_results(paper_results, our_results, task_name, sample_size=None, population_size=None):
    """Create a comparison table between paper results and our verification"""
    comparison_df = pd.DataFrame({
        'Metric': paper_results.keys(),
        'Paper Result': paper_results.values(),
        'Our Result': [our_results.get(k, 'N/A') for k in paper_results.keys()],
        'Difference': [our_results.get(k, 0) - v if k in our_results else 'N/A'
                      for k, v in paper_results.items()]
    })

    print(f"\n{task_name} - Verification Results")
    if sample_size and population_size:
        margin_of_error = 1.96 * math.sqrt((0.5 * 0.5) / sample_size) * math.sqrt((population_size - sample_size) / (population_size - 1))
        print(f"Sample Size: {sample_size} of {population_size} ({(sample_size/population_size)*100:.1f}%)")
        print(f"Margin of Error: ±{margin_of_error*100:.2f}% at 95% confidence level")
    print("="*50)
    return comparison_df

# =============================================
# 1. Image Captioning Evaluation
# =============================================

def evaluate_blip_hf_captioning(sample_size=20):
    """
    Evaluate BLIP image captioning using the Hugging Face transformers library

    Args:
        sample_size: Number of images to evaluate
    """
    # Import after installation
    from transformers import BlipProcessor, BlipForConditionalGeneration
    import sys

    # Try to import evaluation metrics, with fallback implementations
    try:
        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.meteor.meteor import Meteor
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider
    except ImportError:
        print("Warning: pycocoevalcap not installed properly. Using fallback metrics.")

        # Implement simple BLEU score calculation as fallback
        class Bleu:
            def __init__(self, n=4):
                self.n = n

            def compute_score(self, refs, hyps):
                # Calculate simple n-gram precision for n=1,2,3,4
                scores = [0.0] * self.n
                for i in range(self.n):
                    n = i + 1
                    total_matches = 0
                    total_hyp_ngrams = 0

                    for img_id in refs.keys():
                        ref_sentences = refs[img_id]
                        hyp_sentence = hyps[img_id][0]

                        # Generate n-grams
                        hyp_words = hyp_sentence.lower().split()
                        hyp_ngrams = set()
                        for j in range(len(hyp_words)-n+1):
                            hyp_ngrams.add(tuple(hyp_words[j:j+n]))

                        total_hyp_ngrams += len(hyp_ngrams)

                        # Count matches with any reference
                        matches = set()
                        for ref in ref_sentences:
                            ref_words = ref.lower().split()
                            for j in range(len(ref_words)-n+1):
                                ngram = tuple(ref_words[j:j+n])
                                if ngram in hyp_ngrams:
                                    matches.add(ngram)

                        total_matches += len(matches)

                    if total_hyp_ngrams > 0:
                        scores[i] = total_matches / total_hyp_ngrams

                return scores, scores

        # Simple implementations for other metrics if needed
        class Rouge:
            def compute_score(self, refs, hyps):
                score = 0.0
                scores = []

                for img_id in refs.keys():
                    hyp = hyps[img_id][0].lower().split()
                    best_score = 0

                    for ref in refs[img_id]:
                        ref_words = ref.lower().split()
                        matches = set(hyp) & set(ref_words)
                        recall = len(matches) / len(ref_words) if ref_words else 0
                        prec = len(matches) / len(hyp) if hyp else 0
                        if recall > 0 and prec > 0:
                            f1 = 2 * recall * prec / (recall + prec)
                            best_score = max(best_score, f1)

                    scores.append(best_score)
                    score += best_score

                if len(refs) > 0:
                    score /= len(refs)

                return score, scores

        class Cider:
            def compute_score(self, refs, hyps):
                score = 0.0
                scores = []

                # Very simplified CIDEr implementation
                for img_id in refs.keys():
                    hyp = hyps[img_id][0].lower().split()
                    hyp_count = {}
                    for w in hyp:
                        hyp_count[w] = hyp_count.get(w, 0) + 1

                    best_score = 0
                    for ref in refs[img_id]:
                        ref_words = ref.lower().split()
                        ref_count = {}
                        for w in ref_words:
                            ref_count[w] = ref_count.get(w, 0) + 1

                        # Calculate cosine similarity
                        dot_product = sum(hyp_count.get(w, 0) * ref_count.get(w, 0) for w in set(hyp_count) | set(ref_count))
                        hyp_mag = math.sqrt(sum(c**2 for c in hyp_count.values()))
                        ref_mag = math.sqrt(sum(c**2 for c in ref_count.values()))

                        if hyp_mag > 0 and ref_mag > 0:
                            similarity = dot_product / (hyp_mag * ref_mag)
                            best_score = max(best_score, similarity)

                    scores.append(best_score)
                    score += best_score

                if len(refs) > 0:
                    score /= len(refs)

                return score, scores

        # Skip METEOR which requires external Java dependencies
        class Meteor:
            def compute_score(self, refs, hyps):
                print("METEOR metric requires Java. Skipping.")
                return 0.0, [0.0] * len(refs)

    print("\nEvaluating BLIP Image Captioning with Hugging Face implementation...")

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load COCO validation set
    from pycocotools.coco import COCO
    coco_ann_file = f"{coco_ann_path}/captions_val2017.json"
    coco = COCO(coco_ann_file)
    img_ids = list(coco.imgs.keys())
    population_size = len(img_ids)

    print(f"Using sample size of {sample_size} images (from total of {population_size})")

    # Use random subset for evaluation
    random.seed(42)  # For reproducibility
    subset_img_ids = random.sample(img_ids, sample_size)

    # Load BLIP model and processor from Hugging Face
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    # Generate captions for validation images
    results = []

    print(f"Generating captions for {sample_size} images...")
    for img_id in tqdm(subset_img_ids):
        # Load image
        img_info = coco.loadImgs(img_id)[0]
        img_path = f"{coco_img_path}/{img_info['file_name']}"

        try:
            # Load image
            raw_image = Image.open(img_path).convert('RGB')

            # Generate caption with BLIP
            inputs = processor(raw_image, return_tensors="pt").to(device)

            with torch.no_grad():
                generated_ids = model.generate(**inputs, max_length=50)
                caption = processor.decode(generated_ids[0], skip_special_tokens=True)

                # Add result
                results.append({
                    'image_id': img_id,
                    'caption': caption
                })
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")

    # Save results to file
    result_file = 'caption_results_hf.json'
    with open(result_file, 'w') as f:
        json.dump(results, f)

    # Create reference dictionary for evaluation
    references = {}
    for img_id in subset_img_ids:
        references[img_id] = []
        anns = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
        for ann in anns:
            references[img_id].append(ann['caption'])

    # Filter out any image IDs that have no generated captions
    hypo = {res['image_id']: [res['caption']] for res in results if res['image_id'] in references}
    ref = {img_id: references[img_id] for img_id in hypo.keys()}

    # Check if we have any valid results
    if len(hypo) == 0:
        print("No valid captions generated. Skipping metric calculation.")
        scores = {
            "Bleu_1": 0, "Bleu_2": 0, "Bleu_3": 0, "Bleu_4": 0,
            "METEOR": 0, "ROUGE_L": 0, "CIDEr": 0
        }
    else:
        # Ensure that both hypo and ref have the same set of keys
        assert hypo.keys() == ref.keys(), "Mismatch between generated and ground truth image IDs"

        # Evaluate using standard metrics
        print("Calculating metrics...")

        # Setup metrics
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # Calculate scores with error handling
        scores = {}

        # Format results for scoring
        for scorer, method in scorers:
            try:
                score, scores_list = scorer.compute_score(ref, hypo)
                if isinstance(method, list):
                    for m, s in zip(method, score):
                        scores[m] = s
                else:
                    scores[method] = score
            except Exception as e:
                print(f"Error calculating {method}: {str(e)}")
                if isinstance(method, list):
                    for m in method:
                        scores[m] = 0  # Default to 0 on error
                else:
                    scores[method] = 0  # Default to 0 on error

    # Show example captions
    print("\nExample Generated Captions:")
    for i in range(min(5, len(results))):
        img_id = results[i]['image_id']
        img_info = coco.loadImgs(img_id)[0]

        print(f"Image: {img_info['file_name']}")
        print(f"Generated: {results[i]['caption']}")
        print(f"References: {references[img_id]}")
        print()

    # Report paper vs our results
    paper_results = {
        'Bleu_4': 39.7,
        'CIDEr': 133.3
    }

    # Scale our results to match paper's format
    our_results = {
        'Bleu_4': scores.get('Bleu_4', 0) * 100,  # Convert to percentage
        'CIDEr': scores.get('CIDEr', 0) * 100     # Scale to match paper
    }

    print("\nComparison with Paper Results:")
    print(f"Metric    Paper    Ours    Difference")
    print(f"BLEU@4    {paper_results['Bleu_4']:.1f}%    {our_results['Bleu_4']:.1f}%    {our_results['Bleu_4'] - paper_results['Bleu_4']:.1f}%")
    print(f"CIDEr     {paper_results['CIDEr']:.1f}    {our_results['CIDEr']:.1f}    {our_results['CIDEr'] - paper_results['CIDEr']:.1f}")

    print("\nDetailed Metrics:")
    for metric, score in scores.items():
        print(f"{metric}: {score*100 if 'Bleu' in metric else score*100:.2f}%")

    return results, scores

# =============================================
# 2. Visual Question Answering Evaluation
# =============================================

def evaluate_vqa(sample_size=None):
    """
    Evaluate BLIP on Visual Question Answering

    Args:
        sample_size: Number of QA pairs to evaluate (if None, automatically calculate)
    """
    try:
        from models.blip_vqa import blip_vqa
    except ImportError:
        print("BLIP VQA module not found. This evaluation requires the BLIP codebase.")
        print("Reporting only paper results.")

        # Return placeholder values
        paper_results = {
            'VQA test-dev': 78.25,
            'VQA test-std': 78.32
        }

        our_results = {
            'VQA validation accuracy': 'N/A (requires BLIP codebase)'
        }

        comparison = compare_results(paper_results, our_results, "Visual Question Answering", 0, 0)
        return comparison, 0, []

    import json

    print("\nEvaluating Visual Question Answering...")

    # Load VQA validation set
    vqa_ann_file = f"{vqa_ann_path}/v2_mscoco_val2014_annotations.json"
    vqa_ques_file = f"{vqa_ann_path}/v2_OpenEnded_mscoco_val2014_questions.json"

    try:
        with open(vqa_ann_file, 'r') as f:
            vqa_anns = json.load(f)['annotations']

        with open(vqa_ques_file, 'r') as f:
            vqa_questions = json.load(f)['questions']
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading VQA data: {str(e)}")
        print("This evaluation requires the VQA dataset.")

        # Return placeholder values
        paper_results = {
            'VQA test-dev': 78.25,
            'VQA test-std': 78.32
        }

        our_results = {
            'VQA validation accuracy': 'N/A (requires VQA dataset)'
        }

        comparison = compare_results(paper_results, our_results, "Visual Question Answering", 0, 0)
        return comparison, 0, []

    # Create question lookup
    question_map = {q['question_id']: q for q in vqa_questions}

    # Calculate appropriate sample size
    population_size = len(vqa_anns)
    if sample_size is None:
        sample_size = calculate_sample_size(population_size)
    print(f"Using sample size of {sample_size} QA pairs (from total of {population_size})")

    # Use random subset for evaluation
    random.seed(42)  # For reproducibility
    subset_anns = random.sample(vqa_anns, sample_size)

    # Load BLIP model
    image_size = 480
    model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'

    try:
        model = blip_vqa(pretrained=model_url, image_size=image_size, vit='base')
        model.eval()
        model = model.to(device)
    except Exception as e:
        print(f"Error loading BLIP VQA model: {str(e)}")
        print("This evaluation requires the BLIP codebase and model weights.")

        # Return placeholder values
        paper_results = {
            'VQA test-dev': 78.25,
            'VQA test-std': 78.32
        }

        our_results = {
            'VQA validation accuracy': 'N/A (requires BLIP model)'
        }

        comparison = compare_results(paper_results, our_results, "Visual Question Answering", 0, 0)
        return comparison, 0, []

    # Define image transform
    from torchvision import transforms
    from torchvision.transforms.functional import InterpolationMode

    transform = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ])

    # Generate answers for questions
    results = []

    print(f"Answering {sample_size} questions...")
    for ann in tqdm(subset_anns):
        question_id = ann['question_id']
        img_id = ann['image_id']

        # Get question
        if question_id not in question_map:
            print(f"Question ID {question_id} not found in question map")
            continue

        question = question_map[question_id]['question']

        # Load image
        # Note: VQA uses COCO 2014, adjust file path pattern based on your setup
        img_path = f"{coco_img_path}/{img_id:012d}.jpg"
        if not os.path.exists(img_path):
            # Try alternate path format (depends on your dataset setup)
            img_path = f"{coco_img_path}/COCO_val2014_{img_id:012d}.jpg"
            if not os.path.exists(img_path):
                continue

        try:
            raw_image = Image.open(img_path).convert('RGB')
            image = transform(raw_image).unsqueeze(0).to(device)

            # Generate answer
            with torch.no_grad():
                answer = model(image, question, train=False, inference='generate')

                # Add result
                results.append({
                    'question_id': question_id,
                    'answer': answer[0]
                })
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")

    # Calculate VQA accuracy using the standard VQA evaluation formula
    # VQA score = min(# humans that said answer / 3, 1)
    correct = 0
    total = 0

    for result in results:
        question_id = result['question_id']
        pred_answer = result['answer'].lower()

        # Find annotation
        for ann in subset_anns:
            if ann['question_id'] == question_id:
                # Count matches with ground truth answers
                match_count = sum(1 for a in ann['answers'] if a['answer'].lower() == pred_answer)
                # Use VQA accuracy formula
                accuracy = min(match_count / 3, 1)
                correct += accuracy
                total += 1
                break

    # Calculate overall accuracy
    accuracy = correct / total if total > 0 else 0

    # Reported results from BLIP paper for VQA (ViT-B with CapFilt-L on 129M images)
    paper_results = {
        'VQA test-dev': 78.25,
        'VQA test-std': 78.32
    }

    # Our verification results
    our_results = {
        'VQA validation accuracy': accuracy * 100  # Convert to percentage
    }

    # Compare results
    comparison = compare_results(paper_results, our_results, "Visual Question Answering",
                                sample_size, population_size)

    # Show example Q&A
    print("\nExample Question-Answer Pairs:")
    for i in range(min(5, len(results))):
        result = results[i]
        question_id = result['question_id']

        # Find corresponding question and annotation
        question = question_map[question_id]['question'] if question_id in question_map else "Question not found"

        # Find ground truth answers
        ground_truth = []
        for ann in subset_anns:
            if ann['question_id'] == question_id:
                ground_truth = [a['answer'] for a in ann['answers']]
                break

        print(f"Question: {question}")
        print(f"Predicted: {result['answer']}")
        print(f"Ground Truth: {ground_truth}")
        print()

    return comparison, accuracy, results

# =============================================
# 3. Image-Text Retrieval Evaluation
# =============================================

def evaluate_image_text_retrieval(sample_size=None):
    """
    Evaluate BLIP on Image-Text Retrieval

    Args:
        sample_size: Number of images to evaluate (if None, automatically calculate)
    """
    from models.blip_itm import blip_itm
    from pycocotools.coco import COCO

    print("\nEvaluating Image-Text Retrieval...")

    # Load COCO validation set
    coco_ann_file = f"{coco_ann_path}/captions_val2017.json"
    coco = COCO(coco_ann_file)
    img_ids = list(coco.imgs.keys())
    population_size = len(img_ids)

    # Calculate appropriate sample size if not provided
    if sample_size is None:
        # For retrieval, we use a smaller sample since each image requires comparing
        # with all captions, which is computationally intensive
        sample_size = min(calculate_sample_size(population_size), 500)
    print(f"Using sample size of {sample_size} images (from total of {population_size})")

    # Use random subset for evaluation
    random.seed(42)  # For reproducibility
    subset_img_ids = random.sample(img_ids, sample_size)

    # Load BLIP model
    image_size = 384
    model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'

    model = blip_itm(pretrained=model_url, image_size=image_size, vit='base')
    model.eval()
    model = model.to(device)

    # Define image transform
    from torchvision import transforms
    from torchvision.transforms.functional import InterpolationMode

    transform = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ])

    # Prepare data for retrieval evaluation
    images = []
    captions = []
    image_ids = []

    print(f"Preparing {sample_size} images and captions for retrieval...")
    for img_id in tqdm(subset_img_ids):
        # Load image
        img_info = coco.loadImgs(img_id)[0]
        img_path = f"{coco_img_path}/{img_info['file_name']}"

        try:
            raw_image = Image.open(img_path).convert('RGB')
            image = transform(raw_image).unsqueeze(0).to(device)

            # Get first caption for simplicity and to avoid memory issues
            anns = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
            if len(anns) > 0:
                images.append(image)
                captions.append(anns[0]['caption'])
                image_ids.append(img_id)
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")

    # Calculate similarity scores for all image-text pairs
    # Note: For large sets, this is very memory intensive
    # We use a batched approach to avoid OOM issues

    batch_size = 100
    num_images = len(images)
    scores = torch.zeros(num_images, num_images)

    print("Calculating retrieval scores...")
    for i in tqdm(range(0, num_images, batch_size)):
        i_end = min(i + batch_size, num_images)
        for j in range(0, num_images, batch_size):
            j_end = min(j + batch_size, num_images)

            for i_idx in range(i, i_end):
                for j_idx in range(j, j_end):
                    with torch.no_grad():
                        # Calculate ITC score (cosine similarity)
                        score = model(images[i_idx], captions[j_idx], match_head='itc').item()
                        scores[i_idx, j_idx] = score

    # Calculate retrieval metrics
    # IR@K: Image Retrieval at K
    # TR@K: Text Retrieval at K
    ks = [1, 5, 10]

    # Text Retrieval: For each image, find the top K matching texts
    tr_correct = {k: 0 for k in ks}
    for i in range(num_images):
        # Get scores for this image
        image_scores = scores[i]

        # Sort scores
        sorted_indices = torch.argsort(image_scores, descending=True)

        # Check if ground truth caption is in top K
        for k in ks:
            if i in sorted_indices[:k]:
                tr_correct[k] += 1

    # Image Retrieval: For each text, find the top K matching images
    ir_correct = {k: 0 for k in ks}
    for j in range(num_images):
        # Get scores for this caption
        caption_scores = scores[:, j]

        # Sort scores
        sorted_indices = torch.argsort(caption_scores, descending=True)

        # Check if ground truth image is in top K
        for k in ks:
            if j in sorted_indices[:k]:
                ir_correct[k] += 1

    # Calculate recall@K
    tr_recall = {k: tr_correct[k] / num_images * 100 for k in ks}
    ir_recall = {k: ir_correct[k] / num_images * 100 for k in ks}

    # Reported results from BLIP paper for Image-Text Retrieval (BLIP with ViT-B on 129M images)
    paper_results = {
        'TR@1': 81.9,
        'TR@5': 95.4,
        'TR@10': 97.8,
        'IR@1': 64.3,
        'IR@5': 85.7,
        'IR@10': 91.5
    }

    # Our verification results
    our_results = {
        'TR@1': tr_recall[1],
        'TR@5': tr_recall[5],
        'TR@10': tr_recall[10],
        'IR@1': ir_recall[1],
        'IR@5': ir_recall[5],
        'IR@10': ir_recall[10]
    }

    # Compare results
    comparison = compare_results(paper_results, our_results, "Image-Text Retrieval",
                                len(images), population_size)

    # Show example retrieval
    print("\nExample Image-Text Retrieval:")
    for i in range(min(5, len(images))):
        # Get top 3 captions for this image
        image_scores = scores[i]
        sorted_indices = torch.argsort(image_scores, descending=True)

        print(f"Image ID: {image_ids[i]}")
        print(f"Ground Truth Caption: {captions[i]}")
        print("Top 3 Retrieved Captions:")
        for j in range(3):
            idx = sorted_indices[j].item()
            print(f"  {j+1}. {captions[idx]} (score: {image_scores[idx]:.4f})")
        print()

    return comparison, (tr_recall, ir_recall)

# =============================================
# 4. Zero-Shot Flickr30K Evaluation
# =============================================

def evaluate_zero_shot_flickr30k(sample_size=None):
    """
    Evaluate BLIP on Zero-Shot Flickr30K Retrieval
    Note: This requires access to the Flickr30K dataset

    Args:
        sample_size: Number of images to evaluate
    """
    from models.blip_itm import blip_itm
    import json

    print("\nEvaluating Zero-Shot Flickr30K Retrieval...")

    # Check if we have access to Flickr30K dataset
    try:
        # This is a placeholder for actual Flickr30K dataset loading
        # In a real scenario, you'd load the Flickr30K test set
        # Since we don't have direct access to it, we'll simulate with placeholder data

        # We're creating synthetic data for demonstration
        # In a real evaluation, you'd load the actual Flickr30K test set
        print("Note: Using synthetic Flickr30K data for demonstration")
        flickr_test_size = 1000  # Flickr30K test set size is 1000

        # Create synthetic data
        if not os.path.exists(f"{flickr_path}/imgs"):
            os.makedirs(f"{flickr_path}/imgs")

        # Create synthetic annotation file
        flickr_annotations = {
            "images": [],
            "annotations": []
        }

        for i in range(min(sample_size or 100, flickr_test_size)):
            flickr_annotations["images"].append({
                "id": i,
                "file_name": f"img_{i}.jpg"
            })

            flickr_annotations["annotations"].append({
                "image_id": i,
                "caption": f"This is a synthetic caption for image {i}"
            })

        with open(f"{flickr_path}/annotations/test_annotations.json", 'w') as f:
            json.dump(flickr_annotations, f)

        # Since we don't have actual Flickr30K images, we'll reuse COCO images for demonstration
        # In a real evaluation, you'd use the actual Flickr30K images
        population_size = flickr_test_size

        # Calculate appropriate sample size if not provided
        if sample_size is None:
            sample_size = min(calculate_sample_size(population_size), 200)
        print(f"Using sample size of {sample_size} images (from total of {population_size})")

        # Load BLIP model trained on COCO
        image_size = 384
        model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'

        model = blip_itm(pretrained=model_url, image_size=image_size, vit='base')
        model.eval()
        model = model.to(device)

        # Paper reported zero-shot results
        paper_results = {
            'TR@1': 94.8,
            'TR@5': 99.7,
            'TR@10': 100.0,
            'IR@1': 84.9,
            'IR@5': 96.7,
            'IR@10': 98.3
        }

        # Since we don't have the actual dataset, we'll just report the paper results
        our_results = {
            'TR@1': 'N/A (requires Flickr30K dataset)',
            'TR@5': 'N/A (requires Flickr30K dataset)',
            'TR@10': 'N/A (requires Flickr30K dataset)',
            'IR@1': 'N/A (requires Flickr30K dataset)',
            'IR@5': 'N/A (requires Flickr30K dataset)',
            'IR@10': 'N/A (requires Flickr30K dataset)'
        }

        # Compare results
        comparison = compare_results(paper_results, our_results, "Zero-Shot Flickr30K Retrieval",
                                    sample_size, population_size)

        print("\nNote: Zero-shot Flickr30K evaluation requires the actual Flickr30K dataset.")
        print("We're only reporting the paper results for comparison.")

        return comparison, None

    except Exception as e:
        print(f"Error in Flickr30K evaluation: {str(e)}")
        return None, None

# =============================================
# 5. NLVR2 Evaluation
# =============================================

def evaluate_nlvr2(sample_size=None):
    """
    Evaluate BLIP on Natural Language Visual Reasoning (NLVR2)

    Args:
        sample_size: Number of samples to evaluate (if None, automatically calculate)
    """
    from models.med import BertConfig, BertModel
    from datasets import load_dataset
    import json

    print("\nEvaluating Natural Language Visual Reasoning (NLVR2)...")

    try:
        # Load NLVR2 dataset from HuggingFace
        nlvr2_dataset = load_dataset("lmms-lab/NLVR2")

        # Use balanced_dev split for evaluation
        dev_dataset = nlvr2_dataset["balanced_dev"]
        population_size = len(dev_dataset)

        # Calculate appropriate sample size if not provided
        if sample_size is None:
            sample_size = calculate_sample_size(population_size)
        print(f"Using sample size of {sample_size} samples (from total of {population_size})")

        # Use random subset for evaluation
        random.seed(42)  # For reproducibility
        indices = random.sample(range(population_size), min(sample_size, population_size))
        subset_dataset = [dev_dataset[i] for i in indices]

        # Load the BLIP model for NLVR2
        image_size = 384
        model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'

        # Define custom BLIP_NLVR2 model (similar to what's in the BLIP repo)
        class BLIP_NLVR2(torch.nn.Module):
            def __init__(self, med_config='configs/med_config.json', image_size=384, vit='base'):
                """
                Args:
                    med_config (str): path for the mixture of encoder-decoder model's configuration file
                    image_size (int): input image size
                    vit (str): model size of vision transformer
                """
                super().__init__()

                # Import necessary models
                from models.blip import create_vit, init_tokenizer

                self.visual_encoder = create_vit(vit=vit, image_size=image_size)
                self.tokenizer = init_tokenizer()

                encoder_config = BertConfig.from_json_file(med_config)
                encoder_config.encoder_width = self.visual_encoder.width
                self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)

                self.cls_head = torch.nn.Sequential(
                    torch.nn.Linear(self.text_encoder.config.hidden_size, self.text_encoder.config.hidden_size),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.text_encoder.config.hidden_size, 2)
                )

                # Load pre-trained weights
                checkpoint = torch.load(model_url, map_location='cpu')
                state_dict = checkpoint['model']

                # Load weights into model (simplified for demonstration)
                msg = self.load_state_dict(state_dict, strict=False)

            def forward(self, image1, image2, text, targets=None):
                # Process image1
                image_embeds1 = self.visual_encoder(image1)

                # Process image2
                image_embeds2 = self.visual_encoder(image2)

                # Prepare text input
                text_input = self.tokenizer(text, padding='max_length', truncation=True,
                                          max_length=35, return_tensors="pt").to(device)

                # Process text with image contexts
                # This is a simplified implementation - the actual implementation would match the BLIP repo
                text_output = self.text_encoder(text_input.input_ids,
                                             attention_mask=text_input.attention_mask,
                                             encoder_hidden_states=[image_embeds1, image_embeds2],
                                             encoder_attention_mask=[
                                                 torch.ones(image_embeds1.size()[:-1], dtype=torch.long).to(device),
                                                 torch.ones(image_embeds2.size()[:-1], dtype=torch.long).to(device)
                                             ],
                                             return_dict=True)

                # Classification
                cls_feats = text_output.last_hidden_state[:, 0]
                prediction = self.cls_head(cls_feats)

                if targets is not None:
                    # Training (not needed for evaluation)
                    loss = torch.nn.functional.cross_entropy(prediction, targets)
                    return {"loss": loss}
                else:
                    return {"predictions": torch.argmax(prediction, dim=1)}

        # Note: In practice, loading the NLVR2 model requires the BLIP codebase
        # For the purpose of this notebook, we'll just report the paper results
        print("Note: Actual evaluation requires the complete BLIP codebase with NLVR2 model implementation.")
        print("Reporting paper results for comparison.")

        # Define image transform
        from torchvision import transforms
        from torchvision.transforms.functional import InterpolationMode

        transform = transforms.Compose([
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])

        # Reported results from BLIP paper for NLVR2
        paper_results = {
            'NLVR2 dev': 82.67,
            'NLVR2 test-P': 82.30
        }

        # For demonstration, we'll just report the paper results
        our_results = {
            'NLVR2 dev': 'N/A (requires full BLIP implementation)',
            'NLVR2 test-P': 'N/A (requires full BLIP implementation)'
        }

        # Compare results
        comparison = compare_results(paper_results, our_results, "NLVR2",
                                   sample_size, population_size)

        return comparison, None

    except Exception as e:
        print(f"Error in NLVR2 evaluation: {str(e)}")
        print("To run NLVR2 evaluation, you need to install the 'datasets' package and have the BLIP codebase.")
        return None, None

# =============================================
# Visualization and Analysis Functions
# =============================================

def plot_benchmark_comparison(results):
    """
    Plot benchmark results comparison

    Args:
        results: Dictionary of evaluation results
    """
    import matplotlib.pyplot as plt
    import numpy as np

    # Extract paper results and our results
    tasks = []
    paper_scores = []
    our_scores = []

    # Image Captioning
    if results.get("caption_comparison") is not None:
        tasks.append("Image Captioning\n(CIDEr)")
        paper_scores.append(133.3)  # Paper CIDEr score
        our_scores.append(results["caption_scores"]["CIDEr"] * 100)

    # VQA
    if results.get("vqa_comparison") is not None:
        tasks.append("VQA\n(Accuracy)")
        paper_scores.append(78.25)  # Paper VQA test-dev
        our_scores.append(results["vqa_accuracy"] * 100)

    # Image-Text Retrieval
    if results.get("retrieval_comparison") is not None:
        tr_recall, ir_recall = results["retrieval_scores"]

        tasks.append("Text Retrieval\n(R@1)")
        paper_scores.append(81.9)  # Paper TR@1
        our_scores.append(tr_recall[1])

        tasks.append("Image Retrieval\n(R@1)")
        paper_scores.append(64.3)  # Paper IR@1
        our_scores.append(ir_recall[1])

    # Create figure
    fig, ax = plt.subplots(figsize=(12, 6))

    # Bar positions
    x = np.arange(len(tasks))
    width = 0.35

    # Plot bars
    ax.bar(x - width/2, paper_scores, width, label='Paper Results')
    ax.bar(x + width/2, our_scores, width, label='Our Results')

    # Add labels, title and legend
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('BLIP Benchmark Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(tasks)
    ax.legend()

    # Add values on top of bars
    for i, v in enumerate(paper_scores):
        ax.text(i - width/2, v + 1, f'{v:.1f}', ha='center')

    for i, v in enumerate(our_scores):
        ax.text(i + width/2, v + 1, f'{v:.1f}', ha='center')

    plt.tight_layout()
    return fig

def visualize_image_captioning_examples(results, coco_img_path, coco_ann_path, num_examples=5):
    """
    Visualize image captioning examples

    Args:
        results: Image captioning results
        coco_img_path: Path to COCO images
        coco_ann_path: Path to COCO annotations
        num_examples: Number of examples to visualize
    """
    import matplotlib.pyplot as plt
    from pycocotools.coco import COCO

    # Load COCO dataset
    coco_ann_file = f"{coco_ann_path}/captions_val2017.json"
    coco = COCO(coco_ann_file)

    # Create figure
    fig, axs = plt.subplots(num_examples, 1, figsize=(12, 5 * num_examples))

    if num_examples == 1:
        axs = [axs]

    # Get random examples from results
    indices = np.random.choice(len(results), min(num_examples, len(results)), replace=False)

    for i, idx in enumerate(indices):
        result = results[idx]
        img_id = result['image_id']
        img_info = coco.loadImgs(img_id)[0]
        img_path = f"{coco_img_path}/{img_info['file_name']}"

        # Get ground truth captions
        anns = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
        gt_captions = [ann['caption'] for ann in anns]

        # Display image
        img = plt.imread(img_path)
        axs[i].imshow(img)
        axs[i].axis('off')

        # Display generated caption and ground truth
        axs[i].set_title(f"Generated: {result['caption']}", fontsize=12)

        # Add ground truth captions as text
        gt_text = "Ground Truth:\n"
        for j, cap in enumerate(gt_captions[:3]):  # Show up to 3 ground truth captions
            gt_text += f"{j+1}. {cap}\n"

        axs[i].text(0, img.shape[0] + 20, gt_text, fontsize=10, wrap=True)

    plt.tight_layout()
    return fig

# =============================================
# Run All Evaluations
# =============================================

def run_all_evaluations(subset_size=100):
    """
    Run all benchmark evaluations

    Args:
        subset_size: Number of samples to use for each evaluation
    """
    print("=" * 50)
    print("BLIP Benchmark Evaluation - Running All Tests")
    print("=" * 50)

    # Note: Using a small subset for demonstration
    # For full benchmark verification, use larger subset or entire dataset

    # 1. Image Captioning
    caption_results, caption_scores = evaluate_blip_hf_captioning(subset_size)

    # 2. Visual Question Answering
    vqa_comparison, vqa_accuracy, vqa_results = evaluate_vqa(subset_size)

    # 3. Image-Text Retrieval
    retrieval_comparison, retrieval_scores = evaluate_image_text_retrieval(subset_size)

    # 4. Zero-Shot Flickr30K Retrieval
    flickr_comparison, _ = evaluate_zero_shot_flickr30k(subset_size)

    # 5. NLVR2
    try:
        nlvr2_comparison, _ = evaluate_nlvr2(subset_size)
    except Exception as e:
        print(f"NLVR2 evaluation requires additional setup. Skipping... Error: {str(e)}")
        nlvr2_comparison = None

    print("=" * 50)
    print("BLIP Benchmark Evaluation - All Tests Completed")
    print("=" * 50)

    # Summarize
    print("\nSummary of Benchmark Results:")
    print(f"Subset size used: {subset_size} samples")
    print("1. Image Captioning:")
    print(f"   BLEU@4: {caption_scores.get('Bleu_4', 'N/A')} (Paper: 39.7%)")
    print(f"   CIDEr: {caption_scores.get('CIDEr', 'N/A')} (Paper: 133.3)")

    print("\n2. Visual Question Answering:")
    print(f"   Accuracy: {vqa_accuracy*100:.2f}% (Paper VQA test-dev: 78.25%)")

    print("\n3. Image-Text Retrieval:")
    tr_recall, ir_recall = retrieval_scores
    print(f"   Text Retrieval R@1: {tr_recall[1]:.2f}% (Paper: 81.9%)")
    print(f"   Image Retrieval R@1: {ir_recall[1]:.2f}% (Paper: 64.3%)")

    print("\n4. Zero-Shot Flickr30K:")
    print("   Results require the actual Flickr30K dataset (Paper TR@1: 94.8%, IR@1: 84.9%)")

    print("\n5. NLVR2:")
    print("   Results require full BLIP implementation (Paper dev: 82.67%, test-P: 82.30%)")

    print("\nNote: These results are based on a small subset and may not fully match paper results.")
    print("For accurate comparison, the full validation sets should be used.")

    return {
        "caption_comparison": None,  # Placeholder, since we are not explicitly calculating comparison in this function
        "caption_scores": caption_scores,
        "vqa_comparison": vqa_comparison,
        "vqa_accuracy": vqa_accuracy,
        "retrieval_comparison": retrieval_comparison,
        "retrieval_scores": retrieval_scores,
        "flickr_comparison": flickr_comparison,
        "nlvr2_comparison": nlvr2_comparison
    }

# Run the evaluation
if __name__ == "__main__":
    # Run all evaluations with the subset size you prefer
    results = run_all_evaluations(subset_size=100)  # You can adjust the subset_size as needed

    # Optionally, print or process the results
    print("\nAll Evaluations Completed!")

    # Assuming results contain captions and references from the function
    if results:
        print("\nSummary of Image Captioning Results:")
        for result in results:
            print(f"Image ID: {result['image_id']}")
            print(f"Generated Caption: {result['caption']}")
            print(f"References: {', '.join(result['caption'] for result in results)}")  # Displaying references

    else:
        print("Evaluation failed. Please check the errors above.")
a

Running in Colab, checking dependencies...
Removing existing BLIP directory to avoid nesting...
Cloning BLIP repo...
Cloning into 'BLIP'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (183/183), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 277 (delta 145), reused 137 (delta 137), pack-reused 94 (from 1)[K
Receiving objects: 100% (277/277), 7.04 MiB | 19.06 MiB/s, done.
Resolving deltas: 100% (152/152), done.
/content/BLIP
Downloading COCO validation dataset...
--2025-04-16 00:01:50--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.11.156, 16.15.217.168, 52.217.87.28, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.11.156|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘datasets/coco/val2017.zip’


2025-04-16 00:02:30 (19.8 MB/s) - ‘datasets/coco/val2017.zip’ saved 

  0%|          | 0/100 [00:00<?, ?it/s]

Calculating metrics...
{'testlen': 740, 'reflen': 903, 'guess': [740, 640, 540, 440], 'correct': [548, 282, 116, 48]}
ratio: 0.8194905869315399
Error calculating METEOR: could not convert string to float: b'5.0 11.0 2.0 5.0 3.0 3.0 2.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 5.0 5.0'

Example Generated Captions:
Image: 000000301061.jpg
Generated: a man is standing next to a truck with a baby elephant
References: ['A person is moving green hay towards an elephant that is inside the back of a white truck.', 'A man pulls an elephant out of a truck.', 'An elephant in the back of a truck trailer.', 'An elephant is being fed while in a truck.', 'There is an elephant in side a truck trying to come out']

Image: 000000261982.jpg
Generated: a man riding a skateboard down a street
References: ['A man flying through the air while riding a skateboard.', 'a guy having a spill on his skate board', 'A person on a skateboard does a flip in a parking lot.', 'A person is riding on a skat

Exception ignored in: <function Meteor.__del__ at 0x790e634db420>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pycocoevalcap/meteor/meteor.py", line 78, in __del__
    self.lock.acquire()
KeyboardInterrupt: 



Evaluating Visual Question Answering...
Using sample size of 100 QA pairs (from total of 214354)


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.35G/1.35G [00:46<00:00, 30.8MB/s]


load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth
Answering 100 questions...


  0%|          | 0/100 [00:00<?, ?it/s]

Error processing datasets/coco/val2017/000000397303.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000410221.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000425226.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000021604.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000311303.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000445658.jpg: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Error processing datasets/coco/val2017/000000482970.jpg: The size of tensor a (3) must match the size of tensor b (9) 

100%|██████████| 1.78G/1.78G [01:01<00:00, 30.9MB/s]


load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth
Preparing 100 images and captions for retrieval...


  0%|          | 0/100 [00:00<?, ?it/s]

Calculating retrieval scores...


  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 