In [6]:
from __future__ import annotations

import os
from dotenv import load_dotenv

import dspy
from datasets import load_dataset
from dspy.teleprompt.gepa.instruction_proposal import MultiModalInstructionProposer as DefaultMultiModalProposer

load_dotenv()

# Disable cache for fresh runs
dspy.configure_cache(enable_disk_cache=False, enable_memory_cache=False)


class ImageCaption(dspy.Signature):
    """Generate a caption for an image that describes the text visible in the image."""
    image: dspy.Image = dspy.InputField(desc="Image containing text that needs to be read and described")
    caption: str = dspy.OutputField(desc="A detailed caption describing the text and context visible in the image")


def load_textcaps_dataset(num_train: int = 20, num_val: int = 10, num_test: int = 10):
    """
    Load TextCaps dataset from HuggingFace.
    
    TextCaps is an image captioning dataset that requires reading text in images.
    """
    print("Loading TextCaps dataset...")
    
    try:
        # Try loading from HuggingFace
        dataset = load_dataset("lmms-lab/TextCaps", split="train")
    except Exception as e:
        print(f"Error loading from HuggingFace: {e}")
        print("Trying alternative dataset name...")
        try:
            dataset = load_dataset("textcaps", split="train")
        except Exception as e2:
            print(f"Error loading alternative: {e2}")
            raise Exception("Failed to load TextCaps dataset. Please check your internet connection and dataset availability.")

    
    # Convert to DSPy format
    train_set = []
    val_set = []
    test_set = []
    
    # Process train split
    processed = 0
    skipped = 0
    
    for i, example in enumerate(dataset.select(range(min(num_train, len(dataset))))):
        try:
            # Try to get image URL first (preferred for dspy.Image)
            image_url = None
            # Try URL fields first
            for key in ["flickr_original_url", "flickr_300k_url", "image_url", "url"]:
                if key in example and example[key] is not None:
                    url_value = example[key]
                    if isinstance(url_value, str) and url_value.strip():
                        image_url = url_value.strip()
                        break
            
            # If no URL, try image path
            image_path = None
            if not image_url:
                for key in ["image_path", "path", "file_name", "filename"]:
                    if key in example and example[key] is not None:
                        path_value = example[key]
                        if isinstance(path_value, str) and path_value.strip():
                            image_path = path_value.strip()
                            break
            
            # If we have a PIL Image but no URL/path, we'll need to skip or convert
            # For now, skip if no URL or path
            if not image_url and not image_path:
                skipped += 1
                if i < 3:  # Print first few for debugging
                    print(f"Example {i}: No image URL or path found. Keys: {list(example.keys())}")
                continue
            
            # Try multiple field name variations for caption
            caption = None
            for key in ["caption_str", "caption", "caption_text", "text", "text_caption", "str", "label", "reference_strs"]:
                if key in example and example[key] is not None:
                    caption_value = example[key]
                    # Handle list of captions (take first one)
                    if isinstance(caption_value, list):
                        if len(caption_value) > 0:
                            caption = str(caption_value[0]).strip()
                    else:
                        caption = str(caption_value).strip()
                    if caption:  # Only use if non-empty
                        break
            
            if not caption:
                skipped += 1
                if i < 3:  # Print first few for debugging
                    print(f"Example {i}: No caption found. Keys: {list(example.keys())}")
                continue
            
            # Create dspy.Image from URL or path
            if image_url:
                img = dspy.Image(url=image_url)
            elif image_path:
                if image_path.startswith("http"):
                    img = dspy.Image(url=image_path)
                else:
                    img = dspy.Image(path=image_path)
            else:
                skipped += 1
                if i < 3:
                    print(f"Example {i}: Could not create image from available data")
                continue
            
            train_set.append(
                dspy.Example(image=img, caption=caption).with_inputs("image")
            )
            processed += 1
            
        except Exception as e:
            skipped += 1
            if i < 3:  # Print first few errors for debugging
                print(f"Error processing example {i}: {e}")
                import traceback
                traceback.print_exc()
            continue
    
    print(f"Loaded {len(train_set)} training examples (processed: {processed}, skipped: {skipped})")
    
    if len(train_set) == 0:
        print("\nDebugging info:")
        if len(dataset) > 0:
            print(f"First example keys: {list(dataset[0].keys())}")
            print(f"First example values types: {[(k, type(v)) for k, v in dataset[0].items()]}")
    
    return train_set, val_set, test_set


def textcaps_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Metric for TextCaps: Compare predicted caption with gold caption.
    Uses simple string matching - can be enhanced with BLEU/ROUGE scores.
    """
    pred_caption = getattr(pred, "caption", "").strip().lower()
    gold_caption = getattr(gold, "caption", "").strip().lower()
    
    # Simple exact match
    if pred_caption == gold_caption:
        score = 1.0
        feedback = "Perfect match!"
    # Check for word overlap
    elif pred_caption and gold_caption:
        pred_words = set(pred_caption.split())
        gold_words = set(gold_caption.split())
        overlap = len(pred_words & gold_words)
        total = len(gold_words)
        score = overlap / total if total > 0 else 0.0
        feedback = f"Word overlap: {overlap}/{total} words. Expected: '{gold.caption}', Got: '{pred_caption}'"
    else:
        score = 0.0
        feedback = f"Missing caption. Expected: '{gold.caption}', Got: '{pred_caption}'"
    
    return dspy.Prediction(score=score, feedback=feedback)


# Configure LM and load dataset
dspy.settings.configure(
    lm=dspy.LM("gpt-5-nano", temperature=1.0, max_tokens=16000, cache=False)
)

trainset, valset, testset = load_textcaps_dataset(num_train=20, num_val=10, num_test=10)

if not trainset:
    raise Exception("No training data available. Please check the dataset loading.")

# Use validation set as train if no separate valset
if not valset:
    valset = trainset[:5]  # Use first 5 as validation
    trainset = trainset[5:]  # Rest as training

print(f"\nDataset: {len(trainset)} train, {len(valset)} val examples")

Loading TextCaps dataset...
Loaded 20 training examples (processed: 20, skipped: 0)

Dataset: 15 train, 5 val examples


In [3]:
# ============================================================================
# GEPA with Default Multimodal Proposer (single LLM, no reranking)
# ============================================================================

print("\n" + "="*70)
print("Running GEPA with Default Multimodal Proposer (single LLM)")
print("="*70)

# Use default multimodal proposer (single LLM, no reranking)
default_proposer = DefaultMultiModalProposer()

program = dspy.Predict(ImageCaption)

optimizer = dspy.GEPA(
    metric=textcaps_metric,
    auto="light",
    candidate_selection_strategy="current_best",
    instruction_proposer=default_proposer,
)

default_program = optimizer.compile(program, trainset=trainset, valset=valset)

# Evaluate on validation set
from dspy.evaluate import Evaluate
evaluate = Evaluate(metric=textcaps_metric, num_threads=1)
default_score = evaluate(default_program, valset=valset)

print(f"\nDefault Multimodal Proposer Final Score: {default_score:.2%}")

# Save results
default_program.save("gepa_textcaps_default.json")
print("Saved to: gepa_textcaps_default.json")


2025/11/16 20:27:39 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 400 metric calls of the program. This amounts to 20.00 full evals on the train+val set.
2025/11/16 20:27:39 INFO dspy.teleprompt.gepa.gepa: Using 5 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.



Running GEPA with Default Multimodal Proposer (single LLM)


GEPA Optimization:   0%|          | 0/400 [00:00<?, ?rollouts/s]2025/11/16 20:27:54 INFO dspy.evaluate.evaluate: Average Metric: 2.0277777777777777 / 5 (40.6%)
2025/11/16 20:27:54 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.40555555555555556
GEPA Optimization:   1%|▏         | 5/400 [00:14<19:22,  2.94s/rollouts]2025/11/16 20:27:54 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.40555555555555556


Average Metric: 1.80 / 3 (60.0%): 100%|██████████| 3/3 [00:12<00:00,  4.09s/it]

2025/11/16 20:28:06 INFO dspy.evaluate.evaluate: Average Metric: 1.8 / 3 (60.0%)





2025/11/16 20:28:25 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for self: Your task is to generate a concise, 1–2 sentence caption that accurately describes the image, with explicit incorporation of any clearly visible text. Do not add details that are not observable. When text is legible, mention the text or the branding as part of the scene description to support the caption.

Guidelines
- Visual analysis
  - Identify the main subjects (objects, people, scenes), their colors, shapes, quantities, and spatial relationships (what’s in foreground vs. background, left/right positions, scale).
  - Note any distinctive branding, logos, labels, numbers, or other readable text, and their prominence.
- Text processing
  - Extract legible text using OCR-like observation. If text is short and clearly legible, quote it verbatim in the caption; if longer, summarize the key words or brand name.
- Integration
  - Combine the visual description with the key, visible text to produce

Average Metric: 0.84 / 3 (27.9%): 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]

2025/11/16 20:29:01 INFO dspy.evaluate.evaluate: Average Metric: 0.8363636363636364 / 3 (27.9%)





2025/11/16 20:29:25 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for self: - Task definition: Generate a concise, accurate caption for an image that describes the main scene and any clearly legible text within the image. Do not identify real people by name or guess identities beyond visible cues. Use both visual observations and visible text to inform the caption, but avoid exhaustively listing every word or item.

- Visual analysis guidance (what to look for and describe):
  - Identify the primary subject(s): objects, people, setting, actions, or events (e.g., beverage bottles, a book cover, a group of people at a sporting event).
  - Note salient relationships and actions: what the subjects are doing, how they’re arranged, and any motion or interaction.
  - Describe key visual features: colors, shapes, labels, logos, fonts, and layout that help define the scene (e.g., “a row of bottles with blue caps,” “a book cover with a large title”).
  - If text is visible, note

Average Metric: 1.40 / 3 (46.6%): 100%|██████████| 3/3 [00:10<00:00,  3.44s/it]

2025/11/16 20:29:57 INFO dspy.evaluate.evaluate: Average Metric: 1.398989898989899 / 3 (46.6%)





2025/11/16 20:30:17 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for self: You are an image captioning assistant whose task is to generate a concise caption that centers on the legible text visible in the image. Follow these rules:

- Primary goal: Create a caption that describes the image using only the text that is clearly readable in the image. Build the caption around the most prominent visible text (words, numbers, brand names, titles, logos).
- Visual-text integration: First extract the readable text, then craft a natural one-sentence caption that anchors on that text. If the text names a product, brand, movie, or work, mention it by name.
- Caption style: One sentence only, concise (roughly 6–14 words). Avoid listing many small visual details; focus on the text-driven description.
- Handling multiple text blocks: If several texts are visible, prioritize the most salient (largest or clearest). If no legible text is present, provide a brief generic caption about 

Average Metric: 0.79 / 3 (26.4%): 100%|██████████| 3/3 [00:16<00:00,  5.38s/it]

2025/11/16 20:30:43 INFO dspy.evaluate.evaluate: Average Metric: 0.792929292929293 / 3 (26.4%)





2025/11/16 20:31:03 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for self: - Task definition: For each image, produce a concise caption that (a) describes the main visual scene and objects, and (b) explicitly transcribes any legible text visible in the image. The caption should reflect what a viewer can read in the image and how the text relates to the scene.
- Visual analysis guidance:
  - Identify the primary objects (e.g., bottle, glass, table) and their spatial relationships (left/right of, on, next to).
  - Scan the image for readable text on objects, signage, or packaging. Note exact words, capitalization, and any numbers.
  - If text is partially legible or blurred, transcribe the clearly readable parts and indicate uncertainty for unclear portions.
  - Prioritize text that appears prominently or is central to identifying the product, brand, or context.
- Text processing rules:
  - Transcribe legible text exactly as it appears (including capitalization and punc

Average Metric: 1.17 / 3 (39.0%): 100%|██████████| 3/3 [00:08<00:00,  2.98s/it]

2025/11/16 20:31:47 INFO dspy.evaluate.evaluate: Average Metric: 1.1696969696969697 / 3 (39.0%)





2025/11/16 20:32:13 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for self: Goal:
- Generate a concise caption that describes only the visible text in an image. Do not attempt to describe non-text visuals in detail unless they are directly relevant to the text (e.g., a page layout or a label design that affects how the text is read).

What to do (step-by-step):
1) Extract visible text
   - Use OCR-like reasoning to identify legible words, phrases, headings, titles, logos containing text, and any dates or numbers.
   - Note which text is clearly readable and which parts are partially legible or blurred.

2) Determine the main textual subject
   - Identify the primary topic or purpose of the text (e.g., a page title, a brochure heading, a product label, a book page, etc.).
   - If possible, determine the language of the text from what is readable.

3) Write a concise caption (1–2 sentences)
   - Start with a brief description of what the text is about or what kind of tex

Average Metric: 1.78 / 3 (59.3%): 100%|██████████| 3/3 [00:12<00:00,  4.07s/it]

2025/11/16 20:32:39 INFO dspy.evaluate.evaluate: Average Metric: 1.7787878787878786 / 3 (59.3%)





2025/11/16 20:32:57 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for self: Task: Generate a concise, one- to two-sentence caption for the given image that centers on the text visible in the image and describes the scene that the text conveys. Do not dump a long transcription of everything you see; instead, integrate the key visible text into a brief description of the image.

Guidelines
- Visual analysis
  - Identify the main subject or scene (e.g., a book cover, a booklet page, a row of cans, a poster).
  - Note prominent, legible text and its placement, size, color, and font style; indicate language when relevant.
  - Mention any logos, titles, subtitles, or labels that define the context of the image.
  - If text is illegible due to image quality, state that clearly and still describe the overall scene.

- Text integration
  - Include the most informative visible text in the caption (e.g., the title, major subtitle, product name) without enumerating every word.
  -

Average Metric: 1.56 / 3 (51.9%): 100%|██████████| 3/3 [00:08<00:00,  2.79s/it]

2025/11/16 20:33:14 INFO dspy.evaluate.evaluate: Average Metric: 1.5555555555555556 / 3 (51.9%)





2025/11/16 20:33:30 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for self: You will be given an image. Produce a concise caption that accurately describes the main subject or scene, and include any clearly legible text that appears in the image if it helps identify the subject. Do not over-describe or invent details.

Guidelines:
- Visual analysis
  - Identify the primary subject(s) in the foreground and the overall setting.
  - Note any text that is legible on objects (labels, signage, posters, books, packaging, etc.), and transcribe it exactly.
  - Recognize logos, brands, and notable features (e.g., a framed collage, a bottle label, a sign with a title).
  - If there are multiple items, decide which is the main focus and describe that first.
- Text integration
  - If readable, incorporate key text into the caption (brand names, product names, dates, titles) in natural phrasing.
  - Do not add unrelated textual details that aren’t visible.
- Style and length
  - Use

Average Metric: 0.81 / 3 (26.9%): 100%|██████████| 3/3 [00:11<00:00,  3.71s/it]

2025/11/16 20:33:52 INFO dspy.evaluate.evaluate: Average Metric: 0.8060606060606059 / 3 (26.9%)





2025/11/16 20:34:10 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for self: You are an AI that generates a concise, descriptive caption for a single image. Your caption should reflect both the visible scene and any legible text present in the image. Follow these guidelines to analyze and compose the caption:

- Task definition
  - Produce one short caption (1–3 sentences) that describes what is happening or what the scene is, plus any clearly legible text you can extract.
  - Do not invent facts about people, places, or events that aren’t discernible in the image.

- Visual analysis (what to look for)
  - Identify the main subject(s) and the setting (indoor/outdoor, objects, activities, actions, states like “standing,” “on a shelf,” “at a stadium,” etc.).
  - Note salient details: number of items, colors, shapes, orientations, positions (left/right/center), and any notable textures or patterns.
  - Mention relationships and spatial layout (e.g., “books on a shelf,” “bo

Average Metric: 1.99 / 3 (66.4%): 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]

2025/11/16 20:34:29 INFO dspy.evaluate.evaluate: Average Metric: 1.991919191919192 / 3 (66.4%)





2025/11/16 20:34:53 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for self: Task: Generate a concise caption for an image that accurately reflects the visible text and its context, not a full OCR dump or extraneous scene description.

Guidelines:
- Visual-text analysis
  - Identify the primary object in the image (e.g., beer bottle, monitor, book page) and the most legible text on or in relation to that object.
  - Use OCR-style reading to extract readable text blocks, prioritizing brand names, product/series names, edition/volume, titles, authors, and any dates visible.
  - If text is partially obscured or unreadable, note that some text is not legible and avoid guessing the missing words.
- Caption construction
  - Create a single concise sentence (or up to two short sentences) that describes the object and the key text content visible.
  - Do not reproduce long strings verbatim; paraphrase into natural-sounding English while preserving the essential information (e.g

Average Metric: 0.86 / 3 (28.6%): 100%|██████████| 3/3 [00:09<00:00,  3.06s/it]

2025/11/16 20:35:11 INFO dspy.evaluate.evaluate: Average Metric: 0.8585858585858586 / 3 (28.6%)





2025/11/16 20:35:34 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for self: You are an image captioning assistant. Your task is to write a single, concise caption that describes what is visible in the image and, when present, references any legible text shown in the image. Do not guess identities or events beyond what the image supports. Use the visible text to anchor the caption and integrate it with the visual content.

Guidelines:
- Read and transcribe legible text exactly as it appears (including capitalization and punctuation). Use this text to identify key subjects (e.g., book titles, author names, country names, logos, slogans).
- Analyze the visual scene: identify main subjects (people, objects), their actions, arrangement, colors, and any notable features (e.g., stacked books, jerseys with text, a person reading, a presentation).
- Integration of text and visuals:
  - If the image contains text that names a person, brand, title, or location, incorporate that 

Average Metric: 0.94 / 3 (31.3%): 100%|██████████| 3/3 [00:12<00:00,  4.30s/it]

2025/11/16 20:36:01 INFO dspy.evaluate.evaluate: Average Metric: 0.9393939393939394 / 3 (31.3%)





2025/11/16 20:36:24 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for self: Task: For each image, generate a single, concise caption that centers on the visible text within the image and explains what the text communicates about the scene. Do not attempt to describe every visual detail; use the legible words, logos, titles, or slogans to anchor a clear interpretation of the image.

Visual analysis guidelines:
- Identify all legible text in the image (e.g., names, titles, brands, slogans, dates) and transcribe them as accurately as possible.
- Note the source or context implied by the text (e.g., movie poster, stadium signage, product packaging, album cover).
- Recognize logos or brand marks that help identify the scene and connect to the text.

Textual integration guidelines:
- Construct a caption that incorporates the key text elements to convey the main idea or context of the image (e.g., “A framed Top Gun display,” “Fly Emirates signage visible at a stadium,” “Dyla

Average Metric: 1.82 / 3 (60.6%): 100%|██████████| 3/3 [00:10<00:00,  3.41s/it]

2025/11/16 20:36:42 INFO dspy.evaluate.evaluate: Average Metric: 1.816919191919192 / 3 (60.6%)





2025/11/16 20:37:08 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for self: You are asked to generate a concise, natural-language caption for an image that describes both the visible scene and any legible text on packaging, signs, or labels. Do not rely on guesswork about unreadable details; base your caption on what can be seen and read.

Guidelines:
- Identify the main items (e.g., bottle, can, glass, table) and their spatial relationships (on a table, beside, in front of).
- Read legible text and extract the essential identifiers. Prefer concise phrases like “A bottle of X,” “A can of Y root beer,” or “A glass of Z” and mention the variant if the text is clearly visible (e.g., Imperial White, Premium Lager, Alhambra).
- When multiple items are present, describe the primary item and describe the other items in relation to it (e.g., “A bottle of X sits beside a glass of beer.”).
- Domain knowledge to use: common product naming conventions (brand + product type/varian

Average Metric: 1.65 / 3 (54.8%): 100%|██████████| 3/3 [00:14<00:00,  4.71s/it]

2025/11/16 20:37:33 INFO dspy.evaluate.evaluate: Average Metric: 1.6454545454545455 / 3 (54.8%)





2025/11/16 20:37:56 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for self: You are an image-to-captioning assistant. Your task is to generate a concise, text-focused caption that accurately reflects both the visible scene and the key textual content in the image. Do not simply describe all visual details or reproduce long blocks of text. Instead, identify the main subject and extract the most salient visible text, then craft a short, one-sentence caption that conveys that content.

Guidelines

- Visual analysis:
  - Identify the primary object or scene (e.g., a book’s title page, a book cover, a row of beverage bottles).
  - Note layout cues (where the main text sits, any subtitles, branding bands, or edition information).

- Text extraction and emphasis:
  - Treat the most prominent visible text as the anchor of the caption (e.g., the main title, a subtitle, or a brand name).
  - If multiple languages or subtitles are visible, mention them briefly as part of the cap

Average Metric: 1.09 / 3 (36.5%): 100%|██████████| 3/3 [00:17<00:00,  5.87s/it]

2025/11/16 20:38:25 INFO dspy.evaluate.evaluate: Average Metric: 1.094949494949495 / 3 (36.5%)





2025/11/16 20:38:46 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for self: Task: Produce a concise caption that describes the text visible in the image, and the surrounding scene, based on legible OCR.

What to do
- Visual analysis focused on text: Detect and read all legible text on visible surfaces (labels, packaging, signs, posters). Transcribe the text exactly as it appears (case, punctuation, spacing) when readable.
- Text-aware scene description: Identify what objects carry the text (e.g., product cans, book spines, t-shirts, signs) and describe the overall scene (how many items, their arrangement, setting, colors) in a brief caption.
- Integrate text with visuals: Create a single caption (1–2 sentences) that combines the readable text with a high-level description of the scene. Prioritize the main, readable brand or product names and phrases, then add context about the scene.
- Examples of integration patterns:
  - If the brand name is clearly visible, mention

Average Metric: 1.48 / 3 (49.2%): 100%|██████████| 3/3 [00:16<00:00,  5.48s/it]

2025/11/16 20:39:12 INFO dspy.evaluate.evaluate: Average Metric: 1.4767676767676767 / 3 (49.2%)





2025/11/16 20:39:49 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for self: Task: For each given image, generate a concise caption that describes the visible text in the image and its context within the layout. The caption should prioritize what the text says, the language, and how the text relates to other visual elements. Do not rely on or invent non-text visual details unless they help explain the text’s placement or meaning.

Guidelines:
- Visual analysis
  - Identify and describe legible text blocks, headings, labels, logos, and any page layout cues (e.g., spiral binding, margins, two-column formats, titles near the top).
  - Infer the type of document or scene from the text layout (e.g., brochure page, book page, poster, catalog spread, magazine spread).
  - Note orientation, color cues, and where text appears relative to images or diagrams.

- Text processing
  - Transcribe exactly what is legible, including punctuation, capitalization, and line breaks when the

Average Metric: 1.48 / 3 (49.3%): 100%|██████████| 3/3 [00:09<00:00,  3.30s/it]

2025/11/16 20:40:18 INFO dspy.evaluate.evaluate: Average Metric: 1.4777777777777776 / 3 (49.3%)





2025/11/16 20:40:48 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for self: You will generate a concise, caption-style description of the image. The caption should be a single fluent sentence that identifies the main subject and its context. If clearly legible text in the image helps identify the subject, include the essential words (for example a brand name, a title, or a name). Do not attempt to transcribe every word or describe every detail.

Visual analysis guidance:
- Determine the primary subject (e.g., poster, framed display, stack of cans, book cover) and its placement (on a wall, on a shelf, close-up).
- Identify readable text and decide whether mentioning it improves identification. Include only the key text that anchors the image (e.g., FRESCA, Top Gun, Dylan Thomas).
- If multiple items are visible, summarize the scene concisely (e.g., “many cans on a shelf,” “a framed collage on a wall”).
- Use domain knowledge to name formats (poster, LP cover, can, book

Average Metric: 1.68 / 3 (55.9%): 100%|██████████| 3/3 [00:08<00:00,  2.97s/it]

2025/11/16 20:41:03 INFO dspy.evaluate.evaluate: Average Metric: 1.676767676767677 / 3 (55.9%)





2025/11/16 20:41:25 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for self: You are to generate a concise caption for an image that foregrounds and accurately reflects the text visible in the image. Follow these rules:

- Focus on legible text: identify all readable words, phrases, numbers, brand names, titles, captions, and other textual elements present in the image.
- Read and reproduce text precisely when possible: preserve capitalization, punctuation, and formatting as observed. If some text is partially obscured or unclear, indicate this clearly (e.g., [TEXT partially visible: "..."]).
- Tie text to the visual scene: describe how the visible text relates to the objects or setting you see (e.g., a title page reading..., a bottle labeled..., a sign that says...).
- Do not make claims beyond what the image shows: avoid assumptions about people, actions, or events unless they are implied by the visible text or clearly depicted.
- When multiple text blocks exist, pri

Average Metric: 1.25 / 3 (41.8%): 100%|██████████| 3/3 [00:09<00:00,  3.07s/it]

2025/11/16 20:41:53 INFO dspy.evaluate.evaluate: Average Metric: 1.2545454545454546 / 3 (41.8%)





2025/11/16 20:42:20 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for self: Your task is to generate a caption for an image that describes the visible text content in the image, not a general description of the scene. Follow these steps:

- OCR-first: extract all legible text from the image (brand names, product names, titles, subtitles, lists, measurements, dates, etc.).
- Identify salience: determine the main text block (e.g., the book title, bottle label) and any supporting text that clarifies what the text refers to.
- Compose a concise caption (ideally 1–2 sentences). Report the most important visible text clearly in natural language. Include exact short text when it helps identify the item (e.g., “Grand Place”, “A&W”, “12 FL OZ (355 ml)”). Do not reproduce long verbatim descriptions of packaging or decoration.
- If multiple text blocks exist, summarize them rather than listing every word (e.g., “The label reads ‘A&W Root Beer’ and ‘12 FL OZ (355 ml)’”).
- If the

Average Metric: 1.25 / 3 (41.8%): 100%|██████████| 3/3 [00:15<00:00,  5.09s/it]

2025/11/16 20:42:46 INFO dspy.evaluate.evaluate: Average Metric: 1.2525252525252526 / 3 (41.8%)





2025/11/16 20:43:32 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for self: Task: Generate a concise, objective caption for the given image that describes the main subject, its composition, and any clearly legible text present. Do not repeat long text verbatim; instead, summarize readable text when it adds essential context (e.g., brand names or famous titles) and paraphrase or omit extraneous strings. Focus on what a viewer would notice at a glance: the primary object(s), their orientation and arrangement, dominant colors and textures, and any visible labels or branding.

Guidelines:
- Visual analysis
  - Identify the central scene or object (e.g., a row of book spines, a monitor on a stand, a stack of books).
  - Describe layout and orientation (vertical spines, angled stack, overlapping items, perspective, lighting).
  - Note dominant colors, materials, and textures (e.g., glossy red spines, yellow covers, black monitor).
- Text handling
  - Detect any legible text

Average Metric: 1.24 / 3 (41.4%): 100%|██████████| 3/3 [00:11<00:00,  3.94s/it]

2025/11/16 20:43:53 INFO dspy.evaluate.evaluate: Average Metric: 1.2424242424242424 / 3 (41.4%)





2025/11/16 20:44:16 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for self: Task: For each image provided, generate a concise caption that describes the text visible in the image.

Guidelines for visual and text processing
- OCR-first approach: Identify all legible text blocks (brand names, product names, headlines, slogans, signage, page titles, numbers, dates, languages). Prioritize the most prominent or informative text.
- Exact text handling: When text is clearly legible, mention the exact wording or a close paraphrase that reflects the visible characters, including capitalization and any distinctive punctuation.
- Text-centric captioning: Center the caption on what the text says, not on every visual detail. Include the brand or product name if it is clearly visible, and indicate the overall context if the text strongly implies it (e.g., a product label, a book page, or a stadium scoreboard).
- Multiple text blocks: If several text elements are visible, identify t

Average Metric: 1.27 / 3 (42.4%): 100%|██████████| 3/3 [00:17<00:00,  5.82s/it]

2025/11/16 20:44:43 INFO dspy.evaluate.evaluate: Average Metric: 1.2727272727272727 / 3 (42.4%)





2025/11/16 20:45:05 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for self: Task
- Generate a concise caption for an image that centers on the main subject and explicitly references the legible text visible on or in the scene.

What to do
- Visual analysis
  - Identify the primary object or scene element that conveys the most information.
  - Use OCR-style reading to extract text that is clearly legible (brand names, product lines, model names, slogans, etc.).
  - Prioritize text that identifies the object (e.g., brand and product name) and its variant (if visible), rather than non-critical background details.
  - If text is partially obscured, describe the object and keep any legible text if it clearly identifies the item; otherwise describe the object without fabricating text.
- Text integration
  - Create a single, concise caption that combines the visual subject with the most salient visible text.
  - Preferred caption structure: "A [object/brand] [variant] [type]

Average Metric: 1.28 / 3 (42.7%): 100%|██████████| 3/3 [00:10<00:00,  3.43s/it]

2025/11/16 20:45:23 INFO dspy.evaluate.evaluate: Average Metric: 1.2818181818181817 / 3 (42.7%)





2025/11/16 20:45:44 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Proposed new text for self: Your task is to generate a concise, objective caption for a given image that accurately reflects both the visible scene and any text that is legible within the image. Follow these rules:

- Visual analysis: Identify the main subjects (people, objects), their actions or posture, setting, counts, colors, and notable details. Describe only observable content; avoid inferring emotions, motives, or backstory.

- Text extraction and integration: Read all legible text in the image (OCR-friendly). Include key readable text in the caption when it helps identify the scene or subjects (e.g., logos, labels, or large, clear words). If multiple text elements exist, prioritize the most salient ones. Paraphrase long text and keep exact phrases only for clearly legible, important text.

- Caption construction: Write one or two short sentences in the present tense. Start with a broad description of the scene, t

Average Metric: 0.67 / 3 (22.2%): 100%|██████████| 3/3 [00:11<00:00,  3.80s/it]

2025/11/16 20:46:14 INFO dspy.evaluate.evaluate: Average Metric: 0.6666666666666666 / 3 (22.2%)





2025/11/16 20:46:56 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Proposed new text for self: Task: For every provided image, generate a caption that precisely describes the text that is visibly present in the image. The caption should reproduce the exact words visible (including capitalization, punctuation, and line breaks when relevant) and indicate where the text appears (e.g., on a book cover, poster, spine, logo). If multiple text blocks exist, prioritize the most prominent text first and then mention other legible text as needed. Do not infer information not visible in the image; where text is unclear, label it as [unclear] or [illegible].

Visual analysis guidelines:
- Perform OCR-like extraction to identify all legible text in the image.
- Capture exact strings as they appear, preserving case and punctuation.
- Determine the prominence and placement of each text element (top/bottom, large title, author name, logo, slogan).
- Distinguish text from non-text visual elements, but u

Average Metric: 1.53 / 3 (51.1%): 100%|██████████| 3/3 [00:09<00:00,  3.06s/it]

2025/11/16 20:47:17 INFO dspy.evaluate.evaluate: Average Metric: 1.5333333333333332 / 3 (51.1%)





2025/11/16 20:47:38 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Proposed new text for self: Task: Generate a caption that describes the text visible in the image.

Guidelines:
- Focus on the text present in the scene (labels, logos, slogans, numbers, signs, etc.). Do not rely solely on object recognition unless it ties into readable text.
- Visual-text analysis:
  - Identify all readable text blocks, note orientation, legibility, and any partially readable fragments.
  - Transcribe exactly what is legible. If a portion is unreadable, indicate it succinctly (e.g., "readable text includes '...' but '...' is blurred").
- Caption construction:
  - Create a natural, concise caption (1–3 clauses) that highlights the readable text. Examples:
    - "The label shows the word 'FRESCA' prominently." 
    - "Several cans with the text 'Original Citrus' and 'Sparkling Flavored Soda' are visible."
  - If there are multiple legible text elements, mention the most prominent one and optionally note o

Average Metric: 1.40 / 3 (46.6%): 100%|██████████| 3/3 [00:11<00:00,  3.80s/it]

2025/11/16 20:48:28 INFO dspy.evaluate.evaluate: Average Metric: 1.398989898989899 / 3 (46.6%)





2025/11/16 20:48:50 INFO dspy.teleprompt.gepa.gepa: Iteration 25: Proposed new text for self: Task: For every image input, generate a caption that accurately describes the text visible in the image, focusing on extracting and presenting the textual content with precision.

Guidelines:
- Visual/text extraction
  - Use OCR-style reasoning to identify all legible text blocks. Note languages present (e.g., Spanish, Italian, Basque) and indicate if the text appears bilingual.
  - Transcribe the exact visible text blocks as they appear, including capitalization and line breaks where they meaningfully separate blocks (e.g., title vs. subtitle vs. label).
  - Record relative locations of text (top, middle, bottom; left-right) and any visual cues that aid reading (color bands, backgrounds, borders).

- Caption construction
  - Create a concise caption whose core focus is the visible text. Begin with the most prominent text block (usually the title) and then mention other text blocks (subtitles,

Average Metric: 1.95 / 3 (65.0%): 100%|██████████| 3/3 [00:09<00:00,  3.10s/it]

2025/11/16 20:49:15 INFO dspy.evaluate.evaluate: Average Metric: 1.9494949494949496 / 3 (65.0%)





2025/11/16 20:49:31 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Proposed new text for self: Task: Generate a concise, factual caption for a single image that accurately describes the main subject and any clearly legible text visible on objects within the image.

Guidelines:
- Visual focus: Identify the primary object(s) in the scene (e.g., bottle, monitor, glass) and describe their relation if more than one object is prominent.
- Text awareness: Perform OCR on the image and incorporate only clearly legible text into the caption. Include exact brand/product names as they appear (e.g., LG, Samuel Adams Imperial White, Alhambra Premium Lager, Premium Lager).
- Integration: Combine visual description with the extracted text in a natural, one-sentence caption. Example patterns:
  - "A bottle of [Brand] [Product] sits next to a glass on a table."
  - "The [object] displays the brand [Text] on its label."
- Avoid inference: Do not speculate about setting or context beyond what is visible (e

Average Metric: 0.99 / 3 (32.9%): 100%|██████████| 3/3 [00:11<00:00,  3.72s/it]

2025/11/16 20:49:59 INFO dspy.evaluate.evaluate: Average Metric: 0.9858585858585859 / 3 (32.9%)





2025/11/16 20:50:18 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Proposed new text for self: You are a multimodal assistant required to generate a caption for an image with a strong emphasis on any text that is visible in the image. Follow these guidelines:

- Identify and extract all legible text (OCR). Note language, orientation, and clarity. If some text is rotated or partially obscured but still readable, transcribe what you can; if text is unreadable, indicate that clearly.
- Provide an accurate transcription or representative excerpt of the visible text, using quotation marks for exact strings. Do not invent words that aren’t legible.
- Describe the main visual content (people, objects, setting, colors, actions) succinctly, but center the description around the visible text when it is a key element of the scene.
- Text integration: Combine the transcription with a brief visual description to form a natural, coherent caption. For example: “A page shows the title ‘AUDITORIUM DEL P

Average Metric: 1.80 / 3 (60.0%): 100%|██████████| 3/3 [00:08<00:00,  2.75s/it]

2025/11/16 20:50:40 INFO dspy.evaluate.evaluate: Average Metric: 1.8 / 3 (60.0%)





2025/11/16 20:51:01 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Proposed new text for self: You are an image captioning assistant. Your task is to produce a concise, informative caption that reflects both the visible scene and any legible text in the image. Follow these guidelines:

- Determine emphasis: If legible text is prominent (e.g., a book title page, product labels, signage), center the caption on that text first; otherwise describe the scene (objects, arrangement, colors) succinctly.

- Text extraction and transcription:
  - Transcribe the most salient words or phrases visible, but do not attempt to reproduce every word or long sentence.
  - Use the exact wording, punctuation, and capitalization as shown when feasible; you may normalize for readability if needed.
  - Prefer compact patterns like: A title page for [text], or Many [brand] on a shelf.

- Integration patterns:
  - If text dominates, start with a short transcription of the key text and follow with a brief scene d

Average Metric: 1.23 / 3 (40.9%): 100%|██████████| 3/3 [00:08<00:00,  2.99s/it]

2025/11/16 20:51:26 INFO dspy.evaluate.evaluate: Average Metric: 1.2272727272727273 / 3 (40.9%)





2025/11/16 20:51:49 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Proposed new text for self: You are an image captioning assistant that must produce a concise, natural-language caption for each image by integrating both the visual content and the text visibly present in the image. Your caption should highlight the meaning conveyed by the legible text and how it relates to the scene, rather than simply listing every visible word.

Guidelines:
- Visual/text extraction: Use OCR to identify legible text on objects, signs, or pages. Note logos, brands, titles, authors, measurements, and slogans.
- Text prioritization: Focus on the most informative text (brand names, product names, book titles and authors, event or sponsor names, numbers). When multiple text blocks exist, pick the one that best identifies the scene.
- Object-text association: Tie the detected text to its object (e.g., text on a can corresponds to the can; text on a book spine corresponds to the stacked books; text on a scor

Average Metric: 1.21 / 3 (40.4%): 100%|██████████| 3/3 [00:10<00:00,  3.63s/it]

2025/11/16 20:52:10 INFO dspy.evaluate.evaluate: Average Metric: 1.2121212121212122 / 3 (40.4%)





2025/11/16 20:52:29 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Proposed new text for self: - Task goal: Given an image, produce a single, concise caption that accurately describes the main subject and any clearly legible text visible in the image. Do not attempt to describe every detail; prioritize the focal object or scene and the most important visible text.

- Visual analysis guidance:
  - Identify the primary subject (e.g., poster, book cover, collage, person, landscape).
  - Detect and assess legible text: what it says, its language, and where it appears (top/bottom/center, on a label, title, or subtitle).
  - Note composition cues that help identify the subject (e.g., a logo, a frame, a recognizable layout) but don’t over-describe non-essential elements.
  - If the image is a montage or display, pick the most salient element that defines the image’s content.

- Text integration:
  - When text is legible and helps identify the object, include a brief mention of the text in the 

Average Metric: 1.25 / 3 (41.8%): 100%|██████████| 3/3 [00:07<00:00,  2.39s/it]

2025/11/16 20:52:46 INFO dspy.evaluate.evaluate: Average Metric: 1.2525252525252526 / 3 (41.8%)





2025/11/16 20:53:40 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Proposed new text for self: Task: Generate a concise, one-sentence caption for the given image that explicitly describes the visible scene and the text present in the image.

Detailed guidance:
- Visual analysis:
  - Identify the main subject of the image (e.g., a page from a brochure/book, a computer monitor, a group of people, a sign).
  - Note any visible text, logos, or captions, including their approximate location (top, center, bottom) and orientation.
- Text extraction:
  - Perform OCR on legible text in the image.
  - Select the most salient text blocks that define the scene (titles, headings, brand names, signage). Do not attempt to transcribe every word if it is not central to the image.
- Caption content:
  - Write a single, self-contained sentence that:
    - Names the visual subject (e.g., "a page from a spiral-bound brochure," "an LG computer monitor").
    - Includes the most prominent visible text in quot

Average Metric: 1.84 / 3 (61.2%): 100%|██████████| 3/3 [00:35<00:00, 11.98s/it]

2025/11/16 20:54:24 INFO dspy.evaluate.evaluate: Average Metric: 1.8363636363636364 / 3 (61.2%)





2025/11/16 20:54:47 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Proposed new text for self: Task:
- Generate a single, concise caption that describes the image with a primary emphasis on the text that is visibly readable (OCR). The caption should reflect what the text says and the main objects displaying that text.

Visual/text processing steps:
- Use OCR-style reading to extract readable text from all visible packaging, labels, and signs (e.g., brand names, product lines, slogans).
- Identify the main subject(s) that carry text (e.g., cans, bottles) and note their quantity and arrangement.
- Compose a caption that reproduces the most salient visible text phrases exactly as seen (when legible) and include the minimal necessary visual context (e.g., “on a shelf,” “beside a glass”) to identify the scene.
- If there are multiple text-bearing items, mention the most prominent ones that help identify the image, without turning into a full transcription.
- If no legible text is present, de

Average Metric: 0.95 / 3 (31.8%): 100%|██████████| 3/3 [00:13<00:00,  4.53s/it]

2025/11/16 20:55:18 INFO dspy.evaluate.evaluate: Average Metric: 0.9545454545454546 / 3 (31.8%)





2025/11/16 20:55:41 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Proposed new text for self: You are a multimodal captioning assistant. Your task is to generate concise, text-centered captions for images that accurately reflect the visible text and the main subject shown. You should integrate visual observations with the legible text to produce a natural, informative caption in one sentence (or two at most).

Key steps:
- Identify the primary subject of the image (e.g., a product can, a book cover, a vinyl LP) and name it clearly.
- Read and report the most legible text that appears on the front or most prominent surface. Include exact phrases when readable (quotes are optional for emphasis) but do not reproduce excessive detail.
- Combine subject and text into a brief caption that a reader would understand without needing to analyze the image themselves. Example structure: "<Subject> with text '<X>' and '<Y>'." or "<Subject> titled '<X>' with subtitle '<Y>'."
- If multiple text block

Average Metric: 0.84 / 3 (27.9%): 100%|██████████| 3/3 [00:10<00:00,  3.58s/it]

2025/11/16 20:56:00 INFO dspy.evaluate.evaluate: Average Metric: 0.8363636363636364 / 3 (27.9%)





2025/11/16 20:56:22 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Proposed new text for self: You are given an image and your task is to generate a concise caption that describes the text that is visible in the image, and only what is shown. Do not invent words or information beyond what can be read, and do not reinterpret text beyond its literal meaning.

Guidelines
- Visual analysis (text focus)
  - Perform an OCR-like pass to identify all readable text blocks in the image.
  - Transcribe legible text exactly as it appears (when possible). Note language, orientation, color, and where the text is located (on labels, signs, packaging, or banners).
  - Prioritize the most salient text elements that define the image’s meaning (brand names, product names, flavors, slogans, numbers, dates). Do not attempt to list every word on every label unless it is clearly central to the scene.

- Text integration with visuals
  - Describe how the text relates to the surrounding objects (e.g., “labels o

Average Metric: 1.58 / 3 (52.6%): 100%|██████████| 3/3 [00:14<00:00,  4.95s/it]

2025/11/16 20:56:49 INFO dspy.evaluate.evaluate: Average Metric: 1.577777777777778 / 3 (52.6%)





2025/11/16 20:57:19 INFO dspy.teleprompt.gepa.gepa: Iteration 35: Proposed new text for self: - Task definition: Given an image, produce a concise caption that accurately describes the main visual subject and the most important text visible in the image. When legible text is present, prioritize it in the caption (either reproducing exact wording or very close paraphrase) and integrate it with the visual description.

- Visual analysis guidance (what to look for):
  - Identify the primary object or scene (e.g., framed display/poster, book page, shelf of books, collage, etc.).
  - Note layout and display details (frame/mat, number of items, alignment, colors, logos, boundaries).
  - Detect and read any visible text using OCR cues: exact words, titles, headlines, author/publisher names, volumes, dates, or logos.
  - Distinguish between text that defines the object (e.g., a title page) and incidental text (e.g., a label or date on a photo).

- Text integration strategies (how to combine vi

Average Metric: 1.62 / 3 (54.0%): 100%|██████████| 3/3 [00:11<00:00,  3.70s/it]

2025/11/16 20:57:40 INFO dspy.evaluate.evaluate: Average Metric: 1.6191919191919193 / 3 (54.0%)





2025/11/16 20:58:16 INFO dspy.teleprompt.gepa.gepa: Iteration 36: Proposed new text for self: A precise, text-aware captioning instruction that prioritizes and integrates visible text with the scene to produce a concise caption. Follow these guidelines:

- Task objective: For each image, generate one short caption that describes the main subject and, when legible, includes the exact visible text. If the text clearly identifies a product, title, brand, or sign, include that text verbatim to improve accuracy. If text is not clearly legible, describe the scene with concise, generic terms.

- Visual/text analysis steps:
  - Locate all legible text blocks (words, numbers, logos) and read them exactly as shown, preserving capitalization and punctuation.
  - Note text orientation (upright or rotated) and relative size/position to judge prominence.
  - Determine the primary object or scene (e.g., stacks of books, cans on a shelf, framed collage) and how text relates to it.

- Text–image integr

Average Metric: 0.93 / 3 (31.0%): 100%|██████████| 3/3 [00:11<00:00,  3.82s/it]

2025/11/16 20:58:38 INFO dspy.evaluate.evaluate: Average Metric: 0.9292929292929293 / 3 (31.0%)





2025/11/16 20:58:55 INFO dspy.teleprompt.gepa.gepa: Iteration 37: Proposed new text for self: Your task is to generate a concise, accurate caption for an image that centers on the text that is visibly present. Do not rely on or invent knowledge about people, events, or contexts beyond what the text and obvious visuals show. Follow these guidelines:

1) OCR and transcription
- Read all legible text in the image and transcribe it exactly as it appears, including punctuation, capitalization, and line breaks.
- If any text is partially obscured, indicate the uncertainty (e.g., [text unclear] or use ellipses to denote missing parts).

2) Text-driven analysis
- Determine what the visible text communicates (e.g., a title, a slogan, a brand, an author name) and the medium or object displaying it (e.g., book cover, album cover, poster, label).

3) Caption construction
- Produce a caption that centers on the visible text and its meaning. Include the exact phrases from the transcription in quotes

Average Metric: 1.68 / 3 (55.9%): 100%|██████████| 3/3 [00:33<00:00, 11.06s/it]

2025/11/16 20:59:40 INFO dspy.evaluate.evaluate: Average Metric: 1.676767676767677 / 3 (55.9%)





2025/11/16 21:00:02 INFO dspy.teleprompt.gepa.gepa: Iteration 38: Proposed new text for self: Task: Generate a concise caption for the given image that centers on the text visible in the scene.

Guidelines
- Primary goal: Identify legible text on labels, logos, signage, or packaging and base the caption on that text.
- What to include when text is legible:
  - State the brand and product names exactly as they appear (preserve capitalization and key descriptors). Examples: "Samuel Adams Imperial White beer", "A&W Root Beer", "Alhambra Lager".
  - If multiple readable text elements are present, mention the most identifiable item first and include its product descriptor (e.g., "Samuel Adams Imperial White beer" rather than listing every label).
- What to avoid:
  - Do not add non-textual details unless they help clarify which object the text refers to (e.g., “a beer bottle labeled…” is acceptable if it anchors the text).
  - Do not invent facts not supported by the visible text (e.g., fla

Average Metric: 1.89 / 3 (63.0%): 100%|██████████| 3/3 [00:11<00:00,  3.71s/it]

2025/11/16 21:00:24 INFO dspy.evaluate.evaluate: Average Metric: 1.888888888888889 / 3 (63.0%)





2025/11/16 21:00:53 INFO dspy.teleprompt.gepa.gepa: Iteration 39: Proposed new text for self: A text-focused captioning instruction for images containing printed text.

- Task definition: For each image, generate a concise caption that describes only the clearly visible text. Do not invent details beyond what is legible. Prefer a single sentence (two max) that succinctly conveys what the image’s text says and, if helpful, the type of document it appears on (e.g., title page, brochure, poster).

- Visual/text extraction: Read all clearly legible words using OCR. Preserve exact wording, capitalization, punctuation, and line breaks as they appear. If text is shown in uppercase, keep it in uppercase in the transcription.

- Text selection and prioritization: Identify the most informative visible text blocks (typically titles, headings, author lines, volume/edition notes, place and date lines). Include the essential fragments in the caption in a natural order that reflects the image’s layou

Average Metric: 0.79 / 3 (26.2%): 100%|██████████| 3/3 [00:15<00:00,  5.09s/it]

2025/11/16 21:01:25 INFO dspy.evaluate.evaluate: Average Metric: 0.7858585858585859 / 3 (26.2%)





2025/11/16 21:01:46 INFO dspy.teleprompt.gepa.gepa: Iteration 40: Proposed new text for self: Your task is to generate a concise caption that accurately describes the text that is visibly legible in the provided image. Follow these guidelines:

- A) Visual-text extraction: Use OCR-like reasoning to identify all legible words, phrases, numbers, and signs in the image. Record the exact text as it appears, including line breaks if they help readability.

- B) Text prioritization: When there are multiple text blocks, prioritize the most legible and informative ones (e.g., product labels, slogans, numbers, brand names). Do not invent or assume text that isn’t clearly readable.

- C) Caption construction: Create one clear caption that describes the scene through its visible text. You may:
  - Quote legible text exactly in quotation marks (e.g., "Fly Emirates", "lemon cayenne agave").
  - Or summarize the legible text in natural language (e.g., "A label reads 'lemon cayenne agave' on a bottle

Average Metric: 1.69 / 3 (56.3%): 100%|██████████| 3/3 [00:11<00:00,  3.75s/it]

2025/11/16 21:02:10 INFO dspy.evaluate.evaluate: Average Metric: 1.6888888888888889 / 3 (56.3%)





2025/11/16 21:02:43 INFO dspy.teleprompt.gepa.gepa: Iteration 41: Proposed new text for self: - Task: Produce a concise, single-sentence caption that describes the most salient text visible in the image. Include a minimal contextual description only when it helps identify the subject, and avoid long transcriptions of the entire image.

- Visual analysis steps:
  1) Use OCR to identify legible words and phrases, prioritizing those with the largest font, central placement, and clear visibility.
  2) Determine the primary text block or label (e.g., a title page, a book spine, a product label) and extract its key elements.
  3) Extract the most informative textual elements (title or heading, subtitle if relevant, author/creator, edition or volume, publisher/location, brand/product name).
  4) If multiple text blocks exist, select the most identifying one for the caption; mention other obvious text only if it clearly helps with identification.
  5) Convert numerals when appropriate (e.g., R

Average Metric: 1.64 / 3 (54.5%): 100%|██████████| 3/3 [00:16<00:00,  5.65s/it]

2025/11/16 21:03:12 INFO dspy.evaluate.evaluate: Average Metric: 1.6363636363636362 / 3 (54.5%)





2025/11/16 21:04:07 INFO dspy.teleprompt.gepa.gepa: Iteration 42: Proposed new text for self: You are tasked with generating a caption that centers on the text visible in an image. Do not rely solely on generic object descriptions; instead, use OCR to identify all legible text (logos, brand names, slogans, numbers, etc.) and integrate that text into a concise, natural caption.

Guidelines
- Visual analysis
  - Detect and read all legible text in the image, including rotated or partially obscured text.
  - Record exact characters as they appear (including capitalization and punctuation) and identify the source region (e.g., on a logo, label, or sign) to help with attribution.
  - If text is not fully legible, report only the clearly readable portions; do not guess unreadable words.
- Text integration
  - Compose a 1–2 sentence caption that describes the scene and includes the readable text. You may quote the legible text or present it in uppercase as seen, and attribute it to its source

Average Metric: 1.79 / 3 (59.7%): 100%|██████████| 3/3 [00:08<00:00,  2.99s/it]

2025/11/16 21:04:28 INFO dspy.evaluate.evaluate: Average Metric: 1.7916666666666665 / 3 (59.7%)





2025/11/16 21:05:00 INFO dspy.teleprompt.gepa.gepa: Iteration 43: Proposed new text for self: Your task is to generate a caption for an image that foregrounds and accurately conveys the visible text in the image. Do not describe every visual detail; instead, identify all legible text with OCR-like reasoning, transcribe it exactly as it appears (preserving capitalization, punctuation, and line breaks where meaningful), and use that text as the core of the caption. Then add a concise contextual phrase describing where or how the text appears (e.g., "on a soda can label," "on a page header in a brochure") to provide minimal non-text context. If there are multiple text blocks, include the most salient phrases first and keep the overall caption to 1–2 short sentences. If no text is legible, state that and provide a brief scene description. Do not invent words or brand names not visible in the image; when uncertain about a character or a word, mark it as [unclear] and avoid guessing. Include

Average Metric: 0.85 / 3 (28.2%): 100%|██████████| 3/3 [00:09<00:00,  3.05s/it]

2025/11/16 21:05:30 INFO dspy.evaluate.evaluate: Average Metric: 0.8454545454545455 / 3 (28.2%)





2025/11/16 21:05:50 INFO dspy.teleprompt.gepa.gepa: Iteration 44: Proposed new text for self: You are given a single image. Your task is to generate a concise, fluent caption that accurately describes both the visual scene and any legible text visible in the image. Prioritize reading and conveying the exact text that can be read, including words on shirts, banners, signs, logos, and numbers. Use the text to inform the context of the scene, but do not invent facts beyond what the image shows or the readable text allows.

Guidelines:
- Visual analysis
  - Identify and transcribe legible text with exact wording, including capitalization and punctuation when visible.
  - Note orientation: if text is rotated or angled, indicate its orientation only if it affects readability (e.g., “text appears sideways”).
  - Mention text that is partially occluded or unclear (e.g., “text partially visible” or “text illegible”).
  - Describe salient non-text visual details that help set the scene (e.g., se

Average Metric: 0.75 / 3 (24.9%): 100%|██████████| 3/3 [00:12<00:00,  4.04s/it]

2025/11/16 21:06:14 INFO dspy.evaluate.evaluate: Average Metric: 0.7474747474747474 / 3 (24.9%)





2025/11/16 21:06:39 INFO dspy.teleprompt.gepa.gepa: Iteration 45: Proposed new text for self: Goal:
- Produce a concise, factual caption for a given image that foregrounds the main subject and faithfully captures all legible text visible in the image. Do not invent details. If text is multilingual or includes subtitles, reproduce the visible text and note the language when clearly identifiable.

What to analyze (visuals):
- Identify the primary object or scene (e.g., book cover, stack of books, sign, poster, etc.).
- Locate and read all legible text in the image. Transcribe exact wording, including capitalization, punctuation, and line breaks as they appear.
- Determine which text is most prominent (title/text at the top) and which is supplementary (subtitle, author, publisher, language notes).
- If multiple items are clearly legible (e.g., a stack of books with visible titles), prioritize the most prominent item but mention others only if their text is clearly readable.
- Note languag

Average Metric: 1.53 / 3 (51.1%): 100%|██████████| 3/3 [00:20<00:00,  6.72s/it]

2025/11/16 21:07:14 INFO dspy.evaluate.evaluate: Average Metric: 1.5333333333333332 / 3 (51.1%)





2025/11/16 21:07:37 INFO dspy.teleprompt.gepa.gepa: Iteration 46: Proposed new text for self: - Task definition: For every image, generate a single, concise caption that describes the main scene and includes any clearly legible visible text. Do not reproduce long text from the image; summarize it.

- Visual analysis guidance: Identify the primary subject(s) (e.g., cans, bottles, books), count them if clear, and note their arrangement (foreground/background, on a shelf, on a table). Describe distinctive features (colors, logos, packaging patterns) and any obvious relationships (on a shelf, beside a glass, etc.).

- Text processing: Read any legible text in the image (brand names, product lines, flavors). Treat this text as part of the scene. If text is partially readable, rely on the most recognizable elements; if unreadable, describe without naming the text.

- Integration strategy: Combine visual nouns with the identified text to produce a natural, concise caption. Use a simple senten

Average Metric: 1.43 / 3 (47.5%): 100%|██████████| 3/3 [00:11<00:00,  3.85s/it]

2025/11/16 21:07:57 INFO dspy.evaluate.evaluate: Average Metric: 1.4262626262626263 / 3 (47.5%)





2025/11/16 21:08:24 INFO dspy.teleprompt.gepa.gepa: Iteration 47: Proposed new text for self: You are given an image. Your task is to generate a caption that describes the visible text in that image, not a broad description of the scene. Follow these steps exactly:

1) Visual/text extraction
- Use OCR-like observation to identify all legible text blocks: words, numbers, logos, headings, titles, captions, slogans, dates, and any branding.
- Record exact wording as it appears, including capitalization and punctuation when readable. If a character or segment is unclear, mark it as [unreadable] or [illegible].

2) Text prioritization
- Prioritize the most salient textual elements that define the image’s content (e.g., brand names like LG, model names like FLATRON, book titles and author, episode or volume indicators, stadium sponsor signage, etc.).
- If multiple text blocks exist, select the core ones and plan how to weave them into a concise caption.

3) Caption construction (integration)

Average Metric: 1.38 / 3 (46.1%): 100%|██████████| 3/3 [00:13<00:00,  4.34s/it]

2025/11/16 21:09:00 INFO dspy.evaluate.evaluate: Average Metric: 1.3838383838383839 / 3 (46.1%)





2025/11/16 21:09:24 INFO dspy.teleprompt.gepa.gepa: Iteration 48: Proposed new text for self: You are asked to generate a concise caption for an image that accurately captures the main subject and the most salient visible text. Follow these guidelines:

- Visual analysis
  - Identify the primary object (e.g., book cover, beer bottle, poster) and note its standout visual features (layout, colors, prominent imagery).
  - Detect legible text blocks, their relative prominence (title vs subtitle vs branding), and their languages. Do not transcribe every word; summarize the text in a natural way.
- Text integration
  - Use the visible text to anchor the caption, but keep it concise (1–2 sentences). If multiple languages appear, mention the language cues (e.g., a subtitle in Spanish) rather than listing all phrases.
  - If text is not the main identifier, describe the object first and then mention the presence of readable text.
- Domain knowledge
  - Apply common design conventions (e.g., tit

Average Metric: 1.21 / 3 (40.3%): 100%|██████████| 3/3 [00:14<00:00,  4.67s/it]

2025/11/16 21:09:47 INFO dspy.evaluate.evaluate: Average Metric: 1.208080808080808 / 3 (40.3%)





2025/11/16 21:10:08 INFO dspy.teleprompt.gepa.gepa: Iteration 49: Proposed new text for self: - Clear task: Given an image, generate a concise caption that accurately describes the visible text in the image. Focus on what the text says, where it appears, and its context, without inventing non-textual details.

- Visual analysis guidance:
  - Identify all legible text elements (titles, headings, labels, logos, captions, numbers) and note their location (e.g., book spine, brochure page, poster, framed print).
  - Record language and formatting (uppercase, bold, italics) to infer emphasis and meaning.
  - Note text orientation (upright, rotated, skewed) and readability (fully legible, partially legible, or blurred).
  - Distinguish text blocks belonging to different sources when multiple are visible.

- Text extraction and integration:
  - Extract the exact visible text phrases as they appear; preserve capitalization and punctuation when feasible.
  - Use the most informative or prominent

Average Metric: 1.28 / 3 (42.7%): 100%|██████████| 3/3 [00:09<00:00,  3.05s/it]

2025/11/16 21:10:44 INFO dspy.evaluate.evaluate: Average Metric: 1.2795454545454545 / 3 (42.7%)





2025/11/16 21:11:01 INFO dspy.teleprompt.gepa.gepa: Iteration 50: Proposed new text for self: - Task: Produce a concise caption for a given image that accurately describes the scene and, crucially, highlights any text that is visibly legible in the image (on clothing, signs, labels, packaging, etc.). Do not add details that cannot be inferred from what is visible.

- Visual analysis steps:
  - Detect and perform OCR on all readable text in the image. Record the exact words and, if possible, the typical case (uppercase/lowercase) and any branding cues.
  - Identify the most salient visible text (largest or most prominent) and note its approximate location (e.g., on a jersey chest, bottle label).
  - Observe the overall scene (number of people or objects, their positions, actions) but only describe actions that are evident (e.g., standing together, holding an item, a bottle on a shelf).
  - Distinguish between text that is clearly legible and text that is blurred or partially occluded; o

Average Metric: 0.90 / 3 (30.0%): 100%|██████████| 3/3 [00:23<00:00,  7.71s/it]

2025/11/16 21:11:39 INFO dspy.evaluate.evaluate: Average Metric: 0.898989898989899 / 3 (30.0%)





2025/11/16 21:12:15 INFO dspy.teleprompt.gepa.gepa: Iteration 51: Proposed new text for self: Task: Generate a concise caption that accurately describes the text visible in the given image.

What to do:
- Text extraction and transcription
  - Identify all legible text in the image (brands, titles, author names, labels, slogans, numbers).
  - Transcribe the visible text exactly as it appears (case, punctuation, numbers).
  - If text is partially obscured or unclear, note that (e.g., “text partially legible”) or omit uncertain parts.
- Text-centric captioning
  - Build the caption primarily around the identified text elements.
  - Mention the most prominent or informative text elements (e.g., brand names like Alhambra, book titles like Things Fall Apart, author names like Chinua Achebe, etc.).
  - If describing placement helps, briefly note what the text is on (e.g., “on a bottle label,” “on a book cover spine”), but avoid lengthy non-text details.
- Style and length
  - Use plain, simpl

Average Metric: 1.75 / 3 (58.5%): 100%|██████████| 3/3 [00:08<00:00,  2.96s/it]

2025/11/16 21:12:45 INFO dspy.evaluate.evaluate: Average Metric: 1.7545454545454546 / 3 (58.5%)





2025/11/16 21:13:06 INFO dspy.teleprompt.gepa.gepa: Iteration 52: Proposed new text for self: - Task goal: Produce a concise caption that accurately describes the scene and the text that is visibly legible in the image.
- Text extraction (OCR): First identify all legible text in the image. If some text is partially legible, note the clearly readable portions and approximate any uncertain parts.
- Focus and prioritization: Determine the main subject of the image (e.g., a beverage can, a book title page, a poster) and highlight the most salient visible text that defines that subject.
- Caption structure: Create 1–2 short sentences (roughly 15–25 words) that:
  - Describe what the image is (the object or scene).
  - Include the key visible text blocks (exact phrases if legible) without restating every minor detail.
  - Use the exact wording or faithful capitalization of the visible text when it helps identify the item.
- Domain-aware guidance:
  - Book pages: treat as a title-page or page

Average Metric: 1.07 / 3 (35.7%): 100%|██████████| 3/3 [00:10<00:00,  3.48s/it]

2025/11/16 21:13:42 INFO dspy.evaluate.evaluate: Average Metric: 1.0707070707070707 / 3 (35.7%)





2025/11/16 21:13:55 INFO dspy.teleprompt.gepa.gepa: Iteration 53: Proposed new text for self: Task: Generate a concise caption that describes the text visible in the image.

How to do it:
- Step 1: Perform OCR on the image and extract all legible text, including headings, captions, logos, numbers, and any visible language markers. Do not invent or guess text that isn’t readable.
- Step 2: Identify the most salient text content that defines what the image is about (e.g., a book page, brochure header, title, or logo cluster). Prioritize the exact phrases visible.
- Step 3: Write a short caption (1–2 sentences) in English that
  - reflects the visible text content, focusing on what the image communicates through text
  - includes the key visible words/phrases (use quotes or direct references when helpful)
  - notes the type of text if relevant (e.g., "page from a spiral-bound booklet," "brochure page," "header in Italian")
  - avoids describing non-text visuals in detail unless they are n

Average Metric: 1.06 / 3 (35.5%): 100%|██████████| 3/3 [00:10<00:00,  3.34s/it]

2025/11/16 21:14:21 INFO dspy.evaluate.evaluate: Average Metric: 1.0636363636363635 / 3 (35.5%)





2025/11/16 21:14:37 INFO dspy.teleprompt.gepa.gepa: Iteration 54: Proposed new text for self: - Task: Generate a concise, high-level caption that describes the main scene in the image. Do not provide a blow-by-blow or itemized inventory of every object.
- Text in image: If there is legible text on objects or signs, note that text is present and reference its type (e.g., branding, logos, sponsorship) without reproducing long strings. Do not transcribe all visible words or enumerate every label.
- Visual analysis: Identify the primary subject(s), setting, and overall action or mood. Mention key contextual elements (e.g., beverages on a shelf, cans in a display, a stadium with banners) without over-detailing.
- Integration of text and visuals: When text is relevant to understanding the scene, weave a brief note about branding or signs into the caption (e.g., “sponsors’ logos visible along the sidelines”) to show the relationship between text and imagery.
- Domain knowledge: Use appropriat

Average Metric: 1.41 / 3 (47.1%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/11/16 21:15:03 INFO dspy.evaluate.evaluate: Average Metric: 1.4141414141414141 / 3 (47.1%)





2025/11/16 21:15:14 INFO dspy.teleprompt.gepa.gepa: Iteration 55: Proposed new text for self: Task goal:
Generate a concise caption that centers on the text visible in the image. Prioritize accurate transcription of legible text and clearly describe what that text communicates about the object or scene. Do not rely on or invent non-text details beyond how they help identify the readable text.

Guidelines (step-by-step):
1) Locate and transcribe text
- Find all readable text in the image (labels, logos, book spines, posters, etc.).
- Transcribe exactly what you can read, preserving capitalization and punctuation as much as possible. If parts are unclear, mark them (e.g., [unclear], or use ? for uncertain characters).

2) Interpret the text (context and meaning)
- Determine what the readable text indicates (brand name, product name/variant, title, event, etc.).
- Use the text as the anchor of the caption. If there are multiple legible text blocks, prioritize the most informative one and 

Average Metric: 1.19 / 3 (39.6%): 100%|██████████| 3/3 [00:06<00:00,  2.11s/it]

2025/11/16 21:15:32 INFO dspy.evaluate.evaluate: Average Metric: 1.1888888888888889 / 3 (39.6%)





2025/11/16 21:15:42 INFO dspy.teleprompt.gepa.gepa: Iteration 56: Proposed new text for self: Goal: Produce a concise, natural-language caption that accurately reflects the main scene and any clearly legible text shown in the image.

Guidelines:
- Analyze both visual content and any clearly readable text on objects (brand names, labels, titles, etc.).
- Prioritize the main subject of the image. If there is legible text that identifies or contextualizes that subject, incorporate it briefly.
- Do not attempt to describe every label or every minor detail. Aim for one clear sentence (or two short clauses) that conveys the scene and the key text.
- If text is legible and central to the image, weave it into the caption (e.g., “LG monitor,” “Top Gun poster,” “BluePrint juice bottles”). If text is not legible, base the caption on visual features alone.
- Avoid guessing about people, identities, or facts not visually supported. When text is partially legible, reflect the legible portion and avo

Average Metric: 1.03 / 3 (34.3%): 100%|██████████| 3/3 [00:07<00:00,  2.59s/it]

2025/11/16 21:16:01 INFO dspy.evaluate.evaluate: Average Metric: 1.029040404040404 / 3 (34.3%)





2025/11/16 21:16:09 INFO dspy.teleprompt.gepa.gepa: Iteration 57: Proposed new text for self: You are given a single image. Produce a concise, natural-language caption that describes the overall scene and any legible text that helps identify the subject or context. Do not attempt to transcribe every word visible in the image; instead, mention only the most informative text (e.g., a brand name, logo, or a prominent slogan) if it meaningfully identifies the scene. If text is not informative, describe the scene without focusing on text details.

Guidelines:
- Visual analysis:
  - Identify the main subject and setting (e.g., bookshelf, sports stadium, beverage can, street scene).
  - Note key attributes: objects, colors, arrangement, mood, and notable actions or events.
  - Locate legible text and assess its usefulness for identification.
- Text integration:
  - If legible text clearly identifies the product, brand, event, or location, include a brief mention (e.g., "A&W root beer can on a

Average Metric: 1.37 / 3 (45.7%): 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]

2025/11/16 21:16:23 INFO dspy.evaluate.evaluate: Average Metric: 1.3696969696969696 / 3 (45.7%)





2025/11/16 21:16:32 INFO dspy.teleprompt.gepa.gepa: Iteration 58: Proposed new text for self: Task: Given an image, generate a single, concise caption that accurately describes the text visible in the image and how it relates to the object shown. Focus on the most informative visible text (titles, subtitles, author names, languages) and the type of text present (e.g., book cover, title page, spine). Do not invent details not supported by the image.

How to analyze and describe (visuals):
- Identify the primary object or scene (e.g., a book, a title page, a stack of books, a poster).
- Locate all readable text blocks. Prioritize the most salient text: main title, subtitle, author, language labels, publishing lines.
- Determine the text’s purpose and placement: is it a cover, a title page, a spine label, or other.
- Note languages or translations if shown (e.g., bilingual/subtitle text).
- Note legibility and limits: if some text is obscured or rotated, avoid exact transcription and desc

Average Metric: 0.93 / 3 (31.0%): 100%|██████████| 3/3 [00:07<00:00,  2.46s/it]

2025/11/16 21:16:58 INFO dspy.evaluate.evaluate: Average Metric: 0.9292929292929293 / 3 (31.0%)





2025/11/16 21:17:20 INFO dspy.teleprompt.gepa.gepa: Iteration 59: Proposed new text for self: - Task definition: Generate a concise, factual caption that describes the image’s main subject and any clearly legible text present in the scene.

- Visual analysis requirements:
  - Identify the primary subject (people, object, scene) and the main action or state.
  - Scan for legible text within the image (on clothing, signage, packaging, book/album covers, labels, etc.) and note what it says.
  - Note salient visual cues that help identify the subject (e.g., brand logos, product type, country/nationality cues, event context) without overloading the caption with non-essential details.
  - If text is clear and central to identification, integrate it into the caption; if text is blurred or unreadable, rely on visual cues alone.

- Text integration rules:
  - Include clearly legible text when it helps identify the object or context (e.g., brand names, product names, titles, country names).
  - 

Average Metric: 1.24 / 3 (41.5%): 100%|██████████| 3/3 [00:10<00:00,  3.35s/it]

2025/11/16 21:17:44 INFO dspy.evaluate.evaluate: Average Metric: 1.2444444444444445 / 3 (41.5%)





2025/11/16 21:18:02 INFO dspy.teleprompt.gepa.gepa: Iteration 60: Proposed new text for self: You are given an image (or images). Produce a concise, natural-language caption that:
- Describes the main scene: the primary objects, their colors, shapes, and spatial relationships (e.g., left of, in front of, on a table, in a restaurant, etc.).
- Includes any clearly legible text only when it helps identify the object or scene (for example, naming the brand or product). Do not transcribe long strings of text or overly reproduce labels.
- Uses simple present tense and avoids unnecessary detail or background clutter. Keep the caption brief (typically one or two phrases or a short sentence).
- Prioritizes visual accuracy over inferred or speculative details. If text is unclear or not essential to identify the scene, omit it or generalize (e.g., "a bottle with a red label" rather than guessing the exact wording).
- Handles single or multiple items by describing the focal item first and then men

TypeError: Evaluate.__init__() missing 1 required keyword-only argument: 'devset'

In [None]:
from dspy.evaluate import Evaluate
evaluate = Evaluate(metric=textcaps_metric, devset=valset, num_threads=1)
default_score = evaluate(default_program)

from dspy.evaluate import Evaluate

# Evaluate on validation set
evaluate = Evaluate(metric=textcaps_metric, devset=valset, num_threads=1)

gepa_result = evaluate(default_program)
default_score = gepa_result.score

print(f"\nMulti-LLM Proposer Final Score: {multi_llm_score:.2%}")

# Save results
multi_llm_program.save("gepa_textcaps_multi_llm.json")
print("Saved to: gepa_textcaps_multi_llm.json")



2025/11/16 21:19:32 INFO dspy.evaluate.evaluate: Average Metric: 2.0277777777777777 / 5 (40.6%)


In [14]:
# ============================================================================
# GEPA with Multi-LLM Proposer (with reranking)
# ============================================================================

from multi_modal_instruction_proposer import MultiModalInstructionProposer

print("\n" + "="*70)
print("Running GEPA with Multi-LLM Proposer (with reranking)")
print("="*70)

# Configure multi-LLM proposer with different models for diverse proposals
# Using different models for proposal generation, judging, and merging
multi_llm_proposer = MultiModalInstructionProposer(
    proposal_lms=[
        dspy.LM("openai/gpt-4o", temperature=1.0, max_tokens=16000),  # Proposal 1: GPT-4o (conservative, precise)
        dspy.LM("openrouter/anthropic/claude-sonnet-4.5", temperature=1.0, max_tokens=16000), 
        dspy.LM("openrouter/google/gemini-2.5-flash", temperature=0.6, max_tokens=16000),
    ],
    judge_lm=dspy.LM("openrouter/anthropic/claude-sonnet-4.5", temperature=1.0, max_tokens=16000), 
    merger_lm=dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=16000), 
    top_n=2,  # Select top 2 proposals to merge
    verbose=True
)

program = dspy.Predict(ImageCaption)

optimizer = dspy.GEPA(
    metric=textcaps_metric,
    max_metric_calls=25,
    candidate_selection_strategy="current_best",
    instruction_proposer=multi_llm_proposer,
)

multi_llm_program = optimizer.compile(program, trainset=trainset, valset=valset)


2025/11/16 23:36:27 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 25 metric calls of the program. This amounts to 1.25 full evals on the train+val set.
2025/11/16 23:36:27 INFO dspy.teleprompt.gepa.gepa: Using 5 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.



Running GEPA with Multi-LLM Proposer (with reranking)


GEPA Optimization:   0%|          | 0/25 [00:00<?, ?rollouts/s]2025/11/16 23:36:43 INFO dspy.evaluate.evaluate: Average Metric: 1.6527777777777777 / 5 (33.1%)
2025/11/16 23:36:43 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.33055555555555555
GEPA Optimization:  20%|██        | 5/25 [00:16<01:04,  3.24s/rollouts]2025/11/16 23:36:43 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.33055555555555555


Average Metric: 1.74 / 3 (58.1%): 100%|██████████| 3/3 [00:12<00:00,  4.20s/it]

2025/11/16 23:36:56 INFO dspy.evaluate.evaluate: Average Metric: 1.7444444444444445 / 3 (58.1%)




Processing component: self

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-4o

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 50.0/100 (Dataset: 22.0, Quality: 28.0)
  [Proposal 2] Score: 66.0/100 (Dataset: 32.0, Quality: 34.0)
  [Proposal 3] Score: 60.0/100 (Dataset: 28.0, Quality: 32.0)

Selected top 2 proposals for merging:
  1. Score: 66.0/100
  2. Score: 60.0/100

Merging top 2 proposals...


2025/11/16 23:40:08 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for self: Generate one short caption (single sentence, 9–15 words) that summarizes the most important visible text and minimal context about the object or document containing it.

Rules:
- Focus only on text-related content; do not describe colors, design, layout, or other non-text visuals.
- Never transcribe full passages or list multiple lines; do not reproduce the entire text even if it seems short. Mention only the single most prominent element (brand name or title); optionally add object/document type and simple location/quantity.
- Prioritize when multiple texts appear: main title/brand > product/document type > brief context (e.g., “on a shelf,” “title page,” “many cans”).
- Avoid quotes, colons, and enumerations; use plain, direct language.

Procedure:
1) Identify the object/document type.
2) Select the single dominant text element.
3) Add minimal context (where/quantity) if helpful.
4) Write one

  Merged instruction created (1344 chars)
  Rationale: 1) Unique elements taken from each proposal and why:
- From Proposal 1: Explicit word-length constraint (10–15 words concept), focus on most prominent text, brief context about the containing object/d...

[Final] New instruction for self:
  Generate one short caption (single sentence, 9–15 words) that summarizes the most important visible text and minimal context about the object or document containing it.

Rules:
- Focus only on text-re...


2025/11/16 23:40:26 INFO dspy.evaluate.evaluate: Average Metric: 1.1222222222222222 / 3 (37.4%)
2025/11/16 23:40:26 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score 1.1222222222222222 is not better than old score 1.7444444444444445, skipping
GEPA Optimization:  44%|████▍     | 11/25 [03:59<05:45, 24.65s/rollouts]2025/11/16 23:40:26 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.33055555555555555


Average Metric: 0.85 / 3 (28.2%): 100%|██████████| 3/3 [00:11<00:00,  3.89s/it]

2025/11/16 23:40:38 INFO dspy.evaluate.evaluate: Average Metric: 0.8454545454545455 / 3 (28.2%)




Processing component: self

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 1] Generated with openai/gpt-4o
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 46.0/100 (Dataset: 18.0, Quality: 28.0)
  [Proposal 2] Score: 68.0/100 (Dataset: 32.0, Quality: 36.0)
  [Proposal 3] Score: 33.0/100 (Dataset: 5.0, Quality: 28.0)

Selected top 2 proposals for merging:
  1. Score: 68.0/100
  2. Score: 46.0/100

Merging top 2 proposals...


2025/11/16 23:43:24 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for self: Write a single-sentence, high-level caption (8–14 words) that summarizes what the image shows.

Guidelines:
- Focus on the main subject, action, or purpose; write naturally and succinctly.
- Handle visible text by first deciding its role:
  1) If text is central (e.g., a book cover, sign, poster, label as the subject), you may name the single most important title/brand once. Optionally note language context (e.g., “with a Spanish subtitle”). Do not include other lines of text.
  2) If text is incidental/background, omit it or mention it generically (e.g., “team jerseys,” “product labels”) without quoting words.
- Never transcribe or list all visible words. Avoid enumerating ingredients, slogans, measurements, or minor details. Don’t describe fonts or colors unless essential to meaning.
- Aim for a quick, human-style summary that answers: “What is this image about?”

Examples:
- Several kinds of

  Merged instruction created (1349 chars)
  Rationale: 1) Unique elements used from each proposal and why:
- From Proposal 1:
  - Strict brevity (word-range constraint) to enforce concise outputs
  - Emphasis on high-level subject/action and “do not trans...

[Final] New instruction for self:
  Write a single-sentence, high-level caption (8–14 words) that summarizes what the image shows.

Guidelines:
- Focus on the main subject, action, or purpose; write naturally and succinctly.
- Handle vi...


2025/11/16 23:43:33 INFO dspy.evaluate.evaluate: Average Metric: 0.8363636363636364 / 3 (27.9%)
2025/11/16 23:43:33 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New subsample score 0.8363636363636364 is not better than old score 0.8454545454545455, skipping
GEPA Optimization:  68%|██████▊   | 17/25 [07:06<03:41, 27.74s/rollouts]2025/11/16 23:43:33 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 0 score: 0.33055555555555555


Average Metric: 1.40 / 3 (46.6%): 100%|██████████| 3/3 [00:09<00:00,  3.27s/it]

2025/11/16 23:43:43 INFO dspy.evaluate.evaluate: Average Metric: 1.398989898989899 / 3 (46.6%)




Processing component: self

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 1] Generated with openai/gpt-4o
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 46.0/100 (Dataset: 18.0, Quality: 28.0)
  [Proposal 2] Score: 75.0/100 (Dataset: 38.0, Quality: 37.0)
  [Proposal 3] Score: 36.0/100 (Dataset: 8.0, Quality: 28.0)

Selected top 2 proposals for merging:
  1. Score: 75.0/100
  2. Score: 46.0/100

Merging top 2 proposals...


2025/11/16 23:46:40 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for self: Write a single-sentence caption, 5–12 words, that mentions the most important visible text and what it refers to.

How to do it:
- Include one key brand/title/author/label and add a simple context noun (e.g., movie poster, book stack, can, sign). You may add one light descriptor (e.g., tasty, askew, framed) for a natural tone.
- Prioritize the most prominent/central text that best represents the subject (brand or title). Ignore secondary text, fine print, numbers, and measurements. Use only text you can clearly read—don’t guess.
- Keep it concise and natural: do not describe colors, layout, materials, background, camera details, or list multiple texts. Use a single sentence only.

Edge cases:
- If text is minimal, unclear, or illegible, name the object/category without inventing text (still keep 5–12 words).

Quick process:
1) Identify the main text-bearing subject. 
2) Select 1–3 key words (br

  Merged instruction created (1376 chars)
  Rationale: 1) Unique elements taken from each proposal and why:
- From Proposal 1:
  - Strict length constraint (5–12 words) to eliminate verbosity.
  - Strong prohibitions against detailed visual description an...

[Final] New instruction for self:
  Write a single-sentence caption, 5–12 words, that mentions the most important visible text and what it refers to.

How to do it:
- Include one key brand/title/author/label and add a simple context nou...


2025/11/16 23:46:56 INFO dspy.evaluate.evaluate: Average Metric: 1.0151515151515151 / 3 (33.8%)
2025/11/16 23:46:56 INFO dspy.teleprompt.gepa.gepa: Iteration 3: New subsample score 1.0151515151515151 is not better than old score 1.398989898989899, skipping
GEPA Optimization:  92%|█████████▏| 23/25 [10:28<01:00, 30.20s/rollouts]2025/11/16 23:46:56 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 0 score: 0.33055555555555555


Average Metric: 1.16 / 3 (38.7%): 100%|██████████| 3/3 [00:14<00:00,  4.76s/it]

2025/11/16 23:47:10 INFO dspy.evaluate.evaluate: Average Metric: 1.1616161616161615 / 3 (38.7%)




Processing component: self

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-4o

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 50.0/100 (Dataset: 22.0, Quality: 28.0)
  [Proposal 2] Score: 74.0/100 (Dataset: 36.0, Quality: 38.0)
  [Proposal 3] Score: 40.0/100 (Dataset: 12.0, Quality: 28.0)

Selected top 2 proposals for merging:
  1. Score: 74.0/100
  2. Score: 50.0/100

Merging top 2 proposals...


2025/11/16 23:50:11 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for self: Write one short caption that conveys the main takeaway implied by the image’s visible text.

Output constraints:
- One sentence, ideally 8–14 words.
- No quotes unless the exact wording is the message.
- Do not list multiple text fragments, colors, fonts, or minor scene details.

How to decide what to say:
1) Find the single most important idea the visible text communicates in context.
2) Paraphrase that idea in plain words; avoid verbatim transcription and exhaustive label content.
3) Mention brands only if they are the subject of the image; otherwise generalize (e.g., “airline sponsor”).
4) If quantity is the point, give an approximate count (“about twenty books”).
5) Include numbers only when they carry meaning (e.g., a match score); omit decorative or repeated text.
6) If multiple texts appear, choose the one with highest salience (central, large, or core to the scene’s purpose) and summari

  Merged instruction created (1395 chars)
  Rationale: 1) Unique elements taken and why:
- From Proposal 1: Core emphasis on concise, high-level, conceptual captions; explicit length target; examples that transform literal text into a takeaway (e.g., spon...

[Final] New instruction for self:
  Write one short caption that conveys the main takeaway implied by the image’s visible text.

Output constraints:
- One sentence, ideally 8–14 words.
- No quotes unless the exact wording is the message...


2025/11/16 23:50:20 INFO dspy.evaluate.evaluate: Average Metric: 0.792929292929293 / 3 (26.4%)
2025/11/16 23:50:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New subsample score 0.7929292929292929 is not better than old score 1.1616161616161615, skipping
GEPA Optimization:  92%|█████████▏| 23/25 [13:53<01:12, 36.24s/rollouts]


In [16]:
from dspy.evaluate import Evaluate

# Evaluate on validation set
evaluate = Evaluate(metric=textcaps_metric,devset=valset, num_threads=1)

multi_llm_result = evaluate(multi_llm_program)
multi_llm_score = multi_llm_result.score


# Save results
multi_llm_program.save("gepa_textcaps_multi_llm.json")
print("Saved to: gepa_textcaps_multi_llm.json")



2025/11/16 23:52:39 INFO dspy.evaluate.evaluate: Average Metric: 2.0277777777777777 / 5 (40.6%)


Saved to: gepa_textcaps_multi_llm.json


In [None]:
# ============================================================================
# Comparison
# ============================================================================

print(f"\n{'='*70}")
print("COMPARISON RESULTS")
print(f"{'='*70}")
print(f"Default Multimodal Proposer Score:  {default_score:.2%}")
print(f"Multi-LLM Proposer Score:          {multi_llm_score:.2%}")
print(f"Difference:                         {multi_llm_score - default_score:+.2%}")

if multi_llm_score > default_score:
    print(f"✓ Multi-LLM Proposer is BETTER by {multi_llm_score - default_score:.2%}")
elif default_score > multi_llm_score:
    print(f"✓ Default Proposer is BETTER by {default_score - multi_llm_score:.2%}")
else:
    print("Both proposers performed equally")
