In [13]:
import dspy

# Disable disk and memory cache
dspy.configure_cache(enable_disk_cache=False, enable_memory_cache=False)


In [5]:
from dotenv import load_dotenv
from dspy.teleprompt import GEPA
import os
load_dotenv()


os.environ["DSPY_CACHE_DIR"] = "/tmp/dspy_cache"
os.makedirs("/tmp/dspy_cache", exist_ok=True)



In [6]:
from dotenv import load_dotenv
from dspy.teleprompt import GEPA
import os
load_dotenv()

import dspy
from datasets import load_dataset
from multi_llm_proposer import MultiLLMProposalFn

# Initialize HotpotQA dataset
def init_hotpotqa_dataset(num_train=100, num_val=50, num_test=100):
    """Load and prepare HotpotQA dataset for DSPy."""
    print("Loading HotpotQA dataset...")
    
    # Load train split
    train_dataset = load_dataset("hotpot_qa", "distractor", split="train")
    train_dataset = train_dataset.select(range(0, min(num_train, len(train_dataset))))
    
    train_set = []
    for example in train_dataset:
        # Format context from HotpotQA
        context_text = "\n\n".join([
            f"Document {i+1}: {title}\n{' '.join(sentences)}"
            for i, (title, sentences) in enumerate(zip(example["context"]["title"], example["context"]["sentences"]))
        ])
        
        train_set.append(
            dspy.Example({
                "context": context_text,
                "question": example["question"],
                "answer": example["answer"],
            }).with_inputs("context", "question")
        )
    
    # Load validation split
    val_dataset = load_dataset("hotpot_qa", "distractor", split="validation")
    val_dataset = val_dataset.select(range(0, min(num_val, len(val_dataset))))
    
    val_set = []
    for example in val_dataset:
        context_text = "\n\n".join([
            f"Document {i+1}: {title}\n{' '.join(sentences)}"
            for i, (title, sentences) in enumerate(zip(example["context"]["title"], example["context"]["sentences"]))
        ])
        
        val_set.append(
            dspy.Example({
                "context": context_text,
                "question": example["question"],
                "answer": example["answer"],
            }).with_inputs("context", "question")
        )
    
    # Load test split
    test_dataset = load_dataset("hotpot_qa", "distractor", split="validation")
    test_dataset = test_dataset.select(range(0, min(num_test, len(test_dataset))))
    
    test_set = []
    for example in test_dataset:
        context_text = "\n\n".join([
            f"Document {i+1}: {title}\n{' '.join(sentences)}"
            for i, (title, sentences) in enumerate(zip(example["context"]["title"], example["context"]["sentences"]))
        ])
        
        test_set.append(
            dspy.Example({
                "context": context_text,
                "question": example["question"],
                "answer": example["answer"],
            }).with_inputs("context", "question")
        )
    
    print(f"Loaded {len(train_set)} training examples, {len(val_set)} validation examples, and {len(test_set)} test examples.")
    return train_set, val_set, test_set

train_set, val_set, test_set = init_hotpotqa_dataset(num_train=100, num_val=50, num_test=100)

# Configure base LM
lm = dspy.LM("openai/gpt-4.1-mini", temperature=1, max_tokens=32000)
dspy.configure(lm=lm)
os.environ["DSPY_CACHE_DIR"] = "/tmp/dspy_cache"
os.makedirs("/tmp/dspy_cache", exist_ok=True)


Loading HotpotQA dataset...
Loaded 100 training examples, 50 validation examples, and 100 test examples.


In [7]:
# Define the program for HotpotQA
class GenerateAnswer(dspy.Signature):
    """Answer the question based on the provided context. Provide a concise and accurate answer."""
    context = dspy.InputField(desc="Context documents containing relevant information")
    question = dspy.InputField(desc="The question to answer")
    answer = dspy.OutputField(desc="The answer to the question")

program = dspy.ChainOfThought(GenerateAnswer)

# Define metric with feedback (exact match for HotpotQA)
def metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    correct_answer = example['answer'].strip()
    predicted_answer = prediction.answer.strip() if hasattr(prediction, 'answer') else str(prediction).strip()
    
    # Exact match scoring
    score = int(correct_answer.lower() == predicted_answer.lower())
    
    feedback_text = ""
    if score == 1:
        feedback_text = f"Your answer is correct. The correct answer is '{correct_answer}'."
    else:
        feedback_text = f"Your answer is incorrect. You answered '{predicted_answer}', but the correct answer is '{correct_answer}'. "
        feedback_text += "Make sure to carefully read the context documents and extract the exact information needed to answer the question. "
        feedback_text += "Pay attention to details and ensure your answer matches the expected format."
    
    return dspy.Prediction(score=score, feedback=feedback_text)

# Initialize MultiLLMProposalFn
proposer = MultiLLMProposalFn(
    proposal_lms=[
        dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=16000),  # Reasoning model proposal
        dspy.LM("openrouter/anthropic/claude-sonnet-4.5", temperature=1.0, max_tokens=16000), 
        dspy.LM("openrouter/google/gemini-2.5-flash", temperature=0.6, max_tokens=16000),
    ],
    judge_lm=dspy.LM("openrouter/anthropic/claude-sonnet-4.5", temperature=1.0, max_tokens=16000), 
    merger_lm=dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=16000), 
    top_n=2,  
    verbose=True,
)


In [8]:
# Run GEPA optimization with MultiLLMProposalFn
optimizer = GEPA(
    metric=metric_with_feedback,
    reflection_lm=dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=16000),  
    max_full_evals=1,
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=5,
    instruction_proposer=proposer,
)

print("Starting GEPA optimization with MultiLLMProposalFn for HotpotQA...")
print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

print("\nOptimized program instructions:")
print(optimized_program.predict.signature.instructions)


2025/11/06 17:12:00 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 150 metric calls of the program. This amounts to 1.00 full evals on the train+val set.
2025/11/06 17:12:00 INFO dspy.teleprompt.gepa.gepa: Using 50 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.


Starting GEPA optimization with MultiLLMProposalFn for HotpotQA...
Training set size: 100
Validation set size: 50


GEPA Optimization:   0%|          | 0/150 [00:00<?, ?rollouts/s]2025/11/06 17:12:10 INFO dspy.evaluate.evaluate: Average Metric: 11.0 / 50 (22.0%)
2025/11/06 17:12:10 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.22
GEPA Optimization:  33%|███▎      | 50/150 [00:09<00:18,  5.34rollouts/s]2025/11/06 17:12:10 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.22


Average Metric: 2.00 / 5 (40.0%): 100%|██████████| 5/5 [00:02<00:00,  2.39it/s] 

2025/11/06 17:12:12 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 5 (40.0%)




Processing component: predict

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 86.0/100 (Dataset: 44.0, Quality: 42.0)
  [Proposal 2] Score: 74.0/100 (Dataset: 38.0, Quality: 36.0)
  [Proposal 3] Score: 40.0/100 (Dataset: 18.0, Quality: 22.0)

Selected top 2 proposals for merging:
  1. Score: 86.0/100
  2. Score: 74.0/100

Merging top 2 proposals...


2025/11/06 17:14:46 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: Task: Answer the question using only the provided context documents. Output a single, concise answer string.

Input:
- context: Numbered documents (Document 1, Document 2, …)
- question: A single question whose answer is found in the context

Rules and strategy:
1) Use only information explicitly stated in the context. Do not use outside knowledge or infer beyond what is written.
2) Identify the document(s) that directly satisfy the question’s constraint(s) (e.g., role, gender, title, year). Prefer the document that explicitly links the entity to the asked role/attribute (the authoritative relevant document).
3) Cross-check other documents only to select the most specific and complete form that matches the asked linkage (e.g., include middle names when that full form appears in the document identifying the role/attribute or a clearly linked bio).
4) Extract the minimal exact text span that d

  Merged instruction created (1824 chars)
  Rationale: 1) Unique elements taken from each proposal and why:
- From Proposal 1:
  - Role/attribute-linked disambiguation and the idea of an “authoritative relevant document” to decide the correct, fully speci...

[Final] New instruction for predict:
  Task: Answer the question using only the provided context documents. Output a single, concise answer string.

Input:
- context: Numbered documents (Document 1, Document 2, …)
- question: A single ques...


2025/11/06 17:14:49 INFO dspy.evaluate.evaluate: Average Metric: 4.0 / 5 (80.0%)
2025/11/06 17:14:49 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score 4 is better than old score 2. Continue to full eval and add to candidate pool.
2025/11/06 17:14:55 INFO dspy.evaluate.evaluate: Average Metric: 31.0 / 50 (62.0%)
2025/11/06 17:14:55 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New program is on the linear pareto front
2025/11/06 17:14:55 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full valset score for new program: 0.62
2025/11/06 17:14:55 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Full train_val score for new program: 0.62
2025/11/06 17:14:55 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Individual valset scores for new program: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
2025/11/06 17:14:55 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New valset pareto front score

Average Metric: 5.00 / 5 (100.0%): 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]

2025/11/06 17:14:58 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 5 (100.0%)
2025/11/06 17:14:58 INFO dspy.teleprompt.gepa.gepa: Iteration 2: All subsample scores perfect. Skipping.
2025/11/06 17:14:58 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Reflective mutation did not propose a new candidate
GEPA Optimization:  77%|███████▋  | 115/150 [02:57<01:00,  1.73s/rollouts]2025/11/06 17:14:58 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 1 score: 0.62



Average Metric: 4.00 / 5 (80.0%): 100%|██████████| 5/5 [00:02<00:00,  2.28it/s] 

2025/11/06 17:15:00 INFO dspy.evaluate.evaluate: Average Metric: 4.0 / 5 (80.0%)




Processing component: predict

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 74.0/100 (Dataset: 38.0, Quality: 36.0)
  [Proposal 2] Score: 60.0/100 (Dataset: 28.0, Quality: 32.0)
  [Proposal 3] Score: 50.0/100 (Dataset: 22.0, Quality: 28.0)

Selected top 2 proposals for merging:
  1. Score: 74.0/100
  2. Score: 60.0/100

Merging top 2 proposals...


2025/11/06 17:19:07 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: Task
Answer the question using only the provided context documents. Output a single, concise answer string.

Inputs
- context: Numbered documents (Document 1, Document 2, …)
- question: A single question whose answer is found in the context

Rules and strategy
1) Use only information explicitly stated in the context. Do not use outside knowledge or infer beyond what is written.

2) Select the authoritative relevant document:
   - Find the document that explicitly links the asked entity to the required role/attribute/time/filter.
   - Use other documents only to disambiguate entities or confirm which document is authoritative; never to augment or rewrite the answer.

3) Extract a single, minimal, contiguous exact text span from the authoritative document.
   - You may extract from the middle of a sentence; do not force full sentences.
   - Do not join non-adjacent fragments or reformat conten

  Merged instruction created (3106 chars)
  Rationale: 1) Unique elements taken and why
- From Proposal 1:
  - Strict single, minimal, contiguous exact text span rule to prevent non-contiguous synthesis—the root cause of the failure.
  - Preservation of i...

[Final] New instruction for predict:
  Task
Answer the question using only the provided context documents. Output a single, concise answer string.

Inputs
- context: Numbered documents (Document 1, Document 2, …)
- question: A single quest...


2025/11/06 17:19:09 INFO dspy.evaluate.evaluate: Average Metric: 4.0 / 5 (80.0%)
2025/11/06 17:19:09 INFO dspy.teleprompt.gepa.gepa: Iteration 3: New subsample score 4 is not better than old score 4, skipping
GEPA Optimization:  83%|████████▎ | 125/150 [07:08<02:20,  5.62s/rollouts]2025/11/06 17:19:09 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 1 score: 0.62


Average Metric: 2.00 / 5 (40.0%): 100%|██████████| 5/5 [00:02<00:00,  1.81it/s]

2025/11/06 17:19:12 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 5 (40.0%)




Processing component: predict

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 79.5/100 (Dataset: 41.5, Quality: 38.0)
  [Proposal 2] Score: 60.0/100 (Dataset: 28.0, Quality: 32.0)
  [Proposal 3] Score: 70.0/100 (Dataset: 34.0, Quality: 36.0)

Selected top 2 proposals for merging:
  1. Score: 79.5/100
  2. Score: 70.0/100

Merging top 2 proposals...


2025/11/06 17:23:06 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: Task: Using only the provided numbered documents, answer the question with a single, concise answer string.

Rules:
1) Use only what is explicitly stated in the documents. No outside knowledge or inference.
2) Locate the authoritative evidence:
   - Prefer a single document that satisfies all question constraints (role/attribute/date/place/etc.).
   - If no single document satisfies all constraints, use one document (e.g., a list) to identify candidates and other document(s) to apply filters (e.g., nationality, death date) to select the correct entity.
3) Final answer span:
   - Extract a minimal, contiguous, verbatim text span from one document only.
   - Choose that document to be the one that provides the most specific, distinguishing constraints relevant to the question (or the clearest linkage if one doc alone satisfies all constraints).
4) Names and forms:
   - People: Return the full 

  Merged instruction created (2507 chars)
  Rationale: 1) Unique elements taken and why:
- From Proposal 1:
  - Authoritative-document focus and cross-document disambiguation with “final answer must be verbatim from one document” to prevent synthesis erro...

[Final] New instruction for predict:
  Task: Using only the provided numbered documents, answer the question with a single, concise answer string.

Rules:
1) Use only what is explicitly stated in the documents. No outside knowledge or infe...


2025/11/06 17:23:09 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 5 (40.0%)
2025/11/06 17:23:09 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New subsample score 2 is not better than old score 2, skipping
GEPA Optimization:  90%|█████████ | 135/150 [11:08<02:17,  9.14s/rollouts]2025/11/06 17:23:09 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 1 score: 0.62


Average Metric: 3.00 / 5 (60.0%): 100%|██████████| 5/5 [00:02<00:00,  2.11it/s] 

2025/11/06 17:23:11 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 5 (60.0%)




Processing component: predict

Generating 3 proposals in parallel...
  [Proposal 3] Generated with openrouter/google/gemini-2.5-flash
  [Proposal 2] Generated with openrouter/anthropic/claude-sonnet-4.5
  [Proposal 1] Generated with openai/gpt-5

Scoring 3 proposals with judge LLM...
  [Proposal 1] Score: 83.0/100 (Dataset: 40.0, Quality: 43.0)
  [Proposal 2] Score: 53.0/100 (Dataset: 22.0, Quality: 31.0)
  [Proposal 3] Score: 54.0/100 (Dataset: 26.0, Quality: 28.0)

Selected top 2 proposals for merging:
  1. Score: 83.0/100
  2. Score: 54.0/100

Merging top 2 proposals...


2025/11/06 17:27:30 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: Task: Answer the question using only the provided context documents. Output a single, concise answer string.

Input:
- context: Numbered documents (Document 1, Document 2, …)
- question: A single question whose answer is found in the context

Rules and strategy:
1) Parse the question to determine:
   - The requested answer type (e.g., person’s name, year, genus, family, place).
   - All constraints (role/attribute, gender, title, location, time, singular vs plural).

2) Identify the authoritative relevant document(s): those that directly and explicitly link the requested entity type to the asked role/attribute/location/time in one place. Prefer the document that states the exact linkage the question asks about.

3) Cross-check other documents only to:
   - Verify the entity satisfies all constraints.
   - Select the most specific and complete form that matches the asked linkage (e.g., includ

  Merged instruction created (3285 chars)
  Rationale: 1) Unique elements taken and why:
   - From Proposal 1:
     • Authoritative relevant document concept with explicit linkage: Ensures selection of the document that directly ties the entity to the ask...

[Final] New instruction for predict:
  Task: Answer the question using only the provided context documents. Output a single, concise answer string.

Input:
- context: Numbered documents (Document 1, Document 2, …)
- question: A single ques...


2025/11/06 17:27:37 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 5 (60.0%)
2025/11/06 17:27:37 INFO dspy.teleprompt.gepa.gepa: Iteration 5: New subsample score 3 is not better than old score 3, skipping
GEPA Optimization:  97%|█████████▋| 145/150 [15:36<01:04, 12.96s/rollouts]2025/11/06 17:27:37 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Selected program 1 score: 0.62


Average Metric: 5.00 / 5 (100.0%): 100%|██████████| 5/5 [00:01<00:00,  2.68it/s]

2025/11/06 17:27:39 INFO dspy.evaluate.evaluate: Average Metric: 5.0 / 5 (100.0%)
2025/11/06 17:27:39 INFO dspy.teleprompt.gepa.gepa: Iteration 6: All subsample scores perfect. Skipping.
2025/11/06 17:27:39 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Reflective mutation did not propose a new candidate
GEPA Optimization:  97%|█████████▋| 145/150 [15:38<00:32,  6.47s/rollouts]



Optimized program instructions:
Task: Answer the question using only the provided context documents. Output a single, concise answer string.

Input:
- context: Numbered documents (Document 1, Document 2, …)
- question: A single question whose answer is found in the context

Rules and strategy:
1) Use only information explicitly stated in the context. Do not use outside knowledge or infer beyond what is written.
2) Identify the document(s) that directly satisfy the question’s constraint(s) (e.g., role, gender, title, year). Prefer the document that explicitly links the entity to the asked role/attribute (the authoritative relevant document).
3) Cross-check other documents only to select the most specific and complete form that matches the asked linkage (e.g., include middle names when that full form appears in the document identifying the role/attribute or a clearly linked bio).
4) Extract the minimal exact text span that directly answers the question. Do not paraphrase or reword. Pre




In [9]:
# Evaluate the optimized program on test set
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=lambda example, prediction, trace=None, pred_name=None, pred_trace=None: 
        int(example['answer'].strip().lower() == prediction.answer.strip().lower()) if hasattr(prediction, 'answer') else 0,
    num_threads=32,
    display_table=True,
    display_progress=True
)

print("\nEvaluating optimized program on test set...")
result = evaluate(optimized_program)
print(f"\nFinal accuracy: {result.score}%")
print(f"Correct: {result.score * len(test_set) / 100:.0f} / {len(test_set)}")



Evaluating optimized program on test set...
Average Metric: 65.00 / 100 (65.0%): 100%|██████████| 100/100 [00:08<00:00, 11.98it/s]

2025/11/06 17:27:48 INFO dspy.evaluate.evaluate: Average Metric: 65 / 100 (65.0%)





Unnamed: 0,context,question,example_answer,reasoning,pred_answer,<lambda>
0,Document 1: Ed Wood (film) Ed Wood is a 1994 American biographical...,Were Scott Derrickson and Ed Wood of the same nationality?,yes,Document 2 states Scott Derrickson is an American director. Docume...,Yes,✔️ [1]
1,"Document 1: Meet Corliss Archer Meet Corliss Archer, a program fro...",What government position was held by the woman who portrayed Corli...,Chief of Protocol,The woman who portrayed Corliss Archer in the film Kiss and Tell (...,United States ambassador to Ghana and to Czechoslovakia and Chief ...,✔️ [0]
2,Document 1: Andre Norton Award The Andre Norton Award for Young Ad...,"What science fantasy young adult series, told in first person, has...",Animorphs,"Document 9 states that ""Animorphs is a science fantasy series of y...",Animorphs,✔️ [1]
3,Document 1: Esma Sultan (daughter of Abdülaziz) Esma Sultan (21 Ma...,Are the Laleli Mosque and Esma Sultan Mansion located in the same ...,no,"Document 6 states that the Laleli Mosque is located in Laleli, Fat...",No,✔️ [1]
4,Document 1: Just Another Romantic Wrestling Comedy Just Another Ro...,"The director of the romantic comedy ""Big Stone Gap"" is based in wh...","Greenwich Village, New York City","Document 10 states that Adriana Trigiani wrote and directed ""Big S...","Greenwich Village, New York City",✔️ [1]
...,...,...,...,...,...,...
95,Document 1: Scott's Run Nature Preserve Scott's Run Nature Preserv...,Tysons Galleria is located in what county?,Fairfax County,Document 2 states that Tysons Galleria is located at 2001 Internat...,Fairfax County,✔️ [1]
96,"Document 1: Cardinal Health Cardinal Health, Inc. is a Fortune 500...",Bordan Tkachuk was the CEO of a company that provides what sort of...,IT products and services,Document 4 states that Bordan Tkachuk was the former CEO of Viglen...,IT products and services,✔️ [1]
97,"Document 1: Aryanto Yuniawan Aryanto Yuniawan (born June 18, 1977 ...","Which filmmaker was known for animation, Lev Yilmaz or Pamela B. G...",Levni Yilmaz,Document 7 states that Levni Yilmaz (born 1973) is a San Francisco...,Lev Yilmaz,✔️ [0]
98,Document 1: Visa policy of Hong Kong The visa policy of Hong Kong ...,In which city is the ambassador of the Rabat-Salé-Kénitra administ...,Beijing,Document 8 states that the Moroccan ambassador in Beijing is the o...,Beijing,✔️ [1]



Final accuracy: 65.0%
Correct: 65 / 100


In [14]:
# Run GEPA optimization with DEFAULT proposer (single LLM) for comparison
optimizer_default = GEPA(
    metric=metric_with_feedback,
    reflection_lm=dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=16000),  # Single LLM for proposals
    max_full_evals=1,
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
)

print("Starting GEPA optimization with DEFAULT proposer for comparison...")
optimized_program_default = optimizer_default.compile(
    program,
    trainset=train_set,
    valset=val_set,
)


2025/11/06 17:38:00 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 150 metric calls of the program. This amounts to 1.00 full evals on the train+val set.
2025/11/06 17:38:00 INFO dspy.teleprompt.gepa.gepa: Using 50 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.


Starting GEPA optimization with DEFAULT proposer for comparison...


GEPA Optimization:   0%|          | 0/150 [00:00<?, ?rollouts/s]2025/11/06 17:38:08 INFO dspy.evaluate.evaluate: Average Metric: 10.0 / 50 (20.0%)
2025/11/06 17:38:08 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.2
GEPA Optimization:  33%|███▎      | 50/150 [00:08<00:16,  6.02rollouts/s]2025/11/06 17:38:08 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.2


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]

2025/11/06 17:38:10 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/11/06 17:38:51 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: Task: Answer a single question using only the provided “context” documents. Return a concise, exact answer string with no extra text.

Guidelines:
1) Use only information explicitly stated in the provided context. Do not rely on outside knowledge or make inferences beyond what is written.
2) Find the document(s) that directly address the entity and attribute in the question. Cross-reference documents when necessary, but prefer the source that explicitly ties the attribute to the entity as framed by the question.
3) Return the answer in the most precise, fully specified form present in the context:
   - For names: include full names (e.g., middle names) if given (e.g., “Walter Darwin Coy” rather than “Walter Coy” when the full form is provided for the same person).
   - For ages: match the exact phrasing used in the context (e.g., “16-year-old” rather than “16”).
   - For years: return only t

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:02<00:00,  1.03it/s] 

2025/11/06 17:39:08 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/11/06 17:40:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: Task: Answer a single question using only the provided “context” documents. Return a concise, exact answer string with no extra text.

Core rules:
1) Use only information explicitly stated in the provided context. Do not use outside knowledge or infer beyond what is written.
2) Identify the target entity and the attribute asked in the question, then locate the document(s) that mention both or clearly link them.
3) Prefer the source that explicitly ties the attribute to the entity as framed by the question. Cross-reference other documents only when necessary and only to the extent allowed below.

Answer precision:
4) Return the answer in the most precise, fully specified form that is supported by the context AND consistent with the source that ties the attribute to the entity.
   - Names: Preserve the exact spelling, capitalization, diacritics, hyphens, and stylization. Include middle names o

In [15]:
# Evaluate the default optimized program on test set
evaluate_default = dspy.Evaluate(
    devset=test_set,
    metric=lambda example, prediction, trace=None, pred_name=None, pred_trace=None: 
        int(example['answer'].strip().lower() == prediction.answer.strip().lower()) if hasattr(prediction, 'answer') else 0,
    num_threads=32,
    display_table=True,
    display_progress=True
)

print("\nEvaluating default optimized program on test set...")
result_default = evaluate_default(optimized_program_default)
print(f"\nFinal accuracy (default): {result_default.score}%")
print(f"Correct: {result_default.score * len(test_set) / 100:.0f} / {len(test_set)}")



Evaluating default optimized program on test set...
Average Metric: 63.00 / 100 (63.0%): 100%|██████████| 100/100 [00:11<00:00,  8.50it/s]

2025/11/06 17:43:10 INFO dspy.evaluate.evaluate: Average Metric: 63 / 100 (63.0%)





Unnamed: 0,context,question,example_answer,reasoning,pred_answer,<lambda>
0,Document 1: Ed Wood (film) Ed Wood is a 1994 American biographical...,Were Scott Derrickson and Ed Wood of the same nationality?,yes,"Scott Derrickson is described as an American director, screenwrite...",Yes,✔️ [1]
1,"Document 1: Meet Corliss Archer Meet Corliss Archer, a program fro...",What government position was held by the woman who portrayed Corli...,Chief of Protocol,The question asks for the government position held by the woman wh...,United States ambassador to Ghana,✔️ [0]
2,Document 1: Andre Norton Award The Andre Norton Award for Young Ad...,"What science fantasy young adult series, told in first person, has...",Animorphs,"The question asks for a science fantasy young adult series, told i...",Animorphs,✔️ [1]
3,Document 1: Esma Sultan (daughter of Abdülaziz) Esma Sultan (21 Ma...,Are the Laleli Mosque and Esma Sultan Mansion located in the same ...,no,"Document 6 states that the Laleli Mosque is located in Laleli, Fat...",No,✔️ [1]
4,Document 1: Just Another Romantic Wrestling Comedy Just Another Ro...,"The director of the romantic comedy ""Big Stone Gap"" is based in wh...","Greenwich Village, New York City","Document 10 states that ""Big Stone Gap"" (2014 film) was written an...","Greenwich Village, New York City",✔️ [1]
...,...,...,...,...,...,...
95,Document 1: Scott's Run Nature Preserve Scott's Run Nature Preserv...,Tysons Galleria is located in what county?,Fairfax County,"Document 2 states that Tysons Galleria is located in McLean, Virgi...",Fairfax County,✔️ [1]
96,"Document 1: Cardinal Health Cardinal Health, Inc. is a Fortune 500...",Bordan Tkachuk was the CEO of a company that provides what sort of...,IT products and services,The question asks what sort of products were provided by the compa...,"IT products and services, including storage systems, servers, work...",✔️ [0]
97,"Document 1: Aryanto Yuniawan Aryanto Yuniawan (born June 18, 1977 ...","Which filmmaker was known for animation, Lev Yilmaz or Pamela B. G...",Levni Yilmaz,The context indicates that Lev Yilmaz is a San Francisco based ind...,Lev Yilmaz,✔️ [0]
98,Document 1: Visa policy of Hong Kong The visa policy of Hong Kong ...,In which city is the ambassador of the Rabat-Salé-Kénitra administ...,Beijing,The Rabat-Salé-Kénitra administrative region is mentioned in Docum...,Rabat,✔️ [0]



Final accuracy (default): 63.0%
Correct: 63 / 100


In [12]:
# Final comparison
print("="*80)
print("FINAL COMPARISON: MultiLLMProposalFn vs Default GEPA")
print("="*80)

multi_llm_score = result.score
default_score = result_default.score

print(f"\n{'Metric':<30} {'MultiLLM':<25} {'Default':<25} {'Difference':<15}")
print("-" * 95)
print(f"{'Test Accuracy':<30} {multi_llm_score:>6.2f}%{'':>15} {default_score:>6.2f}%{'':>15} {multi_llm_score - default_score:>+6.2f}%")
print(f"{'Correct Answers':<30} {multi_llm_score * len(test_set) / 100:>6.1f}/{len(test_set):<4}{'':>15} {default_score * len(test_set) / 100:>6.1f}/{len(test_set):<4}{'':>15} {(multi_llm_score - default_score) * len(test_set) / 100:>+6.1f}")

print(f"\n{'='*80}")
if multi_llm_score > default_score:
    improvement = ((multi_llm_score / default_score) - 1) * 100 if default_score > 0 else 0
    print(f"✓ MultiLLMProposalFn is BETTER by {multi_llm_score - default_score:.2f} percentage points")
    print(f"  Relative improvement: {improvement:+.2f}%")
elif default_score > multi_llm_score:
    improvement = ((default_score / multi_llm_score) - 1) * 100 if multi_llm_score > 0 else 0
    print(f"✓ Default GEPA is BETTER by {default_score - multi_llm_score:.2f} percentage points")
    print(f"  Relative improvement: {improvement:+.2f}%")
else:
    print("Both approaches perform equally well")
print(f"{'='*80}")


FINAL COMPARISON: MultiLLMProposalFn vs Default GEPA

Metric                         MultiLLM                  Default                   Difference     
-----------------------------------------------------------------------------------------------
Test Accuracy                   65.00%                 62.00%                 +3.00%
Correct Answers                  65.0/100                   62.0/100                   +3.0

✓ MultiLLMProposalFn is BETTER by 3.00 percentage points
  Relative improvement: +4.84%
