In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import dspy

In [4]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("test")
mlflow.dspy.autolog()




In [5]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="vincentkoc/hover-parquet", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

import ujson
import bm25s
import Stemmer
import os
from dspy.utils import download

# Download and extract data if not exists
if not os.path.exists("wiki.abstracts.2017.jsonl"):
    download("https://huggingface.co/dspy/cache/resolve/main/wiki.abstracts.2017.tar.gz")
    os.system("tar -xzvf wiki.abstracts.2017.tar.gz")

corpus = []
if os.path.exists("wiki.abstracts.2017.jsonl"):
    with open("wiki.abstracts.2017.jsonl") as f:
        for line in f:
            line = ujson.loads(line)
            corpus.append(f"{line['title']} | {' '.join(line['text'])}")

stemmer = Stemmer.Stemmer("english")
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

retriever = bm25s.BM25(k1=0.9, b=0.4)
retriever.index(corpus_tokens)

DOCS = {}

def search(query: str, k: int) -> list[str]:
    tokens = bm25s.tokenize(query, stopwords="en", stemmer=stemmer, show_progress=False)
    results, scores = retriever.retrieve(tokens, k=k, n_threads=1, show_progress=False)
    
    retrieved_docs = [corpus[doc] for doc in results[0]]
    
    for doc_str in retrieved_docs:
         if " | " in doc_str:
             title, text = doc_str.split(" | ", 1)
             DOCS[title] = text
             
    return retrieved_docs

# search("France", 5)

def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

  from .autonotebook import tqdm as notebook_tqdm


Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Academy Award for Best Director', 'Miss Potter', 'Chris Noonan']


                                                                                   

In [6]:
import warnings
warnings.filterwarnings("ignore", message=r"(?s)Pydantic serializer warnings:.*StreamingChoices")

In [7]:
from dspy.datasets import DataLoader
from datasets import load_dataset


student_lm = dspy.LM("openrouter/qwen/qwen3-8b")
teacher_lm = dspy.LM("openai/gpt-5")
dspy.configure(lm=student_lm)
# student_lm("this is a test")

In [9]:
react(claim="David Gregory was born in 1625.").titles[:3]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...rt with the search.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...nswer is to finish.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

['David Gregory (mathematician)', 'David Gregory (journalist)']

In [15]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

eval_result = evaluate(safe_react)
print(eval_result)


Average Metric: 57.67 / 100 (57.7%): : 101it [05:35,  3.32s/it]                       

2025/12/11 09:31:35 INFO dspy.evaluate.evaluate: Average Metric: 57.666666666666664 / 100 (57.7%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,top5_recall
0,The Church of England's movement that inspired the Trinity Episcop...,"[Oxford Movement, Trinity Episcopal Church (Houghton, Michigan), S...","{'thought_0': ""To verify the claim, I need to identify the specifi...",The claim states that the Church of England's movement inspiring T...,"[Oxford Movement, Samuel Rickards]",✔️ [0.667]
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Bobby Stewart, Red, White &amp; Crüe, Mike Tyson]","{'thought_0': 'I need to find Wikipedia pages that mention ""Red, W...","The claim mentions ""Red, White & Crüe"" and a French fighter traine...","[Red, White & Crüe, Bobby Stewart, Mötley Crüe, Mike Tyson, French...",✔️ [0.667]
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Glen or Glenda, Ed Wood, Fernand Rivers]",{'thought_0': 'I need to verify the claim by identifying the indiv...,"The claim suggests that the creator of ""Glen or Glenda"" and ""Ferna...","[Ed Wood, Fernand Rivers]",✔️ [0.667]
3,The film by Sandi Sissel was released before The End of Suburbia.,"[The End of Suburbia, Sandi Sissel, Chicken Ranch (film)]",{'thought_0': 'I need to find the release date of Sandi Sissel\'s ...,"The claim states that a film by Sandi Sissel was released before ""...","[Sandi Sissel, Chicken Ranch, Paul Jacobs and the Nuclear Gang, Th...",✔️ [0.667]
4,The actor who played captain hook in the live production with Tayl...,"[Taylor Louderman, Christopher Walken, Peter Pan Live!]",{'thought_0': 'I need to identify the actor who played Captain Hoo...,The claim is verified. Taylor Louderman was in the 2013 NBC live p...,"[Taylor Louderman, Peter Pan Live!, Christopher Walken, The Deer H...",✔️ [1.000]


EvaluationResult(score=57.67, results=<list of 100 results>)


In [30]:
mlflow.dspy.autolog()



In [None]:
import dspy

from dspy.teleprompt.teleprompt import Teleprompter
from typing import Any

class NoLabelOptimizer(Teleprompter):
    def get_last_traces(self, eval_result):
        results = [result[1][1][-1] for result in eval_result.results]
        result_dict = [{"trajectory": result[1], "prediction": result[2]} for result in results]
        return result_dict
    
    def compile(self, student: dspy.Module, *, trainset: list[dspy.Example], teacher_lm: dspy.LM, bsz=20) -> dspy.Module:
        def wrap_student(student_prog):
            def wrapped_student(*args, **kwargs):
                with dspy.context(trace=[]):
                    result = student_prog(*args, **kwargs)
                    trace = dspy.settings.trace
                    return [result, trace]
            return wrapped_student
        evaluate = dspy.Evaluate(devset=trainset, metric=lambda e, p, t=None: 0, num_threads=16, display_progress=True)
        eval_result = evaluate(wrap_student(student))
        print(eval_result)

        return student, eval_result

    def get_params(self) -> dict[str, Any]:
        """
        Get the parameters of the teleprompter.

        Returns:
            The parameters of the teleprompter.
        """
        return self.__dict__

optimizer = NoLabelOptimizer()

optimized_react, eval_result = optimizer.compile(react, trainset=trainset[:20], teacher_lm=teacher_lm)


Average Metric: 0.00 / 20 (0.0%): 100%|██████████| 20/20 [02:04<00:00,  6.23s/it]

2025/12/11 09:33:52 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)



EvaluationResult(score=0.0, results=<list of 20 results>)


In [None]:
from collections import Counter
import rich
predictions = [result[1] for result in eval_result.results]
# print(type(predictions))
# print(type(predictions[0]))
# print(len(predictions[0]))
rich.print(predictions[0])
c = Counter(type(prediction) for prediction in predictions)
len_c = Counter(len(prediction) for prediction in predictions)
traces = [prediction[1] for prediction in predictions]
final_trajectories = [trace[-1] for trace in traces]
# print(final_trajectories[0])
# print(type(traces[0]))


# rich.print(get_last_traces(eval_result)[0])


In [8]:
def get_last_traces(eval_result):
    results = [result[1][1][-1] for result in eval_result.results]
    result_dict = [{"trajectory": result[1], "prediction": result[2]} for result in results]
    return result_dict

In [None]:
import rich

rich.print({"current_react_prompt": react.react.signature.instructions, "current_extraction_prompt": react.extract.predict.signature.instructions})

In [None]:
# mlflow.dspy.autolog(disable=True)

In [None]:
# ==============
# Initial single step 5x experiment
# ==============
import rich

class ReasonAboutTraces(dspy.Signature):
    """Given a set of trajectories, and the prompt that was used to generate them,
    reason about the traces and how you might improve the prompt from a strategy perspective without overfitting"""
    history: list[str] = dspy.InputField()
    current_state: dict[str, Any] = dspy.InputField()
    deviations_from_specification: str = dspy.OutputField(desc="Deviations from the specification given in the signature of the prompt")
    noticed_patterns: str = dspy.OutputField()
    generic_strategy: str = dspy.OutputField()
    improved_react_prompt: str = dspy.OutputField()
    improved_extraction_prompt: str = dspy.OutputField()

# get 20 trajectories
def wrap_student(student_prog):
    def wrapped_student(*args, **kwargs):
        with dspy.context(trace=[]):
            result = student_prog(*args, **kwargs)
            trace = dspy.settings.trace
            return [result, trace]
    return wrapped_student

instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)

current_trainset = trainset[:20]
evaluate_trainset = dspy.Evaluate(devset=current_trainset, metric=lambda e, p, t=None: 0, num_threads=25, display_progress=True)
evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=False, max_errors=100)

eval_result = evaluate_trainset(wrap_student(react))

# append those to history
history = []
history.append({"current_react_prompt": react.react.signature.instructions, "current_extraction_prompt": react.extract.predict.signature.instructions})
history.append({"last_trajectories": get_last_traces(eval_result)})

current_state = {"current_react_prompt": react.react.signature.instructions, "current_extraction_prompt": react.extract.predict.signature.instructions}
# pass to reasoner and modify prompts
new_prompts = {}
with dspy.context(lm=teacher_lm):
    for i in range(5):
        reasoner = dspy.Predict(ReasonAboutTraces, seed=i)
        reasoner_result = reasoner(
            history=history,
            current_state=current_state,
        )
        new_prompts[i] = reasoner_result
        print(reasoner_result)

# evaluate these on ground truth
# eval_set_result = evaluate(react)
# print(eval_set_result)

import concurrent.futures
# with dspy.context(lm=student_lm):
#     original_score = evaluate(react).score

new_prompts[-1] = dspy.Prediction(improved_react_prompt=react.react.signature.instructions, improved_extraction_prompt=react.extract.predict.signature.instructions)

def evaluate_prompt_pair(num, reasoner_eval_result):
    instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
    signature = dspy.Signature("claim -> titles: list[str]", instructions)
    react_evaluate = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
    react_evaluate.react.signature.instructions = reasoner_eval_result.improved_react_prompt
    react_evaluate.extract.predict.signature.instructions = reasoner_eval_result.improved_extraction_prompt
    with dspy.context(lm=student_lm):
        rich.print("="*10,
        f"Running evaliaton number {num}",
        # f"Current prompt: {new_react.react.signature.instructions}",
        # f"Current extraction prompt: {new_react.extract.predict.signature.instructions}",
        "="*10)
        new_full_evaluate = evaluate(react_evaluate)
    return num, new_full_evaluate.score

results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # Dictionary mapping future to num for tracking and ordering
    future_to_num = {
        executor.submit(evaluate_prompt_pair, num, prompt): num 
        for num, prompt in new_prompts.items()
    }
    for future in concurrent.futures.as_completed(future_to_num):
        num, score = future.result()
        results[num] = score

print("="*80)

rich.print("Original prompt score: ", results[-1])
results.pop(-1)
rich.print("New prompt scores: ", results)
rich.print("Average score: ", sum(results.values()) / len(results))


# run on 20 new trajectories

Average Metric: 0.00 / 20 (0.0%): 100%|██████████| 20/20 [00:00<00:00, 163.75it/s]

2025/12/11 16:08:55 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)



Prediction(
    deviations_from_specification='- Repeatedly passed arguments to the finish tool, e.g., {"titles": [...]}, {"title": ...}, {"type": ...}, {"parent_company": ...}, despite the spec requiring finish to take {} only. This caused execution errors.\n- Occasionally used finish as if it were an output channel for titles rather than a signal to stop.\n- Sometimes relied solely on search snippets without confirming via lookup_wikipedia when claims were multi-part or potentially ambiguous.\n- Minor premature stopping in a few cases without verifying all atomic parts of a compound claim (e.g., not checking both entities when the claim compares them).',
    noticed_patterns='- Positive patterns:\n  - Good use of disambiguation in queries (adding years or qualifiers like "(1990 film)").\n  - Often decomposed claims into sub-parts and retrieved key entity pages (people, places, works).\n  - Used town/city pages for census numbers and line terminus pages for rail claims appropriately.

2025/12/11 16:20:40 INFO dspy.evaluate.evaluate: Average Metric: 58.0 / 100 (58.0%)
2025/12/11 16:22:21 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This individual had called their debut four-track extended play Write.. (EP). Eddie Vedder was born before them.', 'titles': ['Nam Woo-hyun', 'Write.. (EP)', 'Eddie Vedder']}) (input_keys={'claim'}): litellm.RateLimitError: RateLimitError: OpenrouterException - Provider returned error. Set `provide_traceback=True` for traceback.
2025/12/11 16:22:23 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The father of the founder of St Hugh's College, Oxford, was a bishop in the Anglican Church.", 'titles': ["St Hugh's College, Oxford", 'Elizabeth Wordsworth', 'Christopher Wordsworth']}) (input_keys={'claim'}): litellm.RateLimitError: RateLimitError: OpenrouterException - Provider returned error. Set `provide_traceback=True` for traceback.
2025/12/11 16:22:25 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The



In [None]:
# ==============
# Evaluate from 5x experiment
# ==============

# with dspy.context(lm=student_lm):
#     original_score = evaluate(react).score
prompts_with_original = new_prompts.copy()
prompts_with_original[-1] = {"improved_react_prompt": react.react.signature.instructions, "improved_extraction_prompt": react.extract.predict.signature.instructions}

def evaluate_prompt_pair(num, reasoner_result):
    new_react = react.deepcopy()
    new_react.react.signature.instructions = reasoner_result.improved_react_prompt
    new_react.extract.predict.signature.instructions = reasoner_result.improved_extraction_prompt
    with mlflow.start_run(run_name=f"evaluate_prompt_pair_{num}"):
        with dspy.context(lm=student_lm):
            rich.print("="*80,
            f"Running evaliaton number {num}",
            f"Current prompt: {new_react.react.signature.instructions}",
            f"Current extraction prompt: {new_react.extract.predict.signature.instructions}",
            "="*80)
            new_full_evaluate = evaluate_trainset(wrap_student(new_react), devset=current_trainset[:1])
    return new_full_evaluate

for num, prompt in prompts_with_original.items():
    rich.print("="*80)
    rich.print(f"Running evaliaton number {num}")
    rich.print(f"Current prompt: {prompt.improved_react_prompt}")
    rich.print(f"Current extraction prompt: {prompt.improved_extraction_prompt}")
    res = evaluate_prompt_pair(num, prompt)
    rich.print(res)
    rich.print("="*80)


Average Metric: 0.00 / 1 (0.0%): 100%|██████████| 1/1 [00:00<00:00,  8.04it/s]

2025/12/11 13:30:34 INFO dspy.evaluate.evaluate: Average Metric: 0 / 1 (0.0%)



🏃 View run evaluate_prompt_pair_0 at: http://localhost:5000/#/experiments/1/runs/cdefc6cb9bc045bbbb24cfc2f348e881
🧪 View experiment at: http://localhost:5000/#/experiments/1


Average Metric: 0.00 / 1 (0.0%): 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]

2025/12/11 13:30:34 INFO dspy.evaluate.evaluate: Average Metric: 0 / 1 (0.0%)



🏃 View run evaluate_prompt_pair_1 at: http://localhost:5000/#/experiments/1/runs/6ce8ea145e4049f593de16c86783bcda
🧪 View experiment at: http://localhost:5000/#/experiments/1


Average Metric: 0.00 / 1 (0.0%): 100%|██████████| 1/1 [01:10<00:00, 70.07s/it]

2025/12/11 13:31:44 INFO dspy.evaluate.evaluate: Average Metric: 0 / 1 (0.0%)



🏃 View run evaluate_prompt_pair_2 at: http://localhost:5000/#/experiments/1/runs/04dfd907c9004c53ae46863a6c939e70
🧪 View experiment at: http://localhost:5000/#/experiments/1


Average Metric: 0.00 / 1 (0.0%): 100%|██████████| 1/1 [02:37<00:00, 157.25s/it]

2025/12/11 13:34:21 INFO dspy.evaluate.evaluate: Average Metric: 0 / 1 (0.0%)



🏃 View run evaluate_prompt_pair_3 at: http://localhost:5000/#/experiments/1/runs/b40dd6cd9bb944268b6dcf7cde0a3bb0
🧪 View experiment at: http://localhost:5000/#/experiments/1


Average Metric: 0.00 / 1 (0.0%): 100%|██████████| 1/1 [26:39<00:00, 1599.29s/it]

2025/12/11 14:01:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 1 (0.0%)



🏃 View run evaluate_prompt_pair_4 at: http://localhost:5000/#/experiments/1/runs/11a1c7bc729344bfbd3f0da5fad21ebb
🧪 View experiment at: http://localhost:5000/#/experiments/1


AttributeError: 'dict' object has no attribute 'improved_react_prompt'

In [44]:
rich.print(new_prompts)

In [None]:
# ==============
# Initial iteration
# ==============

import rich
from typing import Any
from concurrent.futures import ThreadPoolExecutor
import concurrent
import json
from datetime import datetime

mlflow.dspy.autolog(disable=True)

# student_lm = dspy.LM("openrouter/qwen/qwen3-8b")
student_lm = dspy.LM("openai/gpt-5-nano")
teacher_lm = dspy.LM("openai/gpt-5")
dspy.configure(lm=student_lm)

class ReasonAboutTraces(dspy.Signature):
    """Given sets of trajectories and the prompts that were used to generate them
    reason about the traces and how you might improve the prompt from a strategy perspective without overfitting.
    
    Reason through the prompt evolution over time to try and construct the best possible prompt for the next iterations.
    """
    history: list[str] = dspy.InputField()
    current_state: dict[str, Any] = dspy.InputField()
    reasoning: str = dspy.OutputField()
    next_react_prompt: str = dspy.OutputField()
    next_extraction_prompt: str = dspy.OutputField()

# get 20 trajectories
def wrap_student(student_prog, seed=None):
    def wrapped_student(*args, **kwargs):
        with dspy.context(trace=[]):
            result = student_prog(*args, **kwargs, seed=seed)
            trace = dspy.settings.trace
            return [result, trace]
    return wrapped_student

def write_history(history, filename):
    with open(filename, "w") as f:
        jsons = []
        for h in history:
            # If h is not a dict but can be converted via **, do so
            try:
                jsons.append(json.dumps(h, indent=2))
            except TypeError:
                try:
                    jsons.append(json.dumps({**h}, indent=2))
                except Exception:
                    # fallback: string representation as error handling
                    jsons.append(repr(h))
        f.write('[\n' + ',\n'.join(jsons) + '\n]\n')

instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
# current_react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
evaluate_devset = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=25, display_progress=True, max_errors=50)

run_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
prog_save_dir = f"programs/{run_datetime}"
os.makedirs(prog_save_dir, exist_ok=True)

def run_iteration(seed):
    history = []
    programs_over_time = []
    current_react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
    reasoner = dspy.Predict(ReasonAboutTraces, seed=seed)
    for i in range(4):
        rich.print("="*80,
        f"Running iteration {i}",
        f"Current prompt: {current_react.react.signature.instructions}",
        f"Current extraction prompt: {current_react.extract.predict.signature.instructions}",
        "="*80)
        programs_over_time.append(current_react.deepcopy())
        current_react.save(f"{prog_save_dir}/iter_{seed}_program_{i}.json")
        current_trainset = trainset[i*20:(i+1)*20]
        evaluate_trainset = dspy.Evaluate(devset=current_trainset, metric=lambda e, p, t=None: 0, num_threads=25, display_progress=False, provide_traceback=True)

        # create new trajectories
        with dspy.context(lm=student_lm):
            first_trainset_eval = evaluate_trainset(wrap_student(react, seed=seed))

        # append those to history
        history.append({f"trajectories_iteration_{i}": get_last_traces(first_trainset_eval)})
        history.append({f"react_prompt_for_iteration_{i}": react.react.signature.instructions, f"extraction_prompt_for_iteration_{i}": react.extract.predict.signature.instructions})

        current_state = {"current_react_prompt": react.react.signature.instructions, "current_extraction_prompt": react.extract.predict.signature.instructions}
        # pass to reasoner and modify prompts
        with dspy.context(lm=teacher_lm):
            reasoner_result = reasoner(
                history=history,
                current_state=current_state,
            )
        # append the reasoning and new prompts
        history.append({f"prompt_optimization_iteration_{i}": {**reasoner_result}})

        current_react.react.signature.instructions = reasoner_result.next_react_prompt
        current_react.extract.predict.signature.instructions = reasoner_result.next_extraction_prompt

    # final_evaluate = evaluate_devset(current_react)
    write_history(history, f"{prog_save_dir}/prompt_history_{seed}.json")

    return programs_over_time


# with ThreadPoolExecutor(max_workers=8) as executor:
#     executor.map(run_iteration, range(5))

results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # Dictionary mapping future to num for tracking and ordering
    future_to_num = {
        executor.submit(run_iteration, i): i 
        for i in range(5)
    }
    for future in concurrent.futures.as_completed(future_to_num):
        num = future_to_num[future]
        programs = future.result()
        results[num] = programs
    
# import json

final_programs = [results[i][-1] for i in range(5)]

def evaluate_program(program):
    with dspy.context(lm=student_lm):
        final_evaluate = evaluate_devset(program)
    return final_evaluate.score

instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
original_react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
original_score = evaluate_program(original_react).score

scores = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    future_to_num = {
        executor.submit(evaluate_program, program): i 
        for i, program in enumerate(final_programs)
    }
    for future in concurrent.futures.as_completed(future_to_num):
        num = future_to_num[future]
        scores.append(future.result())


rich.print("="*80)
rich.print("Original prompt score: ", original_score)
rich.print("New prompt scores: ", scores)
rich.print("Average score: ", sum(scores) / len(scores))


# run on 20 new trajectories

2025/12/12 13:08:57 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The opera that Julien (opera) is the sequel, has no more acts than the opera Le roi malgré lui.', 'titles': ['Louise (opera)', 'Le roi malgré lui', 'Julien (opera)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A

Average Metric: 0.00 / 0 (0%):  46%|████▌     | 46/100 [10:46<13:44, 15.26s/it]

2025/12/12 13:08:57 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A club has hosted matches at the all-seater BayArena since 1958. This club has Willibert Kremer as a scout.', 'titles': ['Bayer 04 Leverkusen', 'Willibert Kremer', 'BayArena']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  32%|███▏      | 32/100 [10:46<13:42, 12.09s/it]

2025/12/12 13:08:58 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:08:58 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:08:58 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:08:58 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:08:58 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:09:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keurig Green Mountain specializes in coffee. The company which PICkit is a family of programmers for PIC microcontrollers was made does not specializes in.', 'titles': ['PICkit', 'Microchip Technology', 'Keurig Green Mountain']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:09:00 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)






2025/12/12 13:09:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:09:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:09:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:09:05 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The people who migrated during the Northern and Southern dynasties are the world's largest people group. Their customs and etiquette are the traditional behaviors observed while eating in the Greater China Region.", 'titles': ['Northern and Southern dynasties', 'Customs and etiquette in Chinese dining', 'Han Chinese']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:09:06 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The musician who died in 1978 was associated with an American Opera Company located in Jackson, Mississippi. He composed the opera A Bayou Legend.', 'titles': ['William Grant Still', 'A Bayou Legend', 'Mississippi Opera']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  33%|███▎      | 33/100 [10:56<12:40, 11.35s/it]

2025/12/12 13:09:17 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'When this short, part of "The Tracy Ullman Show", and featuring the first television appearance of Homer Simpson, aired on "The Simpsons 138th Epsiode Spectacular," Julie Kavner was the voice actress who did the voice of the character named after Matt Groening\'s mother.', 'titles': ['Good Night (The Simpsons short)', 'Homer Simpson', 'Marge Simpson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:09:20 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This woman directed Goodbye First Love. She won the Silver Bear for Best Director for a film staring Tom Courtenay.', 'titles': ['Goodbye First Love', 'Things to Come (2016 film)', 'Mia Hansen-Løve']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:09:33 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Luis Téllez served as Secretary of Energy for the president that served from December 1, 1994- November 30, 2000 as Mexican President. That president headed the San Andres Accord.', 'titles': ['Luis Téllez', 'San Andrés Accords', 'Ernesto Zedillo']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:09:38 ERROR dspy.utils.parallelizer: Error for Example({'claim': "One of Nevada's commercial airports was named after the same career United States Army officer that Fort Reno (Oklahoma) is also named after.", 'titles': ['Reno–Tahoe International Airport', 'Jesse L. Reno', 'Fort Reno (Oklahoma)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  34%|███▍      | 34/100 [11:28<19:12, 17.46s/it]

2025/12/12 13:09:39 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Federated Auto Parts 400 is an annual Monster Energy NASCAR Cup Series stock car race held at the Richmond Raceway in Richmond, Virginia, being the second of two races in the spring. The first one of this two races was sponsored from 2007 to 2011 by a brand that is owned by Diageo.', 'titles': ['Toyota Owners 400', 'Federated Auto Parts 400', 'Crown Royal']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:09:42 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'There is an airport across the street from the U.S. Coast Guard Air Station San Diego. That airport and the McCarran International Airport are not located in the same place.', 'titles': ['Coast Guard Air Station San Diego', 'McCarran International Airport', 'San Diego International Airport']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:09:47 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Garnett Dunning was active in animation films. He was more active in animation films than the director of The Gay Bride.', 'titles': ['The Gay Bride', 'Jack Conway (filmmaker)', 'George Dunning']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Average Metric: 0.00 / 0 (0%):  76%|███████▌  | 76/100 [11:37<03:40,  9.17s/it]






2025/12/12 13:09:52 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This woman directed Goodbye First Love. She won the Silver Bear for Best Director for a film staring Tom Courtenay.', 'titles': ['Goodbye First Love', 'Things to Come (2016 film)', 'Mia Hansen-Løve']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:09:53 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The director of Red Amnesia and Justin Reardon were not both producers. Red Amenesia was a 2014 Chinese thriller film.', 'titles': ['Wang Xiaoshuai', 'Red Amnesia', 'Justin Reardon']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith County, Nebraska is located over the 174000 sq mi water area. This area supplies water the the Texas High Plains AVA.', 'titles': ['Ogallala, Nebraska', 'Texas High Plains AVA', 'Ogallala Aquifer']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film, directed by a former assistant directer of academy award winner Kim Ki-duk on "Rough Cut", starring Song Kang-ho in the title role and selected as the South Korean entry for the Best Foreign Language Film at the 90th Academy Awards, debuted in 2017.', 'titles': ['Jang Hoon', 'A Taxi Driver', 'Kim Ki-duk']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The alt-rock band, who released "My Type", was a group of Elektra Records recording artists that are known to be an indie pop band.', 'titles': ['Saint Motel', 'My Type', 'Saintmotelevision']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:10:03 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The vehicle that shared it chassis with the Mitsubishi Carisma was the vehicle that won the Auto Trader RAC British Touring Car Championship. It was marketed and produced by a Swedish manufacturer.', 'titles': ['Mitsubishi Carisma', 'Volvo S40', '1998 British Touring Car Championship']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:10:06 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The opera that Julien (opera) is the sequel, has no more acts than the opera Le roi malgré lui.', 'titles': ['Louise (opera)', 'Le roi malgré lui', 'Julien (opera)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:07 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Garnett Dunning was active in animation films. He was more active in animation films than the director of The Gay Bride.', 'titles': ['The Gay Bride', 'Jack Conway (filmmaker)', 'George Dunning']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Pelle Almqvist is younger than the saxophonist who played synthesizers on the track "Got a Hold on Me".', 'titles': ['Got a Hold on Me', 'Steve Winwood', 'Pelle Almqvist']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  35%|███▌      | 35/100 [12:02<24:25, 22.54s/it]

2025/12/12 13:10:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': "One of Nevada's commercial airports was named after the same career United States Army officer that Fort Reno (Oklahoma) is also named after.", 'titles': ['Reno–Tahoe International Airport', 'Jesse L. Reno', 'Fort Reno (Oklahoma)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:14 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This woman directed Goodbye First Love. She won the Silver Bear for Best Director for a film staring Tom Courtenay.', 'titles': ['Goodbye First Love', 'Things to Come (2016 film)', 'Mia Hansen-Løve']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:18 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A science fiction Western television show stars an Canadian , director, producer, writer, singer, musician, voice artist and stand-up comedian. Laura Jane Laughlin appeared on this show.', 'titles': ['Legend (TV series)', 'Laura Jane Laughlin', 'John de Lancie']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:19 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The notable song from "Oviyum",  by a composer of the Punjabi House soundtr



2025/12/12 13:10:20 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The director who gave Lee Van Cleef a role in "For a Few Dollars More" was the Italian, Jon Paul Puno.', 'titles': ['Sergio Leone', 'Lee Van Cleef', 'Jon Paul Puno']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:21 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The people who migrated during the Northern and Southern dynasties are the world's largest people group. Their customs and etiquette are the traditional behaviors observed while eating in the Greater China Region.", 'titles': ['Northern and Southern dynasties', 'Customs and etiquette in Chinese dining', 'Han Chinese']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:10:23 ERROR dspy.utils.parallelizer: Error for Example({'claim': "One of Nevada's commercial airports was named after the same career United States Army officer that Fort Reno (Oklahoma) is also named after.", 'titles': ['Reno–Tahoe International Airport', 'Jesse L. Reno', 'Fort Reno (Oklahoma)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A club has hosted matches at the all-seater BayArena since 1958. This club has Willibert Kremer as a scout.', 'titles': ['Bayer 04 Leverkusen', 'Willibert Kremer', 'BayArena']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': "Keith-Lee-Castle played as the owner of the doll in the 2004 release. It and Child's Play 3 are both films.", 'titles': ['Keith-Lee Castle', "Child's Play 3", 'Seed of Chucky']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The alt-rock band, who released "My Type", was a group of Elektra Records recording artists that are known to be an indie pop band.', 'titles': ['Saint Motel', 'My Type', 'Saintmotelevision']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A

2025/12/12 13:10:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film by Sandi Sissel was released before The End of Suburbia.', 'titles': ['Chicken Ranch (film)', 'Sandi Sissel', 'The End of Suburbia']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:10:33 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Latin singer Aaliyah had a video director that directed the music video Soothe My Soul, and also had to face allegation of illegal marriage with R. Kelly.', 'titles': ['Aaliyah', 'Soothe My Soul', 'Warren Fu']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:35 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor appeared in the 2011 Indian film Ra.One and in the film Revolver that also starred Jason Statham and Ray Liotta.', 'titles': ['Tom Wu', 'Revolver (2005 film)', 'Ra.One']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The director who gave Lee Van Cleef a role in "For a Few Dollars More" was the Italian, Jon Paul Puno.', 'titles': ['Sergio Leone', 'Lee Van Cleef',

Average Metric: 0.00 / 0 (0%):  36%|███▌      | 36/100 [12:26<24:31, 22.99s/it]

2025/12/12 13:10:46 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The band that had former Dutch member Spencer Ludwig is in the pop genre whilst Tweaker is not.', 'titles': ['Tweaker (band)', 'Capital Cities (band)', 'Spencer Ludwig']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:48 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'There is an airport across the street from the U.S. Coast Guard Air Station San Diego. That airport and the McCarran International Airport are not located in the same place.', 'titles': ['Coast Guard Air Station San Diego', 'McCarran International Airport', 'San Diego International Airport']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:10:53 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keurig Green Mountain specializes in coffee. The company which PICkit is a family of programmers for PIC microcontrollers was made does not specializes in.', 'titles': ['PICkit', 'Microchip Technology', 'Keurig Green Mountain']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  37%|███▋      | 37/100 [12:43<22:09, 21.11s/it]

2025/12/12 13:10:54 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film by Sandi Sissel was released before The End of Suburbia.', 'titles': ['Chicken Ranch (film)', 'Sandi Sissel', 'The End of Suburbia']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:10:58 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith County, Nebraska is located over the 174000 sq mi water area. This area supplies water the the Texas High Plains AVA.', 'titles': ['Ogallala, Nebraska', 'Texas High Plains AVA', 'Ogallala Aquifer']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:11:00 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:11:01 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A club has hosted matches at the all-seater BayArena since 1958. This club has Willibert Kremer as a scout.', 'titles': ['Bayer 04 Leverkusen', 'Willibert Kremer', 'BayArena']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:11:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:11:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:11:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:11:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:11:01 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)


2025/12/12 13:11:02 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:11:03 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Garnett Dunning was active in animation films. He was more active in animation films than the director of The Gay Bride.', 'titles': ['The Gay Bride', 'Jack Conway (filmmaker)', 'George Dunning']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  38%|███▊      | 38/100 [12:52<18:08, 17.56s/it]

2025/12/12 13:11:12 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The band that had former Dutch member Spencer Ludwig is in the pop genre whilst Tweaker is not.', 'titles': ['Tweaker (band)', 'Capital Cities (band)', 'Spencer Ludwig']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:11:14 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The vehicle that shared it chassis with the Mitsubishi Carisma was the vehicle that won the Auto Trader RAC British Touring Car Championship. It was marketed and produced by a Swedish manufacturer.', 'titles': ['Mitsubishi Carisma', 'Volvo S40', '1998 British Touring Car Championship']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:11:14 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'National Contest Journal is published with an independent volunteer editor. The periodical the prelude of Birthright (Robinson novel) was published in is not.', 'titles': ['National Contest Journal', 'Birthright (Robinson novel)', 'Doctor Who Magazine']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  39%|███▉      | 39/100 [13:03<15:56, 15.68s/it]

2025/12/12 13:11:20 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The indie band She Wants Revenge and the band that released the album Speakeasy (Freeze the Atlantic album) play music in the genre of rock. Both bands come from different countries.', 'titles': ['Speakeasy (Freeze the Atlantic album)', 'She Wants Revenge', 'Freeze the Atlantic']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:11:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Between the Irish writer writingThe Four-Chambered Heart and Odysseas Elytis, Odysseas Elytis was awarded the Nobel Prize in Literature.', 'titles': ['The Four-Chambered Heart', 'Anaïs Nin', 'Odysseas Elytis']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:11:27 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A historic fishing town in south Ghana has the N1 passing through it. That town hosts the Fancy Dress Festival.', 'titles': ['Fancy Dress Festival', 'N1 road (Ghana)', 'Winneba']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  40%|████      | 40/100 [13:16<14:45, 14.76s/it]

2025/12/12 13:11:29 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:11:40 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The opera that Julien (opera) is the sequel, has no more acts than the opera Le roi malgré lui.', 'titles': ['Louise (opera)', 'Le roi malgré lui', 'Julien (opera)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.






Average Metric: 0.00 / 0 (0%):  38%|███▊      | 38/100 [13:30<05:09,  4.99s/it]

2025/12/12 13:11:40 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Jessica Lange starred as an American attorney in the HBO film Michael Sucsy created.', 'titles': ['Grey Gardens (2009 film)', 'Michael Sucsy', 'Phelan Beale']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


[A[A[A[A

[A[A


[A[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Average Metric: 0.00 / 0 (0%):  76%|███████▌  | 76/100 [13:30<04:15, 10.66s/it]






2025/12/12 13:11:40 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The movie, based on a story written by brothers who are interred at Alter St.-Matthäus-Kirchhof, is not loosely based off the Brother Grimm\'s "Iron Henry".', 'titles': ['Alter St.-Matthäus-Kirchhof', 'The Princess and the Frog', 'The Frog Prince']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:11:49 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Between the Irish writer writingThe Four-Chambered Heart and Odysseas Elytis, Odysseas Elytis was awarded the Nobel Prize in Literature.', 'titles': ['The Four-Chambered Heart', 'Anaïs Nin', 'Odysseas Elytis']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:11:51 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith Scholey  co-directed "African Cats" and another documentary with Nicholas for Disneynature. That documentary and Aliens of the Deep were not filmed in the same locations.', 'titles': ['Keith Scholey', 'Aliens of the Deep', 'Bears (film)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:11:55 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Garnett Dunning was active in animation films. He was more active in animati

2025/12/12 13:11:56 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:12:02 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The Miss Universe 2015 was held at a venue inside a casino in Las Vegas. This venue is located in a casino that is owned and operated by Dubuque Greyhound Park & Casino.', 'titles': ['The AXIS', 'Miss Universe 2015', 'Planet Hollywood Las Vegas']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:12:08 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor who starred in the 1965 film Frankenstein Meets the Space Monster also appeared on the third episode of the ninth season of a sitcom that was the 187th episode overall.', 'titles': ['Lou Cutell', 'Frankenstein Meets the Space Monster', 'Last Time in New York']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:12:09 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'When this short, part of "The Tracy Ullman Show", and featuring the first television appearance of Homer Simpson, aired on "The Simpsons 138th Epsiode Spectacular," Julie Kavner was the voice actress who did the voice of the character named after Matt Groening\'s mother.', 'titles': ['Good Night (The Simpsons short)', 'Homer Simpson', 'Marge Simpson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for trac




[A[A[A


[A[A[A

2025/12/12 13:12:09 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The band that had former Dutch member Spencer Ludwig is in the pop genre whilst Tweaker is not.', 'titles': ['Tweaker (band)', 'Capital Cities (band)', 'Spencer Ludwig']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:12:13 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:12:15 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The child actor, who played the character Fenmore Baldwin, plays in a series that follows a group of friends who run an Irish bar in it's Always Sunny in Philadelphia.", 'titles': ["It's Always Sunny in Philadelphia", 'Robbie Tucker', 'Fenmore Baldwin']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:12:15 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This woman directed Goodbye First Love. She won the Silver Bear for Best Director for a film staring Tom Courtenay.', 'titles': ['Goodbye First Love', 'Things to Come (2016 film)', 'Mia Hansen-Løve']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  41%|████      | 41/100 [14:04<24:25, 24.84s/it]

2025/12/12 13:12:18 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This woman directed Goodbye First Love. She won the Silver Bear for Best Director for a film staring Tom Courtenay.', 'titles': ['Goodbye First Love', 'Things to Come (2016 film)', 'Mia Hansen-Løve']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:12:18 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor appeared in the 2011 Indian film Ra.One and in the film Revolver that also starred Jason Statham and Ray Liotta.', 'titles': ['Tom Wu', 'Revolver (2005 film)', 'Ra.One']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:12:23 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The Miss Universe 2015 was held at a venue inside a casino in Las Vegas. This venue is located in a casino that is owned and operated by Dubuque Greyhound Park & Casino.', 'titles': ['The AXIS', 'Miss Universe 2015', 'Planet Hollywood Las Vegas']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  42%|████▏     | 42/100 [14:13<19:15, 19.93s/it]

2025/12/12 13:12:24 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Hamilton IV performed the song "Abilene" in a 1963 movie. The actress who co-starred with Linda Evans in this movie was Canadian.', 'titles': ['Abilene (song)', 'Ruta Lee', 'Hootenanny Hoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


2025/12/12 13:12:30 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)
2025/12/12 13:12:39 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The faster roller coaster, between the ride next to Tidal Wave and Green Lantern has a top speed of 65 mph.', 'titles': ['Tidal Wave (Six Flags Magic Mountain)', "The Riddler's Revenge", 'Green Lantern (Six Flags Great Adventure)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:12:42 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The creator of the fantasy-action game FTL:2448 is best known for his work in the creation of a role-playing game. The game was first published in 1982 by Tri Tac Games.', 'titles': ['Richard Tucholka', 'Fringeworthy', 'FTL:2448']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:12:43 ERROR dspy.utils.parallelizer: Error for Example({'claim': "King of RUS, who had the King Magnus' Halt railway station named after him, launched aggressive military campaigns in this region that was at times independent of external control and was known to the Norse as Southern Isles.", 'titles': ["King Magnus' Halt railway station", 'Kingdom of the Isles', 'Magnus Barefoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:12:44 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film by Sandi Sissel was released before The End of Suburbia.', 'titles': ['Chicken Ranch (film)', 'Sandi Sissel', 'The End of Suburbia']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:12:48 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor who starred in the 1965 film Frankenstein Meets the Space Monster also appeared on the third episode of the ninth season of a sitcom that was the 187th episode overall.', 'titles': ['Lou Cutell', 'Frankenstein Meets the Space Monster', 'Last Time in New York']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:12:54 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The 1999 French Open - Women's Doubles runner-up was born on 7 June 1981. She was also a trainer on The Biggest Loser (season 12).", 'titles': ["1999 French Open – Women's Doubles", 'Anna Kournikova', 'The Biggest Loser (season 12)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:13:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Luis Téllez served as Secretary o



2025/12/12 13:13:05 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The alt-rock band, who released "My Type", was a group of Elektra Records recording artists that are known to be an indie pop band.', 'titles': ['Saint Motel', 'My Type', 'Saintmotelevision']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:13:05 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The famous opera comique that was based on a Prosper Merimee novella, in which Nell Rankin was particularly admired for her performance in the title role, perhaps the most famous "opéra comique", is a tragedy not a comedy.', 'titles': ['Carmen', 'Opéra comique', 'Nell Rankin']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:13:12 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith County, Nebraska is located over the 174000 sq mi water area. This area supplies water the the Texas High Plains AVA.', 'titles': ['Ogallala, Nebraska', 'Texas High Plains AVA', 'Ogallala Aquifer']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:13:17 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The Miss Universe 2015 was held at a venue inside a casino in Las Vegas. This venue is located in a casino that is owned and operated by Dubuque Greyhound Park & Casino.', 'titles': ['The AXIS', 'Miss Universe 2015', 'Planet Hollywood Las Vegas']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:13:20 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith County, Nebraska is located over the 174000 sq mi water area. This area supplies water the the Texas High Plains AVA.', 'titles': ['Ogallala, Nebraska', 'Texas High Plains AVA', 'Ogallala Aquifer']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  43%|████▎     | 43/100 [15:09<29:20, 30.89s/it]

2025/12/12 13:13:21 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The alt-rock band, who released "My Type", was a group of Elektra Records recording artists that are known to be an indie pop band.', 'titles': ['Saint Motel', 'My Type', 'Saintmotelevision']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  44%|████▍     | 44/100 [15:10<20:25, 21.88s/it]

2025/12/12 13:13:24 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The creator of the fantasy-action game FTL:2448 is best known for his work in the creation of a role-playing game. The game was first published in 1982 by Tri Tac Games.', 'titles': ['Richard Tucholka', 'Fringeworthy', 'FTL:2448']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  45%|████▌     | 45/100 [15:14<15:01, 16.38s/it]

2025/12/12 13:13:31 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The indie band She Wants Revenge and the band that released the album Speakeasy (Freeze the Atlantic album) play music in the genre of rock. Both bands come from different countries.', 'titles': ['Speakeasy (Freeze the Atlantic album)', 'She Wants Revenge', 'Freeze the Atlantic']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




  0%|          | 0/100 [00:00<?, ?it/s]
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A
[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A

2025/12/12 13:13:35 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The people who migrated during the Northern and Southern dynasties are the world's largest people group. Their customs and etiquette are the traditional behaviors observed while eating in the Greater China Region.", 'titles': ['Northern and Southern dynasties', 'Customs and etiquette in Chinese dining', 'Han Chinese']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  46%|████▌     | 46/100 [15:24<13:10, 14.64s/it]

2025/12/12 13:13:35 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'When this short, part of "The Tracy Ullman Show", and featuring the first television appearance of Homer Simpson, aired on "The Simpsons 138th Epsiode Spectacular," Julie Kavner was the voice actress who did the voice of the character named after Matt Groening\'s mother.', 'titles': ['Good Night (The Simpsons short)', 'Homer Simpson', 'Marge Simpson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  76%|███████▌  | 76/100 [15:25<04:52, 12.18s/it]






2025/12/12 13:13:53 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'There is an airport across the street from the U.S. Coast Guard Air Station San Diego. That airport and the McCarran International Airport are not located in the same place.', 'titles': ['Coast Guard Air Station San Diego', 'McCarran International Airport', 'San Diego International Airport']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:13:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The city where radio personality Ye Sha is from has a higher population than Hengyang.', 'titles': ['Hengyang', 'Shanghai', 'Ye Sha']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:13:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Hamilton IV performed the song "Abilene" in a 1963 movie. The actress who co-starred with Linda Evans in this movie was Canadian.', 'titles': ['Abilene (song)', 'Ruta Lee', 'Hootenanny Hoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A


[A[A[A


[A[A[A

2025/12/12 13:14:01 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The man associated with Sports & Wine and Nic Offer are both considered to be musicians.', 'titles': ['Sports &amp; Wine', 'Nic Offer', 'Ben Folds']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:01 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Between the Irish writer writingThe Four-Chambered Heart and Odysseas Elytis, Odysseas Elytis was awarded the Nobel Prize in Literature.', 'titles': ['The Four-Chambered Heart', 'Anaïs Nin', 'Odysseas Elytis']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:02 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The creator of the fantasy-action game FTL:2448 is best known for his work in the creation of a role-playing game. The game was first published in 1982 by Tri Tac Games.', 'titles



2025/12/12 13:14:07 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Liberal Arts school Emory University was founded before the school that William Ridley Wills is a graduate of.', 'titles': ['Emory University', 'Vanderbilt University', 'William Ridley Wills']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The area the Oregon Portage Railroad ran to is located farther out than Lake Worth Lagoon.', 'titles': ['Oregon Portage Railroad', 'Lake Worth Lagoon', 'Cascade Locks and Canal']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:16 ERROR dspy.utils.parallelizer: Error for Example({'claim': "King of RUS, who had the King Magnus' Halt railway station named after him, launched aggressive military campaigns in this region that was at times independent of ext

Average Metric: 0.00 / 0 (0%):  47%|████▋     | 47/100 [16:16<22:49, 25.83s/it]

2025/12/12 13:14:27 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The magazine that ranked Lifted Research Group at #5 on its Hot 500 list of fastest-growing companies had a longer lifespan than Optimize.', 'titles': ['Entrepreneur (magazine)', 'Optimize (magazine)', 'Lifted Research Group']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  48%|████▊     | 48/100 [16:17<15:47, 18.22s/it]


[A[A[A


[A[A[A


[A[A[A


[A[A[A

2025/12/12 13:14:38 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The city where radio personality Ye Sha is from has a higher population than Hengyang.', 'titles': ['Hengyang', 'Shanghai', 'Ye Sha']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:38 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'When this short, part of "The Tracy Ullman Show", and featuring the first television appearance of Homer Simpson, aired on "The Simpsons 138th Epsiode Spectacular," Julie Kavner was the voice actress who did the voice of the character named after Matt Groening\'s mother.', 'titles': ['Good Night (The Simpsons short)', 'Homer Simpson', 'Marge Simpson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  49%|████▉     | 49/100 [16:28<13:40, 16.08s/it]

2025/12/12 13:14:42 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith County, Nebraska is located over the 174000 sq mi water area. This area supplies water the the Texas High Plains AVA.', 'titles': ['Ogallala, Nebraska', 'Texas High Plains AVA', 'Ogallala Aquifer']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:44 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'There is an airport across the street from the U.S. Coast Guard Air Station San Diego. That airport and the McCarran International Airport are not located in the same place.', 'titles': ['Coast Guard Air Station San Diego', 'McCarran International Airport', 'San Diego International Airport']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:14:50 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The people who migrated during the Northe




[A[A[A


[A[A[A
[A
[A


[A[A[A


[A[A[A

2025/12/12 13:14:55 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Tome Sizemore had a role in a movie that starred Melinda Lopez and Bruce Payne. Richard Nord worked on this film.', 'titles': ['Richard Nord', 'Tom Sizemore', 'Passenger 57']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:14:59 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The famous opera comique that was based on a Prosper Merimee novella, in which Nell Rankin was particularly admired for her performance in the title role, perhaps the most famous "opéra comique", is a tragedy not a comedy.', 'titles': ['Carmen', 'Opéra comique', 'Nell Rankin']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:15:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Liberal Arts school Emory University was founded before the school that William Ridley Wills is a graduate of.', 'titles': ['Emory University', 'Vanderbilt University', 'William Ridley Wills']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:15:04 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith Scholey  co-directed "African Cats" and another documentary w




[A[A[A


[A[A[A

2025/12/12 13:15:22 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Latin singer Aaliyah had a video director that directed the music video Soothe My Soul, and also had to face allegation of illegal marriage with R. Kelly.', 'titles': ['Aaliyah', 'Soothe My Soul', 'Warren Fu']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  76%|███████▌  | 76/100 [17:12<05:25, 13.58s/it]







[A[A[A


[A[A[A

2025/12/12 13:15:28 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The vehicle that shared it chassis with the Mitsubishi Carisma was the vehicle that won the Auto Trader RAC British Touring Car Championship. It was marketed and produced by a Swedish manufacturer.', 'titles': ['Mitsubishi Carisma', 'Volvo S40', '1998 British Touring Car Championship']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:15:29 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The birthplace of Korean Li Ye (speed skater) has a greater population that Huainan. They were born on born the 26 December 1983.', 'titles': ['Huainan', 'Li Ye (speed skater)', 'Changchun']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

Average Metric: 0.67 / 1 (66.7%):   1%|          | 1/100 [01:56<3:11:51, 116.27s/it]

2025/12/12 13:15:32 ERROR dspy.utils.parallelizer: Error for Example({'claim': "King of RUS, who had the King Magnus' Halt railway station named after him, launched aggressive military campaigns in this region that was at times independent of external control and was known to the Norse as Southern Isles.", 'titles': ["King Magnus' Halt railway station", 'Kingdom of the Isles', 'Magnus Barefoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:15:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Linzhou, Henan is bigger than area where the Liaoning Finance and Trade College is located.', 'titles': ['Linzhou, Henan', 'Liaoning Finance and Trade College', 'Xingcheng']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:15:40 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Liberal Arts school Emory University was founded before the school that William Ridley Wills is a graduate of.', 'titles': ['Emory University', 'Vanderbilt University', 'William Ridley Wills']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:15:40 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The indie band She Wants Revenge and the band that released the album Speakeasy (Freeze the Atlantic album) play music in the genre of rock. Both bands come from different countries.', 'titles': ['Speakeasy (Freeze the Atlantic album)', 'She Wants Revenge', 'Freeze the Atlantic']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:15:41 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This individual played with Steve Denton in the June 1983 Dallas



[A[A

[A[A

2025/12/12 13:15:58 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Luis Téllez served as Secretary of Energy for the president that served from December 1, 1994- November 30, 2000 as Mexican President. That president headed the San Andres Accord.', 'titles': ['Luis Téllez', 'San Andrés Accords', 'Ernesto Zedillo']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:15:59 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Luis Téllez served as Secretary of Energy for the president that served from December 1, 1994- November 30, 2000 as Mexican President. That president headed the San Andres Accord.', 'titles': ['Luis Téllez', 'San Andrés Accords', 'Ernesto Zedillo']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


Average Metric: 0.00 / 0 (0%):  76%|███████▌  | 76/100 [17:48<05:37, 14.06s/it]




Average Metric: 1.00 / 2 (50.0%):   2%|▏         | 2/100 [02:26<1:46:54, 65.45s/it] 

2025/12/12 13:16:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Luis Téllez served as Secretary of Energy for the president that served from December 1, 1994- November 30, 2000 as Mexican President. That president headed the San Andres Accord.', 'titles': ['Luis Téllez', 'San Andrés Accords', 'Ernesto Zedillo']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:16:05 ERROR dspy.utils.parallelizer: Error for Example({'claim': "King of RUS, who had the King Magnus' Halt railway station named after him, launched aggressive military campaigns in this region that was at times independent of external control and was known to the Norse as Southern Isles.", 'titles': ["King Magnus' Halt railway station", 'Kingdom of the Isles', 'Magnus Barefoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:16:06 ERROR dspy.utils.parallelizer:


[A
Average Metric: 1.67 / 3 (55.6%):   3%|▎         | 3/100 [02:43<1:10:33, 43.65s/it]


[A[A[A


[A[A[A


[A[A[A


[A[A[A
[A
[A

2025/12/12 13:16:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A science fiction Western television show stars an Canadian , director, producer, writer, singer, musician, voice artist and stand-up comedian. Laura Jane Laughlin appeared on this show.', 'titles': ['Legend (TV series)', 'Laura Jane Laughlin', 'John de Lancie']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:16:36 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Between the Irish writer writingThe Four-Chambered Heart and Odysseas Elytis, Odysseas Elytis was awarded the Nobel Prize in Literature.', 'titles': ['The Four-Chambered Heart', 'Anaïs Nin', 'Odysseas Elytis']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:16:38 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'There is an airport across the street from the U.S. Coast Guard Air Station San Diego. That airport and the McCarran International Airport are not located in the same place.', 'titles': ['Coast Guard Air Station San Diego', 'McCarran International Airport', 'San Diego International Airport']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:16:39 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The Miss Universe 2015 was held at 



[A[A

[A[A

[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A

2025/12/12 13:16:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Between the Irish writer writingThe Four-Chambered Heart and Odysseas Elytis, Odysseas Elytis was awarded the Nobel Prize in Literature.', 'titles': ['The Four-Chambered Heart', 'Anaïs Nin', 'Odysseas Elytis']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:16:57 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The city where radio personality Ye Sha is from has a higher population than Hengyang.', 'titles': ['Hengyang', 'Shanghai', 'Ye Sha']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:17:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film by Sandi Sissel was released before The End of Suburbia.', 'titles': ['Chicken Ranch (film)', 'Sandi Sissel', 'The End of Suburbia']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:17:01 ERROR dspy.utils.parallelizer: Error for Example({'claim': "King of RUS, who had the King Magnus' Halt railway station named after him, launched aggressive military campaigns in this region that was at times independent of external control and was known to the Norse as Southern Isles.", 'titles': ["King Magnus' Halt rail


[A
[A


[A[A[A


[A[A[A

2025/12/12 13:17:08 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Liberal Arts school Emory University was founded before the school that William Ridley Wills is a graduate of.', 'titles': ['Emory University', 'Vanderbilt University', 'William Ridley Wills']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

2025/12/12 13:17:16 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The city where radio personality Ye Sha is from has a higher population than Hengyang.', 'titles': ['Hengyang', 'Shanghai', 'Ye Sha']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

Average Metric: 2.00 / 4 (50.0%):   4%|▍         | 4/100 [03:48<1:23:12, 52.00s/it]

2025/12/12 13:17:23 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor appeared in the 2011 Indian film Ra.One and in the film Revolver that also starred Jason Statham and Ray Liotta.', 'titles': ['Tom Wu', 'Revolver (2005 film)', 'Ra.One']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 3.67 / 6 (61.1%):   6%|▌         | 6/100 [03:51<35:42, 22.79s/it]  

2025/12/12 13:17:27 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith Scholey  co-directed "African Cats" and another documentary with Nicholas for Disneynature. That documentary and Aliens of the Deep were not filmed in the same locations.', 'titles': ['Keith Scholey', 'Aliens of the Deep', 'Bears (film)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:17:29 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The man associated with Sports & Wine and Nic Offer are both considered to be musicians.', 'titles': ['Sports &amp; Wine', 'Nic Offer', 'Ben Folds']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:17:30 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The dog breed that is a cousin of the German longhaired pointer was developed in Pescara and not the Bracco Italiano.', 'titles': ['German Longh



[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

2025/12/12 13:17:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The film, directed by a former assistant directer of academy award winner Kim Ki-duk on "Rough Cut", starring Song Kang-ho in the title role and selected as the South Korean entry for the Best Foreign Language Film at the 90th Academy Awards, debuted in 2017.', 'titles': ['Jang Hoon', 'A Taxi Driver', 'Kim Ki-duk']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:17:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Garnett Dunning was active in animation films. He was more active in animation films than the director of The Gay Bride.', 'titles': ['The Gay Bride', 'Jack Conway (filmmaker)', 'George Dunning']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 6.00 / 9 (66.7%):   8%|▊         | 8/100 [04:06<21:14, 13.85s/it]

2025/12/12 13:17:45 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This actor starred in Drunk Parents. The character he played in "Spider-Man" played football.', 'titles': ['Joe Manganiello', 'Drunk Parents', 'Flash Thompson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:17:49 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The indie band She Wants Revenge and the band that released the album Speakeasy (Freeze the Atlantic album) play music in the genre of rock. Both bands come from different countries.', 'titles': ['Speakeasy (Freeze the Atlantic album)', 'She Wants Revenge', 'Freeze the Atlantic']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A


[A[A[A


Average Metric: 7.00 / 10 (70.0%):  10%|█         | 10/100 [04:24<17:28, 11.65s/it]


[A[A[A


Average Metric: 7.67 / 11 (69.7%):  11%|█         | 11/100 [04:32<15:57, 10.76s/it]

2025/12/12 13:18:10 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Liberal Arts school Emory University was founded before the school that William Ridley Wills is a graduate of.', 'titles': ['Emory University', 'Vanderbilt University', 'William Ridley Wills']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:18:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith Scholey  co-directed "African Cats" and another documentary with Nicholas for Disneynature. That documentary and Aliens of the Deep were not filmed in the same locations.', 'titles': ['Keith Scholey', 'Aliens of the Deep', 'Bears (film)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:18:15 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor who starred in the 1965 film Frankenstein Meets the Space Monster also appeared on the third episode of the ninth season of a sitcom that was the 187th episode overall.', 'titles': ['Lou Cutell', 'Frankenstein Meets the Space Monster', 'Last Time in New York']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

[A[A

[A[A
[A
[A

2025/12/12 13:18:23 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This actor starred in Drunk Parents. The character he played in "Spider-Man" played football.', 'titles': ['Joe Manganiello', 'Drunk Parents', 'Flash Thompson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:18:24 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The birthplace of Korean Li Ye (speed skater) has a greater population that Huainan. They were born on born the 26 December 1983.', 'titles': ['Huainan', 'Li Ye (speed skater)', 'Changchun']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A
[A
[A

2025/12/12 13:18:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The birthplace of Korean Li Ye (speed skater) has a greater population that Huainan. They were born on born the 26 December 1983.', 'titles': ['Huainan', 'Li Ye (speed skater)', 'Changchun']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A

[A[A

[A[A
[A
[A

2025/12/12 13:18:50 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The 1999 French Open - Women's Doubles runner-up was born on 7 June 1981. She was also a trainer on The Biggest Loser (season 12).", 'titles': ["1999 French Open – Women's Doubles", 'Anna Kournikova', 'The Biggest Loser (season 12)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:18:50 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Linzhou, Henan is bigger than area where the Liaoning Finance and Trade College is located.', 'titles': ['Linzhou, Henan', 'Liaoning Finance and Trade College', 'Xingcheng']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:18:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The indie band She Wants Revenge and the band that released the album Speakeasy (Freeze the Atlantic album) play music in the genre of rock. Both bands come from different countries.', 'titles': ['Speakeasy (Freeze the Atlantic album)', 'She Wants Revenge', 'Freeze the Atlantic']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:19:01 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The 1999 French Open - Women's Doubles runner-up was born on 7 June 1981. She was also a trainer on The Biggest Loser (season 12).", 'titles': ["1999 French Open – Women's Doubles", 'Anna Kournikova', 'The Biggest Loser (season 12)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


Average Metric: 8.00 / 12 (66.7%):  12%|█▏        | 12/100 [05:31<34:30, 23.53s/it]

2025/12/12 13:19:06 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This actor starred in Drunk Parents. The character he played in "Spider-Man" played football.', 'titles': ['Joe Manganiello', 'Drunk Parents', 'Flash Thompson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:19:09 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The city where radio personality Ye Sha is from has a higher population than Hengyang.', 'titles': ['Hengyang', 'Shanghai', 'Ye Sha']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
Average Metric: 8.00 / 13 (61.5%):  13%|█▎        | 13/100 [05:38<27:36, 19.04s/it]

[A[A

Average Metric: 9.33 / 15 (62.2%):  15%|█▌        | 15/100 [05:54<18:23, 12.99s/it]

2025/12/12 13:19:37 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Hamilton IV performed the song "Abilene" in a 1963 movie. The actress who co-starred with Linda Evans in this movie was Canadian.', 'titles': ['Abilene (song)', 'Ruta Lee', 'Hootenanny Hoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 9.67 / 16 (60.4%):  16%|█▌        | 16/100 [06:05<17:24, 12.43s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

2025/12/12 13:19:47 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This actor starred in Drunk Parents. The character he played in "Spider-Man" played football.', 'titles': ['Joe Manganiello', 'Drunk Parents', 'Flash Thompson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 10.67 / 17 (62.7%):  17%|█▋        | 17/100 [06:21<18:38, 13.48s/it]

[A[A

[A[A

2025/12/12 13:19:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor who starred in the 1965 film Frankenstein Meets the Space Monster also appeared on the third episode of the ninth season of a sitcom that was the 187th episode overall.', 'titles': ['Lou Cutell', 'Frankenstein Meets the Space Monster', 'Last Time in New York']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 11.33 / 18 (63.0%):  18%|█▊        | 18/100 [06:25<14:28, 10.60s/it]

[A[A

[A[A

[A[A

Average Metric: 11.67 / 19 (61.4%):  19%|█▉        | 19/100 [06:34<13:47, 10.22s/it]

[A[A

[A[A
[A
[A

2025/12/12 13:20:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Keith Scholey  co-directed "African Cats" and another documentary with Nicholas for Disneynature. That documentary and Aliens of the Deep were not filmed in the same locations.', 'titles': ['Keith Scholey', 'Aliens of the Deep', 'Bears (film)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:20:14 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A historic fishing town in south Ghana has the N1 passing through it. That town hosts the Fancy Dress Festival.', 'titles': ['Fancy Dress Festival', 'N1 road (Ghana)', 'Winneba']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:20:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The man associated with Sports & Wine and Nic Offer are both considered to be musicians.', 'titles': ['Sports &amp



[A[A

[A[A


[A[A[A


[A[A[A

[A[A

[A[A


[A[A[A


Average Metric: 12.33 / 21 (58.7%):  21%|██        | 21/100 [07:02<14:34, 11.07s/it]

2025/12/12 13:20:41 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The dog breed that is a cousin of the German longhaired pointer was developed in Pescara and not the Bracco Italiano.', 'titles': ['German Longhaired Pointer', 'Large Münsterländer', 'Bracco Italiano']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

Average Metric: 13.33 / 23 (58.0%):  23%|██▎       | 23/100 [07:15<10:45,  8.39s/it]


[A[A[A


[A[A[A

[A[A

[A[A

2025/12/12 13:20:52 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor appeared in the 2011 Indian film Ra.One and in the film Revolver that also starred Jason Statham and Ray Liotta.', 'titles': ['Tom Wu', 'Revolver (2005 film)', 'Ra.One']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:20:53 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'When this short, part of "The Tracy Ullman Show", and featuring the first television appearance of Homer Simpson, aired on "The Simpsons 138th Epsiode Spectacular," Julie Kavner was the voice actress who did the voice of the character named after Matt Groening\'s mother.', 'titles': ['Good Night (The Simpsons short)', 'Homer Simpson', 'Marge Simpson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:20:56 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The 


[A
[A


[A[A[A


[A[A[A
[A
[A
[A
[A


[A[A[A


[A[A[A

2025/12/12 13:21:11 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The dog breed that is a cousin of the German longhaired pointer was developed in Pescara and not the Bracco Italiano.', 'titles': ['German Longhaired Pointer', 'Large Münsterländer', 'Bracco Italiano']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 24 (58.3%):  24%|██▍       | 24/100 [07:40<16:36, 13.11s/it]

[A[A

Average Metric: 14.67 / 25 (58.7%):  25%|██▌       | 25/100 [07:40<11:36,  9.29s/it]

2025/12/12 13:21:15 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The man associated with Sports & Wine and Nic Offer are both considered to be musicians.', 'titles': ['Sports &amp; Wine', 'Nic Offer', 'Ben Folds']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:21:21 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Hamilton IV performed the song "Abilene" in a 1963 movie. The actress who co-starred with Linda Evans in this movie was Canadian.', 'titles': ['Abilene (song)', 'Ruta Lee', 'Hootenanny Hoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A
[A
[A

[A[A

[A[A
[A
[A


[A[A[A


[A[A[A
[A
[A

2025/12/12 13:21:48 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The area the Oregon Portage Railroad ran to is located farther out than Lake Worth Lagoon.', 'titles': ['Oregon Portage Railroad', 'Lake Worth Lagoon', 'Cascade Locks and Canal']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   2%|▏         | 2/100 [55:39<45:26:56, 1669.56s/it]


2025/12/12 13:21:49 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Tome Sizemore had a role in a movie that starred Melinda Lopez and Bruce Payne. Richard Nord worked on this film.', 'titles': ['Richard Nord', 'Tom Sizemore', 'Passenger 57']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 15.33 / 26 (59.0%):  26%|██▌       | 26/100 [08:16<21:27, 17.39s/it]


[A[A[A


[A[A[A

[A[A

[A[A

[A[A

[A[A


[A[A[A


[A[A[A

2025/12/12 13:21:59 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'This actor starred in Drunk Parents. The character he played in "Spider-Man" played football.', 'titles': ['Joe Manganiello', 'Drunk Parents', 'Flash Thompson']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A


[A[A[A


Average Metric: 17.00 / 28 (60.7%):  28%|██▊       | 28/100 [08:41<16:36, 13.84s/it]

[A[A

[A[A
[A
Average Metric: 17.67 / 29 (60.9%):  29%|██▉       | 29/100 [08:52<15:24, 13.02s/it]


[A[A[A


[A[A[A


[A[A[A


[A[A[A
[A
[A

[A[A

[A[A
[A
[A


[A[A[A


[A[A[A

[A[A

Average Metric: 18.00 / 30 (60.0%):  30%|███       | 30/100 [09:22<21:12, 18.18s/it]
[A
[A

2025/12/12 13:22:59 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The dog breed that is a cousin of the German longhaired pointer was developed in Pescara and not the Bracco Italiano.', 'titles': ['German Longhaired Pointer', 'Large Münsterländer', 'Bracco Italiano']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:23:00 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The faster roller coaster, between the ride next to Tidal Wave and Green Lantern has a top speed of 65 mph.', 'titles': ['Tidal Wave (Six Flags Magic Mountain)', "The Riddler's Revenge", 'Green Lantern (Six Flags Great Adventure)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 18.67 / 31 (60.2%):  31%|███       | 31/100 [09:26<16:02, 13.95s/it]


[A[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A

[A[A

[A[A

Average Metric: 20.00 / 33 (60.6%):  33%|███▎      | 33/100 [09:52<14:08, 12.66s/it]

2025/12/12 13:23:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Tome Sizemore had a role in a movie that starred Melinda Lopez and Bruce Payne. Richard Nord worked on this film.', 'titles': ['Richard Nord', 'Tom Sizemore', 'Passenger 57']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:23:27 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The birthplace of Korean Li Ye (speed skater) has a greater population that Huainan. They were born on born the 26 December 1983.', 'titles': ['Huainan', 'Li Ye (speed skater)', 'Changchun']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:23:30 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Linzhou, Henan is bigger than area where the Liaoning Finance and Trade College is located.', 'titles': ['Linzhou, Henan', 'Liaoning Finance and Trade College', 'Xingcheng'



[A[A

[A[A


[A[A[A


Average Metric: 20.67 / 34 (60.8%):  34%|███▍      | 34/100 [10:07<14:45, 13.41s/it]

[A[A

[A[A

2025/12/12 13:23:50 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Latin singer Aaliyah had a video director that directed the music video Soothe My Soul, and also had to face allegation of illegal marriage with R. Kelly.', 'titles': ['Aaliyah', 'Soothe My Soul', 'Warren Fu']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




2025/12/12 13:23:54 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The actor who starred in the 1965 film Frankenstein Meets the Space Monster also appeared on the third episode of the ninth season of a sitcom that was the 187th episode overall.', 'titles': ['Lou Cutell', 'Frankenstein Meets the Space Monster', 'Last Time in New York']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 21.67 / 36 (60.2%):  36%|███▌      | 36/100 [10:31<12:35, 11.81s/it]

[A[A

Average Metric: 22.33 / 37 (60.4%):  37%|███▋      | 37/100 [10:32<08:53,  8.47s/it]

2025/12/12 13:24:06 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The faster roller coaster, between the ride next to Tidal Wave and Green Lantern has a top speed of 65 mph.', 'titles': ['Tidal Wave (Six Flags Magic Mountain)', "The Riddler's Revenge", 'Green Lantern (Six Flags Great Adventure)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 22.67 / 38 (59.6%):  38%|███▊      | 38/100 [10:34<06:51,  6.64s/it]
[A
[A


[A[A[A


[A[A[A

[A[A

[A[A


[A[A[A


[A[A[A

2025/12/12 13:24:19 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Latin singer Aaliyah had a video director that directed the music video Soothe My Soul, and also had to face allegation of illegal marriage with R. Kelly.', 'titles': ['Aaliyah', 'Soothe My Soul', 'Warren Fu']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

2025/12/12 13:24:21 ERROR dspy.utils.parallelizer: Error for Example({'claim': "The 1999 French Open - Women's Doubles runner-up was born on 7 June 1981. She was also a trainer on The Biggest Loser (season 12).", 'titles': ["1999 French Open – Women's Doubles", 'Anna Kournikova', 'The Biggest Loser (season 12)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 22.67 / 39 (58.1%):  39%|███▉      | 39/100 [10:59<12:13, 12.02s/it]
[A
[A

2025/12/12 13:24:36 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The birthplace of Korean Li Ye (speed skater) has a greater population that Huainan. They were born on born the 26 December 1983.', 'titles': ['Huainan', 'Li Ye (speed skater)', 'Changchun']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:24:38 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The area the Oregon Portage Railroad ran to is located farther out than Lake Worth Lagoon.', 'titles': ['Oregon Portage Railroad', 'Lake Worth Lagoon', 'Cascade Locks and Canal']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A


[A[A[A


[A[A[A
[A
Average Metric: 23.33 / 40 (58.3%):  40%|████      | 40/100 [11:15<13:15, 13.26s/it]


[A[A[A


Average Metric: 24.00 / 41 (58.5%):  41%|████      | 41/100 [11:21<10:47, 10.97s/it]
[A
[A

[A[A

[A[A

[A[A

[A[A

2025/12/12 13:25:06 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A science fiction Western television show stars an Canadian , director, producer, writer, singer, musician, voice artist and stand-up comedian. Laura Jane Laughlin appeared on this show.', 'titles': ['Legend (TV series)', 'Laura Jane Laughlin', 'John de Lancie']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A


Average Metric: 25.00 / 42 (59.5%):  42%|████▏     | 42/100 [11:57<17:59, 18.62s/it]

[A[A

[A[A
[A
Average Metric: 26.00 / 43 (60.5%):  43%|████▎     | 43/100 [12:02<13:36, 14.33s/it]
[A
[A
[A
[A

2025/12/12 13:25:51 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'A science fiction Western television show stars an Canadian , director, producer, writer, singer, musician, voice artist and stand-up comedian. Laura Jane Laughlin appeared on this show.', 'titles': ['Legend (TV series)', 'Laura Jane Laughlin', 'John de Lancie']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


Average Metric: 27.33 / 45 (60.7%):  45%|████▌     | 45/100 [12:28<11:45, 12.83s/it]

[A[A

Average Metric: 28.00 / 46 (60.9%):  46%|████▌     | 46/100 [12:31<08:53,  9.88s/it]

[A[A

Average Metric: 29.00 / 47 (61.7%):  47%|████▋     | 47/100 [12:34<06:52,  7.79s/it]
[A
[A

2025/12/12 13:26:13 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'George Hamilton IV performed the song "Abilene" in a 1963 movie. The actress who co-starred with Linda Evans in this movie was Canadian.', 'titles': ['Abilene (song)', 'Ruta Lee', 'Hootenanny Hoot']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.



[A
[A

2025/12/12 13:26:26 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The faster roller coaster, between the ride next to Tidal Wave and Green Lantern has a top speed of 65 mph.', 'titles': ['Tidal Wave (Six Flags Magic Mountain)', "The Riddler's Revenge", 'Green Lantern (Six Flags Great Adventure)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.
2025/12/12 13:26:29 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The man associated with Sports & Wine and Nic Offer are both considered to be musicians.', 'titles': ['Sports &amp; Wine', 'Nic Offer', 'Ben Folds']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A

[A[A

[A[A

2025/12/12 13:26:32 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Tome Sizemore had a role in a movie that starred Melinda Lopez and Bruce Payne. Richard Nord worked on this film.', 'titles': ['Richard Nord', 'Tom Sizemore', 'Passenger 57']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A


[A[A[A


[A[A[A
[A
[A


[A[A[A


[A[A[A

[A[A

[A[A

[A[A

Average Metric: 29.67 / 49 (60.5%):  48%|████▊     | 48/100 [13:15<15:17, 17.64s/it]
[A
[A

2025/12/12 13:27:08 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'Tome Sizemore had a role in a movie that starred Melinda Lopez and Bruce Payne. Richard Nord worked on this film.', 'titles': ['Richard Nord', 'Tom Sizemore', 'Passenger 57']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.





[A[A[A


[A[A[A
[A
[A
[A
[A

[A[A

[A[A
[A
[A

2025/12/12 13:27:31 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The faster roller coaster, between the ride next to Tidal Wave and Green Lantern has a top speed of 65 mph.', 'titles': ['Tidal Wave (Six Flags Magic Mountain)', "The Riddler's Revenge", 'Green Lantern (Six Flags Great Adventure)']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.


Average Metric: 30.33 / 50 (60.7%):  50%|█████     | 50/100 [14:06<17:46, 21.34s/it]

[A[A

Average Metric: 31.67 / 52 (60.9%):  52%|█████▏    | 52/100 [14:24<12:15, 15.32s/it]


[A[A[A


Average Metric: 32.33 / 53 (61.0%):  53%|█████▎    | 53/100 [14:34<10:41, 13.66s/it]
[A
[A


[A[A[A


[A[A[A
[A
Average Metric: 33.00 / 54 (61.1%):  54%|█████▍    | 54/100 [14:54<11:55, 15.56s/it]


[A[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A

Average Metric: 33.67 / 56 (60.1%):  56%|█████▌    | 56/100 [15:12<08:42, 11.88s/it]

[A[A

[A[A
[A
Average Metric: 34.00 / 57 (59.6%):  57%|█████▋    | 57/100 [15:24<08:35, 11.99s/it]
[A
[A
[A
[A
[A
Average Metric: 35.00 / 58 (60.3%):  58%|█████▊    | 58/100 [15:43<09:49, 14.04s/it]


[A[A[A


Average Metric: 35.67 / 60 (59.4%):  60%|██████    | 60/100 [15:48<05:21,  8.05s/it]

[A[A

[A[A

[A[A

Average Metric: 37.67 / 63 (59.8%):  63%|██████▎   | 63/100 [16:00<02:56,  4.78s/it]

[A[A

[A[A
[A
Average Metric: 38.3

2025/12/12 13:31:01 INFO dspy.evaluate.evaluate: Average Metric: 70.33333333333333 / 100 (70.3%)



Average Metric: 43.33 / 72 (60.2%):  72%|███████▏  | 72/100 [17:27<05:28, 11.74s/it]

[A[A

[A[A

[A[A

[A[A


[A[A[A


[A[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Average Metric: 44.00 / 73 (60.3%):  73%|███████▎  | 73/100 [18:00<08:07, 18.06s/it]

2025/12/12 13:31:34 ERROR dspy.utils.parallelizer: Error for Example({'claim': 'The dog breed that is a cousin of the German longhaired pointer was developed in Pescara and not the Bracco Italiano.', 'titles': ['German Longhaired Pointer', 'Large Münsterländer', 'Bracco Italiano']}) (input_keys={'claim'}): 'list' object has no attribute 'titles'. Set `provide_traceback=True` for traceback.




[A[A

[A[A


[A[A[A


Average Metric: 69.00 / 100 (69.0%): : 102it [18:12, 10.71s/it]

2025/12/12 13:31:46 INFO dspy.evaluate.evaluate: Average Metric: 69.0 / 100 (69.0%)



Average Metric: 47.33 / 79 (59.9%):  79%|███████▉  | 79/100 [18:34<01:52,  5.38s/it]

[A[A

Average Metric: 48.33 / 80 (60.4%):  80%|████████  | 80/100 [18:37<01:34,  4.71s/it]
[A
[A

[A[A

[A[A

[A[A

Average Metric: 49.00 / 81 (60.5%):  81%|████████  | 81/100 [18:39<01:17,  4.08s/it]

[A[A

Average Metric: 49.67 / 82 (60.6%):  82%|████████▏ | 82/100 [18:48<01:38,  5.45s/it]

[A[A

[A[A

[A[A

Average Metric: 50.67 / 83 (61.0%):  83%|████████▎ | 83/100 [18:56<01:45,  6.19s/it]

[A[A

Average Metric: 51.00 / 84 (60.7%):  84%|████████▍ | 84/100 [18:58<01:18,  4.90s/it]

[A[A

Average Metric: 51.33 / 85 (60.4%):  85%|████████▌ | 85/100 [19:05<01:25,  5.72s/it]

[A[A

Average Metric: 51.33 / 86 (59.7%):  86%|████████▌ | 86/100 [19:09<01:10,  5.01s/it]

[A[A

Average Metric: 51.33 / 87 (59.0%):  87%|████████▋ | 87/100 [19:29<02:04,  9.60s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A
Average Metric: 52.00 / 88 (59.1%):  88%|████████▊ | 88/100 [19:44<02:

2025/12/12 13:35:15 INFO dspy.evaluate.evaluate: Average Metric: 73.0 / 100 (73.0%)





[A[A

Average Metric: 57.67 / 100 (57.7%): : 102it [22:38, 13.32s/it]

2025/12/12 13:36:12 INFO dspy.evaluate.evaluate: Average Metric: 57.666666666666664 / 100 (57.7%)





[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Average Metric: 62.67 / 100 (62.7%): : 101it [25:19, 15.05s/it]

2025/12/12 13:38:53 INFO dspy.evaluate.evaluate: Average Metric: 62.666666666666664 / 100 (62.7%)





In [64]:
# print(len(history))
with open("prompt_history.json", "w") as f:
    jsons = []
    for h in history:
        # If h is not a dict but can be converted via **, do so
        try:
            jsons.append(json.dumps(h, indent=2))
        except TypeError:
            try:
                jsons.append(json.dumps({**h}, indent=2))
            except Exception:
                # fallback: string representation as error handling
                jsons.append(repr(h))
    f.write('[\n' + ',\n'.join(jsons) + '\n]\n')

In [None]:
# ==============
# Eval perf over time
# ==============
RUN_ID="20251212_130856"

run_dir = f"programs/{RUN_ID}"




In [None]:
import concurrent.futures
with dspy.context(lm=student_lm):
    original_score = evaluate(react).score

def evaluate_prompt_pair(num, reasoner_result):
    new_react = react.deepcopy()
    new_react.react.signature.instructions = reasoner_result.improved_react_prompt
    new_react.extract.predict.signature.instructions = reasoner_result.improved_extraction_prompt
    with dspy.context(lm=student_lm):
        new_full_evaluate = evaluate(new_react)
    return num, new_full_evaluate.score

# Prefer ThreadPoolExecutor for notebook compatibility; ProcessPoolExecutor can misbehave in notebooks
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    # Dictionary mapping future to num for tracking and ordering
    future_to_num = {
        executor.submit(evaluate_prompt_pair, num, prompt): num 
        for num, prompt in new_prompts.items()
    }
    for future in concurrent.futures.as_completed(future_to_num):
        num, score = future.result()
        results[num] = score

In [None]:
with dspy.context(lm=student_lm):
    initial_evaluation = evaluate(initial_react)
    