In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import dspy

In [4]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment("test")
mlflow.dspy.autolog()




In [5]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="vincentkoc/hover-parquet", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

import ujson
import bm25s
import Stemmer
import os
from dspy.utils import download

# Download and extract data if not exists
if not os.path.exists("wiki.abstracts.2017.jsonl"):
    download("https://huggingface.co/dspy/cache/resolve/main/wiki.abstracts.2017.tar.gz")
    os.system("tar -xzvf wiki.abstracts.2017.tar.gz")

corpus = []
if os.path.exists("wiki.abstracts.2017.jsonl"):
    with open("wiki.abstracts.2017.jsonl") as f:
        for line in f:
            line = ujson.loads(line)
            corpus.append(f"{line['title']} | {' '.join(line['text'])}")

stemmer = Stemmer.Stemmer("english")
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

retriever = bm25s.BM25(k1=0.9, b=0.4)
retriever.index(corpus_tokens)

DOCS = {}

def search(query: str, k: int) -> list[str]:
    tokens = bm25s.tokenize(query, stopwords="en", stemmer=stemmer, show_progress=False)
    results, scores = retriever.retrieve(tokens, k=k, n_threads=1, show_progress=False)
    
    retrieved_docs = [corpus[doc] for doc in results[0]]
    
    for doc_str in retrieved_docs:
         if " | " in doc_str:
             title, text = doc_str.split(" | ", 1)
             DOCS[title] = text
             
    return retrieved_docs

# search("France", 5)

def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

  from .autonotebook import tqdm as notebook_tqdm


Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Academy Award for Best Director', 'Miss Potter', 'Chris Noonan']


                                                                                   

In [6]:
from dspy.datasets import DataLoader
from datasets import load_dataset


student_lm = dspy.LM("openrouter/qwen/qwen3-8b")
teacher_lm = dspy.LM("openai/gpt-5")
dspy.configure(lm=student_lm)

In [7]:
react(claim="David Gregory was born in 1625.").titles[:3]

['David Gregory (physician)']

In [8]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

eval_result = evaluate(safe_react)
print(eval_result)


Average Metric: 58.33 / 100 (58.3%): 100%|██████████| 100/100 [00:06<00:00, 16.29it/s]

2025/12/01 19:02:35 INFO dspy.evaluate.evaluate: Average Metric: 58.3333333333333 / 100 (58.3%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,_patched
0,The Church of England's movement that inspired the Trinity Episcop...,"[Samuel Rickards, Oxford Movement, Trinity Episcopal Church (Hough...","{'thought_0': ""I need to verify if the Trinity Episcopal Church in...","The claim links the Trinity Episcopal Church in Houghton, Michigan...","[Trinity Episcopal Church (Houghton, Michigan), Oxford Movement, S...",✔️ [1.000]
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Red, White &amp; Crüe, Mike Tyson, Bobby Stewart]",{'thought_0': 'I need to identify the French fighter trained by Bo...,"The claim conflates a music album (""Red, White & Crüe"" by Mötley C...","[Red, White & Crüe, Mötley Crüe, French Stewart, Bobby Nash, Fight...",✔️ [0.000]
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Fernand Rivers, Ed Wood, Glen or Glenda]",{'thought_0': 'I need to verify the claim by checking the careers ...,"The claim suggests that the individuals associated with ""Glen or G...","[Glen or Glenda, Ed Wood, Fernand Rivers, Berlingot and Company, T...",✔️ [1.000]
3,The film by Sandi Sissel was released before The End of Suburbia.,"[The End of Suburbia, Chicken Ranch (film), Sandi Sissel]",{'thought_0': 'I need to determine the release dates of Sandi Siss...,"The claim states that a film by Sandi Sissel was released before ""...","[Chicken Ranch, Paul Jacobs and the Nuclear Gang, Mother Teresa (1...",✔️ [0.333]
4,The actor who played captain hook in the live production with Tayl...,"[Taylor Louderman, Peter Pan Live!, Christopher Walken]",{'thought_0': 'I need to identify the actor who played Captain Hoo...,"The actor who played Captain Hook in the 2014 ""Peter Pan Live!"" pr...","[Peter Pan Live!, Christopher Walken, The Deer Hunter]",✔️ [0.667]


EvaluationResult(score=58.33, results=<list of 100 results>)


In [25]:
mlflow.dspy.autolog(disable=True)

In [14]:
import dspy

from dspy.teleprompt.teleprompt import Teleprompter
from typing import Any

class NoLabelOptimizer(Teleprompter):
    def compile(self, student: dspy.Module, *, trainset: list[dspy.Example], teacher_lm: dspy.LM) -> dspy.Module:

        def wrap_student(student_prog):
            def wrapped_student(*args, **kwargs):
                with dspy.context(trace=[]):
                    result = student_prog(*args, **kwargs)
                    trace = dspy.settings.trace
                    return [result, trace]
            return wrapped_student
        evaluate = dspy.Evaluate(devset=trainset, metric=lambda e, p, t=None: 0, num_threads=16, display_progress=True)
        eval_result = evaluate(wrap_student(student))
        print(eval_result)

        return student, eval_result

    def get_params(self) -> dict[str, Any]:
        """
        Get the parameters of the teleprompter.

        Returns:
            The parameters of the teleprompter.
        """
        return self.__dict__

optimizer = NoLabelOptimizer()

optimized_react, eval_result = optimizer.compile(react, trainset=trainset[:20], teacher_lm=teacher_lm)


Average Metric: 0.00 / 10 (0.0%):  45%|████▌     | 9/20 [00:00<00:08,  1.33it/s]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...tailed information.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...o verify the claim.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 11 (0.0%):  55%|█████▌    | 11/20 [00:36<00:30,  3.37s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...tles in the output.\n"}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...e titles collected.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 12 (0.0%):  60%|██████    | 12/20 [00:41<00:28,  3.59s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...ovel by Montgomery.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...the actors checked.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 13 (0.0%):  65%|██████▌   | 13/20 [00:51<00:31,  4.46s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...without that error.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...Social Distortion."\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 14 (0.0%):  70%|███████   | 14/20 [01:14<00:46,  7.71s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re... the most relevant.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...has been collected.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 15 (0.0%):  75%|███████▌  | 15/20 [01:32<00:48,  9.61s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...et/AirTran Airways.\n"}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...the collected data.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 16 (0.0%):  80%|████████  | 16/20 [01:38<00:35,  8.95s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...thoven\'s opinions.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne... proceed with that.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 17 (0.0%):  85%|████████▌ | 17/20 [02:11<00:43, 14.60s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...est related titles.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne... info is collected.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 18 (0.0%):  90%|█████████ | 18/20 [02:29<00:30, 15.41s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re... list those titles.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...ge. Let\'s do that.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 18 (0.0%):  95%|█████████▌| 19/20 [02:38<00:13, 13.92s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re... and "Trick-Trick".\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## ne...the necessary info.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer

Average Metric: 0.00 / 19 (0.0%): : 21it [02:50, 10.11s/it]                      

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...t people and shows.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


Average Metric: 0.00 / 20 (0.0%): : 22it [03:01,  8.26s/it]

  PydanticSerializationUnexpectedValue(Expected 9 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='[[ ## re...de those two bands.\n'}), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...finish_reason': 'stop'}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
2025/12/02 01:23:00 INFO dspy.evaluate.evaluate: Average Metric: 0 / 20 (0.0%)



EvaluationResult(score=0.0, results=<list of 20 results>)


In [15]:
from collections import Counter
predictions = [result[1] for result in eval_result.results]
print(type(predictions))
print(type(predictions[0]))
print(len(predictions[0]))
print(predictions[0][1])
c = Counter(type(prediction) for prediction in predictions)
len_c = Counter(len(prediction) for prediction in predictions)
traces = [prediction[1] for prediction in predictions]
final_trajectories = [trace[-1] for trace in traces]
print(final_trajectories[0])
# print(type(traces[0]))


<class 'list'>
<class 'list'>
2
[(Predict(StringSignature(claim, trajectory -> next_thought, next_tool_name, next_tool_args
    instructions="Find all Wikipedia titles relevant to verifying (or refuting) the claim.\n\nYou are an Agent. In each episode, you will be given the fields `claim` as input. And you can see your past trajectory so far.\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `titles`.\n\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\n\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\n\n(1) search_wikipedia, whose description is <desc>Returns top-5 results and then the titles of the top-5 to top-30 result

In [35]:
import rich
class ReasonAboutTraces(dspy.Signature):
    """Given a set of trajectories, and the prompt that was used to generate them,
    reason about the traces and how you might improve the prompt from a strategy perspective without overfitting"""
    trajectories: str = dspy.InputField()
    old_prompt: str = dspy.InputField()
    deviations_from_specification: str = dspy.OutputField(desc="Deviations from the specification given in the signature of the prompt")
    noticed_patterns: str = dspy.OutputField()
    generic_strategy: str = dspy.OutputField()
    improved_prompt: str = dspy.OutputField()

inputs = {
    "trajectories": final_trajectories,
    "old_prompt": signature.instructions,
}
# input_range = [1,3,5,10,20]

# all_range_prompts = {}

# for i in input_range:
#     all_range_prompts[i] = {
#         "trajectories": final_trajectories[:i],
#         "old_prompt": signature.instructions,
#     }

# all_range_prompts = {}
# with dspy.context(lm=teacher_lm):
#     for i in input_range:
#         inputs = all_range_prompts[i]
#         prompt = reasoner(**inputs).improved_prompt
#         all_range_prompts[i] = prompt
#         rich.print(prompt)

all_20_prompts = {}

with dspy.context(lm=teacher_lm):
    for i in range(5):
        print("Running ", i)
        inputs = {
            "trajectories": final_trajectories[:20],
            "old_prompt": signature.instructions,
        }

        reasoner = dspy.Predict(ReasonAboutTraces, seed=i)
        prompt = reasoner(**inputs).improved_prompt
        all_20_prompts[i] = prompt
        rich.print(prompt)



    


Running  0


Running  1


Running  2


Running  3


Running  4


In [19]:
import warnings
warnings.filterwarnings("ignore", message=r"(?s)Pydantic serializer warnings:.*StreamingChoices")


In [None]:
evaluate_trainset = dspy.Evaluate(devset=trainset[:10], metric=top5_recall, num_threads=16, display_progress=True)
evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, max_errors=50)

# new_signature = signature.with_instructions(outputs.improved_prompt)
# improved_react = dspy.ReAct(new_signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)

initial_react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
no_cache_lm = dspy.LM("openrouter/qwen/qwen3-8b", cache=None)

def make_react_safe(program):
    def safe_program(*args, **kwargs):
        try:
            return program(*args, **kwargs)
        except Exception as e:
            print(f"Error in program: {e}")
            return dspy.Prediction(titles=[])

    return safe_program

results = {}
with dspy.context(lm=student_lm):
    # initial_react_evaluate = evaluate_trainset(initial_react)
    # improved_react_evaluate = evaluate_trainset(improved_react)
    
    # print(initial_react_evaluate.score)
    # print(improved_react_evaluate.score)
    initial_full_evaluate = evaluate(make_react_safe(initial_react))
    results[-1] = initial_full_evaluate.score

    import concurrent.futures

    def evaluate_prompt_pair(num, prompt):
        new_signature = dspy.Signature("claim -> titles: list[str]", prompt)
        new_react = dspy.ReAct(new_signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=10)
        new_full_evaluate = evaluate(make_react_safe(new_react))
        return num, new_full_evaluate.score

    # Prefer ThreadPoolExecutor for notebook compatibility; ProcessPoolExecutor can misbehave in notebooks
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # Dictionary mapping future to num for tracking and ordering
        future_to_num = {
            executor.submit(evaluate_prompt_pair, num, prompt): num 
            for num, prompt in all_20_prompts.items()
        }
        for future in concurrent.futures.as_completed(future_to_num):
            num, score = future.result()
            results[num] = score

print("="*80)
rich.print("Original prompt score: ", results[-1])
results.pop(-1)
rich.print("New prompt scores: ", results)
rich.print("Average score: ", sum(results.values()) / len(results))


Average Metric: 61.00 / 100 (61.0%): 100%|██████████| 100/100 [00:09<00:00, 10.84it/s]

2025/12/02 14:44:57 INFO dspy.evaluate.evaluate: Average Metric: 60.99999999999998 / 100 (61.0%)



Average Metric: 0.67 / 1 (66.7%):   0%|          | 0/100 [00:02<?, ?it/s]
[A

[A[A


Average Metric: 0.67 / 1 (66.7%):   1%|          | 1/100 [00:02<03:25,  2.08s/it]
[A


Average Metric: 1.33 / 2 (66.7%):   1%|          | 1/100 [00:02<03:25,  2.08s/it]
[A


Average Metric: 1.33 / 2 (66.7%):   2%|▏         | 2/100 [00:02<02:02,  1.25s/it]
[A


[A[A[A

Average Metric: 1.67 / 3 (55.6%):   2%|▏         | 2/100 [00:03<02:02,  1.25s/it]
[A


[A[A[A

Average Metric: 1.67 / 3 (55.6%):   3%|▎         | 3/100 [00:03<01:28,  1.10it/s]
[A


[A[A[A

Average Metric: 2.33 / 4 (58.3%):   3%|▎         | 3/100 [00:03<01:28,  1.10it/s]
[A


[A[A[A

Average Metric: 2.33 / 4 (58.3%):   4%|▍         | 4/100 [00:04<01:18,  1.22it/s]
[A


[A[A[A

Average Metric: 3.00 / 5 (60.0%):   4%|▍         | 4/100 [00:05<01:18,  1.22it/s]
[A


[A[A[A

Average Metric: 3.00 / 5 (60.0%):   5%|▌         | 5/100 [00:05<01:28,  1.07it/s]
[A


[A[A[A

Average Metric: 3.67 / 6 (61.1%):   5%|▌  

2025/12/02 14:45:31 INFO dspy.evaluate.evaluate: Average Metric: 61.3333333333333 / 100 (61.3%)



Average Metric: 61.33 / 100 (61.3%): 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]

2025/12/02 14:45:31 INFO dspy.evaluate.evaluate: Average Metric: 61.33333333333329 / 100 (61.3%)



Average Metric: 61.00 / 100 (61.0%): 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]

2025/12/02 14:45:31 INFO dspy.evaluate.evaluate: Average Metric: 60.99999999999998 / 100 (61.0%)



Average Metric: 62.67 / 100 (62.7%): 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]
Average Metric: 55.00 / 100 (55.0%): 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]

2025/12/02 14:45:31 INFO dspy.evaluate.evaluate: Average Metric: 62.66666666666664 / 100 (62.7%)





2025/12/02 14:45:31 INFO dspy.evaluate.evaluate: Average Metric: 54.99999999999999 / 100 (55.0%)




In [None]:
with dspy.context(lm=student_lm):
    initial_evaluation = evaluate(initial_react)
    