In [9]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune
from dotenv import load_dotenv

load_dotenv()

True

## Set up models

In [7]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
colbertv2 = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(rm=colbertv2, lm=turbo)

# Data

In [3]:
train = [('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', 'Kevin Greutert'),
         ('The heir to the Du Pont family fortune sponsored what wrestling team?', 'Foxcatcher'),
         ('In what year was the star of To Hell and Back born?', '1925'),
         ('Which award did the first book of Gary Zukav receive?', 'U.S. National Book Award'),
         ('What documentary about the Gilgo Beach Killer debuted on A&E?', 'The Killing Season'),
         ('Which author is English: John Braine or Studs Terkel?', 'John Braine'),
         ('Who produced the album that included a re-recording of "Lithium"?', 'Butch Vig')]

train = [dspy.Example(question=question, answer=answer).with_inputs('question') for question, answer in train]

In [4]:
dev = [('Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?', 'E. L. Doctorow'),
       ('Right Back At It Again contains lyrics co-written by the singer born in what city?', 'Gainesville, Florida'),
       ('What year was the party of the winner of the 1971 San Francisco mayoral election founded?', '1828'),
       ('Anthony Dirrell is the brother of which super middleweight title holder?', 'Andre Dirrell'),
       ('The sports nutrition business established by Oliver Cookson is based in which county in the UK?', 'Cheshire'),
       ('Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.', 'February 13, 1980'),
       ('Kyle Moran was born in the town on what river?', 'Castletown River'),
       ("The actress who played the niece in the Priest film was born in what city, country?", 'Surrey, England'),
       ('Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.', 'Portrait of a Marriage'),
       ('What year was the father of the Princes in the Tower born?', '1442'),
       ('What river is near the Crichton Collegiate Church?', 'the River Tyne'),
       ('Who purchased the team Michael Schumacher raced for in the 1995 Monaco Grand Prix in 2000?', 'Renault'),
       ('André Zucca was a French photographer who worked with a German propaganda magazine published by what Nazi organization?', 'the Wehrmacht')]

dev = [dspy.Example(question=question, answer=answer).with_inputs('question') for question, answer in dev]

# Evaluation

In [14]:
NUM_THREADS = 32
evaluate_hotpot = Evaluate(devset=dev, metric=metric_EM, num_threads=NUM_THREADS, display_progress=True, display_table=15)

# Chain of Thought

In [5]:
class CoT(dspy.Module):  # let's define a new module
    def __init__(self):
        super().__init__()

        # here we declare the chain of thought sub-module, so we can later compile it (e.g., teach it a prompt)
        self.generate_answer = dspy.ChainOfThought('question -> answer')
    
    def forward(self, question):
        return self.generate_answer(question=question)  # here we use the module

In [10]:
metric_EM = dspy.evaluate.answer_exact_match

optimizer = BootstrapFewShot(metric=metric_EM, max_bootstrapped_demos=2)
cot_compiled = optimizer.compile(CoT(), trainset=train)

 29%|█████████████████████████████████████████▏                                                                                                      | 2/7 [00:02<00:05,  1.12s/it]

Bootstrapped 2 full traces after 3 examples in round 0.





In [11]:
turbo.inspect_history(n=1)





Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: Who produced the album that included a re-recording of "Lithium"?
Answer: Butch Vig

---

Question: Which award did the first book of Gary Zukav receive?
Answer: U.S. National Book Award

---

Question: Which author is English: John Braine or Studs Terkel?
Answer: John Braine

---

Question: Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?
Answer: Kevin Greutert

---

Question: In what year was the star of To Hell and Back born?
Answer: 1925

---

Question: What documentary about the Gilgo Beach Killer debuted on A&E?
Answer: The Killing Season

---

Question: The heir to the Du Pont family fortune sponsored what wrestling team?
Reasoning: Let's think step by step in order to[32m find the answer. We know that the Du Pont

In [13]:
evaluate_hotpot(cot_compiled)

Average Metric: 3 / 13  (23.1): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.45it/s]
  df = df.applymap(truncate_cell)
 'False' 'False' 'False' 'False' 'False' 'False']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df.loc[:, metric_name] = df[metric_name].apply(


Average Metric: 3 / 13  (23.1%)


Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,E. L. Doctorow,"find the answer. We know that E. L. Doctorow is a novelist, editor, and professor, while Julia Peterkin is a novelist and short story writer....",E. L. Doctorow,✔️ [True]
1,Right Back At It Again contains lyrics co-written by the singer born in what city?,"Gainesville, Florida","find the answer. We know that the song ""Right Back At It Again"" was released in 2013, so we can look up the songs that...","Ocala, Florida",False
2,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,1828,find the answer. We know that the 1971 San Francisco mayoral election was held in 1971. We can then look up the winner of that...,1969,False
3,Anthony Dirrell is the brother of which super middleweight title holder?,Andre Dirrell,"find the answer. We know that Anthony Dirrell is a professional boxer, and specifically a super middleweight. We can then look up the current super...",Andre Dirrell,✔️ [True]
4,The sports nutrition business established by Oliver Cookson is based in which county in the UK?,Cheshire,find the answer. We know that Oliver Cookson is the founder of a sports nutrition business. We can then look up the location of the...,Cheshire,✔️ [True]
5,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"February 13, 1980",find the answer. We know that the actor played roles in both First Wives Club and Searching for the Elephant. We can then look up...,"September 18, 1961",False
6,Kyle Moran was born in the town on what river?,Castletown River,"find the answer. We know that Kyle Moran was born in a town, so we can look up the towns that he could have been...",Hudson River,False
7,"The actress who played the niece in the Priest film was born in what city, country?","Surrey, England","find the answer. We know that the actress who played the niece in the Priest film is a well-known actress, so we can look up...","London, England",False
8,Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.,Portrait of a Marriage,"find the answer. We know that Noel Harrison is an actor, and that he has a daughter. We can then look up the movies that...",The Trials of Oscar Wilde,False
9,What year was the father of the Princes in the Tower born?,1442,"find the answer. We know that the Princes in the Tower were born in 1470 and 1473, and their father was born when he was...",1457,False


23.08

# RAG

In [17]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        # declare three modules: the retriever, a query generator, and an answer generator
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_query = dspy.ChainOfThought("question -> search_query")
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
    
    def forward(self, question):
        # generate a search query from the question, and use it to retrieve passages
        search_query = self.generate_query(question=question).search_query
        passages = self.retrieve(search_query).passages

        # generate an answer from the passages and the question
        return self.generate_answer(context=passages, question=question)

Out of curiosity, evluate the uncompiled or zero-shot version of this

In [18]:
evaluate_hotpot(RAG(), display_table=0)

Average Metric: 2 / 13  (15.4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.45it/s]


Average Metric: 2 / 13  (15.4%)


  df = df.applymap(truncate_cell)


15.38

In [19]:
optimizer2 = BootstrapFewShotWithRandomSearch(metric=metric_EM, max_bootstrapped_demos=2, num_candidate_programs=8, num_threads=NUM_THREADS)
rag_compiled = optimizer2.compile(RAG(), trainset=train, valset=dev)

Going to sample between 1 and 2 traces per predictor.
Will attempt to train 8 candidate sets.


Average Metric: 2 / 13  (15.4): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 2270.02it/s]


Average Metric: 2 / 13  (15.4%)
Score: 15.38 for set: [0, 0]
New best score: 15.38 for seed -3
Scores so far: [15.38]
Best score: 15.38


Average Metric: 5 / 13  (38.5): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  6.74it/s]


Average Metric: 5 / 13  (38.5%)
Score: 38.46 for set: [7, 7]
New best score: 38.46 for seed -2
Scores so far: [15.38, 38.46]
Best score: 38.46


 29%|█████████████████████████████████████████▏                                                                                                      | 2/7 [00:04<00:11,  2.29s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.76it/s]


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.46153846153846156
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.6153846153846154
Average of max per entry across top 8 scores: 0.6153846153846154
Average of max per entry across top 9999 scores: 0.6153846153846154


 57%|██████████████████████████████████████████████████████████████████████████████████▎                                                             | 4/7 [00:06<00:04,  1.53s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.47it/s]


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.46153846153846156
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.6923076923076923
Average of max per entry across top 8 scores: 0.6923076923076923
Average of max per entry across top 9999 scores: 0.6923076923076923


 14%|████████████████████▌                                                                                                                           | 1/7 [00:01<00:11,  1.95s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.99it/s]


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.46153846153846156
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.7692307692307693
Average of max per entry across top 8 scores: 0.7692307692307693
Average of max per entry across top 9999 scores: 0.7692307692307693


 29%|█████████████████████████████████████████▏                                                                                                      | 2/7 [00:02<00:05,  1.08s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 5 / 13  (38.5): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  9.22it/s]


Average Metric: 5 / 13  (38.5%)
Score: 38.46 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.7692307692307693
Average of max per entry across top 8 scores: 0.7692307692307693
Average of max per entry across top 9999 scores: 0.7692307692307693


 14%|████████████████████▌                                                                                                                           | 1/7 [00:00<00:05,  1.10it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 3 / 13  (23.1): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.40it/s]


Average Metric: 3 / 13  (23.1%)
Score: 23.08 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46, 23.08]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.7692307692307693
Average of max per entry across top 8 scores: 0.8461538461538461
Average of max per entry across top 9999 scores: 0.8461538461538461


 14%|████████████████████▌                                                                                                                           | 1/7 [00:01<00:10,  1.76s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.02it/s]


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46, 23.08, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.7692307692307693
Average of max per entry across top 8 scores: 0.8461538461538461
Average of max per entry across top 9999 scores: 0.8461538461538461


 43%|█████████████████████████████████████████████████████████████▋                                                                                  | 3/7 [00:05<00:07,  1.82s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 3 / 9  (33.3):  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 9/13 [00:02<00:00,  4.84it/s]

Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:06<00:00,  1.99it/s]


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46, 23.08, 30.77, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.6153846153846154
Average of max per entry across top 5 scores: 0.7692307692307693
Average of max per entry across top 8 scores: 0.8461538461538461
Average of max per entry across top 9999 scores: 0.8461538461538461


 29%|█████████████████████████████████████████▏                                                                                                      | 2/7 [00:01<00:04,  1.08it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 2 / 3  (66.7):  23%|█████████████████████████▊                                                                                      | 3/13 [00:01<00:03,  3.09it/s]

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 5  (40.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:02<00:03,  2.28it/s]

Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 6  (33.3):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:02<00:03,  1.90it/s]

Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.8 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 9  (55.6):  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 9/13 [00:04<00:02,  1.68it/s]

Backing off 2.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 13  (38.5): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:09<00:00,  1.44it/s]


Average Metric: 5 / 13  (38.5%)
Score: 38.46 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46, 23.08, 30.77, 30.77, 38.46]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.7692307692307693
Average of max per entry across top 5 scores: 0.8461538461538461
Average of max per entry across top 8 scores: 0.9230769230769231
Average of max per entry across top 9999 scores: 0.9230769230769231


 57%|██████████████████████████████████████████████████████████████████████████████████▎                                                             | 4/7 [00:04<00:03,  1.20s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 1 / 5  (20.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:03<00:04,  1.75it/s]

Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 7  (28.6):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:04<00:04,  1.41it/s]

Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 8  (37.5):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:05<00:04,  1.20it/s]

Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 10  (30.0):  77%|████████████████████████████████████████████████████████████████████████████████████▌                         | 10/13 [00:07<00:02,  1.14it/s]

Backing off 2.8 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:12<00:00,  1.08it/s]

Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [7, 7]
Scores so far: [15.38, 38.46, 30.77, 30.77, 30.77, 38.46, 23.08, 30.77, 30.77, 38.46, 30.77]
Best score: 38.46
Average of max per entry across top 1 scores: 0.38461538461538464
Average of max per entry across top 2 scores: 0.6153846153846154
Average of max per entry across top 3 scores: 0.7692307692307693
Average of max per entry across top 5 scores: 0.8461538461538461
Average of max per entry across top 8 scores: 0.9230769230769231
Average of max per entry across top 9999 scores: 1.0
11 candidate programs found.





In [22]:
evaluate_hotpot(rag_compiled)

Average Metric: 5 / 13  (38.5): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 1486.04it/s]

Average Metric: 5 / 13  (38.5%)



 'False' 'False' 'False' '✔️ [True]' '✔️ [True]' 'False']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,E. L. Doctorow,"determine who has a broader scope of profession. We know that E. L. Doctorow was an American novelist, editor, and professor, while Julia Peterkin was...",E. L. Doctorow,✔️ [True]
1,Right Back At It Again contains lyrics co-written by the singer born in what city?,"Gainesville, Florida","produce the answer. We know that the song ""Right Back At It Again"" is from the album ""Common Courtesy"" by A Day to Remember. We...","The singer was born in Ocala, Florida.",False
2,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,1828,find the year the party of the winner of the 1971 San Francisco mayoral election was founded. We know that the winner of the election...,1849,False
3,Anthony Dirrell is the brother of which super middleweight title holder?,Andre Dirrell,"produce the answer. We know that Anthony Dirrell is a professional boxer and the younger brother of Andre Dirrell, who is also a professional boxer....",Andre Dirrell,✔️ [True]
4,The sports nutrition business established by Oliver Cookson is based in which county in the UK?,Cheshire,"produce the answer. We know that Oliver Cookson is a UK entrepreneur who established the sports nutrition business Myprotein. We also know that in 2011,...",Cheshire,✔️ [True]
5,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"February 13, 1980","find the birth date of the actor. We will first need to identify the actor's name, which is Jo Dong-hyuk. Then, we will need to...","December 11, 1977",False
6,Kyle Moran was born in the town on what river?,Castletown River,"produce the answer. We know that Kyle Moran was born in Dundalk, Ireland and that there are three different pieces of information about people named...",The town of Dundalk is located near the Moran River.,False
7,"The actress who played the niece in the Priest film was born in what city, country?","Surrey, England",produce the answer. We know that the actress who played the niece in the Priest film was Jackie Joseph. We also know that Jackie Joseph...,"Sammie Jacqueline Joseph was born in Los Angeles, California, United States.",False
8,Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.,Portrait of a Marriage,"produce the answer. We know that Cathryn Harrison is the daughter of Noel Harrison, who was an actor. We also know that Violet Trefusis was...","The movie is ""The Soul's Gymnasium"".",False
9,What year was the father of the Princes in the Tower born?,1442,determine the year of birth of the father of the Princes in the Tower. We know that the Princes in the Tower were born during...,1441,False


38.46

In [24]:
rag_compiled("What year was the party of the winner of the 1971 San Francisco mayoral election founded?")
turbo.inspect_history(n=1)





Given the fields `context`, `question`, produce the fields `answer`.

---

Question: Who produced the album that included a re-recording of "Lithium"?
Answer: Butch Vig

Question: In what year was the star of To Hell and Back born?
Answer: 1925

Question: Which award did the first book of Gary Zukav receive?
Answer: U.S. National Book Award

Question: Which author is English: John Braine or Studs Terkel?
Answer: John Braine

Question: What documentary about the Gilgo Beach Killer debuted on A&E?
Answer: The Killing Season

Question: Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?
Answer: Kevin Greutert

Question: The heir to the Du Pont family fortune sponsored what wrestling team?
Answer: Foxcatcher

---

Follow the following format.

Context: ${context}

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: ${answer}

---

Context:
[1] «San Diego mayoral election, 1971 | The 1971 San Die

# Multi-Hop Retrieval and Reasoning

In [33]:
from dsp.utils.utils import deduplicate

class MultiHop(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_query = dspy.ChainOfThought("question -> search_query")

        self.generate_query_from_context = dspy.ChainOfThought("context, question -> search_query")

        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
    
    def forward(self, question):
        passages = []
        
        search_query = self.generate_query(question=question).search_query
        passages += self.retrieve(search_query).passages

        search_query2 = self.generate_query_from_context(context=deduplicate(passages), question=question).search_query

        # TODO: Replace `None` with a call to self.retrieve to retrieve passages. Append them to the list `passages`.
        passages += self.retrieve(search_query2).passages

        return self.generate_answer(context=deduplicate(passages), question=question)

In [29]:
multihop_compiled = optimizer2.compile(MultiHop(), trainset=train, valset=dev)

Average Metric: 4 / 13  (30.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.25it/s]
  df = df.applymap(truncate_cell)


Average Metric: 4 / 13  (30.8%)
Score: 30.77 for set: [0, 0, 0]
New best score: 30.77 for seed -3
Scores so far: [30.77]
Best score: 30.77


Average Metric: 7 / 13  (53.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  7.29it/s]


Average Metric: 7 / 13  (53.8%)
Score: 53.85 for set: [7, 7, 7]
New best score: 53.85 for seed -2
Scores so far: [30.77, 53.85]
Best score: 53.85


 43%|█████████████████████████████████████████████████████████████▋                                                                                  | 3/7 [00:08<00:11,  2.79s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 7 / 13  (53.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.46it/s]


Average Metric: 7 / 13  (53.8%)
Score: 53.85 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85]
Best score: 53.85
Average of max per entry across top 1 scores: 0.5384615384615384
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 57%|██████████████████████████████████████████████████████████████████████████████████▎                                                             | 4/7 [00:07<00:05,  1.79s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 1 / 1  (100.0):   8%|████████▌                                                                                                      | 1/13 [00:04<00:50,  4.18s/it]

Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 sec

Average Metric: 2 / 2  (100.0):  15%|█████████████████                                                                                              | 2/13 [00:06<00:37,  3.38s/it]

Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 3  (100.0):  23%|█████████████████████████▌                                                                                     | 3/13 [00:08<00:26,  2.64s/it]

Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 4  (100.0):  31%|██████████████████████████████████▏                                                                            | 4/13 [00:10<00:19,  2.14s/it]

Backing off 0.8 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 5  (80.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:12<00:16,  2.10s/it]

Backing off 0.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 6  (83.3):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:13<00:12,  1.74s/it]

Backing off 0.5 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.5 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 7  (71.4):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:14<00:09,  1.64s/it]

Backing off 3.4 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 8  (75.0):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:16<00:08,  1.61s/it]

Backing off 3.9 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.4 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 9  (66.7):  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 9/13 [00:18<00:06,  1.69s/it]

Backing off 4.2 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 7 / 13  (53.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:24<00:00,  1.89s/it]


Average Metric: 7 / 13  (53.8%)
Score: 53.85 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85]
Best score: 53.85
Average of max per entry across top 1 scores: 0.5384615384615384
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 14%|████████████████████▌                                                                                                                           | 1/7 [00:02<00:17,  2.99s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 1 / 2  (50.0):  15%|█████████████████▏                                                                                              | 2/13 [00:03<00:15,  1.40s/it]

Backing off 0.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 3  (66.7):  23%|█████████████████████████▊                                                                                      | 3/13 [00:06<00:20,  2.04s/it]

Backing off 0.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.6 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 4  (75.0):  31%|██████████████████████████████████▍                                                                             | 4/13 [00:07<00:16,  1.88s/it]

Backing off 1.9 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 4.7 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 5  (60.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:10<00:18,  2.30s/it]

Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.3 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 8.0 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 6  (66.7):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:12<00:15,  2.24s/it]

Backing off 1.2 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 7  (57.1):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:14<00:12,  2.01s/it]

Backing off 0.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 8  (50.0):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:15<00:08,  1.64s/it]

Backing off 15.4 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 6.3 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 13  (46.2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:34<00:00,  2.63s/it]


Average Metric: 6 / 13  (46.2%)
Score: 46.15 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15]
Best score: 53.85
Average of max per entry across top 1 scores: 0.5384615384615384
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 14%|████████████████████▌                                                                                                                           | 1/7 [00:01<00:06,  1.16s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 2  (100.0):  15%|█████████████████                                                                                              | 2/13 [00:03<00:17,  1.62s/it]

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 3  (100.0):  23%|█████████████████████████▌                                                                                     | 3/13 [00:06<00:19,  1.97s/it]

Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 4  (75.0):  31%|██████████████████████████████████▍                                                                             | 4/13 [00:06<00:13,  1.51s/it]

Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 5  (80.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:07<00:10,  1.33s/it]

Backing off 1.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 6  (66.7):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:09<00:10,  1.47s/it]

Backing off 1.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 7  (71.4):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:10<00:08,  1.36s/it]

Backing off 0.0 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 8  (75.0):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:12<00:06,  1.31s/it]

Backing off 1.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 9  (66.7):  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 9/13 [00:13<00:05,  1.27s/it]

Backing off 3.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 7.3 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 8 / 13  (61.5): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:21<00:00,  1.67s/it]


Average Metric: 8 / 13  (61.5%)
Score: 61.54 for set: [7, 7, 7]
New best score: 61.54 for seed 2
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54]
Best score: 61.54
Average of max per entry across top 1 scores: 0.6153846153846154
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 14%|████████████████████▌                                                                                                                           | 1/7 [00:01<00:07,  1.24s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 0 / 1  (0.0):   8%|████████▋                                                                                                        | 1/13 [00:02<00:31,  2.64s/it]

Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 0 / 2  (0.0):  15%|█████████████████▍                                                                                               | 2/13 [00:03<00:15,  1.43s/it]

Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.7 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 sec

Average Metric: 0 / 3  (0.0):  23%|██████████████████████████                                                                                       | 3/13 [00:05<00:19,  1.93s/it]

Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 0 / 4  (0.0):  31%|██████████████████████████████████▊                                                                              | 4/13 [00:07<00:16,  1.86s/it]

Backing off 2.7 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.1 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 6  (33.3):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:09<00:09,  1.36s/it]

Backing off 3.2 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 7  (42.9):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:10<00:07,  1.29s/it]

Backing off 0.6 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.5 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 6.2 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 13  (46.2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:20<00:00,  1.60s/it]


Average Metric: 6 / 13  (46.2%)
Score: 46.15 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54, 46.15]
Best score: 61.54
Average of max per entry across top 1 scores: 0.6153846153846154
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 14%|████████████████████▌                                                                                                                           | 1/7 [00:02<00:14,  2.44s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 sec

Average Metric: 1 / 1  (100.0):   8%|████████▌                                                                                                      | 1/13 [00:04<00:53,  4.47s/it]

Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 2  (100.0):  15%|█████████████████                                                                                              | 2/13 [00:05<00:27,  2.51s/it]

Backing off 1.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {'max_tokens': 75, 'n': 1, 'temperature': 0.0}
Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.2 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.2 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 4  (50.0):  31%|██████████████████████████████████▍                                                                             | 4/13 [00:07<00:12,  1.40s/it]

Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.8 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.2 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 5  (40.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:09<00:12,  1.52s/it]

Backing off 1.1 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 4.6 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.6 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 6  (50.0):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:12<00:16,  2.30s/it]

Backing off 4.5 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 7  (42.9):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:13<00:10,  1.78s/it]

Backing off 5.4 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 8  (50.0):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:14<00:07,  1.54s/it]

Backing off 5.3 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 13  (46.2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:21<00:00,  1.67s/it]


Average Metric: 6 / 13  (46.2%)
Score: 46.15 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54, 46.15, 46.15]
Best score: 61.54
Average of max per entry across top 1 scores: 0.6153846153846154
Average of max per entry across top 2 scores: 0.8461538461538461
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 57%|██████████████████████████████████████████████████████████████████████████████████▎                                                             | 4/7 [00:07<00:05,  1.86s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 1 / 1  (100.0):   8%|████████▌                                                                                                      | 1/13 [00:04<00:50,  4.18s/it]

Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 2  (100.0):  15%|█████████████████                                                                                              | 2/13 [00:05<00:27,  2.48s/it]

Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.1 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 3  (66.7):  23%|█████████████████████████▊                                                                                      | 3/13 [00:06<00:19,  1.95s/it]

Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.4 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 3.8 sec

Average Metric: 3 / 4  (75.0):  31%|██████████████████████████████████▍                                                                             | 4/13 [00:10<00:23,  2.63s/it]

Backing off 1.1 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 6.1 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.4 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 5  (80.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:12<00:18,  2.26s/it]

Backing off 1.4 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 4.7 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 6  (66.7):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:14<00:16,  2.35s/it]

Backing off 7.5 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 7  (57.1):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:15<00:11,  1.98s/it]

Backing off 5.1 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 5.6 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 5 / 8  (62.5):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:18<00:11,  2.28s/it]

Backing off 8.9 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 7 / 10  (70.0):  77%|████████████████████████████████████████████████████████████████████████████████████▌                         | 10/13 [00:25<00:08,  2.96s/it]

Backing off 21.4 seconds after 6 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 8 / 13  (61.5): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:50<00:00,  3.90s/it]


Average Metric: 8 / 13  (61.5%)
Score: 61.54 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54, 46.15, 46.15, 61.54]
Best score: 61.54
Average of max per entry across top 1 scores: 0.6153846153846154
Average of max per entry across top 2 scores: 0.9230769230769231
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 14%|████████████████████▌                                                                                                                           | 1/7 [00:01<00:06,  1.13s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 9 / 13  (69.2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.64it/s]


Average Metric: 9 / 13  (69.2%)
Score: 69.23 for set: [7, 7, 7]
New best score: 69.23 for seed 6
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54, 46.15, 46.15, 61.54, 69.23]
Best score: 69.23
Average of max per entry across top 1 scores: 0.6923076923076923
Average of max per entry across top 2 scores: 0.9230769230769231
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 43%|█████████████████████████████████████████████████████████████▋                                                                                  | 3/7 [00:04<00:05,  1.39s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


  0%|                                                                                                                                                       | 0/13 [00:00<?, ?it/s]

Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 1 / 1  (100.0):   8%|████████▌                                                                                                      | 1/13 [00:04<00:53,  4.45s/it]

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 2  (100.0):  15%|█████████████████                                                                                              | 2/13 [00:05<00:28,  2.56s/it]

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 3  (66.7):  23%|█████████████████████████▊                                                                                      | 3/13 [00:06<00:17,  1.72s/it]

Backing off 1.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.0 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.6 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.6 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 2 / 4  (50.0):  31%|██████████████████████████████████▍                                                                             | 4/13 [00:08<00:18,  2.06s/it]

Backing off 2.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 3 / 5  (60.0):  38%|███████████████████████████████████████████                                                                     | 5/13 [00:09<00:13,  1.68s/it]

Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.1 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 2.3 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.9 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 7.9 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.6 seconds after 3 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.9 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 0.6 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 6  (66.7):  46%|███████████████████████████████████████████████████▋                                                            | 6/13 [00:15<00:20,  2.99s/it]

Backing off 2.0 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 7  (57.1):  54%|████████████████████████████████████████████████████████████▎                                                   | 7/13 [00:16<00:13,  2.33s/it]

Backing off 9.2 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 1.5 seconds after 2 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 5.1 seconds after 4 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 8  (50.0):  62%|████████████████████████████████████████████████████████████████████▉                                           | 8/13 [00:20<00:14,  2.90s/it]

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {'max_tokens': 75, 'n': 1, 'temperature': 0.0}


Average Metric: 4 / 9  (44.4):  69%|█████████████████████████████████████████████████████████████████████████████▌                                  | 9/13 [00:21<00:09,  2.37s/it]

Backing off 0.4 seconds after 5 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 4 / 10  (40.0):  77%|████████████████████████████████████████████████████████████████████████████████████▌                         | 10/13 [00:25<00:08,  2.92s/it]

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}
Backing off 19.3 seconds after 6 tries calling function <function GPT3.request at 0x1129777e0> with kwargs {}


Average Metric: 6 / 13  (46.2): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:48<00:00,  3.75s/it]

Average Metric: 6 / 13  (46.2%)
Score: 46.15 for set: [7, 7, 7]
Scores so far: [30.77, 53.85, 53.85, 53.85, 46.15, 61.54, 46.15, 46.15, 61.54, 69.23, 46.15]
Best score: 69.23
Average of max per entry across top 1 scores: 0.6923076923076923
Average of max per entry across top 2 scores: 0.9230769230769231
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
11 candidate programs found.





In [30]:
evaluate_hotpot(multihop_compiled, devset=dev)

Average Metric: 9 / 13  (69.2): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 2084.41it/s]

Average Metric: 9 / 13  (69.2%)



 'False' '✔️ [True]' '✔️ [True]' '✔️ [True]' '✔️ [True]' '✔️ [True]']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,E. L. Doctorow,"determine who has a broader scope of profession. We know that E. L. Doctorow is an American novelist, editor, and professor, while Julia Peterkin is...",E. L. Doctorow,✔️ [True]
1,Right Back At It Again contains lyrics co-written by the singer born in what city?,"Gainesville, Florida","find the city where the singer was born. We know that the song ""Right Back At It Again"" is by A Day to Remember, and...",Belleville,False
2,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,1828,"find the year the party was founded. We know that the winner of the 1971 San Francisco mayoral election was Pete Wilson, and we know...",1998,False
3,Anthony Dirrell is the brother of which super middleweight title holder?,Andre Dirrell,"find the name of the super middleweight title holder. We know that Anthony Dirrell is the younger brother of Andre Dirrell, who is also a...",Andre Dirrell,✔️ [True]
4,The sports nutrition business established by Oliver Cookson is based in which county in the UK?,Cheshire,find the county where the sports nutrition business is based. We know that the business is called Myprotein and that it was established by Oliver...,Cheshire,✔️ [True]
5,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"February 13, 1980","find the birth date. We know that the actor played roles in First Wives Club and Searching for the Elephant, and we know that her...","January 14, 1971",False
6,Kyle Moran was born in the town on what river?,Castletown River,"find the town where Kyle Moran was born. We know that Kyle Moran is an Irish footballer, and we know that he was born on...",Castletown River,✔️ [True]
7,"The actress who played the niece in the Priest film was born in what city, country?","Surrey, England","find the city and country of birth. We know that the actress is Francesca Cardinale, and we know that she is the niece of Claudia...","Buenos Aires, Argentina",False
8,Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.,Portrait of a Marriage,"find the movie. We know that the daughter of Noel Harrison is Cathryn Harrison, and we know that she played Violet Trefusis. We also know...",Portrait of a Marriage,✔️ [True]
9,What year was the father of the Princes in the Tower born?,1442,"find the year of birth. We know that the father of the Princes in the Tower is Edward IV, and we know that he died...",1442,✔️ [True]


69.23

In [32]:
multihop_compiled(question="Who purchased the team Michael Schumacher raced for in the 1995 Monaco Grand Prix in 2000?")
turbo.inspect_history(n=3)





Given the fields `question`, produce the fields `search_query`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the search_query}. We ...
Search Query: ${search_query}

---

Question: In what year was the star of To Hell and Back born?
Reasoning: Let's think step by step in order to find the year the star of To Hell and Back was born. We need to know the name of the star, then we can search for their birth year.
Search Query: To Hell and Back star birth year

---

Question: Who purchased the team Michael Schumacher raced for in the 1995 Monaco Grand Prix in 2000?
Reasoning: Let's think step by step in order to[32m find out who purchased the team Michael Schumacher raced for in the 1995 Monaco Grand Prix in 2000. We need to know the name of the team, then we can search for who purchased it in 2000.
Search Query: 1995 Monaco Grand Prix team purchase 2000[0m







Given the fields `context`, `question`, produce the