In [17]:
import os
import tqdm as notebook_tqdm
from dotenv import load_dotenv

import dsp
import dspy
from dotdict import dotdict
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFinetune, BootstrapFewShotWithRandomSearch

load_dotenv()
unify_api_key = os.getenv("UNIFY_KEY")



In [2]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

(20, 50)

In [3]:
print(devset)
print(trainset)

[Example({'question': 'Are both Cangzhou and Qionghai in the Hebei province of China?', 'answer': 'no', 'gold_titles': {'Qionghai', 'Cangzhou'}}) (input_keys={'question'}), Example({'question': 'Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?', 'answer': 'National Hockey League', 'gold_titles': {'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}}) (input_keys={'question'}), Example({'question': 'The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay Lightning of the National Hockey League (NHL)?', 'answer': 'Steve Yzerman', 'gold_titles': {'Steve Yzerman', '2006–07 Detroit Red Wings season'}}) (input_keys={'question'}), Example({'question': 'What river is near the Crichton Collegiate Church?', 'answer': 'the River Tyne', 'gold_titles': {'Crichton Collegiate Church', 'Crichton Castle'}}) (input_keys={'questi

In [10]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers. If possible, answer with between 1 and 5 words."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

class QnA(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought("question -> answer")

    def forward(self, question):
        prediction = self.generate_answer(question=question)
        return dspy.Prediction(answer=prediction.answer)

In [5]:
gemma = dsp.Unify(
    endpoint="gemma-7b-it@anyscale",
    max_tokens=150,
    api_key=unify_api_key,
)

claude = dsp.Unify(
    endpoint="claude-3-haiku@anthropic",
    max_tokens=150,
    api_key=unify_api_key,
)

unify_router = dsp.Unify(
    max_tokens=150,
    api_key=unify_api_key,
)


dspy.settings.configure(lm=claude)

In [18]:
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    return answer_EM

num_threads = 24

teleprompter = BootstrapFewShotWithRandomSearch(metric=validate_context_and_answer)



Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


In [19]:
compiled_qna = teleprompter.compile(QnA(), trainset=trainset[:50], valset=trainset[50:200])

Average Metric: 0 / 20  (0.0): 100%|██████████| 20/20 [00:09<00:00,  2.09it/s]


Score: 0.0 for set: [0]
New best sscore: 0.0 for seed -3
Scores so far: [0.0]
Best score: 0.0


Average Metric: 10 / 20  (50.0): 100%|██████████| 20/20 [00:10<00:00,  1.82it/s]


Score: 50.0 for set: [16]
New best sscore: 50.0 for seed -2
Scores so far: [0.0, 50.0]
Best score: 50.0


100%|██████████| 20/20 [00:50<00:00,  2.54s/it]


Bootstrapped 3 full traces after 20 examples in round 0.


Average Metric: 4 / 20  (20.0): 100%|██████████| 20/20 [00:12<00:00,  1.55it/s]


Score: 20.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.6
Average of max per entry across top 3 scores: 0.6
Average of max per entry across top 5 scores: 0.6
Average of max per entry across top 8 scores: 0.6
Average of max per entry across top 9999 scores: 0.6


100%|██████████| 20/20 [00:48<00:00,  2.41s/it]


Bootstrapped 3 full traces after 20 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:12<00:00,  1.60it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.7
Average of max per entry across top 3 scores: 0.75
Average of max per entry across top 5 scores: 0.75
Average of max per entry across top 8 scores: 0.75
Average of max per entry across top 9999 scores: 0.75


 75%|███████▌  | 15/20 [00:39<00:13,  2.60s/it]


Bootstrapped 2 full traces after 16 examples in round 0.


Average Metric: 8 / 20  (40.0): 100%|██████████| 20/20 [00:12<00:00,  1.58it/s]


Score: 40.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.75
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.8


 60%|██████    | 12/20 [00:31<00:20,  2.62s/it]


Bootstrapped 1 full traces after 13 examples in round 0.


Average Metric: 5 / 20  (25.0): 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


Score: 25.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.75
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.8


100%|██████████| 20/20 [00:50<00:00,  2.52s/it]


Bootstrapped 1 full traces after 20 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:12<00:00,  1.64it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.75
Average of max per entry across top 5 scores: 0.75
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.8


 85%|████████▌ | 17/20 [00:42<00:07,  2.48s/it]


Bootstrapped 2 full traces after 18 examples in round 0.


Average Metric: 8 / 20  (40.0): 100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Score: 40.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.85
Average of max per entry across top 9999 scores: 0.85


100%|██████████| 20/20 [00:48<00:00,  2.44s/it]


Bootstrapped 1 full traces after 20 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.85
Average of max per entry across top 9999 scores: 0.85


  5%|▌         | 1/20 [00:02<00:38,  2.01s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 5 / 20  (25.0): 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


Score: 25.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.85


 80%|████████  | 16/20 [00:36<00:09,  2.27s/it]


Bootstrapped 3 full traces after 17 examples in round 0.


Average Metric: 4 / 20  (20.0): 100%|██████████| 20/20 [00:12<00:00,  1.58it/s]


Score: 20.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.85


100%|██████████| 20/20 [00:48<00:00,  2.45s/it]


Bootstrapped 2 full traces after 20 examples in round 0.


Average Metric: 5 / 20  (25.0): 100%|██████████| 20/20 [00:12<00:00,  1.58it/s]


Score: 25.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.85


100%|██████████| 20/20 [00:42<00:00,  2.11s/it]


Bootstrapped 1 full traces after 20 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:12<00:00,  1.56it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.9


 40%|████      | 8/20 [00:18<00:27,  2.31s/it]


Bootstrapped 1 full traces after 9 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.9


100%|██████████| 20/20 [00:49<00:00,  2.50s/it]


Bootstrapped 1 full traces after 20 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:12<00:00,  1.66it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.95


100%|██████████| 20/20 [00:42<00:00,  2.13s/it]


Bootstrapped 2 full traces after 20 examples in round 0.


Average Metric: 5 / 20  (25.0): 100%|██████████| 20/20 [00:13<00:00,  1.52it/s]


Score: 25.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0, 30.0, 25.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.95


100%|██████████| 20/20 [00:49<00:00,  2.48s/it]


Bootstrapped 2 full traces after 20 examples in round 0.


Average Metric: 2 / 20  (10.0): 100%|██████████| 20/20 [00:13<00:00,  1.52it/s]


Score: 10.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0, 30.0, 25.0, 10.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.95


 20%|██        | 4/20 [00:08<00:34,  2.14s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 6 / 20  (30.0): 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


Score: 30.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0, 30.0, 25.0, 10.0, 30.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.95


 15%|█▌        | 3/20 [00:07<00:40,  2.35s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 5 / 20  (25.0): 100%|██████████| 20/20 [00:12<00:00,  1.63it/s]


Score: 25.0 for set: [16]
Scores so far: [0.0, 50.0, 20.0, 30.0, 40.0, 25.0, 30.0, 40.0, 30.0, 25.0, 20.0, 25.0, 30.0, 30.0, 30.0, 25.0, 10.0, 30.0, 25.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.65
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.9
Average of max per entry across top 9999 scores: 0.95
19 candidate programs found.


In [22]:
evaluate_hotpot = Evaluate(devset=devset[:1000], metric=validate_context_and_answer, num_threads=num_threads, display_progress=True, display_table=5)
evaluate_hotpot(compiled_qna, metric=validate_context_and_answer)

Average Metric: 8 / 50  (16.0): 100%|██████████| 50/50 [00:08<00:00,  5.77it/s]


Unnamed: 0,question,example_answer,gold_titles,pred_answer,validate_context_and_answer
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Qionghai', 'Cangzhou'}",Question: Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where? Reasoning: Let's think step by step...,False
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}","space Question: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?...",False
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'Steve Yzerman', '2006–07 Detroit Red Wings season'}","Question: The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa...",False
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","The River Esk is near the Crichton Collegiate Church, which is located in Midlothian, Scotland.",False
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}",Question: In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king? Reasoning: Let's think step by step in order to...,False


16.0

In [21]:
claude.inspect_history(n=3)




Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where?
Answer: space

---

Question: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?
Answer: "Outfield of Dreams"

---

Question: which American actor was Candace Kita guest starred with
Answer: Bill Murray

---

Question: Tombstone stared an actor born May 17, 1955 known as who?
Answer: Bill Paxton

---

Question: The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?
Answer: 2010

---

Question: Which is taller, the Empire State Building or the Bank of America Tower?
Answer: The Empire Sta

'\n\n\nGiven the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\nAnswer: ${answer}\n\n---\n\nQuestion: Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where?\nAnswer: space\n\n---\n\nQuestion: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?\nAnswer: "Outfield of Dreams"\n\n---\n\nQuestion: which American actor was Candace Kita guest starred with\nAnswer: Bill Murray\n\n---\n\nQuestion: Tombstone stared an actor born May 17, 1955 known as who?\nAnswer: Bill Paxton\n\n---\n\nQuestion: The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?\nAnswer: 2010\n\n---\n\nQuestion: Which is taller, the Empire State Building or the Ban