In [2]:
import dspy
import os
gpt4o = dspy.LM('openai/gpt-4o', api_key=os.getenv('OPENAI_API_KEY'), temperature=0.7)
dspy.configure(lm=gpt4o)
print(os.getenv('OPENAI_API_KEY'))




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


In [3]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

Downloading data: 100%|██████████| 9.21M/9.21M [00:00<00:00, 18.4MB/s]
Downloading data: 100%|██████████| 2.15M/2.15M [00:00<00:00, 12.3MB/s]
Downloading data: 100%|██████████| 899k/899k [00:00<00:00, 21.7MB/s]
Generating train split: 100%|██████████| 18171/18171 [00:00<00:00, 23248.37 examples/s]
Generating validation split: 100%|██████████| 4000/4000 [00:00<00:00, 28531.03 examples/s]
Generating test split: 100%|██████████| 4000/4000 [00:00<00:00, 44299.79 examples/s]


In [6]:
import pickle

# Save the datasets to pickle files
with open('trainset.txt', 'wb') as f:
    pickle.dump(trainset, f)

with open('devset.txt', 'wb') as f:
    pickle.dump(devset, f)

with open('testset.txt', 'wb') as f:
    pickle.dump(testset, f)


In [7]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Chris Noonan', 'Miss Potter', 'Academy Award for Best Director']


In [8]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [9]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [10]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [11]:
react(claim="David Gregory was born in 1625.").titles[:3]


['David Gregory (physician)']

In [12]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [13]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

Average Metric: 3.00 / 73 (4.1%):  73%|███████▎  | 73/100 [03:45<01:27,  3.25s/it] 