In [1]:
from dotenv import load_dotenv
import dspy
from dspy.datasets import HotPotQA
from dspy.evaluate import answer_exact_match
from hotpotqa_agent import hotpotqa_agent

In [2]:
hotpot_qa_ds = HotPotQA(train_size=3000, eval_seed=2025)
hotpot_qa_ds

  from .autonotebook import tqdm as notebook_tqdm
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'hotpot_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
`trust_remote_code` is not supported anym

<dspy.datasets.hotpotqa.HotPotQA at 0x1038c4ce0>

In [3]:
load_dotenv(".env")

# Configure DSPY with a language model
lm = dspy.LM("gemini/gemini-2.5-flash", cache=False)
dspy.configure(lm=lm, temperature=0.7)

In [4]:
def get_pred_message_responses(pred):
    from dspy.clients.base_lm import GLOBAL_HISTORY

    pred_steps = len(pred.trajectory) // 4

    # get last pred_step number of convo turns
    message_responses = []
    for pred_step in range(pred_steps):
        history_step_index = len(GLOBAL_HISTORY) - (pred_steps - pred_step) - 1
        # first part of messages + response is messages
        pred_step_messages_response = GLOBAL_HISTORY[history_step_index][
            "messages"
        ].copy()
        # add the response
        response = (
            GLOBAL_HISTORY[history_step_index]["response"].choices[0].message.content
        )
        formatted_response = {"role": "assistant", "content": response}
        pred_step_messages_response.append(formatted_response)

        message_responses.append(pred_step_messages_response)

    return message_responses

In [5]:
from datasets import Dataset


# Convert your pred_message_responses to the format needed for HF dataset
def create_hf_dataset(message_responses_list, question, trace_exact_match):
    """
    Convert list of conversation message responses to HuggingFace dataset format
    """
    dataset_entries = []

    for step, conversation in enumerate(message_responses_list):
        entry = {
            "conversations": conversation,
            "question": question,
            "step": step,
            "trace_exact_match": trace_exact_match,
        }
        dataset_entries.append(entry)

    return Dataset.from_list(dataset_entries)

In [6]:
import os, json, itertools

NUM_QUESTIONS = 1500
QUESTION_SAVE_INTERVAL = 100
DATASET_SAVE_DIR = "hotpotqa_agent_training_data"
os.makedirs(DATASET_SAVE_DIR, exist_ok=True)


def save_checkpoint(data, n):
    p = os.path.join(DATASET_SAVE_DIR, f"checkpoint_{n}.json")
    with open(p, "w") as f:
        json.dump(data, f)
    print(f"Saved checkpoint at question {n} to {p}")


def load_checkpoint(p):
    with open(p) as f:
        return json.load(f)


def find_latest_checkpoint():
    files = [
        f
        for f in os.listdir(DATASET_SAVE_DIR)
        if f.startswith("checkpoint_") and f.endswith(".json")
    ]
    if not files:
        return 0, None
    nums = [int(f.split("_")[1].split(".")[0]) for f in files]
    n = max(nums)
    return n, os.path.join(DATASET_SAVE_DIR, f"checkpoint_{n}.json")


start_n, ckpt_path = find_latest_checkpoint()
if ckpt_path and os.path.exists(ckpt_path):
    dataset_entries = load_checkpoint(ckpt_path)
    print(f"Found latest checkpoint at question {start_n}: {ckpt_path}")
    print(
        f"Resuming from question {start_n} with {len(dataset_entries)} existing entries"
    )
else:
    print("No checkpoint found, starting from the beginning")
    dataset_entries, start_n = [], 0

train_iter = iter(hotpot_qa_ds.train)
for _ in range(start_n):
    next(train_iter, None)  # fast-forward

processed = 0
for i, ex in enumerate(
    itertools.islice(train_iter, NUM_QUESTIONS - start_n), start=start_n
):
    q = ex["question"]
    print(f"Question #{i}: {q}")
    try:
        pred = hotpotqa_agent(question=q)
        tem = answer_exact_match(ex, pred)
        print(f"Trace exact match: {tem}")
        for step, convo in enumerate(get_pred_message_responses(pred)):
            dataset_entries.append(
                {
                    "conversations": convo,
                    "question": q,
                    "answer": ex["answer"],
                    "step": step,
                    "trace_exact_match": tem,
                }
            )
    except Exception as e:
        print(f"Error processing question {i}: {e}")

    processed += 1
    if (
        i + 1
    ) % QUESTION_SAVE_INTERVAL == 0:  # (i+1) is the 1-based count of seen examples
        save_checkpoint(dataset_entries, i + 1)

# final save if last block didn't land on interval
if (start_n + processed) % QUESTION_SAVE_INTERVAL:
    save_checkpoint(dataset_entries, start_n + processed)


No checkpoint found, starting from the beginning
Question #0: The city where the Anubis Shrine was found was known to the ancient Egyptians as what?
Trace exact match: True
Question #1: Which is farther south, Palm Beach International Airport or Fairbanks International Airport?
Trace exact match: True
Question #1: Which is farther south, Palm Beach International Airport or Fairbanks International Airport?
Trace exact match: False
Question #2: What type of district does Electoral district of Ashford and Keswick, South Australia have in common?
Trace exact match: False
Question #2: What type of district does Electoral district of Ashford and Keswick, South Australia have in common?
Trace exact match: False
Question #3: FreeMIDI was a program made for the operating system developed by what company?
Trace exact match: False
Question #3: FreeMIDI was a program made for the operating system developed by what company?
Trace exact match: True
Question #4: What member of the country music group



Trace exact match: False
Question #205: What film did Peter Best make in 1986 that was set in the Australian Outback and in New York City?
Trace exact match: True
Question #206: What animation studio are both Susanne Blakeslee and the game Kingdom Hearts associated with?
Trace exact match: True
Question #206: What animation studio are both Susanne Blakeslee and the game Kingdom Hearts associated with?
Trace exact match: True
Question #207: What genre of films were the films The Devil Came on Horseback and Comic Book Confidential?
Trace exact match: True
Question #207: What genre of films were the films The Devil Came on Horseback and Comic Book Confidential?
Trace exact match: False
Question #208: Love's Travel Stops and Country Stores is a chain of truck stops that include which fast-food restaurant established in NC in 1960?
Trace exact match: False
Question #208: Love's Travel Stops and Country Stores is a chain of truck stops that include which fast-food restaurant established in N



Trace exact match: False
Question #254: In what hotel venue did Irina Antonenko compete for Miss Universe in 2010?
Trace exact match: True
Question #255: Is Platycerium a genus of ferns and Faucaria a word that means animal mouth?
Trace exact match: True
Question #255: Is Platycerium a genus of ferns and Faucaria a word that means animal mouth?
Trace exact match: False
Question #256: As of 2016, which band released more albums, The Last Shadow Puppets or The Classic Crime?
Trace exact match: False
Question #256: As of 2016, which band released more albums, The Last Shadow Puppets or The Classic Crime?
Trace exact match: False
Question #257: Were E. M. Forster and Ludwig Renn both from England?
Trace exact match: False
Question #257: Were E. M. Forster and Ludwig Renn both from England?
Trace exact match: False
Question #258: Junko Takeuchi is the Japanese voice of the title character in the British-American television series created by whom?
Trace exact match: False
Question #258: Junk



Trace exact match: False
Question #308: In 2009, what Bowl did the University that calls their home football field Memorial Stadium play in?
Trace exact match: False
Question #309: Lucila Salao, a Filipino sprinter, competed in the women's 4 x 100 meters relay at an international multi-sport event that was held in what city?
Trace exact match: False
Question #309: Lucila Salao, a Filipino sprinter, competed in the women's 4 x 100 meters relay at an international multi-sport event that was held in what city?
Trace exact match: True
Question #310: Which non profit practice and medical research group based in Rochester, Minnesota was founded by William Worrall Mayo?
Trace exact match: True
Question #310: Which non profit practice and medical research group based in Rochester, Minnesota was founded by William Worrall Mayo?
Trace exact match: True
Question #311: In what Rock film did Sylvestor Stallone star with a family member?
Trace exact match: True
Question #311: In what Rock film did S



Trace exact match: False
Question #754: Who was a professor of theology at the University of Leiden as well as a student of Theodore Beza?
Trace exact match: False
Question #755: What is the population of the city that has the basilica that The Basilica of St. Josaphat was modeled after?
Trace exact match: False
Question #755: What is the population of the city that has the basilica that The Basilica of St. Josaphat was modeled after?
Trace exact match: True
Question #756: The Uncluded was a folk rap group formed by which half of The Moldy Peaches?
Trace exact match: True
Question #756: The Uncluded was a folk rap group formed by which half of The Moldy Peaches?
Trace exact match: True
Question #757: Are Ulf Merbold and Michael Foale both astronauts?
Trace exact match: True
Question #757: Are Ulf Merbold and Michael Foale both astronauts?
Trace exact match: False
Question #758: Who is the star of "Swimming Pool" who also has appeared in "Scooby-Doo" and "Wedding Crashers"?
Trace exact 



Trace exact match: False
Question #892:  An actor who is best known for his role as David Ruffin in The Temptations, also appeared in a 1993 American comedy sports film that was directed by who?
Trace exact match: True
Question #893: Robert Langdon's films based on Dan Brown's novels were released in a different chronological order than the novels, though the last movies released was also the last book released. Which book was it?
Trace exact match: True
Question #893: Robert Langdon's films based on Dan Brown's novels were released in a different chronological order than the novels, though the last movies released was also the last book released. Which book was it?
Trace exact match: True
Question #894: Mega Python vs. Gatoroid starred what singer who was born in October of 1971?
Trace exact match: True
Question #894: Mega Python vs. Gatoroid starred what singer who was born in October of 1971?
Trace exact match: True
Question #895: Which French car manufacturer supplied the technolog



Error processing question 982: normalize() argument 2 must be str, not None
Question #983: Which Air Force Base was the last assignment for the 820th Strategic Aerospace Division and is located south of the Canada-United States border?
Trace exact match: True
Question #984: What did the husband of Duchess Victoria do for a living? 
Trace exact match: True
Question #984: What did the husband of Duchess Victoria do for a living? 
Trace exact match: False
Question #985: Peter Hugh Pocklington is an advocate of an economic system that is free from intervention by which party ?
Trace exact match: False
Question #985: Peter Hugh Pocklington is an advocate of an economic system that is free from intervention by which party ?
Trace exact match: True
Question #986: As of 2010, what was the population where the Village of Westbury is located?
Trace exact match: True
Question #986: As of 2010, what was the population where the Village of Westbury is located?
Trace exact match: False
Question #987



Error processing question 1453: normalize() argument 2 must be str, not None
Question #1454: Oliver Muirhead directed "A Christmas Held Captive", which is a type of musical comedy stage production that includes songs, gags, slapstick comedy, and dancing, and was developed where?
Trace exact match: True
Question #1455: When was the former American football coach and a former college baseball player who made Air raid offense popular born? 
Trace exact match: False
Question #1456: Where did Grant Gregory play his home games in 2009?
Trace exact match: False
Question #1457: What other actress starred with American actress and scream queen Danielle Harris in the movie Wish Upon a Star?
Trace exact match: True
Question #1458: Which businessman, born in 1853, made his fortune in Kimberley, South Africa?
Trace exact match: False
Question #1459: Prostitution Information Center is located in the red-light district that contains approximately how many cabins?
Trace exact match: False
Question #14

In [7]:
# training_dataset = Dataset.from_list(dataset_entries)

# training_dataset.push_to_hub(
#     "rshn-krn/hotpotqa-agent-training-data",
#     private=False,
#     commit_message="GemHotPotQA agent training trace conversations",
# )

In [17]:
training_dataset = Dataset.from_list(dataset_entries)

training_dataset.push_to_hub(
    "rshn-krn/hotpotqa-agent-training-data-2",
    private=False,
    commit_message="Gemini HotPotQA agent training trace conversations",
)

Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 68.38ba/s]
Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (1 / 1)                : 100%|██████████| 4.48MB / 4.48MB,   ???B/s  
Processing Files (1 / 1)                : 100%|██████████| 4.48MB / 4.48MB,  0.00B/s  
New Data Upload                         : |          |  0.00B /  0.00B,  0.00B/s  
                                        : 100%|██████████| 4.48MB / 4.48MB            
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.45 shards/s]


CommitInfo(commit_url='https://huggingface.co/datasets/rshn-krn/hotpotqa-agent-training-data-2/commit/c279eff63fee4a4add4794fb44c97b78d2234b51', commit_message='Gemini HotPotQA agent training trace conversations', commit_description='', oid='c279eff63fee4a4add4794fb44c97b78d2234b51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rshn-krn/hotpotqa-agent-training-data-2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rshn-krn/hotpotqa-agent-training-data-2'), pr_revision=None, pr_num=None)

In [9]:
training_dataset

Dataset({
    features: ['conversations', 'question', 'answer', 'step', 'trace_exact_match'],
    num_rows: 4794
})

In [10]:
training_dataset[0]

{'conversations': [{'content': 'Your input fields are:\n1. `question` (str): \n2. `trajectory` (str):\nYour output fields are:\n1. `next_thought` (str): \n2. `next_tool_name` (Literal[\'evaluate_math\', \'search_wikipedia\', \'search_web\', \'finish\']): \n3. `next_tool_args` (dict[str, Any]):\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\nInputs will have the following structure:\n\n[[ ## question ## ]]\n{question}\n\n[[ ## trajectory ## ]]\n{trajectory}\n\nOutputs will be a JSON object with the following fields.\n\n{\n  "next_thought": "{next_thought}",\n  "next_tool_name": "{next_tool_name}        # note: the value you produce must exactly match (no extra characters) one of: evaluate_math; search_wikipedia; search_web; finish",\n  "next_tool_args": "{next_tool_args}        # note: the value you produce must adhere to the JSON schema: {\\"type\\": \\"object\\", \\"additionalProperties\\": true}"\n}\nIn adhering to this structure, 

In [None]:
training_dataset.filter(lambda example: example["trace_exact_match"] == True)

Filter: 100%|██████████| 4794/4794 [00:00<00:00, 10154.69 examples/s]


Dataset({
    features: ['conversations', 'question', 'answer', 'step', 'trace_exact_match'],
    num_rows: 1704
})

In [14]:
training_dataset[1]["conversations"]

[{'content': 'Your input fields are:\n1. `question` (str): \n2. `trajectory` (str):\nYour output fields are:\n1. `next_thought` (str): \n2. `next_tool_name` (Literal[\'evaluate_math\', \'search_wikipedia\', \'search_web\', \'finish\']): \n3. `next_tool_args` (dict[str, Any]):\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## question ## ]]\n{question}\n\n[[ ## trajectory ## ]]\n{trajectory}\n\n[[ ## next_thought ## ]]\n{next_thought}\n\n[[ ## next_tool_name ## ]]\n{next_tool_name}        # note: the value you produce must exactly match (no extra characters) one of: evaluate_math; search_wikipedia; search_web; finish\n\n[[ ## next_tool_args ## ]]\n{next_tool_args}        # note: the value you produce must adhere to the JSON schema: {"type": "object", "additionalProperties": true}\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Given the fields `question`, produce the fields `answer`.\n        