# OpenAI Fine-tuning Demo with HotPotQA

Finetuning demo with DSPy.

## Notebook Preparation

Magic commands and secrets.

In [1]:
%load_ext autoreload
%autoreload 2

import os

assert "DSP_CACHEDIR" in os.environ
assert "OPENAI_API_KEY" in os.environ

# Altenatively, you can set the environment variables in code
# os.environ["DSP_CACHEDIR"] = <YOUR_CACHE_DIR>
# os.environ["OPENAI_API_KEY"] = <OPENAI_API_KEY>

## Task Setup

In [2]:
import dspy
import dspy.evaluate
from dspy.datasets import HotPotQA
from dspy.evaluate import Evaluate
from dsp.utils.utils import deduplicate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mini = "gpt-4o-mini-2024-07-18"
base_temp = 0.9

lm = dspy.TrainableOpenAI(model=mini, max_tokens=500, temperature=base_temp)

colbert_v2_endpoint = "http://20.102.90.50:2017/wiki17_abstracts"
colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint)

dspy.settings.configure(rm=colbertv2, lm=lm) 

In [4]:
metric = dspy.evaluate.answer_exact_match

In [5]:
# Load and configure the datasets.
TRAIN_SIZE = 500
EVAL_SIZE = 500

hotpot_dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, keep_details="type")
trainset = [x.with_inputs('question') for x in hotpot_dataset.train][:EVAL_SIZE]
devset = [x.with_inputs('question') for x in hotpot_dataset.dev][:EVAL_SIZE]

# Set up metrics
NUM_THREADS = 12

In [6]:
kwargs = dict(num_threads=NUM_THREADS, display_progress=True)
evaluate = Evaluate(devset=devset, metric=metric, **kwargs)

In [7]:
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question, return_trace=False):
        context = []
        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        x = self.generate_answer(context=context, question=question).copy(context=context)
        
        if return_trace:
            return x, dspy.settings.trace
        return x

In [8]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

program_params = {
    "passages_per_hop": 3,
}

COMPILE = False

if COMPILE:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs = teleprompter.compile(BasicMH(**program_params), trainset=trainset[:100], valset=devset[:150])
    basicmh_bs.save(f"basicmh_{max_bootstrapped_demos}_{max_labeled_demos}_{num_candidate_programs}.json")

    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:300])
    bs_eval = evaluate(basicmh_bs, devset=devset[:300])
else:
    basicmh_bs = BasicMH(**program_params)
    basicmh_bs.load("basicmh_3_3_6.json")

In [9]:
from dspy.teleprompt.finetune_teleprompter import bootstrap_data_for_round, convert_to_prompt_completion_data
import ujson

samples = 200

dspy.settings.configure(experimental=True)
dc_kwargs = {
    "sampling_temperature": base_temp,
    "sampling_temperature_delta":0.0001,
    "num_threads": NUM_THREADS,
}

dataset_filenames = {"trainset_data.jsonl": trainset[:samples], "devset_data.jsonl": devset[:int(samples/4)]}

def write_data(data, filename):
    # get the bootstrapped data for num_rounds=1, but using the callback
    data = bootstrap_data_for_round(basicmh_bs, data, metric, sampling_round=1, **dc_kwargs)

    # Post process the data to remove any entries with no score
    filtered_data = [d for d in data if d["score"]]

    # Convert the data to prompt completion format
    dataset = convert_to_prompt_completion_data(filtered_data, program=basicmh_bs, exclude_demos=True)[:10]
    
    # Format the data for finetuning using the LM
    print("Writing dataset with length", len(dataset), "to", filename)
    with open(filename, "w") as f:
        ujson.dump(dataset, f)

for key, data in dataset_filenames.items():
    write_data(data, key)

Average Metric: 113 / 200  (56.5): 100%|██████████| 200/200 [00:00<00:00, 296.95it/s]


Writing dataset with length 10 to trainset_data.jsonl


Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:00<00:00, 308.83it/s]

Writing dataset with length 10 to devset_data.jsonl





In [10]:
from dsp.modules.lm import TrainingMethod

future_lm = lm.get_finetune(method=TrainingMethod.SFT, train_path="trainset_data.jsonl", eval_path="devset_data.jsonl", hyperparameters={"n_epochs": 1})
finetuned_lm = future_lm.result()
finetuned_lm.kwargs["temperature"] = 0.0

No errors found

10 examples are missing a system message
Dataset has ~4375 tokens that will be charged for during training
By default, you'll train for 10 epochs on this dataset
By default, you'll be charged for ~43750 tokens
No errors found
Uploaded train data to OpenAI
Uploaded val data to OpenAI
Waiting for training to complete


In [11]:
assert finetuned_lm.kwargs["model"] != lm.kwargs["model"]

In [12]:
basicmh_bs_ft = BasicMH(**program_params)
basicmh_bs_ft._set_all_predictor_lms(finetuned_lm)

In [13]:
RECOMPILE_FT_MODEL = True

if RECOMPILE_FT_MODEL:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    bsfsrs_teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs_ft_bs = bsfsrs_teleprompter.compile(student=basicmh_bs_ft, trainset=trainset[:100], valset=devset[:250])
    basicmh_bs_ft_bs.save('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)
else:
    basicmh_bs_ft_bs = BasicMH(**program_params)
    basicmh_bs_ft_bs.load('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)

Going to sample between 1 and 3 traces per predictor.
Will attempt to bootstrap 6 candidate sets.


Average Metric: 103 / 250  (41.2): 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]


Score: 41.2 for set: [0, 0, 0]
New best score: 41.2 for seed -3
Scores so far: [41.2]
Best score: 41.2


Average Metric: 135 / 250  (54.0): 100%|██████████| 250/250 [00:32<00:00,  7.76it/s]


Score: 54.0 for set: [16, 16, 16]
New best score: 54.0 for seed -2
Scores so far: [41.2, 54.0]
Best score: 54.0


  5%|▌         | 5/100 [00:22<07:06,  4.49s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 144 / 250  (57.6): 100%|██████████| 250/250 [01:37<00:00,  2.57it/s]


Score: 57.6 for set: [16, 16, 16]
New best score: 57.6 for seed -1
Scores so far: [41.2, 54.0, 57.6]
Best score: 57.6
Average of max per entry across top 1 scores: 0.576
Average of max per entry across top 2 scores: 0.628
Average of max per entry across top 3 scores: 0.64
Average of max per entry across top 5 scores: 0.64
Average of max per entry across top 8 scores: 0.64
Average of max per entry across top 9999 scores: 0.64


  3%|▎         | 3/100 [00:14<08:02,  4.98s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 140 / 250  (56.0): 100%|██████████| 250/250 [01:50<00:00,  2.26it/s]


Score: 56.0 for set: [16, 16, 16]
Scores so far: [41.2, 54.0, 57.6, 56.0]
Best score: 57.6
Average of max per entry across top 1 scores: 0.576
Average of max per entry across top 2 scores: 0.64
Average of max per entry across top 3 scores: 0.664
Average of max per entry across top 5 scores: 0.672
Average of max per entry across top 8 scores: 0.672
Average of max per entry across top 9999 scores: 0.672


  3%|▎         | 3/100 [00:12<06:36,  4.08s/it]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 137 / 250  (54.8): 100%|██████████| 250/250 [01:39<00:00,  2.53it/s]


Score: 54.8 for set: [16, 16, 16]
Scores so far: [41.2, 54.0, 57.6, 56.0, 54.8]
Best score: 57.6
Average of max per entry across top 1 scores: 0.576
Average of max per entry across top 2 scores: 0.64
Average of max per entry across top 3 scores: 0.668
Average of max per entry across top 5 scores: 0.684
Average of max per entry across top 8 scores: 0.684
Average of max per entry across top 9999 scores: 0.684


  2%|▏         | 2/100 [00:05<04:15,  2.60s/it]


Bootstrapped 1 full traces after 3 examples in round 0.




KeyboardInterrupt: 

In [None]:
TEST_SIZE = 300
baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:TEST_SIZE])
bs_eval = evaluate(basicmh_bs, devset=devset[:TEST_SIZE])
bs_ft_eval = evaluate(basicmh_bs_ft, devset=devset[:TEST_SIZE])
bs_ft_bs_eval = evaluate(basicmh_bs_ft_bs, devset=devset[:TEST_SIZE])

print(f"Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N={samples} and up to 1 attempts for each example with one model for all predictors. Tested on first {TEST_SIZE} of devset.")
print(f"Non-finetuned model: {baseline_eval}")
print(f"Non-finetuned bootstrapped model: {bs_eval}")
print(f"Finetuned model: {bs_ft_eval}")
print(f"Finetuned model with bootstrapping: {bs_ft_bs_eval}")