In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

import zipfile
import os

import dotenv

dotenv.load_dotenv()
import dspy

from dspy.datasets import HotPotQA
import re
from dspy.evaluate import Evaluate

from dsp.utils import EM
from dsp.utils.utils import deduplicate
import collections

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import openai

openai.api_key = os.environ.get('OPENAI_API_KEY')

mini = "gpt-4o-mini-2024-07-18"
base_temp = 0.9

lm = dspy.OpenAIModel(model=mini, max_tokens=500, temperature=base_temp)

colbert_v2_endpoint = "http://20.102.90.50:2017/wiki17_abstracts"
colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint)

dspy.settings.configure(rm=colbertv2, lm=lm) 

In [4]:
import dspy.evaluate

metric = dspy.evaluate.answer_exact_match

In [5]:
# Load and configure the datasets.
TRAIN_SIZE = 500
EVAL_SIZE = 500

hotpot_dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, keep_details="type")
trainset = [x.with_inputs('question') for x in hotpot_dataset.train][:EVAL_SIZE]
devset = [x.with_inputs('question') for x in hotpot_dataset.dev][:EVAL_SIZE]

# Set up metrics
NUM_THREADS = 12

In [6]:
kwargs = dict(num_threads=NUM_THREADS, display_progress=True)
evaluate = Evaluate(devset=devset, metric=metric, **kwargs)

In [7]:
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question, return_trace=False):
        context = []
        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        x = self.generate_answer(context=context, question=question).copy(context=context)
        
        if return_trace:
            return x, dspy.settings.trace
        return x

In [8]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

program_params = {
    "passages_per_hop": 3,
}

COMPILE = False

if COMPILE:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs = teleprompter.compile(BasicMH(**program_params), trainset=trainset[:100], valset=devset[:150])
    basicmh_bs.save(f"basicmh_{max_bootstrapped_demos}_{max_labeled_demos}_{num_candidate_programs}.json")

    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:300])
    bs_eval = evaluate(basicmh_bs, devset=devset[:300])
else:
    basicmh_bs = BasicMH(**program_params)
    basicmh_bs.load("basicmh_3_3_6.json")

In [9]:
from collections import Counter
from dspy.teleprompt.finetune_teleprompter import bootstrap_data_multiple_rounds, DataCollectionCallback, build_prompt_completion_data_from_trace
from typing import Callable
import ujson

samples = 500

callback = DataCollectionCallback(num_correct=1, max_attempts=2)

dspy.settings.configure(experimental=True)
dc_kwargs = {
    # "exclude_demos":True, 
    "sampling_temperature_base": base_temp,
    "sampling_temperature_delta":0.0001,
    "next_round_dataset_callback": callback.move_on_callback_correct_with_max,
    "num_threads": NUM_THREADS,
}

dataset_filenames = {"trainset_data.jsonl": trainset[:samples], "devset_data.jsonl": devset[:int(samples/4)]}

def write_data(data, filename):
    # get the bootstrapped data for num_rounds=1, but using the callback
    dataset = bootstrap_data_multiple_rounds(basicmh_bs, data, metric, num_rounds=1, **dc_kwargs)
    # Flatmap the lists of prompt completions gives by each trace
    dataset = sum([build_prompt_completion_data_from_trace(result["trace"], program=basicmh_bs, exclude_demos=True) for result in dataset if result["score"]], [])
    # Format the data for finetuning using the LM
    dataset = lm.format_data_for_vanilla_finetuning(dataset)
    print("Writing dataset with length", len(dataset), "to", filename)
    with open(filename, "w") as f:
        for line in dataset:
            f.write(ujson.dumps(line) + "\n")

for key, data in dataset_filenames.items():
    write_data(data, key)

Average Metric: 259 / 500  (51.8): 100%|██████████| 500/500 [00:01<00:00, 321.91it/s]


Writing dataset with length 777 to trainset_data.jsonl


Average Metric: 76 / 125  (60.8): 100%|██████████| 125/125 [00:00<00:00, 350.95it/s]


Writing dataset with length 228 to devset_data.jsonl


In [10]:
import concurrent.futures
# lm.()
# lm.get_finetune("trainset_data.jsonl", "devset_data.jsonl", method="SFT")
future_lm = dspy.OpenAIModel.load_from_job_id("ftjob-tmccSJv170p4gNFJUYwXdCSq")
finetuned_lm = future_lm
finetuned_lm.kwargs["temperature"] = 0.0

In [11]:
assert finetuned_lm.kwargs["model"] != lm.kwargs["model"]

In [12]:
basicmh_bs_ft = BasicMH(**program_params)
basicmh_bs_ft._set_all_predictor_lms(finetuned_lm)

In [15]:
RECOMPILE_FT_MODEL = True

if RECOMPILE_FT_MODEL:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    bsfsrs_teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs_ft_bs = bsfsrs_teleprompter.compile(student=basicmh_bs_ft, trainset=trainset[:100], valset=devset[:250])
    basicmh_bs_ft_bs.save('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)
else:
    basicmh_bs_ft_bs = BasicMH(**program_params)
    basicmh_bs_ft_bs.load('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)

Going to sample between 1 and 3 traces per predictor.
Will attempt to bootstrap 6 candidate sets.


Average Metric: 106 / 250  (42.4): 100%|██████████| 250/250 [00:03<00:00, 71.56it/s]  


Score: 42.4 for set: [0, 0, 0]
New best score: 42.4 for seed -3
Scores so far: [42.4]
Best score: 42.4


Average Metric: 134 / 250  (53.6): 100%|██████████| 250/250 [00:29<00:00,  8.54it/s]


Score: 53.6 for set: [16, 16, 16]
New best score: 53.6 for seed -2
Scores so far: [42.4, 53.6]
Best score: 53.6


  6%|▌         | 6/100 [00:26<07:02,  4.50s/it]


Bootstrapped 3 full traces after 7 examples in round 0.


Average Metric: 89 / 162  (54.9):  64%|██████▍   | 161/250 [01:00<00:34,  2.62it/s]

Average Metric: 4 / 8  (50.0):   3%|▎         | 8/250 [10:14<5:10:00, 76.86s/it]

Average Metric: 90 / 163  (55.2):  65%|██████▌   | 163/250 [01:00<00:26,  3.30it/s]




Average Metric: 142 / 250  (56.8): 100%|██████████| 250/250 [01:31<00:00,  2.74it/s]


Score: 56.8 for set: [16, 16, 16]
New best score: 56.8 for seed -1
Scores so far: [42.4, 53.6, 56.8]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.64
Average of max per entry across top 3 scores: 0.656
Average of max per entry across top 5 scores: 0.656
Average of max per entry across top 8 scores: 0.656
Average of max per entry across top 9999 scores: 0.656


  3%|▎         | 3/100 [00:15<08:17,  5.13s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 133 / 250  (53.2): 100%|██████████| 250/250 [01:49<00:00,  2.28it/s]


Score: 53.2 for set: [16, 16, 16]
Scores so far: [42.4, 53.6, 56.8, 53.2]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.64
Average of max per entry across top 3 scores: 0.664
Average of max per entry across top 5 scores: 0.676
Average of max per entry across top 8 scores: 0.676
Average of max per entry across top 9999 scores: 0.676


  1%|          | 1/100 [00:04<07:14,  4.39s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 136 / 250  (54.4): 100%|██████████| 250/250 [01:49<00:00,  2.29it/s]


Score: 54.4 for set: [16, 16, 16]
Scores so far: [42.4, 53.6, 56.8, 53.2, 54.4]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.624
Average of max per entry across top 3 scores: 0.664
Average of max per entry across top 5 scores: 0.684
Average of max per entry across top 8 scores: 0.684
Average of max per entry across top 9999 scores: 0.684


  2%|▏         | 2/100 [00:06<05:32,  3.39s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 135 / 250  (54.0): 100%|██████████| 250/250 [01:46<00:00,  2.34it/s]


Score: 54.0 for set: [16, 16, 16]
Scores so far: [42.4, 53.6, 56.8, 53.2, 54.4, 54.0]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.624
Average of max per entry across top 3 scores: 0.648
Average of max per entry across top 5 scores: 0.68
Average of max per entry across top 8 scores: 0.688
Average of max per entry across top 9999 scores: 0.688


  2%|▏         | 2/100 [00:10<08:25,  5.16s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 125 / 250  (50.0): 100%|██████████| 250/250 [13:00<00:00,  3.12s/it]


Score: 50.0 for set: [16, 16, 16]
Scores so far: [42.4, 53.6, 56.8, 53.2, 54.4, 54.0, 50.0]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.624
Average of max per entry across top 3 scores: 0.648
Average of max per entry across top 5 scores: 0.68
Average of max per entry across top 8 scores: 0.688
Average of max per entry across top 9999 scores: 0.688


  1%|          | 1/100 [17:22<28:40:26, 1042.69s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 133 / 250  (53.2): 100%|██████████| 250/250 [01:35<00:00,  2.61it/s]


Score: 53.2 for set: [16, 16, 16]
Scores so far: [42.4, 53.6, 56.8, 53.2, 54.4, 54.0, 50.0, 53.2]
Best score: 56.8
Average of max per entry across top 1 scores: 0.568
Average of max per entry across top 2 scores: 0.624
Average of max per entry across top 3 scores: 0.648
Average of max per entry across top 5 scores: 0.68
Average of max per entry across top 8 scores: 0.688
Average of max per entry across top 9999 scores: 0.688


  8%|▊         | 8/100 [00:44<08:35,  5.60s/it]


Bootstrapped 3 full traces after 9 examples in round 0.


Average Metric: 143 / 250  (57.2): 100%|██████████| 250/250 [01:47<00:00,  2.33it/s]

Score: 57.2 for set: [16, 16, 16]
New best score: 57.2 for seed 5
Scores so far: [42.4, 53.6, 56.8, 53.2, 54.4, 54.0, 50.0, 53.2, 57.2]
Best score: 57.2
Average of max per entry across top 1 scores: 0.572
Average of max per entry across top 2 scores: 0.64
Average of max per entry across top 3 scores: 0.66
Average of max per entry across top 5 scores: 0.688
Average of max per entry across top 8 scores: 0.692
Average of max per entry across top 9999 scores: 0.692
9 candidate programs found.
[('retrieve', <dspy.retrieve.retrieve.Retrieve object at 0x12e43f670>), ('generate_query[0]', Predict(StringSignature(context, question -> rationale, search_query
    instructions='Given the fields `context`, `question`, produce the fields `search_query`.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix




In [16]:
TEST_SIZE = 300
baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:TEST_SIZE])
bs_eval = evaluate(basicmh_bs, devset=devset[:TEST_SIZE])
bs_ft_eval = evaluate(basicmh_bs_ft, devset=devset[:TEST_SIZE])
bs_ft_bs_eval = evaluate(basicmh_bs_ft_bs, devset=devset[:TEST_SIZE])

print(f"Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N={samples} and up to 1 attempts for each example with one model for all predictors. Tested on first {TEST_SIZE} of devset.")
print(f"Non-finetuned model: {baseline_eval}")
print(f"Non-finetuned bootstrapped model: {bs_eval}")
print(f"Finetuned model: {bs_ft_eval}")
print(f"Finetuned model with bootstrapping: {bs_ft_bs_eval}")

[autoreload of dsp.modules.lm failed: Traceback (most recent call last):
  File "/Users/isaac.miller/projects/dspy-finetuning/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/isaac.miller/projects/dspy-finetuning/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 613, in _exec
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "/Users/isaac.miller/projects/dspy-finetuning/dilara-dspy/dsp/modules/lm.py", line 152, in <module>
    class TrainableLM(LM, ABC):
  File "/Users/isaac.miller/

AttributeError: 'float' object has no attribute '_get_lm_info_str'