In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

import zipfile
import os

import dotenv

dotenv.load_dotenv()
import dspy

from dspy.datasets import HotPotQA
import re
from dspy.evaluate import Evaluate

from dsp.utils import EM
from dsp.utils.utils import deduplicate
import collections

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import openai

openai.api_key = os.environ.get('OPENAI_API_KEY')

mini = "gpt-4o-mini-2024-07-18"
base_temp = 0.9

lm = dspy.OpenAIModel(model=mini, max_tokens=500, temperature=base_temp)

colbert_v2_endpoint = "http://20.102.90.50:2017/wiki17_abstracts"
colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint)

dspy.settings.configure(rm=colbertv2, lm=lm) 

In [4]:
import dspy.evaluate

metric = dspy.evaluate.answer_exact_match

In [5]:
# Load and configure the datasets.
TRAIN_SIZE = 500
EVAL_SIZE = 500

hotpot_dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, keep_details="type")
trainset = [x.with_inputs('question') for x in hotpot_dataset.train][:EVAL_SIZE]
devset = [x.with_inputs('question') for x in hotpot_dataset.dev][:EVAL_SIZE]

# Set up metrics
NUM_THREADS = 12

In [6]:
kwargs = dict(num_threads=NUM_THREADS, display_progress=True)
evaluate = Evaluate(devset=devset, metric=metric, **kwargs)

In [7]:
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question, return_trace=False):
        context = []
        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        x = self.generate_answer(context=context, question=question).copy(context=context)
        
        if return_trace:
            return x, dspy.settings.trace
        return x

In [8]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

program_params = {
    "passages_per_hop": 3,
}

COMPILE = False

if COMPILE:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs = teleprompter.compile(BasicMH(**program_params), trainset=trainset[:100], valset=devset[:150])
    basicmh_bs.save(f"basicmh_{max_bootstrapped_demos}_{max_labeled_demos}_{num_candidate_programs}.json")

    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:300])
    bs_eval = evaluate(basicmh_bs, devset=devset[:300])
else:
    basicmh_bs = BasicMH(**program_params)
    basicmh_bs.load("basicmh_3_3_6.json")

In [9]:
from collections import Counter
from dspy.teleprompt.finetune_teleprompter import bootstrap_data_multiple_rounds, DataCollectionCallback, build_prompt_completion_data_from_trace
from typing import Callable
import ujson

samples = 500

callback = DataCollectionCallback(num_correct=1, max_attempts=2)

dspy.settings.configure(experimental=True)
dc_kwargs = {
    # "exclude_demos":True, 
    "sampling_temperature_base": base_temp,
    "sampling_temperature_delta":0.0001,
    "next_round_dataset_callback": callback.move_on_callback_correct_with_max,
    "num_threads": NUM_THREADS,
}

dataset_filenames = {"trainset_data.jsonl": trainset[:samples], "devset_data.jsonl": devset[:int(samples/4)]}

def write_data(data, filename):
    # get the bootstrapped data for num_rounds=1, but using the callback
    dataset = bootstrap_data_multiple_rounds(basicmh_bs, data, metric, num_rounds=1, **dc_kwargs)
    # Flatmap the lists of prompt completions gives by each trace
    dataset = sum([build_prompt_completion_data_from_trace(result["trace"], program=basicmh_bs, exclude_demos=True) for result in dataset if result["score"]], [])
    # Format the data for finetuning using the LM
    dataset = lm.format_data_for_vanilla_finetuning(dataset)
    print("Writing dataset with length", len(dataset), "to", filename)
    with open(filename, "w") as f:
        for line in dataset:
            f.write(ujson.dumps(line) + "\n")

for key, data in dataset_filenames.items():
    write_data(data, key)

Average Metric: 257 / 500  (51.4): 100%|██████████| 500/500 [00:01<00:00, 342.65it/s]


Writing dataset with length 771 to trainset_data.jsonl


Average Metric: 76 / 125  (60.8): 100%|██████████| 125/125 [00:43<00:00,  2.87it/s]

Writing dataset with length 228 to devset_data.jsonl





In [12]:
import concurrent.futures

future_lm = lm.get_finetune("trainset_data.jsonl", "devset_data.jsonl", method="SFT")
finetuned_lm = await future_lm
finetuned_lm.kwargs["temperature"] = 0.0

AttributeError: '_asyncio.Future' object has no attribute '_condition'

In [None]:
assert finetuned_lm.kwargs["model"] != lm.kwargs["model"]

In [None]:
basicmh_bs_ft = BasicMH(**program_params)
basicmh_bs_ft._set_all_predictor_lms(finetuned_lm)

In [None]:
RECOMPILE_FT_MODEL = True

if RECOMPILE_FT_MODEL:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    bsfsrs_teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs_ft_bs = bsfsrs_teleprompter.compile(student=basicmh_bs_ft, trainset=trainset[:100], valset=devset[:250])
    basicmh_bs_ft_bs.save('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)
else:
    basicmh_bs_ft_bs = BasicMH(**program_params)
    basicmh_bs_ft_bs.load('mini_bs_ft_bs_hpqa_100.json')
    basicmh_bs_ft_bs._set_all_predictor_lms(finetuned_lm)

In [None]:
TEST_SIZE = 300
baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:TEST_SIZE])
baseline_eval._get_lm_info_str()
bs_eval = evaluate(basicmh_bs, devset=devset[:TEST_SIZE])
bs_eval._get_lm_info_str()
bs_ft_eval = evaluate(basicmh_bs_ft, devset=devset[:TEST_SIZE])
bs_ft_eval._get_lm_info_str()
bs_ft_bs_eval = evaluate(basicmh_bs_ft_bs, devset=devset[:TEST_SIZE])
bs_ft_bs_eval._get_lm_info_str()

print(f"Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N={samples} and up to 1 attempts for each example with one model for all predictors. Tested on first {TEST_SIZE} of devset.")
print(f"Non-finetuned model: {baseline_eval}")
print(f"Non-finetuned bootstrapped model: {bs_eval}")
print(f"Finetuned model: {bs_ft_eval}")
print(f"Finetuned model with bootstrapping: {bs_ft_bs_eval}")