In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

import zipfile
import os

import dotenv

dotenv.load_dotenv()
import dspy

from dspy.datasets import HotPotQA
import re
from dspy.evaluate import Evaluate

from dsp.utils import EM
from dsp.utils.utils import deduplicate
import collections

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import openai

openai.api_key = os.environ.get('OPENAI_API_KEY')

mini = "gpt-4o-mini-2024-07-18"
base_temp = 0.9

lm = dspy.OpenAIModel(model=mini, max_tokens=500, temperature=base_temp)

colbert_v2_endpoint = "http://20.102.90.50:2017/wiki17_abstracts"
colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint)

dspy.settings.configure(rm=colbertv2, lm=lm) 

In [4]:
import dspy.evaluate


class AnswerCorrectnessSignature(dspy.Signature):
    """Verify that the predicted answer matches the gold answer."""

    question = dspy.InputField()
    gold_answer = dspy.InputField(desc="correct answer for question")
    predicted_answer = dspy.InputField(desc="predicted answer for question")
    is_correct = dspy.OutputField(desc='True or False')

class AnswerCorrectness(dspy.Module):
    def __init__(self):
        super().__init__()
        self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
    
    def forward(self, example, predicted_answer):
        question, gold_answer = example.question, example.answer
        with dspy.context(lm=dspy.OpenAI(model=mini, max_tokens=100, temperature=0.1)):
            return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
    
evaluator = AnswerCorrectness()

# def metric(example, pred, trace=None):
#     result = evaluator(example=example, predicted_answer=pred.answer).is_correct.lower()
#     return "true" in result and not "false" in result

metric = dspy.evaluate.answer_exact_match

In [5]:
# Load and configure the datasets.
TRAIN_SIZE = 500
EVAL_SIZE = 500

hotpot_dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, keep_details="type")
trainset = [x.with_inputs('question') for x in hotpot_dataset.train][:EVAL_SIZE]
devset = [x.with_inputs('question') for x in hotpot_dataset.dev][:EVAL_SIZE]

# Set up metrics
NUM_THREADS = 12

In [6]:
kwargs = dict(num_threads=NUM_THREADS, display_progress=True)
evaluate = Evaluate(devset=devset, metric=metric, **kwargs)

In [7]:
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question, return_trace=False):
        context = []
        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        x = self.generate_answer(context=context, question=question).copy(context=context)
        
        if return_trace:
            return x, dspy.settings.trace
        return x

In [8]:
# program = BasicMH()
# program(question="What is the capital of France?")

In [9]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

program_params = {
    "passages_per_hop": 3,
}

COMPILE = False

if COMPILE:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs = teleprompter.compile(BasicMH(**program_params), trainset=trainset[:100], valset=devset[:150])
    basicmh_bs.save(f"basicmh_{max_bootstrapped_demos}_{max_labeled_demos}_{num_candidate_programs}.json")

    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:300])
    bs_eval = evaluate(basicmh_bs, devset=devset[:300])
else:
    basicmh_bs = BasicMH(**program_params)
    basicmh_bs.load("basicmh_3_3_6.json")

In [10]:
from collections import Counter
from dspy.teleprompt.finetune_teleprompter import bootstrap_data_multiple_rounds, DataCollectionCallback, build_prompt_completion_data_from_trace
from typing import Callable
import ujson

samples = 100

callback = DataCollectionCallback(num_correct=1, max_attempts=1)

dspy.settings.configure(experimental=True)
dc_kwargs = {
    # "exclude_demos":True, 
    "sampling_temperature_base": base_temp,
    "sampling_temperature_delta":0.0001,
    "next_round_dataset_callback": callback.move_on_callback_correct_with_max,
    "num_threads": NUM_THREADS,
}
basicmh_bs = BasicMH(**program_params)
basicmh_bs.load("basicmh_3_3_6.json")
# NOTE: It looks like all data, not just success is saved

trainset_data = bootstrap_data_multiple_rounds(basicmh_bs, trainset[:samples], metric, num_rounds=1, **dc_kwargs)
trainset_data = sum([build_prompt_completion_data_from_trace(result["trace"], program=basicmh_bs, exclude_demos=True) for result in trainset_data if result["score"]], [])
trainset_data = lm.format_data_for_vanilla_finetuning(trainset_data)
print(len(trainset_data))
ts_json = ujson.dumps(trainset_data)
with open("trainset_data.jsonl", "w") as f:
    for line in trainset_data:
        f.write(ujson.dumps(line) + "\n")

devset_data = bootstrap_data_multiple_rounds(basicmh_bs, devset[:int(samples)], metric, num_rounds=1, **dc_kwargs)
devset_data = sum([build_prompt_completion_data_from_trace(result["trace"], program=basicmh_bs, exclude_demos=True) for result in devset_data if result["score"]], [])
devset_data = lm.format_data_for_vanilla_finetuning(devset_data)
print(len(devset_data))
ds_json = ujson.dumps(devset_data)
with open("devset_data.jsonl", "w") as f:
    for line in devset_data:
        f.write(ujson.dumps(line) + "\n")

Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:00<00:00, 351.07it/s]


186


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:00<00:00, 343.97it/s]


168


In [11]:
lm.start_training("trainset_data.jsonl", "devset_data.jsonl", method="SFT")

APIConnectionError: Connection error.

In [None]:
# from dspy.teleprompt.finetune_teleprompter import BootstrapFinetune
# ft:gpt-4o-mini-2024-07-18:anyscale:nosys:9vUm0BYv
# teleprompter = BootstrapFinetune(training_kwargs={}, seed=314)

In [None]:
TRAIN_FROM_SCRATCH = False

if TRAIN_FROM_SCRATCH:
    teacher = basicmh_bs
    
    # something like
    # Go through all the possible methods + save/load
    # change to future[lm] first


    # TODO: Working here
    # teleprompter = BootstrapFinetune(training_kwargs={}, seed=314)
#     kwargs = {"num_threads": NUM_THREADS, "combine": True}
#     # NOTE: LM IS NOT SET PROPERLY
#     basicmh_bs_ft = teleprompter.compile(BasicMH(**program_params), teacher=teacher, dataset=trainset[:samples], metric=metric, data_collection_kwargs=dc_kwargs, **kwargs)
#     basicmh_bs_ft.save('mini_ft_hpqa_100.json')
# else:
#     basicmh_bs_ft = BasicMH(**program_params)
#     basicmh_bs_ft.load('mini_ft_hpqa_100.json')

In [None]:
lm2 = lm.copy()
lm2.kwargs["model"] = "ft:gpt-4o-mini-2024-07-18:anyscale:nosys:9vUm0BYv"

basicmh_bs_ft = BasicMH(**program_params)
basicmh_bs_ft._set_all_predictor_lms(lm2)
# print(basicmh_bs_ft.predictors()[0].lm.kwargs["model"])
# dspy.settings.configure(lm=basicmh_bs_ft.predictors()[0].lm)

In [None]:
RECOMPILE_FT_MODEL = False

if RECOMPILE_FT_MODEL:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    bsfsrs_teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs_ft_bs = bsfsrs_teleprompter.compile(student=basicmh_bs_ft, trainset=trainset[:100], valset=devset[:250])
    basicmh_bs_ft_bs.save('mini_bs_ft_bs_hpqa_100.json')
else:
    basicmh_bs_ft_bs = BasicMH(**program_params)
    basicmh_bs_ft_bs.load('mini_bs_ft_bs_hpqa_100.json')

In [None]:
TEST_SIZE = 300

with dspy.context(lm=dspy.OpenAI(model=mini, max_tokens=500, temperature=0.0)):
    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:TEST_SIZE])
    bs_eval = evaluate(basicmh_bs, devset=devset[:TEST_SIZE])
with dspy.context(lm=dspy.OpenAI(model="ft:gpt-4o-mini-2024-07-18:anyscale:nosys:9vUm0BYv", max_tokens=500, temperature=0.0)):
    bs_ft_eval = evaluate(basicmh_bs_ft, devset=devset[:TEST_SIZE])
    bs_ft_bs_eval = evaluate(basicmh_bs_ft_bs, devset=devset[:TEST_SIZE])

print(f"Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N={samples} and up to 1 attempts for each example with one model for all predictors. Tested on first {TEST_SIZE} of devset.")
print(f"Non-finetuned model: {baseline_eval}")
print(f"Non-finetuned bootstrapped model: {bs_eval}")
print(f"Finetuned model: {bs_ft_eval}")
print(f"Finetuned model with bootstrapping: {bs_ft_bs_eval}")