In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

import zipfile
import os

import dotenv

dotenv.load_dotenv()
import dspy

from dspy.datasets import HotPotQA
import re
from dspy.evaluate import Evaluate

from dsp.utils import EM
from dsp.utils.utils import deduplicate
import collections

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import openai

openai.api_key = os.environ.get('OPENAI_API_KEY')

mini = "gpt-4o-mini-2024-07-18"
colbert_v2_endpoint = "http://20.102.90.50:2017/wiki17_abstracts"
base_temp = 0.9

prompt_model = dspy.OpenAIModel(model=mini, max_tokens=500, temperature=base_temp)

colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint)

dspy.settings.configure(rm=colbertv2) # 

In [4]:
import dspy.evaluate


class AnswerCorrectnessSignature(dspy.Signature):
    """Verify that the predicted answer matches the gold answer."""

    question = dspy.InputField()
    gold_answer = dspy.InputField(desc="correct answer for question")
    predicted_answer = dspy.InputField(desc="predicted answer for question")
    is_correct = dspy.OutputField(desc='True or False')

class AnswerCorrectness(dspy.Module):
    def __init__(self):
        super().__init__()
        self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
    
    def forward(self, example, predicted_answer):
        question, gold_answer = example.question, example.answer
        with dspy.context(lm=dspy.OpenAI(model=mini, max_tokens=100, temperature=0.1)):
            return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
    
evaluator = AnswerCorrectness()

def metric(example, pred, trace=None):
    result = evaluator(example=example, predicted_answer=pred.answer).is_correct.lower()
    return "true" in result and not "false" in result

metric = dspy.evaluate.answer_exact_match

In [5]:
# Load and configure the datasets.
TRAIN_SIZE = 500
EVAL_SIZE = 500

hotpot_dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, keep_details="type")
trainset = [x.with_inputs('question') for x in hotpot_dataset.train][:EVAL_SIZE]
devset = [x.with_inputs('question') for x in hotpot_dataset.dev][:EVAL_SIZE]

# Set up metrics
NUM_THREADS = 12

In [6]:
kwargs = dict(num_threads=NUM_THREADS, display_progress=True)
evaluate = Evaluate(devset=devset, metric=metric, **kwargs)

In [7]:
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question, return_trace=False):
        context = []
        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        x = self.generate_answer(context=context, question=question).copy(context=context)
        
        if return_trace:
            return x, dspy.settings.trace
        return x

In [8]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

program_params = {
    "passages_per_hop": 3,
}

COMPILE = False

if COMPILE:
    with dspy.context(lm=dspy.OpenAI(model=mini, max_tokens=500, temperature=0.0)):
        max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
        config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
        teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
        basicmh_bs = teleprompter.compile(BasicMH(**program_params), trainset=trainset[:100], valset=devset[:250])
        basicmh_bs.save(f"basicmh_{max_bootstrapped_demos}_{max_labeled_demos}_{num_candidate_programs}.json")

        baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:300])
        bs_eval = evaluate(basicmh_bs, devset=devset[:300])
else:
    basicmh_bs = BasicMH(**program_params)
    basicmh_bs.load("basicmh_3_3_6.json")

In [9]:
from collections import Counter
from dspy.teleprompt.finetune_teleprompter import bootstrap_multiple_prompt_completion_data
from typing import Callable

samples = 500
max_attempts = 1


class DataCollectionCallback:
    def __init__(self):
        self.num_attempts = 0
        self.num_correct = 2
        self.max_attempts = max_attempts

    def move_on_callback_correct_with_max(self, dataset_copy, data):
        correct_counts = Counter(list(map(lambda x: x["example"], data)))
        examples_still_incorrect = [x for x in dataset_copy if correct_counts.get(x, 0) < self.num_correct]
        self.num_attempts += 1
        if self.num_attempts >= self.max_attempts:
            return []
        return examples_still_incorrect


callback = DataCollectionCallback()
    #       "dataset":devset[:samples],
    # "metric":metric,
dc_kwargs = {
    "include": None,
    "exclude_demos":True, 
    "temperature": base_temp,
    "temperature_delta":0.0001,
    "move_on_callback": callback.move_on_callback_correct_with_max,
    "num_threads": NUM_THREADS,
}

# results = bootstrap_multiple_prompt_completion_data(program, **kwargs)

In [10]:
from dspy.teleprompt.finetune_teleprompter import BootstrapFinetune

teleprompter = BootstrapFinetune(training_kwargs={}, seed=314)

In [11]:
TRAIN_FROM_SCRATCH = False

if TRAIN_FROM_SCRATCH:
    teacher = basicmh_bs
    lm = dspy.OpenAIModel(model=mini, max_tokens=500, temperature=base_temp)
    # for predictor in teacher.predictors():
    #     predictor.lm = lm
    dspy.settings.configure(lm=lm)
    # TODO: Working here
    # teleprompter = BootstrapFinetune(training_kwargs={}, seed=314)
    kwargs = {"num_threads": NUM_THREADS, "combine": True}
    # NOTE: LM IS NOT SET PROPERLY
    basicmh_bs_ft = teleprompter.compile(BasicMH(**program_params), teacher=teacher, dataset=trainset[:samples], metric=metric, data_collection_kwargs=dc_kwargs, **kwargs)
    basicmh_bs_ft.save('mini_ft_hpqa_100.json')
else:
    basicmh_bs_ft = BasicMH(**program_params)
    basicmh_bs_ft.load('mini_ft_hpqa_100.json')

In [12]:
print(basicmh_bs_ft.predictors()[0].lm.kwargs["model"])
dspy.settings.configure(lm=basicmh_bs_ft.predictors()[0].lm)

ft:gpt-4o-mini-2024-07-18:anyscale::9tkXPhLI


In [13]:
RECOMPILE_FT_MODEL = True

if RECOMPILE_FT_MODEL:
    max_bootstrapped_demos, max_labeled_demos, num_candidate_programs = 3,3,6
    config = dict(max_bootstrapped_demos=max_bootstrapped_demos, num_candidate_programs=num_candidate_programs, num_threads=NUM_THREADS)
    bsfsrs_teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
    basicmh_bs_ft_bs = bsfsrs_teleprompter.compile(student=basicmh_bs_ft, trainset=trainset[:100], valset=devset[:250])
    basicmh_bs_ft_bs.save('mini_bs_ft_bs_hpqa_100.json')
else:
    basicmh_bs_ft_bs = BasicMH(**program_params)
    basicmh_bs_ft_bs.load('mini_bs_ft_bs_hpqa_100.json')

Going to sample between 1 and 3 traces per predictor.
Will attempt to bootstrap 6 candidate sets.


Average Metric: 9 / 10  (90.0):   4%|▎         | 9/250 [00:00<00:01, 136.55it/s]

Average Metric: 188 / 250  (75.2): 100%|██████████| 250/250 [00:00<00:00, 257.17it/s]


Score: 75.2 for set: [0, 0, 0]
New best sscore: 75.2 for seed -3
Scores so far: [75.2]
Best score: 75.2


Average Metric: 183 / 250  (73.2): 100%|██████████| 250/250 [00:00<00:00, 533.70it/s]


Score: 73.2 for set: [16, 16, 16]
Scores so far: [75.2, 73.2]
Best score: 75.2


  4%|▍         | 4/100 [00:00<00:00, 173.77it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 184 / 250  (73.6): 100%|██████████| 250/250 [00:00<00:00, 296.67it/s]


Score: 73.6 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.784
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.8
Average of max per entry across top 8 scores: 0.8
Average of max per entry across top 9999 scores: 0.8


  3%|▎         | 3/100 [00:00<00:00, 240.60it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 182 / 250  (72.8): 100%|██████████| 250/250 [00:00<00:00, 260.19it/s]


Score: 72.8 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.784
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.812
Average of max per entry across top 8 scores: 0.812
Average of max per entry across top 9999 scores: 0.812


  1%|          | 1/100 [00:00<00:00, 154.99it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 178 / 250  (71.2): 100%|██████████| 250/250 [00:00<00:00, 310.60it/s]


Score: 71.2 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8, 71.2]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.784
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.824
Average of max per entry across top 8 scores: 0.824
Average of max per entry across top 9999 scores: 0.824


  1%|          | 1/100 [00:00<00:00, 451.92it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 182 / 250  (72.8): 100%|██████████| 250/250 [00:00<00:00, 305.13it/s]


Score: 72.8 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8, 71.2, 72.8]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.784
Average of max per entry across top 3 scores: 0.8
Average of max per entry across top 5 scores: 0.824
Average of max per entry across top 8 scores: 0.832
Average of max per entry across top 9999 scores: 0.832


  2%|▏         | 2/100 [00:00<00:00, 148.98it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 186 / 250  (74.4): 100%|██████████| 250/250 [00:00<00:00, 289.52it/s]


Score: 74.4 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8, 71.2, 72.8, 74.4]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.796
Average of max per entry across top 3 scores: 0.812
Average of max per entry across top 5 scores: 0.828
Average of max per entry across top 8 scores: 0.84
Average of max per entry across top 9999 scores: 0.84


  1%|          | 1/100 [00:00<00:00, 258.21it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 185 / 250  (74.0): 100%|██████████| 250/250 [00:00<00:00, 346.57it/s]


Score: 74.0 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8, 71.2, 72.8, 74.4, 74.0]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.796
Average of max per entry across top 3 scores: 0.816
Average of max per entry across top 5 scores: 0.836
Average of max per entry across top 8 scores: 0.848
Average of max per entry across top 9999 scores: 0.848


  3%|▎         | 3/100 [00:00<00:00, 258.47it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 186 / 250  (74.4): 100%|██████████| 250/250 [00:02<00:00, 113.88it/s]

Score: 74.4 for set: [16, 16, 16]
Scores so far: [75.2, 73.2, 73.6, 72.8, 71.2, 72.8, 74.4, 74.0, 74.4]
Best score: 75.2
Average of max per entry across top 1 scores: 0.752
Average of max per entry across top 2 scores: 0.796
Average of max per entry across top 3 scores: 0.812
Average of max per entry across top 5 scores: 0.832
Average of max per entry across top 8 scores: 0.844
Average of max per entry across top 9999 scores: 0.848
9 candidate programs found.
[('retrieve', <dspy.retrieve.retrieve.Retrieve object at 0x124d86610>), ('generate_query[0]', Predict(StringSignature(context, question -> rationale, search_query
    instructions='Given the fields `context`, `question`, produce the fields `search_query`.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${que




In [16]:
TEST_SIZE = 300
with dspy.context(lm=dspy.OpenAI(model=mini, max_tokens=500, temperature=0.0)):
    baseline_eval = evaluate(BasicMH(**program_params), devset=devset[:TEST_SIZE])
    bs_eval = evaluate(basicmh_bs, devset=devset[:TEST_SIZE])
bs_ft_eval = evaluate(basicmh_bs_ft, devset=devset[:TEST_SIZE])
bs_ft_bs_eval = evaluate(basicmh_bs_ft_bs, devset=devset[:TEST_SIZE])

print(f"Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N={samples} and up to {max_attempts} attempts for each example with one model for all predictors. Tested on first {TEST_SIZE} of devset.")
print(f"Non-finetuned model: {baseline_eval}")
print(f"Non-finetuned bootstrapped model: {bs_eval}")
print(f"Finetuned model: {bs_ft_eval}")
print(f"Finetuned model with bootstrapping: {bs_ft_bs_eval}")

Average Metric: 219 / 300  (73.0): 100%|██████████| 300/300 [00:00<00:00, 4214.14it/s]
Average Metric: 227 / 300  (75.7): 100%|██████████| 300/300 [00:00<00:00, 2704.22it/s]
Average Metric: 216 / 300  (72.0): 100%|██████████| 300/300 [00:00<00:00, 2398.68it/s]
Average Metric: 222 / 300  (74.0): 100%|██████████| 300/300 [00:00<00:00, 2448.68it/s]

Results for HotPotQA finetuning gpt-4o-mini with rejection sampling N=500 and up to 1 attempts for each example with one model for all predictors. Tested on first 300 of devset.
Non-finetuned model: 73.0
Non-finetuned bootstrapped model: 75.67
Finetuned model: 72.0
Finetuned model with bootstrapping: 74.0



