In [18]:
import glob
import os
import pandas as pd
import random
from dotenv import load_dotenv

import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

In [2]:
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join('.', 'cache')

In [19]:
load_dotenv()

True

## Initialize Models

In [3]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo-1106', max_tokens=250, model_type='chat')

dspy.settings.configure(lm=turbo)

In [4]:
gpt4T = dspy.OpenAI(model='gpt-4-1106-preview', max_tokens=350, model_type='chat')

In [22]:
RUN_FROM_SCRATCH = True

# ScoNe Data

In [5]:
!git clone https://github.com/selenashe/ScoNe.git

Cloning into 'ScoNe'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[Ks:  51% (40/77)[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 77 (delta 42), reused 42 (delta 20), pack-reused 0[K
Receiving objects: 100% (77/77), 116.25 KiB | 1.25 MiB/s, done.
Resolving deltas: 100% (42/42), done.


In [6]:
def load_scone(dirname):
    dfs = []
    for filename in glob.glob(dirname + "/*.csv"):
        df = pd.read_csv(filename, index_col=0)
        df['category'] = os.path.basename(filename).replace(".csv", "")
        dfs.append(df)
    data_df = pd.concat(dfs)

    def as_example(row):
        # The 'one_scoped' file is from an earlier dataset, MoNLI, and
        # so is formatted a bit differently:
        suffix = '' if row['category'] == 'one_scoped' else '_edited'
        # Reformat the hypothesis to be an embedded clause in a question:
        hkey = 'sentence2' + suffix
        question = row[hkey][0].lower() + row[hkey][1: ].strip(".")
        question = f"Can we logically conclude for sure that {question}?"
        # Binary task formulation:
        label = "Yes" if row['gold_label' + suffix] == 'entailment' else "No"
        return dspy.Example({
            "context": row['sentence1' + suffix],
            "question": question,
            "answer": label,
            "category": row['category']
        }).with_inputs("context", "question")

    return list(data_df.apply(as_example, axis=1).values)

## Train, Dev, Test

In [7]:
all_train = load_scone("ScoNe/scone_nli/train")

random.seed(1)
random.shuffle(all_train)

# 200 random train, 50 random dev:
train, dev = all_train[: 200], all_train[200: 250]

len(train), len(dev)

(200, 50)

In [34]:
dev

[Example({'context': 'The people are not happy when they play instruments.', 'question': 'Can we logically conclude for sure that the people are not happy when they play accordions?', 'answer': 'No', 'category': 'one_not_scoped'}) (input_keys={'question', 'context'}),
 Example({'context': 'the boy does not play a guitar', 'question': 'Can we logically conclude for sure that the boy does not play an instrument?', 'answer': 'No', 'category': 'one_scoped'}) (input_keys={'question', 'context'}),
 Example({'context': 'The three children not holding plants are not in school today.', 'question': 'Can we logically conclude for sure that the three children not holding poppies are not in school today?', 'answer': 'Yes', 'category': 'one_scoped_one_not_scoped'}) (input_keys={'question', 'context'}),
 Example({'context': 'A woman did not like cheeseburger and did not like clothes.', 'question': 'Can we logically conclude for sure that a woman did not like food and did not like clothes?', 'answer':

In [8]:
random.seed(1)

test = load_scone(dirname=f"ScoNe/scone_nli/test")

# We're developing a system for the full ScoNe benchmark, but we'll
# evaluate only on one of the hardest and most informative ScoNe
# categories for now -- examples with a single negation that plays
# a crucial role in the reasoning:
test = [ex for ex in test if ex.category == "one_scoped"]

In [9]:
pd.Series([ex.answer for ex in test]).value_counts()

No     100
Yes    100
Name: count, dtype: int64

# Evaluation

In [10]:
scone_accuracy = dspy.evaluate.metrics.answer_exact_match

In [11]:
evaluator = Evaluate(devset=test, num_threads=1, display_progress=True, display_table=0)

# Zero Shot Chain of Thought

In [12]:
class ScoNeSignature(dspy.Signature):
    ("""You are given some context (a premise) and a question (a hypothesis). """
    """You must indicate with Yes/No answer whether we can logically """
    """conclude the hypothesis from the premise.""")

    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Yes or No")

In [13]:
class ScoNeCoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(ScoNeSignature)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)

In [14]:
cot_zeroshot = ScoNeCoT()

In [20]:
evaluator(cot_zeroshot, metric=scone_accuracy)


  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                      | 0/200 [00:01<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|▌                                                                                                             | 1/200 [00:01<04:11,  1.26s/it][A
Average Metric: 1 / 2  (50.0):   0%|▌                                                                                                              | 1/200 [00:02<04:11,  1.26s/it][A
Average Metric: 1 / 2  (50.0):   1%|█                                                                                                              | 2/200 [00:02<04:36,  1.40s/it][A
Average Metric: 2 / 3  (66.7):   1%|█                                               

Average Metric: 99 / 200  (49.5%)



  df = df.applymap(truncate_cell)


49.5

# Bootstrap Few Shot Optimization with Random Search

Use "teacher," in this case GPT4, to generate "demonstrations". Demonstrations basically include the "thoughts" or other LLM responses alongside the final answer. These thoughts help guide the LLM to do something similar in the test scenario. Then, uses those examples to do few shot prompting. Examples of this at the end. 

In [21]:
bootstrap_optimizer = BootstrapFewShotWithRandomSearch(
    max_bootstrapped_demos=8,
    max_labeled_demos=8,
    num_candidate_programs=10,
    num_threads=8,
    metric=scone_accuracy,
    teacher_settings=dict(lm=gpt4T))

Going to sample between 1 and 8 traces per predictor.
Will attempt to train 10 candidate sets.


In [23]:
if RUN_FROM_SCRATCH:
    cot_fewshot = bootstrap_optimizer.compile(cot_zeroshot, trainset=train, valset=dev)
else:
    cot_fewshot = ScoNeCoT()
    cot_fewshot.load("scone-cot_fewshot-turbo-gpt4-demos.json")


  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:42,  1.15it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:42,  1.15it/s][A
Average Metric: 3 / 3  (100.0):   4%|████▍                                                                                                          | 2/50 [00:01<00:41,  1.15it/s][A
Average Metric: 3 / 3  (100.0):   6%|██████▋                                        

Average Metric: 24 / 50  (48.0%)
Score: 48.0 for set: [0]
New best score: 48.0 for seed -3
Scores so far: [48.0]
Best score: 48.0



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:24,  1.98it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:24,  1.98it/s][A
Average Metric: 2 / 2  (100.0):   4%|████▍                                                                                                          | 2/50 [00:00<00:14,  3.25it/s][A
Average Metric: 3 / 3  (100.0):   4%|████▍                                          

Average Metric: 29 / 50  (58.0%)
Score: 58.0 for set: [8]
New best score: 58.0 for seed -2
Scores so far: [48.0, 58.0]
Best score: 58.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:04<15:37,  4.71s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:08<13:44,  4.16s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:12<12:49,  3.91s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:14<11:23,  3.49s/it][A
  2%|███▌                                                                           

Bootstrapped 8 full traces after 12 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   0%|                                                                                                                         | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   2%|██▎                                                                                                              | 1/50 [00:00<00:41,  1.18it/s][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:00<00:41,  1.18it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                                                                                           | 2/50 [00:01<00:40,  1.18it/s][A
Average Metric: 2 / 3  (66.7):   6%|██████▋                                         

Average Metric: 30 / 50  (60.0%)
Score: 60.0 for set: [8]
New best score: 60.0 for seed -1
Scores so far: [48.0, 58.0, 60.0]
Best score: 60.0
Average of max per entry across top 1 scores: 0.6
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.92
Average of max per entry across top 8 scores: 0.92
Average of max per entry across top 9999 scores: 0.92



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:04<13:41,  4.13s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:08<14:18,  4.33s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:11<11:55,  3.63s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:14<11:11,  3.42s/it][A
  2%|███▌                                                                           

Bootstrapped 7 full traces after 10 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   0%|                                                                                                                         | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   2%|██▎                                                                                                              | 1/50 [00:00<00:46,  1.06it/s][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:01<00:46,  1.06it/s][A
Average Metric: 1 / 2  (50.0):   4%|████▍                                                                                                           | 2/50 [00:01<00:23,  2.06it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                           

Average Metric: 36 / 50  (72.0%)
Score: 72.0 for set: [8]
New best score: 72.0 for seed 0
Scores so far: [48.0, 58.0, 60.0, 72.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<08:40,  2.61s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:05<10:04,  3.05s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:09<10:22,  3.16s/it][A


Bootstrapped 3 full traces after 4 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:42,  1.16it/s][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:00<00:42,  1.16it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                                                                                           | 2/50 [00:00<00:41,  1.16it/s][A
Average Metric: 3 / 4  (75.0):   6%|██████▋                                         

Average Metric: 28 / 50  (56.0%)
Score: 56.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<07:51,  2.37s/it][A


Bootstrapped 1 full traces after 2 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:37,  1.31it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:37,  1.31it/s][A
Average Metric: 3 / 3  (100.0):   4%|████▍                                                                                                          | 2/50 [00:00<00:36,  1.31it/s][A
Average Metric: 4 / 4  (100.0):   6%|██████▋                                        

Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 22 / 30  (73.3):  58%|███████████████████████████████████████████████████████████████▏                                             | 29/50 [00:04<00:03,  6.59it/s][A
Average Metric: 22 / 30  (73.3):  60%|█████████████████████████████████████████████████████████████████▍                                           | 30/50 [00:04<00:03,  6.18it/s][A
Average Metric: 22 / 31  (71.0):  60%|█████████████████████████████████████████████████████████████████▍                                           | 30/50 [00:04<00:03,  6.18it/s][A
Average Metric: 23 / 32  (71.9):  62%|███████████████████████████████████████████████████████████████████▌                                         | 31/50 [00:04<00:03,  6.18it/s][A
Average Metric: 23 / 32  (71.9):  64%|█████████████████████████████████████████████████████████████████████▊                                       | 32/50 [00:04<00:02,  7.23it/s][A
Average Metric: 23 / 33  (69.7):  64%|██████████████████████████████████████████████

Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.3 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 24 / 36  (66.7):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:05<00:02,  5.54it/s][A
Average Metric: 24 / 36  (66.7):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:05<00:03,  4.17it/s][A
Average Metric: 25 / 37  (67.6):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:05<00:03,  4.17it/s][A
Average Metric: 25 / 38  (65.8):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:06<00:03,  4.17it/s][A
Average Metric: 25 / 38  (65.8):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:06<00:02,  4.05it/s][A

Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.2 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 26 / 39  (66.7):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:06<00:02,  4.05it/s][A
Average Metric: 26 / 39  (66.7):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:06<00:03,  3.56it/s][A

Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.7 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 27 / 40  (67.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:07<00:03,  3.56it/s][A
Average Metric: 27 / 40  (67.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:07<00:03,  2.80it/s][A
Average Metric: 28 / 41  (68.3):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:07<00:03,  2.80it/s][A
Average Metric: 28 / 41  (68.3):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:07<00:02,  3.03it/s][A

Backing off 3.0 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 29 / 42  (69.0):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:08<00:02,  3.03it/s][A
Average Metric: 29 / 42  (69.0):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:08<00:03,  2.22it/s][A

Backing off 0.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 30 / 43  (69.8):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:09<00:03,  2.22it/s][A
Average Metric: 30 / 43  (69.8):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:09<00:03,  2.15it/s][A
Average Metric: 31 / 44  (70.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:09<00:03,  2.15it/s][A

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 32 / 45  (71.1):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:09<00:02,  2.15it/s][A
Average Metric: 32 / 45  (71.1):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:09<00:01,  2.94it/s][A

Backing off 2.9 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 33 / 46  (71.7):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:10<00:01,  2.94it/s][A
Average Metric: 33 / 46  (71.7):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:10<00:01,  2.15it/s][A
Average Metric: 34 / 47  (72.3):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:10<00:01,  2.15it/s][A
Average Metric: 34 / 48  (70.8):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:11<00:01,  2.15it/s][A
Average Metric: 34 / 48  (70.8):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:11<00:01,  1.86it/s][A
Average Metric: 34 / 49  (69.4):  96%|██████████████████████████████████████████████

Average Metric: 35 / 50  (70.0%)
Score: 70.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<09:47,  2.95s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:06<11:24,  3.46s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:10<11:57,  3.64s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:13<10:41,  3.27s/it][A


Bootstrapped 4 full traces after 5 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:01<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:01<01:07,  1.38s/it][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:01<01:07,  1.38s/it][A
Average Metric: 2 / 2  (100.0):   4%|████▍                                                                                                          | 2/50 [00:01<00:42,  1.13it/s][A
Average Metric: 3 / 3  (100.0):   4%|████▍                                          

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 12 / 33  (36.4):  64%|█████████████████████████████████████████████████████████████████████▊                                       | 32/50 [00:05<00:02,  7.32it/s][A
Average Metric: 12 / 33  (36.4):  66%|███████████████████████████████████████████████████████████████████████▉                                     | 33/50 [00:05<00:02,  6.17it/s][A
Average Metric: 13 / 34  (38.2):  66%|███████████████████████████████████████████████████████████████████████▉                                     | 33/50 [00:06<00:02,  6.17it/s][A
Average Metric: 13 / 34  (38.2):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:06<00:02,  5.83it/s][A
Average Metric: 13 / 35  (37.1):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:06<00:02,  5.83it/s][A
Average Metric: 13 / 35  (37.1):  70%|██████████████████████████████████████████████

Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 13 / 36  (36.1):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:06<00:03,  4.35it/s][A


Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.9 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}


Average Metric: 13 / 36  (36.1):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:06<00:03,  3.70it/s][A
Average Metric: 14 / 37  (37.8):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:07<00:03,  3.70it/s][A
Average Metric: 14 / 37  (37.8):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:07<00:03,  3.41it/s][A

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 14 / 38  (36.8):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:08<00:03,  3.41it/s][A
Average Metric: 14 / 38  (36.8):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:08<00:05,  2.34it/s][A
Average Metric: 15 / 39  (38.5):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:08<00:05,  2.34it/s][A
Average Metric: 15 / 39  (38.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:08<00:04,  2.49it/s][A

Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.4 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 15 / 40  (37.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:09<00:04,  2.49it/s][A
Average Metric: 15 / 40  (37.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:09<00:05,  1.81it/s][A
Average Metric: 16 / 41  (39.0):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:09<00:05,  1.81it/s][A
Average Metric: 16 / 41  (39.0):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:09<00:04,  2.19it/s][A

Backing off 1.8 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.9 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 17 / 42  (40.5):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:10<00:04,  2.19it/s][A
Average Metric: 17 / 42  (40.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:10<00:04,  1.74it/s][A
Average Metric: 18 / 43  (41.9):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:10<00:04,  1.74it/s][A
Average Metric: 18 / 43  (41.9):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:10<00:03,  1.85it/s][A

Backing off 0.9 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 19 / 44  (43.2):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:11<00:03,  1.85it/s][A
Average Metric: 19 / 44  (43.2):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:11<00:03,  1.89it/s][A

Backing off 0.6 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 19 / 45  (42.2):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:11<00:03,  1.89it/s][A
Average Metric: 19 / 45  (42.2):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:11<00:02,  1.94it/s][A

Backing off 0.1 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 20 / 46  (43.5):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:12<00:02,  1.94it/s][A
Average Metric: 20 / 46  (43.5):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:12<00:02,  1.89it/s][A

Backing off 1.7 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 4.5 seconds after 4 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 20 / 47  (42.6):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:13<00:02,  1.89it/s][A
Average Metric: 20 / 47  (42.6):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:13<00:02,  1.46it/s][A
Average Metric: 20 / 48  (41.7):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:14<00:02,  1.46it/s][A
Average Metric: 20 / 48  (41.7):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:14<00:01,  1.39it/s][A
Average Metric: 21 / 49  (42.9):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:14<00:01,  1.39it/s][A
Average Metric: 21 / 49  (42.9):  98%|██████████████████████████████████████████████

Average Metric: 21 / 50  (42.0%)
Score: 42.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<09:50,  2.97s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:07<13:30,  4.10s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:10<11:44,  3.58s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:15<12:23,  3.79s/it][A


Bootstrapped 4 full traces after 5 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:48,  1.01it/s][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:01<00:48,  1.01it/s][A
Average Metric: 1 / 3  (33.3):   4%|████▍                                                                                                           | 2/50 [00:01<00:47,  1.01it/s][A
Average Metric: 1 / 3  (33.3):   6%|██████▋                                         

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 19 / 35  (54.3):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:05<00:03,  4.93it/s][A
Average Metric: 19 / 35  (54.3):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:05<00:02,  6.31it/s][A
Average Metric: 19 / 36  (52.8):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:06<00:02,  6.31it/s][A
Average Metric: 19 / 36  (52.8):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:06<00:02,  5.52it/s][A
Average Metric: 19 / 37  (51.4):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:06<00:02,  5.52it/s][A
Average Metric: 20 / 38  (52.6):  74%|██████████████████████████████████████████████

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 20 / 39  (51.3):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:07<00:02,  5.33it/s][A
Average Metric: 20 / 39  (51.3):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:07<00:03,  3.43it/s][A
Average Metric: 21 / 40  (52.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:07<00:03,  3.43it/s][A
Average Metric: 21 / 40  (52.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:07<00:03,  2.94it/s][A

Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.7 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 22 / 41  (53.7):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:08<00:03,  2.94it/s][A
Average Metric: 22 / 41  (53.7):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:08<00:03,  2.66it/s][A
Average Metric: 23 / 42  (54.8):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:08<00:03,  2.66it/s][A
Average Metric: 23 / 42  (54.8):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:08<00:03,  2.64it/s][A

Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.9 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 23 / 43  (53.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:09<00:03,  2.64it/s][A
Average Metric: 23 / 43  (53.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:09<00:04,  1.53it/s][A
Average Metric: 24 / 44  (54.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:10<00:04,  1.53it/s][A

Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 25 / 45  (55.6):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:10<00:03,  1.53it/s][A
Average Metric: 25 / 45  (55.6):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:10<00:02,  1.94it/s][A

Backing off 1.3 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 25 / 46  (54.3):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:11<00:02,  1.94it/s][A
Average Metric: 25 / 46  (54.3):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:11<00:02,  1.43it/s][A
Average Metric: 26 / 47  (55.3):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:12<00:02,  1.43it/s][A
Average Metric: 26 / 47  (55.3):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:12<00:01,  1.75it/s][A

Backing off 1.2 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 26 / 48  (54.2):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:13<00:01,  1.75it/s][A
Average Metric: 26 / 48  (54.2):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:13<00:01,  1.36it/s][A
Average Metric: 27 / 49  (55.1):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:14<00:01,  1.36it/s][A
Average Metric: 27 / 49  (55.1):  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 49/50 [00:14<00:00,  1.34it/s][A
Average Metric: 28 / 50  (56.0):  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 49/50 [00:14<00:00,  1.34it/s][A
Average Metric: 28 / 50  (56.0): 100%|██████████████████████████████████████████████

Average Metric: 28 / 50  (56.0%)
Score: 56.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:04<16:17,  4.91s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:08<13:51,  4.20s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:12<12:42,  3.87s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:14<11:20,  3.47s/it][A
  2%|███▌                                                                           

Bootstrapped 5 full traces after 6 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   0%|                                                                                                                         | 0/50 [00:01<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   2%|██▎                                                                                                              | 1/50 [00:01<00:55,  1.12s/it][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:01<00:55,  1.12s/it][A
Average Metric: 1 / 2  (50.0):   4%|████▍                                                                                                           | 2/50 [00:01<00:26,  1.81it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                           

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.8 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 19 / 35  (54.3):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:06<00:02,  7.11it/s][A
Average Metric: 19 / 35  (54.3):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:06<00:04,  3.30it/s][A
Average Metric: 20 / 36  (55.6):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:06<00:04,  3.30it/s][A

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 20 / 37  (54.1):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:07<00:04,  3.30it/s][A
Average Metric: 20 / 37  (54.1):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:07<00:05,  2.36it/s][A
Average Metric: 21 / 38  (55.3):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:07<00:05,  2.36it/s][A
Average Metric: 21 / 38  (55.3):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:07<00:05,  2.32it/s][A
Average Metric: 22 / 39  (56.4):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:08<00:05,  2.32it/s][A
Average Metric: 22 / 39  (56.4):  78%|██████████████████████████████████████████████

Backing off 0.1 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.6 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.8 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.3 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 23 / 40  (57.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:08<00:04,  2.37it/s][A
Average Metric: 23 / 40  (57.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:08<00:04,  2.27it/s][A
Average Metric: 24 / 41  (58.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:09<00:04,  2.27it/s][A
Average Metric: 24 / 41  (58.5):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:09<00:03,  2.34it/s][A

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.4 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.7 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 25 / 42  (59.5):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:10<00:03,  2.34it/s][A
Average Metric: 25 / 42  (59.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:10<00:04,  1.67it/s][A
Average Metric: 26 / 43  (60.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:10<00:04,  1.67it/s][A
Average Metric: 26 / 43  (60.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:10<00:03,  1.84it/s][A

Backing off 1.5 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.8 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 27 / 44  (61.4):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:11<00:03,  1.84it/s][A
Average Metric: 27 / 44  (61.4):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:11<00:03,  1.88it/s][A

Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 3.1 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 28 / 45  (62.2):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:12<00:03,  1.88it/s][A
Average Metric: 28 / 45  (62.2):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:12<00:04,  1.25it/s][A
Average Metric: 29 / 46  (63.0):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:13<00:04,  1.25it/s][A
Average Metric: 29 / 46  (63.0):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:13<00:02,  1.44it/s][A

Backing off 2.1 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 30 / 47  (63.8):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:13<00:02,  1.44it/s][A
Average Metric: 30 / 47  (63.8):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:13<00:02,  1.36it/s][A
Average Metric: 31 / 48  (64.6):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:14<00:02,  1.36it/s][A
Average Metric: 31 / 48  (64.6):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:14<00:01,  1.48it/s][A
Average Metric: 32 / 49  (65.3):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:15<00:01,  1.48it/s][A
Average Metric: 32 / 49  (65.3):  98%|██████████████████████████████████████████████

Average Metric: 33 / 50  (66.0%)
Score: 66.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0, 66.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<08:49,  2.66s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:04<08:02,  2.44s/it][A


Bootstrapped 2 full traces after 3 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:38,  1.27it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:38,  1.27it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                                                                                           | 2/50 [00:00<00:37,  1.27it/s][A
Average Metric: 3 / 4  (75.0):   6%|██████▋                                         

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 10 / 17  (58.8):  32%|██████████████████████████████████▉                                                                          | 16/50 [00:02<00:04,  7.59it/s][A

Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 11 / 18  (61.1):  34%|█████████████████████████████████████                                                                        | 17/50 [00:03<00:04,  7.59it/s][A
Average Metric: 11 / 18  (61.1):  36%|███████████████████████████████████████▏                                                                     | 18/50 [00:03<00:06,  5.00it/s][A
Average Metric: 11 / 19  (57.9):  36%|███████████████████████████████████████▏                                                                     | 18/50 [00:03<00:06,  5.00it/s][A
Average Metric: 11 / 19  (57.9):  38%|█████████████████████████████████████████▍                                                                   | 19/50 [00:03<00:06,  4.66it/s][A
Average Metric: 11 / 20  (55.0):  38%|█████████████████████████████████████████▍                                                                   | 19/50 [00:03<00:06,  4.66it/s][A
Average Metric: 11 / 20  (55.0):  40%|███████████████████████████████████████████▌  

Backing off 0.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.4 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.0 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.2 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 11 / 21  (52.4):  40%|███████████████████████████████████████████▌                                                                 | 20/50 [00:04<00:05,  5.14it/s][A
Average Metric: 11 / 21  (52.4):  42%|█████████████████████████████████████████████▊                                                               | 21/50 [00:04<00:09,  3.13it/s][A

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 11 / 22  (50.0):  42%|█████████████████████████████████████████████▊                                                               | 21/50 [00:04<00:09,  3.13it/s][A
Average Metric: 12 / 23  (52.2):  44%|███████████████████████████████████████████████▉                                                             | 22/50 [00:05<00:08,  3.13it/s][A
Average Metric: 12 / 23  (52.2):  46%|██████████████████████████████████████████████████▏                                                          | 23/50 [00:05<00:10,  2.55it/s][A
Average Metric: 13 / 24  (54.2):  46%|██████████████████████████████████████████████████▏                                                          | 23/50 [00:05<00:10,  2.55it/s][A
Average Metric: 13 / 25  (52.0):  48%|████████████████████████████████████████████████████▎                                                        | 24/50 [00:05<00:10,  2.55it/s][A
Average Metric: 13 / 25  (52.0):  50%|██████████████████████████████████████████████

Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 2.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 1.5 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 2.2 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 13 / 26  (50.0):  50%|██████████████████████████████████████████████████████▌                                                      | 25/50 [00:06<00:08,  2.96it/s][A
Average Metric: 13 / 26  (50.0):  52%|████████████████████████████████████████████████████████▋                                                    | 26/50 [00:06<00:08,  2.97it/s][A

Backing off 3.4 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 13 / 27  (48.1):  52%|████████████████████████████████████████████████████████▋                                                    | 26/50 [00:06<00:08,  2.97it/s][A
Average Metric: 13 / 27  (48.1):  54%|██████████████████████████████████████████████████████████▊                                                  | 27/50 [00:06<00:08,  2.82it/s][A

Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 14 / 28  (50.0):  54%|██████████████████████████████████████████████████████████▊                                                  | 27/50 [00:07<00:08,  2.82it/s][A
Average Metric: 14 / 28  (50.0):  56%|█████████████████████████████████████████████████████████████                                                | 28/50 [00:07<00:08,  2.60it/s][A

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 3.8 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 14 / 29  (48.3):  56%|█████████████████████████████████████████████████████████████                                                | 28/50 [00:08<00:08,  2.60it/s][A
Average Metric: 14 / 29  (48.3):  58%|███████████████████████████████████████████████████████████████▏                                             | 29/50 [00:08<00:13,  1.60it/s][A
Average Metric: 14 / 30  (46.7):  58%|███████████████████████████████████████████████████████████████▏                                             | 29/50 [00:08<00:13,  1.60it/s][A
Average Metric: 14 / 30  (46.7):  60%|█████████████████████████████████████████████████████████████████▍                                           | 30/50 [00:08<00:10,  1.82it/s][A
Average Metric: 14 / 31  (45.2):  60%|█████████████████████████████████████████████████████████████████▍                                           | 30/50 [00:08<00:10,  1.82it/s][A

Backing off 2.3 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 15 / 32  (46.9):  62%|███████████████████████████████████████████████████████████████████▌                                         | 31/50 [00:09<00:10,  1.82it/s][A
Average Metric: 15 / 32  (46.9):  64%|█████████████████████████████████████████████████████████████████████▊                                       | 32/50 [00:09<00:07,  2.26it/s][A
Average Metric: 15 / 33  (45.5):  64%|█████████████████████████████████████████████████████████████████████▊                                       | 32/50 [00:09<00:07,  2.26it/s][A
Average Metric: 15 / 33  (45.5):  66%|███████████████████████████████████████████████████████████████████████▉                                     | 33/50 [00:09<00:06,  2.51it/s][A

Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 2.2 seconds after 4 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.5 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 16 / 34  (47.1):  66%|███████████████████████████████████████████████████████████████████████▉                                     | 33/50 [00:10<00:06,  2.51it/s][A
Average Metric: 16 / 34  (47.1):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:10<00:07,  2.00it/s][A

Backing off 7.7 seconds after 4 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 17 / 35  (48.6):  68%|██████████████████████████████████████████████████████████████████████████                                   | 34/50 [00:10<00:07,  2.00it/s][A
Average Metric: 17 / 35  (48.6):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:10<00:07,  2.08it/s][A

Backing off 0.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 17 / 36  (47.2):  70%|████████████████████████████████████████████████████████████████████████████▎                                | 35/50 [00:11<00:07,  2.08it/s][A
Average Metric: 17 / 36  (47.2):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:11<00:07,  1.81it/s][A
Average Metric: 17 / 37  (45.9):  72%|██████████████████████████████████████████████████████████████████████████████▍                              | 36/50 [00:11<00:07,  1.81it/s][A

Backing off 1.6 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 3.3 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 17 / 38  (44.7):  74%|████████████████████████████████████████████████████████████████████████████████▋                            | 37/50 [00:12<00:07,  1.81it/s][A
Average Metric: 17 / 38  (44.7):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:12<00:05,  2.13it/s][A
Average Metric: 17 / 39  (43.6):  76%|██████████████████████████████████████████████████████████████████████████████████▊                          | 38/50 [00:12<00:05,  2.13it/s][A
Average Metric: 17 / 39  (43.6):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:12<00:05,  2.16it/s][A

Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 5.9 seconds after 4 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 12.9 seconds after 5 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 17 / 40  (42.5):  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 39/50 [00:13<00:05,  2.16it/s][A
Average Metric: 17 / 40  (42.5):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:13<00:05,  1.85it/s][A
Average Metric: 18 / 41  (43.9):  80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 40/50 [00:13<00:05,  1.85it/s][A

Backing off 2.1 seconds after 3 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 19 / 42  (45.2):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:14<00:04,  1.85it/s][A
Average Metric: 19 / 42  (45.2):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:14<00:03,  2.10it/s][A
Average Metric: 20 / 43  (46.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:14<00:03,  2.10it/s][A
Average Metric: 20 / 43  (46.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:14<00:02,  2.38it/s][A

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 21 / 44  (47.7):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:14<00:02,  2.38it/s][A
Average Metric: 21 / 44  (47.7):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:14<00:02,  2.44it/s][A
Average Metric: 22 / 45  (48.9):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:16<00:02,  2.44it/s][A
Average Metric: 22 / 45  (48.9):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:16<00:03,  1.63it/s][A
Average Metric: 23 / 46  (50.0):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:16<00:03,  1.63it/s][A
Average Metric: 23 / 47  (48.9):  92%|██████████████████████████████████████████████

Average Metric: 23 / 50  (46.0%)
Score: 46.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0, 66.0, 46.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:02<09:28,  2.86s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:06<11:26,  3.47s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:10<11:47,  3.59s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:15<13:45,  4.21s/it][A
  2%|███▌                                                                           

Bootstrapped 6 full traces after 8 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:39,  1.24it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:39,  1.24it/s][A
Average Metric: 2 / 2  (100.0):   4%|████▍                                                                                                          | 2/50 [00:00<00:20,  2.32it/s][A
Average Metric: 3 / 3  (100.0):   4%|████▍                                          

Average Metric: 32 / 50  (64.0%)
Score: 64.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0, 66.0, 46.0, 64.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:03<11:58,  3.61s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:06<09:57,  3.02s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:10<12:30,  3.81s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:13<11:15,  3.45s/it][A
  2%|███▌                                                                           

Bootstrapped 4 full traces after 6 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:38,  1.27it/s][A
Average Metric: 2 / 2  (100.0):   2%|██▏                                                                                                            | 1/50 [00:00<00:38,  1.27it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                                                                                           | 2/50 [00:00<00:37,  1.27it/s][A
Average Metric: 3 / 4  (75.0):   6%|██████▋                                         

Backing off 0.4 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 20 / 42  (47.6):  82%|█████████████████████████████████████████████████████████████████████████████████████████▍                   | 41/50 [00:06<00:01,  7.79it/s][A
Average Metric: 20 / 42  (47.6):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:06<00:01,  5.24it/s][A
Average Metric: 20 / 43  (46.5):  84%|███████████████████████████████████████████████████████████████████████████████████████████▌                 | 42/50 [00:06<00:01,  5.24it/s][A
Average Metric: 20 / 43  (46.5):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:06<00:01,  5.80it/s][A
Average Metric: 21 / 44  (47.7):  86%|█████████████████████████████████████████████████████████████████████████████████████████████▋               | 43/50 [00:06<00:01,  5.80it/s][A
Average Metric: 21 / 44  (47.7):  88%|██████████████████████████████████████████████

Backing off 0.1 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.9 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 22 / 46  (47.8):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:07<00:00,  6.06it/s][A
Average Metric: 22 / 46  (47.8):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:07<00:01,  3.18it/s][A
Average Metric: 23 / 47  (48.9):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:07<00:01,  3.18it/s][A
Average Metric: 23 / 47  (48.9):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:07<00:00,  3.67it/s][A
Average Metric: 24 / 48  (50.0):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:07<00:00,  3.67it/s][A
Average Metric: 24 / 48  (50.0):  96%|██████████████████████████████████████████████

Average Metric: 25 / 50  (50.0%)
Score: 50.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0, 66.0, 46.0, 64.0, 50.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0



  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
  0%|▋                                                                                                                                             | 1/200 [00:04<14:56,  4.51s/it][A
  1%|█▍                                                                                                                                            | 2/200 [00:07<12:53,  3.90s/it][A
  2%|██▏                                                                                                                                           | 3/200 [00:13<15:21,  4.68s/it][A
  2%|██▊                                                                                                                                           | 4/200 [00:16<13:05,  4.01s/it][A
  2%|███▌                                                                           

Bootstrapped 8 full traces after 9 examples in round 0.



  0%|                                                                                                                                                       | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   0%|                                                                                                                         | 0/50 [00:00<?, ?it/s][A
Average Metric: 0 / 1  (0.0):   2%|██▎                                                                                                              | 1/50 [00:00<00:44,  1.09it/s][A
Average Metric: 1 / 2  (50.0):   2%|██▏                                                                                                             | 1/50 [00:00<00:44,  1.09it/s][A
Average Metric: 2 / 3  (66.7):   4%|████▍                                                                                                           | 2/50 [00:01<00:43,  1.09it/s][A
Average Metric: 2 / 3  (66.7):   6%|██████▋                                         

Backing off 0.6 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}
Backing off 0.2 seconds after 1 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 23 / 45  (51.1):  88%|███████████████████████████████████████████████████████████████████████████████████████████████▉             | 44/50 [00:07<00:01,  5.38it/s][A
Average Metric: 23 / 45  (51.1):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:07<00:00,  6.11it/s][A
Average Metric: 24 / 46  (52.2):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████           | 45/50 [00:07<00:00,  6.11it/s][A
Average Metric: 24 / 46  (52.2):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:07<00:00,  4.72it/s][A

Backing off 1.0 seconds after 2 tries calling function <function GPT3.request at 0x117742200> with kwargs {}



Average Metric: 25 / 47  (53.2):  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 46/50 [00:08<00:00,  4.72it/s][A
Average Metric: 25 / 47  (53.2):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:08<00:01,  2.97it/s][A
Average Metric: 26 / 48  (54.2):  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 47/50 [00:08<00:01,  2.97it/s][A
Average Metric: 26 / 48  (54.2):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:08<00:00,  2.61it/s][A
Average Metric: 26 / 49  (53.1):  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 48/50 [00:10<00:00,  2.61it/s][A
Average Metric: 26 / 49  (53.1):  98%|██████████████████████████████████████████████

Average Metric: 27 / 50  (54.0%)
Score: 54.0 for set: [8]
Scores so far: [48.0, 58.0, 60.0, 72.0, 56.0, 70.0, 42.0, 56.0, 66.0, 46.0, 64.0, 50.0, 54.0]
Best score: 72.0
Average of max per entry across top 1 scores: 0.72
Average of max per entry across top 2 scores: 0.94
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
13 candidate programs found.





In [24]:
evaluator(cot_fewshot, metric=scone_accuracy)


  0%|                                                                                                                                                      | 0/200 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|                                                                                                                      | 0/200 [00:00<?, ?it/s][A
Average Metric: 1 / 1  (100.0):   0%|▌                                                                                                             | 1/200 [00:00<02:57,  1.12it/s][A
Average Metric: 2 / 2  (100.0):   0%|▌                                                                                                             | 1/200 [00:01<02:57,  1.12it/s][A
Average Metric: 2 / 2  (100.0):   1%|█                                                                                                             | 2/200 [00:01<02:27,  1.34it/s][A
Average Metric: 3 / 3  (100.0):   1%|█                                              

Average Metric: 173 / 200  (86.5%)





86.5

In [25]:
cot_fewshot.save("scone-cot_fewshot-turbo-gpt4-demos.json")

# Examples

In [26]:
turbo.inspect_history(n=1)





You are given some context (a premise) and a question (a hypothesis). You must indicate with Yes/No answer whether we can logically conclude the hypothesis from the premise.

---

Follow the following format.

Context: ${context}

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Yes or No

---

Context: It is not true that there is not a single person walking in the city.

Question: Can we logically conclude for sure that it is not true that there is not a single celebrity walking in the city?

Reasoning: Let's think step by step in order to produce the answer. We know that the statement negates the absence of any person walking in the city, which means there is at least one person walking in the city. However, we do not have information about the status of the person or persons walking in the city. They could be celebrities or they could be non-celebrities. Without specific information about the individuals' celebrity st