In [1]:
import os
from pyprojroot import here
from datasets import Dataset

In [2]:
base_path = os.path.join("evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh")

In [3]:
from tqdm.notebook import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\".")
) == y_pred_i.translate(str.maketrans("", "", '.(),"'))

In [4]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(y_i)
            print("-" * 100)
            print(y_pred_i)
            print("-+=" * 100)
    return correct_preds

# boolean_expressions

In [18]:
subset = 'boolean_expressions'

In [19]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-boolean_expressions/bbh-boolean_expressions_eval')

In [20]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [21]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Break down the statement into simpler components": {
        "Component 1": "True and False",
        "Component 2": "( not True )"
    },
    "Step 2: Evaluate the truth value of Component 1": {
        "Applying logical rule": "Boolean algebra",
        "Truth value": "False"
    },
    "Step 3: Evaluate the truth value of Component 2": {
        "Applying logical rule": "Negation",
        "Truth value": "False"
    },
    "Step 4: Determine the order of operations": {
        "Operator precedence": "NOT, AND, OR",
        "Order of evaluation": "Component 2, then Component 1, then OR operation"
    },
    "Step 5: Apply the order of operations to evaluate the statement": {
        "Applying logical rule": "Operator precedence",
        "Intermediate result": "False OR False"
    },
    "Step 6: Evaluate the final truth value of the statement": {
        "Applying logical rule": "Boolean algebra",
        "Truth value": "False"
    },
    "Step 7: Analyze the stat

In [22]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

True
----------------------------------------------------------------------------------------------------
False."
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
True
----------------------------------------------------------------------------------------------------
False.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
True
----------------------------------------------------------------------------------------------------
False.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-

0.984

In [13]:
subset = 'causal_judgement'

In [14]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-causal_judgement/bbh-causal_judgement_eval')

In [15]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 187
})

In [16]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the primary event or action that triggered the problem": {
        "Description": "Determine the main event that led to the issue.",
        "Action": "Identify the simultaneous login of Alice and Zoe at 9 am."
    },
    "Step 2: Identify the unspoken rules or conditions that led to this situation": {
        "Description": "Understand the underlying rules or conditions that caused the problem.",
        "Action": "Recognize the rule that an empty email is sent if two people are logged in at the same time."
    },
    "Step 3: Sequence the events leading up to the problem": {
        "Description": "List the events in chronological order.",
        "Action": [
            "Alice logs in at 9 am.",
            "Zoe logs in at 9 am.",
            "An empty email is sent immediately."
        ]
    },
    "Step 4: Analytical Thinking - Evaluate the problem from different viewpoints": {
        "Description": "Consider different perspectives and challenge a

In [17]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[:3].strip()

In [18]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 187it [00:00, 29266.23it/s]

No, Yes.

Yes, No.

Yes, No.

No, Yes"

No, Yes.

Yes, No.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

No, Yes."

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

No, Yes.

No, Yes.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.






0.732620320855615

# causal_judgement

In [23]:
subset = "causal_judgement"

In [24]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-causal_judgement/bbh-causal_judgement_eval')

In [25]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 187
})

In [26]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the given task:

```
{
    "Step 1: Define the problem": {
        "Problem statement": "Did Zoe cause an empty email to be sent from the central computer to a non-existent email address?",
        "Key factors": ["Zoe's actions (logging in at 9 am)", "Central computer system (sending an empty email when two users are logged in simultaneously)", "Simultaneous login (Alice and Zoe logging in at the same time)"]
    },
    "Step 2: Identify key assumptions": {
        "Assumptions": ["Central computer system is functioning normally (i.e., it sends an empty email when two users are logged in simultaneously)", "Empty email is not caused by an external factor (e.g., a hacker or system malfunction)"]
    },
    "Step 3: Simplify the problem": {
        "Key factors to focus on": ["Simultaneous login of two users (Alice and Zoe)", "Central computer system's behavior (sending an empty email when two users are logged in simultaneously)"]
    },
    "Step 4: Analyze th

In [27]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No
----------------------------------------------------------------------------------------------------
Yes.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
No
----------------------------------------------------------------------------------------------------
Yes.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
Yes
----------------------------------------------------------------------------------------------------
No.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.6898395721925134

# date_understanding

In [28]:
subset = 'date_understanding'

In [29]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-date_understanding/bbh-date_understanding_eval')

In [30]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [31]:
print(dataset[0]["reasoning"])

Here is the filled JSON reasoning structure for the given task:

```
{
    "Step 1: Identify the core date-related problem": "Find the date one week ago from today",
    "Step 2: Identify the given date": "Jan 21, 2011",
    "Step 3: Identify the time period to subtract": "One week (7 days)",
    "Step 4: Calculate the date one day ago from the given date": "Jan 20, 2011",
    "Step 5: Subtract six more days from the date one day ago": "Jan 14, 2011",
    "Step 6: Determine the correct month and year for the calculated date": "January 2011",
    "Step 7: Format the calculated date in MM/DD/YYYY format": "01/14/2011",
    "Step 8: Compare the calculated date with the given options": "The closest option is (B) 01/15/2011, but it is one day ahead of the calculated date",
    "Step 9: Select the correct option that matches the calculated date": "None of the options exactly match the calculated date, but the closest one is (B) 01/15/2011"
}

Alternatively,

{
    "Problem Explanation": {
  

In [32]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [35]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 4) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(C)
----------------------------------------------------------------------------------------------------
B.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(A)
----------------------------------------------------------------------------------------------------
C.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(B)
----------------------------------------------------------------------------------------------------
in the correct format (MM/DD/YYYY) and matches option (B)."
-+=-+=-+=-+=-+=-+

0.9

In [34]:
def map_fn(ins):
    if not ins["answer_pred"] or len(ins["answer_pred"]) > 4:
        print(ins["input"])
        print("-" * 100)
        print(ins["target"], ins["answer_pred"])
        print("-" * 100)
        if not ins["answer_pred"]:
            print(ins["trajectory"])
        print("-+" * 100)

dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Yesterday, Jan 21, 2011, Jane ate 2 pizzas and 5 wings. What is the date 10 days ago in MM/DD/YYYY?
Options:
(A) 01/18/2011
(B) 01/12/2011
(C) 01/12/2069
(D) 01/13/2011
(E) 05/12/2010
(F) 08/12/2010
----------------------------------------------------------------------------------------------------
(B) B (However, note that this is an approximation as none of the given options exactly match the calculated answer).
----------------------------------------------------------------------------------------------------
-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Today is the first day of 2007. What is the date today in MM/DD/YYYY?
Options:
(A) 05/01/2007
(B) 01/01/2007
(C) 01/01/1951
(D) 01/01/2096
(E) 01/22/2007
(F) 12/18/2006
----------------------------------------------------------------------------------------------------
(B) in the

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [17]:
subset = 'disambiguation_qa'

In [18]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-disambiguation_qa/bbh-disambiguation_qa_eval')

In [19]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [20]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the pronoun and its potential antecedents": {
        "Pronoun": "they",
        "Potential antecedents": ["The worker", "The pedestrian"]
    },
    "Step 2: Analyze the sentence structure to understand the pronoun's relationship with potential antecedents": {
        "Sentence structure": "The worker told the pedestrian that they were repairing the sidewalk as quickly as possible.",
        "Relationship between pronoun and potential antecedents": "The pronoun 'they' is the subject of the subordinate clause, and its antecedent could be either 'The worker' or 'The pedestrian'."
    },
    "Step 3: Examine the grammatical evidence to support or refute each potential antecedent": {
        "Grammatical evidence for each potential antecedent": [
            {"Antecedent": "The worker", "Evidence": "The worker is the main subject of the sentence, and the verb 'told' is in the active voice, suggesting that the worker is performing the action."},
            {"An

In [21]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [22]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), C.

(B), C.

(C), B.

(A), C.

(A), C.

(B), C.

(A), C.

(A), C.

(A), C.

(C), B.

(C), B.

(A), C.

(C), B.

(C), B.

(C), A.

(C), B.

(C), B.

(A), C.

(A), (C) Ambiguous.

(B), C.

(B), option (B) because the sentence's structure and the use of the word 'bought' imply that the accountant is the recipient of the car and therefore the one who needs it. This interpretation is consistent with standard grammatical rules and the logic of the situation."

(B), C.

(C), B.

(C), A.

(A), C.

(A), C.

(C), B.

(A), (C) Ambiguous.

(C), B.

(A), C.

(C), B.

(C), B.

(C), B.

(A), C.

(A), C.

(C), A.

(C), A.

(B), C.

(C), B.

(C), B.

(B), A.

(A), C.

(B), C.

(C), A.

(A), C.

(B), C.

(C), A.

(B), C.

(C), A.

(A), C.

(C), B.

(A), C.

(A), C.

(B), C.



0.788

# dyck_languages

In [36]:
subset = 'dyck_languages'

In [37]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [38]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [39]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the given reasoning structure:

```
{
    "Step 1: Identify the input sequence": {
        "Sequence": "{ ( < [ < > ]"
    },
    "Step 2: Analyze the sequence from different perspectives": {
        "Innermost parentheses": "The innermost parentheses are < >.",
        "Outermost parentheses": "The outermost parentheses are { }.",
        "Matching pairs of parentheses": "The matching pairs are ( ), < >, and [ ]."
    },
    "Step 3: Simplify the sequence by focusing on one type of parenthesis at a time": {
        "Innermost parentheses": "The innermost < > are already closed.",
        "Outermost parentheses": "The outermost { needs to be closed with } at the end."
    },
    "Step 4: Break down the sequence into smaller parts": {
        "Identify matching pairs of parentheses": "The pairs are ( ), < >, and [ ].",
        "Identify lone parentheses": "There is a lone { that needs to be closed."
    },
    "Step 5: Determine the core issue with the given s

In [40]:
def map_fn(ins):
    find = "Input: "
    index = ins["input"].find(find)
    
    return {
        "target": ins["input"][index + len(find):] + " " + ins["target"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [41]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
).replace(" ", "") == y_pred_i.translate(str.maketrans("", "", '.(),"')).replace(" ", "")

In [42]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

{ ( < [ < > ] > ) }
----------------------------------------------------------------------------------------------------
{ ( < [ < > ] ) }.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
{ ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] > )
----------------------------------------------------------------------------------------------------
{ ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] ) ) ).
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
< 

0.544

# logical_deduction_three_objects

In [43]:
subset = 'logical_deduction_three_objects'

In [44]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-logical_deduction_three_objects/bbh-logical_deduction_three_objects_eval')

In [45]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [46]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core issue": "The task is to determine the cheapest fruit among plums, apples, and loquats.",
    "Step 2: Analyze the given statements": [
        "Statement 1: The loquats are the cheapest.",
        "Statement 2: The plums are less expensive than the apples."
    ],
    "Step 3: Eliminate or simplify irrelevant information": "There is no irrelevant information to be ignored or simplified.",
    "Step 4: Determine the logical consistency of the statements": "The statements are logically consistent with each other, as the loquats being the cheapest does not contradict the plums being less expensive than the apples.",
    "Step 5: Compare the prices of the fruits": [
        "Price comparison 1: Loquats are cheaper than plums.",
        "Price comparison 2: Plums are less expensive than apples.",
        "Price comparison 3: Loquats are cheaper than apples (based on the previous comparisons)."
  

In [47]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [48]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(C)
----------------------------------------------------------------------------------------------------
A.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(C)
----------------------------------------------------------------------------------------------------
A.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=


0.992

# penguins_in_a_table

In [49]:
subset = 'penguins_in_a_table'

In [50]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-penguins_in_a_table/bbh-penguins_in_a_table_eval')

In [51]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 146
})

In [52]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the relevant data in the table": {
        "Task": "List the name and age of each penguin in the table",
        "Data": [
            {"Name": "Louis", "Age": 7},
            {"Name": "Bernard", "Age": 5},
            {"Name": "Vincent", "Age": 9},
            {"Name": "Gwen", "Age": 8}
        ]
    },
    "Step 2: Determine the age criteria": {
        "Task": "Identify the age threshold for the penguins (less than 8 years old)",
        "Age threshold": 8
    },
    "Step 3: Delete the specified penguin from the table": {
        "Task": "Remove the penguin named Bernard from the table",
        "Updated table": [
            {"Name": "Louis", "Age": 7},
            {"Name": "Vincent", "Age": 9},
            {"Name": "Gwen", "Age": 8}
        ]
    },
    "Step 4: Count the penguins that meet the age criteria": {
        "Task": "Count the number of penguins that are less than 8 years old",
    

In [53]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [54]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A)
----------------------------------------------------------------------------------------------------
correct and supported by the data. The data clearly shows that only one penguin, Gwen, meets the criteria of being more than 5 years old and weighing more than 12 kg."
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(C)
----------------------------------------------------------------------------------------------------
E is incorrect, the correct answer is not among the first 4 options, but since only the letter is asked, the correct letter is not among the first 4, so the correct letter is (E).
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=

0.9657534246575342

# reasoning_about_colored_objects

In [55]:
subset = 'reasoning_about_colored_objects'

In [56]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-reasoning_about_colored_objects/bbh-reasoning_about_colored_objects_eval')

In [57]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [58]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Read and understand the task": "The task is to count the number of objects on the nightstand that are neither yellow nor green.",
    "Step 2: Identify the key elements of the task (e.g., colors, objects, location)": "The key elements are the colors (yellow and green), the objects (necklace, fidget spinner, keychain, sheet of paper, and stress ball), and the location (nightstand).",
    "Step 3: Determine the core issue or problem that needs to be addressed (e.g., counting objects that are neither yellow nor green)": "The core issue is to count the objects that are neither yellow nor green.",
    "Step 4: Simplify the problem by focusing only on the relevant characteristics (e.g., colors)": "The relevant characteristic is the color of the objects.",
    "Step 5: Break down the problem into smaller parts (e.g., counting yellow objects, counting green objects, counting remaining objects)": "The problem can be 

In [59]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [61]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(M)
----------------------------------------------------------------------------------------------------
confirmed to be (M) silver."
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(A)
----------------------------------------------------------------------------------------------------
B.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(J)
----------------------------------------------------------------------------------------------------
(F).
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-

0.968

# ruin_names

In [62]:
subset = 'ruin_names'

In [63]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-ruin_names/bbh-ruin_names_eval')

In [64]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [65]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core issue or problem in identifying a humorous edit of the artist or movie name 'star wars'": "The core issue is to determine which option is a playful and amusing alteration of the original name 'star wars'.",
    "Step 2: Determine the key assumptions underlying the concept of a humorous edit and how they apply to the given options": {
        "Assumptions about humor": "Humor often involves wordplay, satire, or absurdity.",
        "Assumptions about editing": "A humorous edit should be a creative and unexpected alteration of the original name.",
        "Relevance to options": "The options should be evaluated based on their level of creativity, unexpectedness, and playfulness."
    },
    "Step 3: Analyze the options from different perspectives, questioning the assumptions about what makes an edit humorous": {
        "Perspective 1: Wordplay": "Option C, 'star warts', uses wordplay by repl

In [66]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()",'))[0]

In [67]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(B)
----------------------------------------------------------------------------------------------------
C.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(B)
----------------------------------------------------------------------------------------------------
A.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(D)
----------------------------------------------------------------------------------------------------
A.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.864

# salient_translation_error_detection

In [68]:
subset = 'salient_translation_error_detection'

In [69]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-salient_translation_error_detection/bbh-salient_translation_error_detection_eval')

In [70]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [71]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
  "Step 1: Understand the task and error types": {
    "Task description": "Identify the error in the translation from German to English",
    "Error types": [
      "Named Entities",
      "Numerical Values",
      "Modifiers or Adjectives",
      "Negation or Antonyms",
      "Facts",
      "Dropped Content"
    ]
  },
  "Step 2: Analyze the source text": {
    "Source text": "Der Potsdamer Platz ist ein platzartiger Verkehrsknotenpunkt in den Berliner Ortsteilen Mitte und Tiergarten im Bezirk Mitte zwischen der alten Innenstadt im Osten und dem neuen Berliner Westen.",
    "Key phrases and entities": [
      "Potsdamer Platz",
      "platzartiger Verkehrsknotenpunkt",
      "Berliner Ortsteilen",
      "Mitte",
      "Tiergarten",
      "Bezirk Mitte",
      "alten Innenstadt",
      "Osten",
      "neuen Berliner Westen"
    ]
  },
  "Step 3: Analyze the translation": {
    "Translation text": "Potsdamer Platz is a s

In [72]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [73]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A)
----------------------------------------------------------------------------------------------------
D.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(D)
----------------------------------------------------------------------------------------------------
F.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(C)
----------------------------------------------------------------------------------------------------
F.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.656

# sports_understanding

In [5]:
subset = 'sports_understanding'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```
{
    "Assumptions about Tyreek Hill and context": "We assume that Tyreek Hill is a professional football player, specifically a wide receiver, and that the sentence refers to a play in a football game.",
    "Simplified sentence focusing on essential elements": "Tyreek Hill caught a screen pass.",
    "Analysis from different perspectives (player skills, game situation, team strategy)": {
        "Player skills": "Tyreek Hill is known for his speed, agility, and catching ability, which makes it plausible for him to catch a screen pass.",
        "Game situation": "A screen pass is often used in situations where the defense is blitzing or pressuring the quarterback, and the offense needs a quick and safe way to get the ball out to a receiver.",
        "Team strategy": "The team's strategy may involve using screen passes to get the ball to their playmakers, such as Tyreek Hill, in space and allow them to make plays."
    },
    "Core issue affecting plausibility": "The core issue i

In [9]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False',
  'False.',
  'Highly plausible.',
  'Implausible.',
  'Low.',
  'No',
  'No.',
  'Plausible but uncertain.',
  'Plausible in certain contexts.',
  'Plausible.',
  'Somewhat plausible.',
  "The sentence 'Courtland Sutton hit a triple' is unlikely, but not impossible, without further context.",
  "The sentence 'Jayson Tatum nutmegged the defender' is somewhat plausible but unlikely in a professional soccer context.",
  "The sentence 'Santi Cazorla called for the screen' is plausible.",
  'The sentence is partially plausible.',
  'The sentence is plausible but potentially misleading.',
  'The sentence is plausible but unlikely.',
  'The sentence is plausible in certain contexts or scenarios, such as Tyler Glasnow being a batter or a spectator, but less plausible in the context of him being a pitcher.',
  'The sentence is plausible.',
  'The sentence is potentially plausible in a baseball context, but without additional information or context, its plausibility cannot be confirm

In [10]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

{'False',
 'Highly plausible',
 'Implausible',
 'Low',
 'No',
 'Plausible',
 'Plausible but uncertain',
 'Plausible in certain contexts',
 'Somewhat plausible',
 "The sentence 'Courtland Sutton hit a triple' is unlikely, but not impossible, without further context",
 "The sentence 'Jayson Tatum nutmegged the defender' is somewhat plausible but unlikely in a professional soccer context",
 "The sentence 'Santi Cazorla called for the screen' is plausible",
 'The sentence is partially plausible',
 'The sentence is plausible',
 'The sentence is plausible but potentially misleading',
 'The sentence is plausible but unlikely',
 'The sentence is plausible in certain contexts or scenarios, such as Tyler Glasnow being a batter or a spectator, but less plausible in the context of him being a pitcher',
 'The sentence is potentially plausible',
 'The sentence is potentially plausible in a baseball context, but without additional information or context, its plausibility cannot be confirmed',
 'The s

In [11]:
# Plausible (Yes)
plausible_yes = [
    "Plausible",
    "Highly plausible",
    "The sentence 'Santi Cazorla called for the screen' is plausible",
    "The sentence is plausible",
    "Yes",
    "True",
]

# Implausible (No)
implausible_no = [
    "False",
    "Implausible",
    "No",
    "Unlikely",
    "The sentence is unlikely to be plausible",
    "implausible",
]

# Indeterminate
indeterminate = [
    "There is insufficient evidence to determine plausibility",
    "The sentence is plausible but unlikely",
    "The sentence is plausible in certain contexts or scenarios, such as Tyler Glasnow being a batter or a spectator, but less plausible in the context of him being a pitcher",
    "The sentence is potentially plausible",
    "The sentence is potentially plausible in a baseball context, but without additional information or context, its plausibility cannot be confirmed",
    "The sentence is partially plausible",
    "Somewhat plausible",
    "Plausible but uncertain",
    "Plausible in certain contexts",
    "Uncertain",
    "The sentence 'Courtland Sutton hit a triple' is unlikely, but not impossible, without further context",
    "The sentence 'Jayson Tatum nutmegged the defender' is somewhat plausible but unlikely in a professional soccer context",
    "Yes, but unlikely",
    "Yes, but with low probability",
    "Low",
    "Unlikely but not impossible",
    "Unlikely, but not impossible",
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Low',
 'Plausible but uncertain',
 'Plausible in certain contexts',
 'Somewhat plausible',
 "The sentence 'Courtland Sutton hit a triple' is unlikely, but not impossible, without further context",
 "The sentence 'Jayson Tatum nutmegged the defender' is somewhat plausible but unlikely in a professional soccer context",
 'The sentence is partially plausible',
 'The sentence is plausible but potentially misleading',
 'The sentence is plausible but unlikely',
 'The sentence is plausible in certain contexts or scenarios, such as Tyler Glasnow being a batter or a spectator, but less plausible in the context of him being a pitcher',
 'The sentence is potentially plausible',
 'The sentence is potentially plausible in a baseball context, but without additional information or context, its plausibility cannot be confirmed',
 'There is insufficient evidence to determine plausibility',
 'Uncertain',
 'Unlikely but not impossible',
 'Unlikely, but not impossible',
 'Yes, but unlikely',
 'Yes, but 

In [12]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

no
----------------------------------------------------------------------------------------------------
yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
yes
----------------------------------------------------------------------------------------------------
no
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
no
----------------------------------------------------------------------------------------------------
yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.752

# tracking_shuffled_objects_three_objects

In [84]:
subset = 'tracking_shuffled_objects_three_objects'

In [85]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-tracking_shuffled_objects_three_objects/bbh-tracking_shuffled_objects_three_objects_eval')

In [86]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [87]:
print(dataset[0]["reasoning"])

To solve the task, I will follow the step-by-step reasoning plan in JSON format.

```
{
    "Initial Partnerships": {
        "Alice's Partner": "Ophelia",
        "Bob's Partner": "Lola",
        "Claire's Partner": "Izzi"
    },
    "Partners after Bob and Claire switch": {
        "Alice's Partner": "Ophelia",
        "Bob's Partner": "Izzi",
        "Claire's Partner": "Lola"
    },
    "Partners after Claire and Alice switch": {
        "Alice's Partner": "Lola",
        "Bob's Partner": "Izzi",
        "Claire's Partner": "Ophelia"
    },
    "Partners after Alice and Bob switch": {
        "Alice's Partner": "Izzi",
        "Bob's Partner": "Lola",
        "Claire's Partner": "Ophelia"
    },
    "Is the final partner for Alice one of the options?": {
        "Option A (Ophelia)": "No",
        "Option B (Lola)": "No",
        "Option C (Izzi)": "Yes"
    },
    "Conclusion": "Alice's final partner is Izzi."
}
```

Alternatively, here is the filled JSON for the more detailed pla

In [88]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [89]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A)
----------------------------------------------------------------------------------------------------
B
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(C)
----------------------------------------------------------------------------------------------------
A.
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
(C)
----------------------------------------------------------------------------------------------------
A."
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.976

# web_of_lies

In [90]:
subset = 'web_of_lies'

In [91]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [92]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [93]:
print(dataset[0]["reasoning"])

To solve the task, we will follow the step-by-step reasoning plan and fill in the values for the corresponding keys.


```
{
    "Step 1: Analyze the statements from different perspectives":
    {
        "Identify the speakers and their statements": 
        [
            "Raymond tells the truth", 
            "Sal says Raymond lies", 
            "Alexis says Sal lies", 
            "Helene says Alexis lies", 
            "Elanor says Helene lies"
        ],
        "Consider the assumptions about who lies and who tells the truth": 
        [
            "If Raymond tells the truth, then Sal lies", 
            "If Sal lies, then Alexis tells the truth", 
            "If Alexis tells the truth, then Helene lies", 
            "If Helene lies, then Elanor tells the truth"
        ],
        "Evaluate the consistency of the statements": 
        [
            "The statements are consistent if Raymond tells the truth"
        ]
    },
    
    "Step 2: Simplify the chain of statements"

In [94]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Andree tells the truth if Kandi tells the truth, otherwise Andree lies.',
  'Cannot be determined.',
  'False.',
  'No',
  'No definitive answer can be provided due to the paradoxical nature of the statements.',
  'No.',
  None,
  'True.',
  'Yes',
  'Yes, Conception tells the truth.',
  'Yes.',
  'no.',
  'true.'},
 {'No', 'Yes'})

In [97]:
def map_fn(instance):
    if not instance["answer_pred"]:
        return {"answer_pred": instance["answer_pred"]}

    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."*')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Andree tells the truth if Kandi tells the truth, otherwise Andree lies',
 'Cannot be determined',
 'False',
 'No',
 'No definitive answer can be provided due to the paradoxical nature of the statements',
 None,
 'True',
 'Yes',
 'Yes, Conception tells the truth',
 'no',
 'true'}

In [99]:
print(dataset.filter(lambda x: x["answer_pred"] == None)[0]["reasoning"])

```
{
    "Step 1: Identify the core issue or question": "Does Alexis tell the truth?",
    "Step 2: Break down the statements into smaller parts": {
        "Millicent's statement": "Millicent tells the truth",
        "Millie's statement": "Millicent tells the truth",
        "Michael's statement": "Millie tells the truth",
        "Antwan's statement": "Michael lies",
        "Alexis's statement": "Antwan tells the truth"
    },
    "Step 3: Analyze the relationships between the statements": {
        "Relationship between Millicent and Millie's statements": "Both Millicent and Millie claim that Millicent tells the truth, so they agree",
        "Relationship between Millie and Michael's statements": "Michael claims that Millie tells the truth, which means Michael agrees with Millie's statement about Millicent",
        "Relationship between Michael and Antwan's statements": "Antwan claims that Michael lies, which directly contradicts Michael's statement",
        "Relationship betw

In [106]:
# Truth (Yes)
truth_yes = [
    'True.',
    'True',
  'Yes',
  'Yes, Conception tells the truth',
  'Yes.',
  'true'
]

# False (No)
false_no = [
    'False',
 'No',
'no',
]

indeterminate = [
    'Andree tells the truth if Kandi tells the truth, otherwise Andree lies',
    'Cannot be determined',
    'No definitive answer can be provided due to the paradoxical nature of the statements',
 None,
 
]



def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Andree tells the truth if Kandi tells the truth, otherwise Andree lies',
 'Cannot be determined',
 'No',
 'No definitive answer can be provided due to the paradoxical nature of the statements',
 None,
 'Yes'}

In [107]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No
----------------------------------------------------------------------------------------------------
Yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
No
----------------------------------------------------------------------------------------------------
Yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
No
----------------------------------------------------------------------------------------------------
Yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.848

# word_sorting

In [5]:
subset = 'word_sorting'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_5/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core issue or problem": 
        "The main challenge in sorting the given list of words alphabetically is to determine the correct order of the words 'slurp', 'raytheon', and 'gloucester' based on their letters or sounds.",

    "Step 2: Break down the problem into smaller parts": 
        "The list of words can be simplified by breaking them down into their first letters or sounds. The first letter of 'slurp' is 's', 'raytheon' is 'r', and 'gloucester' is 'g'. This makes them easier to sort.",

    "Step 3: Compare the first letter of each word": 
        "The alphabetical order of the first letter of each word is: 'g' comes before 'r', and 'r' comes before 's'. So, the initial order is: gloucester, raytheon, slurp.",

    "Step 4: Compare subsequent letters": 
        "Since the first letters are different, there is no need to compare subsequent letters in this case. The initial order remains 

In [9]:
answer_pred_list = [x.translate(str.maketrans("", "", ".'")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

89

In [10]:
answer_pred_list[0].translate(str.maketrans("", "", "'\"[]")).split(", ")

['chlorate',
 'glidden',
 'incentive',
 'judicatory',
 'lavoisier',
 'manatee',
 'spurt']

In [11]:
set(dataset["answer_pred"])

{'',
 'Yes.',
 '["aberdeen", "analogue", "deciduous", "easel", "sprightly", "swaziland"].',
 '["abner", "abramson", "amity", "automate", "exquisite", "fruitful", "gurgle", "none", "shampoo", "shorten", "waterproof"].',
 '["accelerate", "bauer", "county", "nail", "nominee", "o\'connell", "phony", "poole", "putnam", "quantify", "raisin", "venice"].',
 '["acidify", "antagonism", "asteria"].',
 '["acquisitive", "annuity", "autocracy", "bruno", "custody", "dare", "exploitation", "lodge", "militant", "quench", "somatic", "thunderclap", "ventricle"].',
 '["admixture", "catwalk", "chateaux", "coordinate", "equine", "higgins", "irremediable", "malthusian", "offertory", "panamanian", "pecos", "reluctant", "shelve", "suction", "tunis"].',
 '["adopt", "afghan", "friday", "glimmer", "multitudinous", "pacifist", "wage", "worcestershire"].',
 '["afro", "blackbird", "blame", "calyx", "elgin", "emphases", "implacable", "jura", "mayapple", "perquisite", "vii", "whit"].',
 '["agamemnon", "clench", "depre

In [12]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 41
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 7
 }))

In [13]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if answer_pred is None or answer_pred == '':
        marker = "The final answer is:"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn)

In [14]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 7
 }))

In [15]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if not answer_pred or answer_pred in ['[', 'Yes.']:
        marker = "The final answer is"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn)

In [16]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 7
 }))

In [17]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": ins["answer_pred"]
        }
        
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    try:
        if "[" in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
        elif "," in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
        elif "1" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        elif "-" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        else:
            refined_answer = " ".join(answer_pred.split("\n"))
    except Exception:
        refined_answer = answer_pred
        
    return {
        "answer_pred": refined_answer.lower()
    }


dataset = dataset.map(map_fn)

In [19]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 8) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

confess croupier daffy dockyard duty household hypothesis info loam mandate mantic minstrelsy nepotism peccary sawtimber serenade silver summate triode
----------------------------------------------------------------------------------------------------

    confess,
    croupier,
    daffy,
    dockyard,
    duty,
    household,
    hypothesis,
    info,
    loam,
    mandate,
    mantic,
    minstrelsy,
    nepotism,
    peccary,
    sawtimber,
    serenade,
    silver,
    summate,
    triode

-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
charcuterie crucifix diatom footfall greenberg impenetrable muddle spoken synchronous
----------------------------------------------------------------------------------------------------
yes"
-+=-+=

0.904

In [123]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]