In [1]:
import os
from pyprojroot import here
from datasets import Dataset

In [2]:
base_path = os.path.join("evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh")

In [104]:
from tqdm.notebook import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))

In [4]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(f"{y_i}, {y_pred_i}\n")
    return correct_preds

# boolean_expressions

In [5]:
subset = 'boolean_expressions'

In [8]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-boolean_expressions/bbh-boolean_expressions_eval')

In [9]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [10]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Break down the logical statement into smaller components": {
        "Sub-step 1.1": "Identify the main logical operations (AND, OR, NOT).",
        "Sub-step 1.2": "Separate the statement into individual logical expressions: 'True and False', 'not True', and the entire expression 'True and False or ( not True )'."
    },
    "Step 2: Evaluate each logical expression": {
        "Sub-step 2.1": "Evaluate the truth value of 'True and False'.",
        "Sub-step 2.2": "Evaluate the truth value of 'not True'.",
        "Sub-step 2.3": "Evaluate the truth value of the entire expression 'True and False or ( not True )'."
    },
    "Step 3: Apply logical rules and truth tables": {
        "Sub-step 3.1": "Use the truth table for the AND operation to evaluate 'True and False'. Result: False.",
        "Sub-step 3.2": "Use the truth table for the NOT operation to evaluate 'not True'. Result: False.",
        "Sub-step 3.3": "Use the truth table for the OR operation to e

In [11]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 250it [00:00, 265866.13it/s]

False, True.

True, False.

True, False."

True, False.

False, True.

False, True.

True, False."

True, False"

False, True.

True, False"






0.964

# causal_judgement

In [13]:
subset = 'causal_judgement'

In [14]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-causal_judgement/bbh-causal_judgement_eval')

In [15]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 187
})

In [16]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the primary event or action that triggered the problem": {
        "Description": "Determine the main event that led to the issue.",
        "Action": "Identify the simultaneous login of Alice and Zoe at 9 am."
    },
    "Step 2: Identify the unspoken rules or conditions that led to this situation": {
        "Description": "Understand the underlying rules or conditions that caused the problem.",
        "Action": "Recognize the rule that an empty email is sent if two people are logged in at the same time."
    },
    "Step 3: Sequence the events leading up to the problem": {
        "Description": "List the events in chronological order.",
        "Action": [
            "Alice logs in at 9 am.",
            "Zoe logs in at 9 am.",
            "An empty email is sent immediately."
        ]
    },
    "Step 4: Analytical Thinking - Evaluate the problem from different viewpoints": {
        "Description": "Consider different perspectives and challenge a

In [17]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[:3].strip()

In [18]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 187it [00:00, 29266.23it/s]

No, Yes.

Yes, No.

Yes, No.

No, Yes"

No, Yes.

Yes, No.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

No, Yes."

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

No, Yes.

No, Yes.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.






0.732620320855615

# date_understanding

In [1]:
subset = 'date_understanding'

In [7]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-date_understanding/bbh-date_understanding_eval')

In [8]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [9]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the given date": {
        "Description": "Extract the given date from the task description.",
        "Action": "Identify the date mentioned in the task (Jan 21, 2011)."
    },
    "Step 2: Determine the required calculation": {
        "Description": "Understand the specific date calculation required.",
        "Action": "Identify that the task requires calculating the date one week ago from the given date."
    },
    "Step 3: Break down the date calculation": {
        "Description": "Plan the steps to subtract one week from the given date.",
        "Action": "Subtract 7 days from the given date."
    },
    "Step 4: Perform the date arithmetic": {
        "Description": "Execute the date subtraction.",
        "Action": "Calculate the new date by subtracting 7 days from Jan 21, 2011. The result is Jan 14, 2011."
    },
    "Step 5: Format the result": {
        "Description": "Format the resulting date in MM/DD/YYYY.",
        "Action": "Convert th

In [10]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [12]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 186148.77it/s]

(C), B.

(D), None of the options match the calculated date.

(B), E.

(B), E.

(B), C.

(E), that none of the given options match the calculated date one week ago from Tue, 7/9/1972.

(B), A.

(C), None of the given options match the correct date."

(E), None of the options match the calculated date 10/09/1924.

(B), E."

(D), None of the options match the calculated date.

(D), B.

(E), None of the options match the date 11/23/2001.

(B), not listed among the given options.

(B), None of the options match the calculated date.

(E), C.

(E), not among the given options.

(D), F.

(A), B.

(B), The correct date today is not listed in the options provided.

(E), D.

(D), not listed among the given options.

(A), D."

(A), C.

(B), F"

(B), C.

(D), that none of the options match the calculated date.

(B), D.

(B), F.

(A), F.

(A), that none of the options match the calculated date of 04/14/1985.

(C), D.

(C), (F) 12/20/2014."

(B), C.

(C), B"

(D), E.

(D), that the calculated date f




0.812

# disambiguation_qa

In [5]:
subset = 'disambiguation_qa'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-disambiguation_qa/bbh-disambiguation_qa_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1 - Identify Core Pronoun Ambiguity": {
        "Description": "Identify the core pronoun reference issue in the sentence that needs to be resolved.",
        "Action": "Locate the pronoun 'they' in the sentence and identify potential antecedents.",
        "Value": "The pronoun 'they' could refer to either 'the worker' or 'the pedestrian'."
    },
    "Step 2 - Analyze Underlying Sentence Structures and Factors": {
        "Description": "Analyze the underlying grammatical structures, contextual clues, or semantic factors contributing to the pronoun's ambiguity or clarity.",
        "Action": "Examine the sentence structure and context to understand the roles of 'the worker' and 'the pedestrian'.",
        "Value": "The sentence structure suggests that 'they' is the subject of the verb 'were repairing', which is part of the clause 'that they were repairing the sidewalk as quickly as possible'."
    },
    "Step 3 - Gather Relevant Linguistic Data": {
        "Descr

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 161642.67it/s]

(A), B.

(A), C.

(A), C"

(A), C"

(A), C.

(A), C"

(C), B.

(C), A.

(C), B.

(C), B."

(C), B.

(C), B"

(C), A."

(C), B."

(A), (C) Ambiguous.

(A), C.

(A), (C) Ambiguous."

(B), C.

(C), B"

(C), A.

(A), C"

(B), (C) Ambiguous.

(C), B."

(C), B.

(C), A."

(B), C.

(B), C.

(A), C.

(B), C"

(A), C.

(A), C"

(B), C.

(B), C.

(A), C.

(C), B.

(B), C.

(B), C"

(C), B.

(B), C."

(B), (C) Ambiguous."

(B), C.

(C), A.

(A), C."

(C), A.

(A), B.

(A), C.

(C), A.

(A), (C) Ambiguous."

(A), C.

(C), B"

(C), B.

(A), B.

(A), B.

(A), C.

(A), C**

(C), B.

(A), C.

(A), C.

(A), B.

(A), (C) Ambiguous.

(C), A.

(C), B"

(C), A."

(A), C.

(C), B"

(C), B."

(A), C.

(C), (A) The nurse smelled awful."

(A), (C) Ambiguous.

(C), B."

(B), C.






0.716

# dyck_languages

In [18]:
subset = 'dyck_languages'

In [19]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [20]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [21]:
print(dataset[0]["reasoning"])

```json
{
    "Simplify the Problem": {
        "Focus on one type of bracket at a time": "We will focus on closing the square brackets first, then the parentheses."
    },
    "Break Down the Problem": {
        "Divide the sequence into smaller segments": "The sequence is '[ ['. We can break it down into two segments: the outer square brackets and the inner square brackets.",
        "Ensure each segment is properly closed before moving to the next": "We will close the inner square brackets first, then the outer square brackets."
    },
    "Critical Thinking": {
        "Analyze the sequence from different perspectives": "We need to ensure that each opening bracket has a corresponding closing bracket.",
        "Consider the hierarchy of nested brackets": "The inner square brackets should be closed before the outer square brackets.",
        "Evaluate the evidence of open and closed brackets at each point": "Currently, we have two open square brackets and no closed brackets."
    },

In [None]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 124889.95it/s]

(B), C.

(C), (C) 06/18/2016.

(E), D.

(A), C.

(E), A.

(B), F.

(C), A.

(E), B.

(B), D.

(E), D.

(A), (A) 02/29/2008.

(C), B.

(A), (A) 04/27/2004.

(A), (A) 12/22/1929.

(A), B.

(C), D.

(C), F.

(A), (A) 01/02/1930.

(A), (A) 04/29/2002.

(A), (A) 01/16/2010.

(A), (A) 02/23/1973.

(D), B.

(B), D.

(B), F.

(E), B.

(A), (A) 09/08/2003.

(A), (A) 11/29/2002.

(A), (A) 09/09/1909.

(A), B.

(D), C.

(C), B.

(B), (D) 09/06/2020.

(F), B.

(B), C.

(D), (E) 08/28/2021.

(A), (A) 12/02/2007.

(A), (A) 03/07/2016.

(A), (A) 06/11/2019.

(D), B.

(B), that none of the given options match the correct date one year ago from today.

(F), A.

(F), E.

(B), (E) 12/11/1929, as it is the closest option to the correct date a month ago from 12/31/1929.

(A), B.

(A), B.

(A), (A) 02/28/2015.

(F), B.

(B), (A) 11/25/1933.

(E), C.

(C), (A) 09/02/2021.

(F), (F) 10/22/2002.

(D), F.

(A), (A) 11/01/2019.

(A), B.

(A), (A) 06/20/2019.

(C), D.

(E), A.






0.84

# formal_fallacies

In [5]:
subset = 'formal_fallacies'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-formal_fallacies/bbh-formal_fallacies_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify Key Assumptions": {
        "Description": "List the underlying assumptions about the relationships between supporters, experts, and backers of different football clubs as stated in the problem.",
        "Action": "Extract and list all assumptions from the given premises.",
        "Assumptions": [
            "Every supporter of Tottenham Hotspur is not an expert of Trabzonspor AŞ and not a backer of US Sassuolo Calcio.",
            "Every backer of US Sassuolo Calcio who is an expert of Trabzonspor AŞ is a supporter of Tottenham Hotspur or a devotee of FC Zenit."
        ]
    },
    "Step 2: Critical Analysis": {
        "Description": "Analyze the argument from different perspectives, question the assumptions made about the supporters' relationships, and evaluate the logical consistency of the given statements.",
        "Action": "Question each assumption and evaluate the logical consistency of the statements.",
        "Analysis": [
            "

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 191136.71it/s]

invalid, valid.

valid, invalid."

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid."

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid."

invalid, valid.

invalid, valid.

invalid, valid."

invalid, valid.

valid, invalid."

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

valid, invalid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid."

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid."

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid."

invalid, valid.

invalid, valid.

invalid, valid.

invali




0.728

# geometric_shapes

In [11]:
subset = 'geometric_shapes'

In [12]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-geometric_shapes/bbh-geometric_shapes_eval')

In [13]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [14]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Understand the SVG Path": {
        "Description": "Identify the SVG path commands and coordinates provided in the path element.",
        "Action": "Break down the SVG path commands into smaller, understandable segments.",
        "Result": "The SVG path commands are: M 22.00,62.00 L 46.00,65.00 L 64.00,60.00 L 91.00,42.00 L 92.00,24.00 L 46.00,19.00 L 22.00,62.00"
    },
    "Step 2: Simplify the SVG Path Data": {
        "Description": "Simplify the SVG path data to make it easier to identify the shape.",
        "Action": "Remove any unnecessary commands or coordinates that do not affect the shape.",
        "Result": "The simplified SVG path commands are: M 22,62 L 46,65 L 64,60 L 91,42 L 92,24 L 46,19 L 22,62"
    },
    "Step 3: Analyze the SVG Path Commands": {
        "Description": "Analyze the SVG path commands step by step.",
        "Action": "Interpret each command and its corresponding coordinates to understand the shape being drawn.",
        "Res

In [15]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [16]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 80672.10it/s]

(G), J"

(C), G.

(K), H."

(G), (D) kite."

(C), G.

(I), J.

(K), J.

(K), J."

(C), J"

(C), G"

(K), D.

(B), C"

(K), I.

(K), I."

(K), I"

(D), J"

(K), D.

(K), H"

(C), G."

(K), D."

(K), D"

(C), J"

(K), J"

(C), J"

(K), D"

(F), E"

(D), J."

(C), G.

(K), A.

(K), J.

(C), G"

(K), I.

(K), H"

(D), J."

(B), G."

(C), J"

(F), G"

(K), H"

(K), D"

(G), J.

(G), D.

(C), J.

(B), J.

(K), A"

(C), G"

(K), D.

(G), J"

(C), J"

(F), G.

(D), H."

(D), J.

(C), B.

(F), B.

(F), (J) triangle.

(C), J.

(K), D"

(F), G.

(K), D."

(K), I.

(F), B.

(K), D.

(K), I"

(C), G.

(F), B.

(F), (B) heptagon.

(K), I"

(B), C.

(G), J"

(B), J"

(C), G"

(G), J"

(C), G"

(F), B.

(B), D."

(F), G"

(B), J.

(K), I."

(K), (D) kite."

(C), G"

(F), B."

(K), D"

(F), G.

(C), G"

(K), D.

(B), G.

(F), C"

(K), D"

(B), D"

(C), G"

(C), G."

(B), D"

(B), G"

(K), I"

(G), D"

(K), D."

(G), J.

(K), A.

(K), H.

(K), H"

(B), G."

(B), J.

(F), B"

(K), A."






0.588

# hyperbaton

In [5]:
subset = 'hyperbaton'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-hyperbaton/bbh-hyperbaton_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the Adjectives in Each Sentence": {
        "Description": "List all the adjectives in each sentence option.",
        "Action": "Extract adjectives from each sentence.",
        "Result": {
            "Sentence A": ["wonderful", "big", "circular", "orange", "Pakistani", "smoking"],
            "Sentence B": ["circular", "wonderful", "smoking", "Pakistani", "big", "orange"]
        }
    },
    "Step 2: Categorize the Adjectives": {
        "Description": "Categorize the adjectives into different types (e.g., opinion, size, age, shape, color, origin, material, purpose).",
        "Action": "Assign each adjective to its respective category.",
        "Result": {
            "Sentence A": {
                "opinion": "wonderful",
                "size": "big",
                "shape": "circular",
                "color": "orange",
                "origin": "Pakistani",
                "purpose": "smoking"
            },
            "Sentence B": {
       

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 321550.44it/s]

(A), B.

(A), B.

(B), A"

(A), B.

(B), A.

(A), B.

(A), B."

(A), B.

(B), A.

(B), A.

(A), B.

(A), B.

(A), B.

(A), B.






0.944

# logical_deduction_five_objects

In [6]:
subset = 'logical_deduction_five_objects'

In [7]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-logical_deduction_five_objects/bbh-logical_deduction_five_objects_eval')

In [8]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [9]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify Given Relationships": {
        "Description": "List all the given relationships between the fruits.",
        "Action": "Extract and list the comparative statements from the problem.",
        "Value": [
            "Watermelons are more expensive than cantaloupes.",
            "Mangoes are less expensive than pears.",
            "Apples are the second-cheapest.",
            "Watermelons are less expensive than mangoes."
        ]
    },
    "Step 2: Analyze Relationships": {
        "Description": "Analyze each relationship to understand the relative pricing of the fruits.",
        "Action": "Break down each statement to understand the 'more expensive than' and 'less expensive than' relationships.",
        "Value": [
            "Watermelons > Cantaloupes",
            "Mangoes < Pears",
            "Apples are the second-cheapest",
            "Watermelons < Mangoes"
        ]
    },
    "Step 3: Establish Initial Order": {
        "Description":

In [10]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [11]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 261164.63it/s]

(B), C.

(E), C.

(E), C.

(B), C.

(C), D.

(B), (C) The purple book is the rightmost.

(E), C.

(C), D.

(C), E"

(E), C"

(B), D"

(A), C"

(E), A.

(E), C.

(C), D.

(E), D.

(E), D.

(B), D.

(D), A.






0.924

# logical_deduction_seven_objects

In [12]:
subset = 'logical_deduction_seven_objects'

In [13]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-logical_deduction_seven_objects/bbh-logical_deduction_seven_objects_eval')

In [14]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [15]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify clear positional clues": {
        "Description": "Identify the birds with clear positional clues.",
        "Action": "List the birds with specific positions mentioned in the clues.",
        "Result": "The owl is the second from the right. The cardinal is the fourth from the left. The raven is the second from the left."
    },
    "Step 2: Place birds with clear positions": {
        "Description": "Place the birds with clear positions on the branch.",
        "Action": "Assign the positions based on the clear clues provided.",
        "Result": "Positions: [Raven, _, Cardinal, _, _, Owl, _]"
    },
    "Step 3: Analyze relative positions": {
        "Description": "Analyze the relative positions of the remaining birds.",
        "Action": "List the relative positions of the birds (e.g., to the left of, to the right of).",
        "Result": "The falcon is to the left of the blue jay. The quail is to the left of the falcon. The robin is to the left of t

In [16]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [17]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 212995.33it/s]

(C), D"

(B), (A).

(A), B."

(F), B.

(A), F.

(C), correct."

(D), C.

(C), G.

(E), B.

(G), E"

(D), F.

(G), C.

(B), A.

(A), C.

(E), B.

(G), B.

(E), (A) The hawk is the second from the left"

(B), D.

(E), G.

(F), (D) The black book is the third from the left.

(F), A"

(G), (F) The red book is the fourth from the left.

(G), C.

(C), E.

(F), E"

(F), C.

(E), C.

(B), F.

(D), G.

(E), C.

(G), A.

(E), C"

(E), B.

(F), D"

(C), F"

(A), C.

(E), F"

(G), E.

(F), D."

(A), G.

(E), A.

(B), F.

(E), C.

(G), E.

(E), G.

(B), G.






0.816

# logical_deduction_three_objects

In [18]:
subset = 'logical_deduction_three_objects'

In [19]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-logical_deduction_three_objects/bbh-logical_deduction_three_objects_eval')

In [20]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [21]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the cheapest fruit as given in the problem statement": {
        "Action": "Read the problem statement to find the explicit declaration of the cheapest fruit.",
        "Expected Outcome": "Identify the fruit that is explicitly stated as the cheapest.",
        "Result": "The loquats are the cheapest."
    },
    "Step 2: Compare the prices of the remaining two fruits based on the information provided": {
        "Action": "Analyze the cost relationship between the remaining two fruits as described in the problem statement.",
        "Expected Outcome": "Determine the relative cost of the remaining two fruits.",
        "Result": "The plums are less expensive than the apples."
    },
    "Step 3: Order the fruits from cheapest to most expensive": {
        "Action": "Use the information from Step 1 and Step 2 to order the fruits from cheapest to most expensive.",
        "Expected Outcome": "Create a list of fruits ordered by their cost.",
        "Resul

In [22]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [23]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 271019.90it/s]

(C), A.

(A), B.

(B), logically sound."






0.988

# movie_recommendation

In [5]:
subset = 'movie_recommendation'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-movie_recommendation/bbh-movie_recommendation_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Brainstorm Movie Similarities": {
        "Description": "Generate a list of characteristics and themes that make The Shawshank Redemption, Forrest Gump, Dances with Wolves, and Mr. Holland's Opus similar.",
        "Action": "Identify common themes, genres, and emotional impacts.",
        "Result": "Common themes include redemption, personal growth, overcoming adversity, and emotional depth."
    },
    "Step 2: Explore Different Perspectives": {
        "Description": "Consider different genres, themes, or emotional impacts that could be relevant when comparing the given movies to the options.",
        "Action": "List various perspectives and their relevance to the given movies.",
        "Result": "Perspectives include drama, historical context, character development, and emotional resonance."
    },
    "Step 3: Critical Movie Analysis": {
        "Description": "Analyze the movies from various angles such as plot, characters, and cinematography.",
        

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 183381.60it/s]

(A), B.

(D), B"

(A), that none of the options (A, B, C, D, E) are similar to the given movies (Goodfellas, Raiders of the Lost Ark, Star Wars Episode IV - A New Hope, The Silence of the Lambs).

(A), C.

(A), C.

(D), B.

(C), A"

(E), D.

(C), B"

(A), D"

(A), E.

(C), B.

(B), A.

(B), C."

(C), A.

(A), B.

(B), D.

(B), A.

(A), D.

(A), B.

(D), C.

(B), C.

(A), B.

(C), A.

(A), B."

(C), D.

(A), D.

(D), E.

(D), C.

(A), B"

(C), B.

(A), B.

(D), A.

(A), D"

(A), C.

(D), A"

(C), A.

(D), E.

(A), D."

(B), A.

(A), B"

Monsters, Inc, B"

(D), C.

(D), A.

(C), None of the options are similar to the given movies.

(B), C"

(A), D"

(D), B.

(D), B.

(D), C.

(A), None of the options are similar to the given set of movies.

(D), B.

(B), D.

(C), B.

(C), A.

(D), A.

(C), B.

(A), D.

(B), D.

(C), D.

(B), C.

(A), B.

(C), None of the options are similar to the given movies.

(B), C.

(B), D"

(B), D.

(C), A.

(A), B.

(C), A.

(D), None of the options are closely si




0.716

# penguins_in_a_table

In [11]:
subset = 'penguins_in_a_table'

In [12]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-penguins_in_a_table/bbh-penguins_in_a_table_eval')

In [13]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 146
})

In [14]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Parse the table": {
        "Description": "Extract the data from the table into a structured format.",
        "Action": "Read the table and convert it into a list of dictionaries where each dictionary represents a penguin with keys: name, age, height, weight.",
        "Result": [
            {"name": "Louis", "age": 7, "height": 50, "weight": 11},
            {"name": "Bernard", "age": 5, "height": 80, "weight": 13},
            {"name": "Vincent", "age": 9, "height": 60, "weight": 11},
            {"name": "Gwen", "age": 8, "height": 70, "weight": 15}
        ]
    },
    "Step 2: Remove Bernard from the table": {
        "Description": "Filter out the penguin named Bernard from the list.",
        "Action": "Iterate through the list and remove the dictionary where the name is 'Bernard'.",
        "Result": [
            {"name": "Louis", "age": 7, "height": 50, "weight": 11},
            {"name": "Vincent", "age": 9, "height": 60, "weight": 11},
            

In [15]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [16]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 146it [00:00, 80574.79it/s]

(C), B"

(A), B.






0.9863013698630136

# reasoning_about_colored_objects

In [17]:
subset = 'reasoning_about_colored_objects'

In [18]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-reasoning_about_colored_objects/bbh-reasoning_about_colored_objects_eval')

In [19]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [20]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1 - Identify all objects": {
        "Description": "List all the objects mentioned on the nightstand.",
        "Action": "Create a list of all objects.",
        "Objects": ["black necklace", "green fidget spinner", "blue keychain", "yellow sheet of paper", "red stress ball"]
    },
    "Step 2 - Separate objects by color": {
        "Description": "Classify each object by its color.",
        "Action": "Create a list of objects that are yellow and a list of objects that are green.",
        "Yellow objects": ["yellow sheet of paper"],
        "Green objects": ["green fidget spinner"]
    },
    "Step 3 - Identify objects that are neither yellow nor green": {
        "Description": "Determine which objects are not yellow or green.",
        "Action": "Create a list of objects that are neither yellow nor green.",
        "Neither yellow nor green objects": ["black necklace", "blue keychain", "red stress ball"]
    },
    "Step 4 - Count the relevant objects": {
   

In [21]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [22]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 112063.27it/s]

(B), the option selected in Step 4.",

(B), C.

(A), B.

(B), C.

(D), B.

(E), confirmed to be correct.",

(A), B"

(F), L"

(C), the option that matches the count of remaining yellow items.",

(F), K"






0.96

# ruin_names

In [23]:
subset = 'ruin_names'

In [24]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-ruin_names/bbh-ruin_names_eval')

In [25]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [26]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Brainstorm Humorous Edits": {
        "Description": "Generate a list of humorous modifications to the artist or movie name 'star wars'.",
        "Action": "List potential humorous edits based on puns, misspellings, phonetic humor, and pop culture references."
    },
    "Step 2: Break Down the Problem": {
        "Description": "Divide the task into smaller parts.",
        "Action": "Identify the types of humor (puns, misspellings, etc.) and analyze each option accordingly."
    },
    "Step 3: Critical Analysis of Options": {
        "Description": "Evaluate each option by considering the use of humor, the play on words, and how it deviates from the original name.",
        "Action": "For each option, note the type of humor used and how it alters the original name."
    },
    "Step 4: Creative Interpretation": {
        "Description": "Think outside the box to understand the humor in each option.",
        "Action": "Consider unconventional wordplays, phonet

In [27]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [28]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 125292.87it/s]

(B), C.

(B), A.

(B), D.

(B), C.

(A), C.

(A), B.

(C), B.

(A), D.

(D), A"

(D), B.

(C), D.

(A), D."

(B), D.

(C), A.

(A), B.

(B), D"

(C), D.

(A), C.

(A), C.

(D), B.

(B), C.

rita, sue and bob poo, D.

(D), B.

(C), A."

(C), B.

(D), B.

(C), B"

(A), C.

(B), A.

(A), B.

(C), A.

(B), D.

(A), D.

(D), B.

(C), D.

(D), C."

(C), B.

(A), C.

(A), D.

(B), D.

(C), A."

(D), C.

(A), B.

(B), A.

(A), D.

(D), B.

(D), B."

(D), B.

(B), D.

(A), C.

(B), C.

(C), A.

(A), D.

(B), C.

(A), D.

(D), C.

(B), C"

(B), C.

dearth, wind, & fire, G.

(C), A.

(A), B.

(C), A.

(C), B.






0.748

# salient_translation_error_detection

In [29]:
subset = 'salient_translation_error_detection'

In [30]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-salient_translation_error_detection/bbh-salient_translation_error_detection_eval')

In [31]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [32]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Understand the Task": {
        "Description": "Read and understand the task requirements and the types of errors that could occur in the translation.",
        "Action": "Identify the error types: Named Entities, Numerical Values, Modifiers or Adjectives, Negation or Antonyms, Facts, Dropped Content."
    },
    "Step 2: Analyze the Source Text": {
        "Description": "Carefully read and analyze the source text to understand its content and structure.",
        "Action": "Identify key elements such as named entities, numerical values, modifiers, negations, facts, and significant clauses."
    },
    "Step 3: Analyze the Translation": {
        "Description": "Carefully read and analyze the translation to understand its content and structure.",
        "Action": "Identify key elements such as named entities, numerical values, modifiers, negations, facts, and significant clauses."
    },
    "Step 4: Compare Source Text and Translation": {
        "Description"

In [33]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [34]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 188423.36it/s]

(A), E.

(F), D.

(A), F.

(F), D."

(D), F.

(B), D.

(A), E.

(F), E."

(F), E.

(A), D.

(F), D.

(F), D.

(A), E.

(F), D.

(E), D. The error is that 'Landkreis Konstanz' is changed to 'district of Constance'."

(F), D.

(C), A.

(C), A."

(F), D"

(E), B.

(A), E."

(C), F.

(C), D.

(D), E.

(F), A.

(F), D.

(D), F.

(A), D.

(B), E.

(F), D.

(D), E.

(C), A.

(A), D.

(A), B.

(F), B.

(A), B.

(E), D.

(A), E.

(F), D.

(F), E.

(A), D.

(D), E.

(F), C.

(C), F.

(F), D.

(F), (A) Modifiers or Adjectives.

(C), A.

(B), E.

(C), A.

(C), B.

(B), E.

(A), F.

(F), D.

(C), D.

(C), B.

(E), D.

(A), D.

(A), E.

(E), D.

(F), D.

(A), D.

(E), D.

(A), F.

(F), D.

(F), D.

(C), A.

(F), D.

(A), D.

(A), E.

(A), E.

(B), D.

(C), B.

(D), E.

(A), D.

(D), E.

(A), D.

(F), D.

(C), A.

(A), D.

(C), (A) Modifiers or Adjectives.

(A), D.

(A), F.

(C), (A) Modifiers or Adjectives.

(F), D.

(A), E.

(F), D."






0.656

# snarks

In [35]:
subset = 'snarks'

In [36]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-snarks/bbh-snarks_eval')

In [37]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 178
})

In [38]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Understand the literal meaning of each statement": {
        "Statement A": "Working the printer is too complex for me",
        "Statement B": "Working the microprocessor is too complex for me"
    },
    "Step 2: Analyze the context and tone for any signs of sarcasm": {
        "Context of Statement A": "Statement A is made in a context where the task of using a printer is generally considered simple.",
        "Context of Statement B": "Statement B is made in a context where working with a microprocessor is generally considered complex.",
        "Tone of Statement A": "The tone of Statement A suggests exaggeration and frustration, which could indicate sarcasm.",
        "Tone of Statement B": "The tone of Statement B is more straightforward and could be taken at face value."
    },
    "Step 3: Identify the core issue or problem that needs to be addressed": {
        "Core issue of Statement A": "The main point of Statement A is the perceived difficulty of us

In [39]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [40]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 178it [00:00, 214968.65it/s]

(B), A.

(A), B.

(A), B.

(A), B.

(B), A"

(B), A"

(A), The statement cannot be determined as sarcastic due to insufficient context.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A"

(B), A.

(B), A.

(B), A"

(B), A"

(B), A.

(B), A.

(A), B.

(B), A.

(B), A."






0.8707865168539326

# sports_understanding

In [61]:
subset = 'sports_understanding'

In [62]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [63]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [64]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1 - Critical Thinking for Plausibility": {
        "Description": "Analyze the sentence from different angles, question assumptions about the context, and evaluate the information's credibility. Focus on logical reasoning to determine if the sentence is plausible, considering potential biases or errors in interpretation.",
        "Action": "Identify and list potential biases or errors in the interpretation of the sentence. Consider the context of American football and the role of Tyreek Hill."
    },
    "Step 2 - Data and Information for Context": {
        "Description": "Identify relevant data or information that can provide context for the sentence. Consider available data sources, such as sports statistics or game reports, and how they can be analyzed to assess the sentence's plausibility.",
        "Action": "Gather relevant data sources such as game reports, player statistics, and team strategies. Tyreek Hill is known for his speed and agility, making him a 

In [65]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False.',
  'False. Andrei Svechnikov is a professional ice hockey player, not a baseball player. Therefore, the sentence "Andrei Svechnikov took ball four" is not plausible.',
  'False. Pete Alonso has not participated in the World Series, and his historical performance does not indicate a high likelihood of hitting a triple in such a context."',
  'False. The statement \'Juan Soto did a double stepover\' is not plausible given his role as a baseball player and the nature of the sport."',
  'False."',
  'Not Plausible.',
  'The sentence "Adam Thielen scored in added time" is plausible.',
  'The sentence "Aleksander Barkov passed the puck" is plausible.',
  'The sentence "Allen Robinson gained five yards" is plausible.',
  'The sentence "Andres Iniesta performed a give and go" is plausible.',
  'The sentence "Anthony Davis beat the buzzer" is plausible.',
  'The sentence "Blake Snell hit a single" is not plausible.',
  'The sentence "Bryce Harper fumbled the ball" is not plausible.',

In [66]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

{'False',
 'False Andrei Svechnikov is a professional ice hockey player, not a baseball player Therefore, the sentence Andrei Svechnikov took ball four is not plausible',
 'False Pete Alonso has not participated in the World Series, and his historical performance does not indicate a high likelihood of hitting a triple in such a context',
 "False The statement 'Juan Soto did a double stepover' is not plausible given his role as a baseball player and the nature of the sport",
 'Not Plausible',
 "The sentence 'Ben Simmons was called for the goal tend' is plausible",
 "The sentence 'Drew Brees went for it on fourth down' is plausible given his historical decision-making and the strategic context of football games",
 "The sentence 'Tyreek Hill caught the screen pass' is plausible",
 'The sentence Adam Thielen scored in added time is plausible',
 'The sentence Aleksander Barkov passed the puck is plausible',
 'The sentence Allen Robinson gained five yards is plausible',
 'The sentence Andres

In [67]:
# Plausible (Yes)
plausible_yes = [
    "The sentence 'Ben Simmons was called for the goal tend' is plausible",
    "The sentence 'Drew Brees went for it on fourth down' is plausible given his historical decision-making and the strategic context of football games",
    "The sentence 'Tyreek Hill caught the screen pass' is plausible",
    'The sentence Adam Thielen scored in added time is plausible',
    'The sentence Aleksander Barkov passed the puck is plausible',
    'The sentence Allen Robinson gained five yards is plausible',
    'The sentence Andres Iniesta performed a give and go is plausible',
    'The sentence Anthony Davis beat the buzzer is plausible',
    'The sentence Caris LeVert scored a reverse dunk is plausible',
    'The sentence Caris LeVert scored a reverse layup is plausible',
    'The sentence Collin Sexton hit the buzzer beater is plausible',
    'The sentence Dejounte Murray took a side-step three is plausible',
    'The sentence Deshaun Watson was flagged on the play is plausible if the context and actions align with NFL rules that would result in a penalty or infraction',
    'The sentence Drew Brees was flagged on the play is plausible',
    'The sentence Elias Lindholm beat the buzzer is plausible',
    'The sentence Francisco Lindor walked on ball four is plausible',
    'The sentence Jamison Crowder drew a flag on the play is plausible',
    'The sentence Jayson Tatum nutmegged the defender is plausible',
    'The sentence Jayson Tatum took a side-step three in the NBA Championship is plausible',
    'The sentence Jayson Tatum was called for the goal tend is plausible',
    'The sentence Jerry Jeudy killed the powerplay is plausible',
    'The sentence John Tavares earned a trip to the penalty box in the Stanley Cup is plausible',
    'The sentence Jonas Valanciunas beat the buzzer is plausible',
    'The sentence Juan Soto took ball four is plausible',
    'The sentence Kendrick Nunn took a charge is plausible',
    'The sentence Kyle Tucker stepped on first base is plausible',
    'The sentence Matthew Stafford launched a hail mary is plausible',
    'The sentence Mike Williams fumbled the ball in the Superbowl is plausible',
    'The sentence Neymar did a maradona on the defender in the Champions League Semifinal is plausible',
    'The sentence Norman Powell committed a blocking foul is plausible',
    'The sentence Patrick Kane backhanded a shot in the Stanley Cup is plausible',
    'The sentence Pedro struck out the side is plausible',
    'The sentence Pepe converted the first down is plausible',
    'The sentence Philip Rivers drove into the restricted area is plausible',
    'The sentence Philip Rivers launched a hail mary is plausible',
    'The sentence Pierre-Luc Dubois skated backwards is plausible',
    'The sentence Robert Woods converted the first down is plausible',
    'The sentence Robert Woods killed the powerplay is plausible',
    'The sentence Ryan Nugent-Hopkins killed the powerplay is plausible',
    "The sentence Ryan O'Reilly wristed a shot is plausible",
    'The sentence Sterling Shepard converted the first down is plausible',
    'The sentence Steven Stamkos hit the slant pass is plausible',
    'The sentence Teuvo Teravainen shot the puck is plausible',
    'The sentence Tuukka Rask killed the powerplay is plausible',
    'The sentence Willian killed the powerplay is plausible, especially in a sports or gaming context where killed is used metaphorically to indicate that Willian effectively neutralized or ended the powerplay',
    'The sentence is plausible',
    'The sentence is plausible in a sports context, specifically referring to taking a three-pointer in basketball',
    'The sentence is plausible**',
    'The statement is plausible',
    'The statement Bastian Schweinsteiger scored in added time is plausible',
    'The statement David Pastrnak skated backwards is plausible',
    'The statement Javier Mascherano took a left footed shot is plausible',
    'The statement Mikal Bridges scored a windmill dunk is plausible',
    'The statement Toni Kroos performed a give and go is plausible',
    'True',
    'Yes',
    'yes',
    'The sentence Jakub Vrana skated backwards is plausible**'
]

# Implausible (No)
implausible_no = [
    'False',
    'False Andrei Svechnikov is a professional ice hockey player, not a baseball player Therefore, the sentence Andrei Svechnikov took ball four is not plausible',
    'False Pete Alonso has not participated in the World Series, and his historical performance does not indicate a high likelihood of hitting a triple in such a context',
    "False The statement 'Juan Soto did a double stepover' is not plausible given his role as a baseball player and the nature of the sport",
    'Not Plausible',
    'The sentence Blake Snell hit a single is not plausible',
    'The sentence Bryce Harper fumbled the ball is not plausible',
    'The sentence Carles Puyol did a maradona on the defender is not plausible',
    'The sentence Carlos Tevez skated backwards is not plausible',
    'The sentence Carson Wentz caught the screen pass is implausible',
    'The sentence Clint Capela got into the endzone is not plausible',
    'The sentence Dani Alves took the snap is not plausible',
    'The sentence David Silva took a throw in is not plausible',
    'The sentence Dougie Hamilton hit the buzzer beater is not plausible',
    'The sentence Draymond Green threw a touchdown is not plausible',
    'The sentence Elias Lindholm took the snap is not plausible',
    'The sentence Emmanuel Sanders got a base hit is not plausible',
    'The sentence Fred VanVleet passed the puck is not plausible',
    'The sentence Gerard Pique scored a corner kick is not plausible',
    'The sentence Gerrit Cole set the hard screen is implausible',
    'The sentence Gleyber Torres scored a bicycle kick is not plausible',
    'The sentence Igor Shesterkin launched a hail mary is not plausible',
    'The sentence James Karinchak crossed the blue line is not plausible',
    'The sentence Jaylen Brown committed a three-second violation is not plausible',
    'The sentence Jordan Binnington scored in the third period is not plausible',
    "The sentence Justin Herbert maradona'd the defender is not plausible",
    'The sentence Ketel Marte got into the endzone is not plausible',
    'The sentence Kevin Durant hit a walkoff homer is not plausible',
    'The sentence Mario Gomez scored a reverse layup is not plausible',
    'The sentence Mark Stone hit a triple is not plausible',
    'The sentence Mookie Betts scored on the power play is not plausible',
    'The sentence Mookie Betts skated behind the net is not plausible',
    'The sentence Nick Foles lost control of the puck is not plausible',
    'The sentence Ryan Tannehill hit a triple is not plausible',
    'The sentence Santi Cazorla earned a red card in the Champions League Final is not plausible',
    'The sentence Sergio Busquets got on base is not plausible',
    'The sentence Thomas Muller hit a triple is not plausible',
    'The sentence Tristan Jarry dunked the ball is not plausible',
    'The sentence Tyler Glasnow scored a penalty kick is not plausible',
    'The sentence Walker Buehler earned a trip to the penalty box is not plausible',
    'The sentence Zach LaVine shot the puck is not plausible',
    'The statement Juan Mata scored a bicycle kick in the Champions League Final is not plausible',
    'The statement Robert Lewandowski threw a touchdown is not plausible',
    'The statement is not plausible',
    'no',
    'not plausible',
    'The sentence Carson Wentz took to the ice is not highly plausible based on the available information',
    'The sentence Mookie Betts took a side-step three is not plausible The phrase side-step three is not a recognized term or action in baseball, and the use of three in this context is ambiguous and does not make logical sense',
    'The sentence is implausible',
    'The sentence is not plausible'
]

# Indeterminate
indeterminate = [
    'The sentence Carson Wentz took to the ice is not highly plausible based on the available information',
    'The sentence Didier Drogba got into the endzone is plausible but unlikely without specific evidence',
    'The sentence Mitchell Robinson airballed the shot is plausible but not highly probable',
    'The sentence Nerlens Noel was out at home is plausible if interpreted in a context where out refers to being unavailable or not playing in a game, and at home refers to the location of the game',
    'The sentence Sonny Gray was out at second is plausible but highly unusual',
    'The sentence Yaya Toure scored a freekick is plausible but not highly likely based on available data',
    'The statement is plausible but not confirmed by available data',
    'that the sentence is plausible but less likely'
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'The sentence Didier Drogba got into the endzone is plausible but unlikely without specific evidence',
 'The sentence Mitchell Robinson airballed the shot is plausible but not highly probable',
 'The sentence Nerlens Noel was out at home is plausible if interpreted in a context where out refers to being unavailable or not playing in a game, and at home refers to the location of the game',
 'The sentence Sonny Gray was out at second is plausible but highly unusual',
 'The sentence Yaya Toure scored a freekick is plausible but not highly likely based on available data',
 'The statement is plausible but not confirmed by available data',
 'no',
 'that the sentence is plausible but less likely',
 'yes'}

In [68]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

no, yes

yes, no

yes, The statement is plausible but not confirmed by available data

yes, no

no, yes

yes, no

yes, no

yes, no

yes, no

no, yes

yes, no

yes, no

yes, no

yes, no

yes, The sentence Mitchell Robinson airballed the shot is plausible but not highly probable

yes, that the sentence is plausible but less likely

yes, no

no, The sentence Didier Drogba got into the endzone is plausible but unlikely without specific evidence

no, yes

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

yes, no

no, yes

no, yes

no, yes

yes, no

no, yes

yes, no

yes, no

yes, no

yes, no

no, The sentence Nerlens Noel was out at home is plausible if interpreted in a context where out refers to being unavailable or not playing in a game, and at home refers to the location of the game

yes, The sentence Sonny Gray was out at second is plausible but highly unusual

yes, no

yes, no

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

yes, no

yes, no

yes, no

yes, The sentence Yaya 

0.788

# temporal_sequences

In [69]:
subset = 'temporal_sequences'

In [70]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-temporal_sequences/bbh-temporal_sequences_eval')

In [71]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [72]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: List all known time intervals": {
        "description": "Identify and list all the time intervals during which Tiffany's activities are known.",
        "action": "Extract the time intervals from the provided information.",
        "values": [
            {"start": "6am", "end": "9am", "activity": "Reading at the library"},
            {"start": "9am", "end": "10am", "activity": "Driving to the water park"},
            {"start": "12pm", "end": "6pm", "activity": "Buying a phone at the electronics store"},
            {"start": "6pm", "end": "10pm", "activity": "Working out at the gym"}
        ]
    },
    "Step 2: Identify gaps between these intervals": {
        "description": "Determine the time gaps between the known intervals where Tiffany was not occupied.",
        "action": "Calculate the time gaps between the end of one activity and the start of the next.",
        "values": [
            {"start": "10am", "end": "12pm"}
        ]
    },
    "Step 3: M

In [73]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [74]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), D.

(C), D"

(C), D.

(D), (C) 11am to 5pm.

(D), B.

(A), D"



0.976

In [72]:
subset = 'tracking_shuffled_objects_five_objects'

In [73]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-tracking_shuffled_objects_five_objects/bbh-tracking_shuffled_objects_five_objects_eval')

In [74]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [75]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the problem into sequential swaps": {
        "Identify each swap of items between pairs and track these exchanges step by step": {
            "1. Dave and Eve switch partners: Dave gets Melissa, Eve gets Lola",
            "2. Dave and Alice switch partners: Dave gets Patrick, Alice gets Melissa",
            "3. Eve and Alice switch partners: Eve gets Patrick, Alice gets Lola",
            "4. Claire and Bob switch partners: Claire gets Sam, Bob gets Jamie",
            "5. Dave and Alice switch partners: Dave gets Lola, Alice gets Patrick"
        }
    },
    "Critical Thinking for Tracking": {
        "Analyze the sequence of swaps from the perspective of each participant involved": {
            "Alice: Patrick -> Melissa -> Lola -> Patrick",
            "Bob: Sam -> Jamie",
            "Claire: Jamie -> Sam",
            "Dave: Lola -> Melissa -> Patrick -> Lola",
            "Eve: Melissa -> Lola -> Patrick"
        },
        "Question and verify the

In [37]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [38]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 255937.52it/s]

(C), B.

(B), E.

(B), (A) red ball.

(C), D.

(E), (D) orange ball.

(D), A.

(D), (A) brown present.

(E), B.

(A), C.

(D), B.

(E), A.

(D), E.

(E), (C) white ball.

(B), C.

(A), C.

(D), C.

(C), (A) brown present.

(C), D.

(B), C.

(A), B.

(C), E.

(E), D.

(D), C.

(E), C.

(A), C.

(C), A.

(C), D.

(E), D.

(B), E.

(B), E.

(C), D.

(B), D.

(E), (D) Patrick.

(E), (B) red present.

(E), D.

(E), D.

(A), B.

(E), (A) benchwarmer.

(B), D.

(A), C.

(A), B.

(E), D.

(A), D.

(B), C.

(D), (A) orange ball.

(D), C.

(A), C.

(A), (E) striker.

(B), E.

(D), B.

(E), C.

(D), (A) green present.

(C), B.

(A), B.

(E), D.

(D), (A) striker.

(D), C.

(A), B.

(C), B.

(D), B.

(D), (C) green present.






0.756

# tracking_shuffled_objects_seven_objects

In [75]:
subset = 'tracking_shuffled_objects_seven_objects'

In [76]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-tracking_shuffled_objects_seven_objects/bbh-tracking_shuffled_objects_seven_objects_eval')

In [77]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [78]:
print(dataset[0]["reasoning"])

```json
{
    "Initial Positions": {
        "Alice": "cheerleader",
        "Bob": "left winger",
        "Claire": "goalkeeper",
        "Dave": "right midfielder",
        "Eve": "center midfielder",
        "Fred": "benchwarmer",
        "Gertrude": "striker"
    },
    "Position after swap 1 (Fred and Claire)": {
        "Alice": "cheerleader",
        "Bob": "left winger",
        "Claire": "benchwarmer",
        "Dave": "right midfielder",
        "Eve": "center midfielder",
        "Fred": "goalkeeper",
        "Gertrude": "striker"
    },
    "Position after swap 2 (Gertrude and Alice)": {
        "Alice": "striker",
        "Bob": "left winger",
        "Claire": "benchwarmer",
        "Dave": "right midfielder",
        "Eve": "center midfielder",
        "Fred": "goalkeeper",
        "Gertrude": "cheerleader"
    },
    "Position after swap 3 (Fred and Dave)": {
        "Alice": "striker",
        "Bob": "left winger",
        "Claire": "benchwarmer",
        "Dave": "goalk

In [79]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [80]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(F), D.

(F), E.

(B), G.

(D), E.

(C), D.

(A), C.



0.976

# tracking_shuffled_objects_three_objects

In [81]:
subset = 'tracking_shuffled_objects_three_objects'

In [82]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-tracking_shuffled_objects_three_objects/bbh-tracking_shuffled_objects_three_objects_eval')

In [83]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [84]:
print(dataset[0]["reasoning"])

```json
{
    "Initial Partnerships": {
        "Alice": "Ophelia",
        "Bob": "Lola",
        "Claire": "Izzi"
    },
    "Partner Switch 1": {
        "Description": "Bob and Claire switch partners",
        "Alice": "Ophelia",
        "Bob": "Izzi",
        "Claire": "Lola"
    },
    "Partner Switch 2": {
        "Description": "Claire and Alice switch partners",
        "Alice": "Lola",
        "Bob": "Izzi",
        "Claire": "Ophelia"
    },
    "Partner Switch 3": {
        "Description": "Alice and Bob switch partners",
        "Alice": "Izzi",
        "Bob": "Lola",
        "Claire": "Ophelia"
    },
    "Final Partnerships": {
        "Alice": "Izzi",
        "Bob": "Lola",
        "Claire": "Ophelia"
    },
    "Conclusion": {
        "Alice's final partner": "Izzi"
    }
}
```

The final answer is C.


In [85]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [86]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), B.

(A), B.

(A), B.

(C), A.

(C), B.

(C), B.

(C), B.

(C), B.



0.968

# web_of_lies

In [93]:
subset = 'web_of_lies'

In [94]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [95]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [96]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1 - Analyze Raymond's Statement": {
        "Description": "Evaluate the initial statement that Raymond tells the truth.",
        "Action": "Assume Raymond tells the truth and analyze the implications.",
        "Result": "Assume Raymond tells the truth."
    },
    "Step 2 - Analyze Sal's Statement": {
        "Description": "Evaluate Sal's statement that Raymond lies.",
        "Action": "Determine the truth value of Sal's statement based on the assumption about Raymond.",
        "Result": "If Raymond tells the truth, then Sal's statement that Raymond lies is false."
    },
    "Step 3 - Analyze Alexis's Statement": {
        "Description": "Evaluate Alexis's statement that Sal lies.",
        "Action": "Determine the truth value of Alexis's statement based on the truth value of Sal's statement.",
        "Result": "Since Sal's statement is false, Alexis's statement that Sal lies is true."
    },
    "Step 4 - Analyze Helene's Statement": {
        "Description"

In [97]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Alejandro tells the truth.',
  'Alexis tells the truth.',
  'Amberly does not tell the truth.',
  'Amberly is not telling the truth.',
  'Amberly tells the truth.',
  'Andree tells the truth.',
  'Antwan tells the truth.',
  'Audrie tells the truth.',
  'Bernita does not tell the truth.',
  'Bernita is not telling the truth.',
  'Christie does not tell the truth.',
  'Christie tells the truth.',
  'Conception does not tell the truth.',
  'Conception tells the truth.',
  'Crista tells the truth.',
  'Dallas is telling the truth.',
  'Dallas tells the truth.',
  'Delbert does not tell the truth.',
  'Delbert tells the truth.',
  'Delfina does not tell the truth.',
  'Delfina lies.',
  'Delfina tells the truth.',
  'Elanor does not tell the truth.',
  'Elanor tells the truth.',
  'False"',
  'False.',
  'False."',
  'False.**',
  'Fidel does not tell the truth.',
  'Fidel tells the truth.',
  'Fletcher tells the truth.',
  'Gwenn does not tell the truth.',
  'Gwenn tells the truth.',
 

In [100]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."*')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False',
 'Ka is telling the truth',
 'Leda lies',
 'No',
 'Shenna is telling the truth',
 'Vina tells the truth',
 'Yes'}

In [102]:
# Truth (Yes)
truth_yes = [
    "True",
    "Amberly tells the truth",
    "Andree tells the truth",
    "Christie tells the truth",
    "Conception tells the truth",
    "Delbert tells the truth",
    "Delfina tells the truth",
    "Maybelle tells the truth",
    "Millie tells the truth",
    "Shalonda tells the truth",
    "Sima tells the truth",
    "Alexis tells the truth",
    "Alejandro tells the truth",
    "Antwan tells the truth",
    "Audrie tells the truth",
    "Crista tells the truth",
    "Dallas is telling the truth",
    "Dallas tells the truth",
    "Elanor tells the truth",
    "Fidel tells the truth",
    "Fletcher tells the truth",
    "Gwenn tells the truth",
    "Inga tells the truth",
    "Jamey tells the truth",
    "Jaymie tells the truth",
    "Jerry tells the truth",
    "Jim tells the truth",
    "Ka tells the truth",
    "Kristian tells the truth",
    "Leda tells the truth",
    "Lorine tells the truth",
    "Michaela tells the truth",
    "Millicent tells the truth",
    "Osvaldo tells the truth",
    "Phoebe tells the truth",
    "Rashida tells the truth",
    "Ryan tells the truth",
    "Sal tells the truth",
    "Shaunda tells the truth",
    "Shenna tells the truth",
    "Sherrie tells the truth",
    "Teressa tells the truth",
    "Yoland tells the truth",
    "Yes",
    "Yes, Alexis tells the truth",
    "Yes, Bernita tells the truth",
    "Yes, Maybelle tells the truth",
    'Ka is telling the truth',
    "true",
    'Shenna is telling the truth',
    'Vina tells the truth',
]

# False (No)
false_no = [
    "False",
    "Michaela does not tell the truth",
    "Amberly does not tell the truth",
    "Amberly is not telling the truth",
    "Bernita does not tell the truth",
    "Bernita is not telling the truth",
    "Christie does not tell the truth",
    "Conception does not tell the truth",
    "Delbert does not tell the truth",
    "Delfina does not tell the truth",
    "Delfina lies",
    "Elanor does not tell the truth",
    "Fidel does not tell the truth",
    "Gwenn does not tell the truth",
    "Helene does not tell the truth",
    "Inga does not tell the truth",
    "Jamey does not tell the truth",
    "Jaymie does not tell the truth",
    "Jerry does not tell the truth",
    "Jim does not tell the truth",
    "Jim lies",
    "Ka does not tell the truth",
    "Kandi does not tell the truth",
    "Kandi is lying",
    "Lorine does not tell the truth",
    "Millicent does not tell the truth",
    "Millicent is not telling the truth",
    "Millie does not tell the truth",
    "No, Alexis does not tell the truth",
    "No, Ka does not tell the truth",
    "No, Shalonda does not tell the truth",
    "Rashida does not tell the truth",
    "Raymond does not tell the truth",
    "Ryan does not tell the truth",
    "Sal does not tell the truth",
    "Shalonda does not tell the truth",
    "Shaunda does not tell the truth",
    "Shenna does not tell the truth",
    "Tamika does not tell the truth",
    "Teressa does not tell the truth",
    "Vina does not tell the truth",
    "Willian does not tell the truth",
    "No",
    'Leda lies',
]



def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [105]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

Yes, No

Yes, No

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

Yes, No

Yes, No

No, Yes

No, Yes

Yes, No

Yes, No



0.908

# word_sorting

In [106]:
subset = 'word_sorting'

In [107]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [108]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [109]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the list of words to be sorted": {
        "Description": "List the words that need to be sorted alphabetically.",
        "Action": "Identify the words: slurp, raytheon, gloucester."
    },
    "Step 2: Understand the sorting criteria": {
        "Description": "Determine the criteria for sorting. In this case, it is alphabetical order.",
        "Action": "Confirm that the sorting is based on alphabetical order."
    },
    "Step 3: Break down the sorting task": {
        "Description": "Break down the sorting task into smaller, more manageable steps.",
        "Action": "Plan to compare each word with every other word to determine the correct order."
    },
    "Step 4: Compare the first pair of words": {
        "Description": "Compare the first two words alphabetically.",
        "Action": "Compare 'slurp' and 'raytheon'. 'raytheon' comes before 'slurp'."
    },
    "Step 5: Compare the next pair of words": {
        "Description": "Compare the next

In [110]:
answer_pred_list = [x.translate(str.maketrans("", "", ".'")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

58

In [111]:
answer_pred_list[0].translate(str.maketrans("", "", "[]")).split(", ")

['chlorate',
 'glidden',
 'incentive',
 'judicatory',
 'lavoisier',
 'manatee',
 'spurt']

In [112]:
set(dataset["answer_pred"])

{'"campfire", "contrast", "crowfoot", "purgatory", "scrupulous".',
 "'across', 'admixture', 'directrix', 'flight', 'gut', 'indicate', 'marshal', 'predacious', 'quagmire', 'smuggle', 'vantage'.",
 None,
 '["abo", "armful", "bonaventure", "cremate", "dictatorial", "embryology", "frond", "gasify", "guiana", "herman", "indistinguishable", "oscillatory", "pancreatic", "passenger", "referential", "stockholder", "through", "tip"].',
 '["acuity", "anticonvulsant", "carrageen", "discovery", "disseminate", "drafty", "embolden", "glamour", "hangout", "hasty", "magnificent", "pewee", "proscenium", "registrar", "scrub", "supposable", "sushi", "you\'d"].',
 '["adipic", "antique", "athlete", "atonic", "catch", "encumber", "lauderdale", "neutrino", "olivia", "persona", "sovereignty", "specify", "statuette", "whiteface"].',
 '["aeneas", "colombo", "foothold", "fox", "garry", "glycerine", "inviolate", "lucre", "magnanimity", "nevada", "notoriety", "plebiscite", "pompey", "quagmire", "scription", "satani

In [113]:
for instance in dataset.filter(lambda x: x["answer_pred"] == None):
    print(instance["trajectory"])

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

```json
{
    "Step 1: Generate and Test Alphabetization Strategies": {
        "Description": "Brainstorm different methods for sorting the words (e.g., using a sorting algorithm, manual comparison), and apply each method to the list to see which one effectively sorts the words alphabetically.",
        "Methods to Consider": [
            "Bubble Sort",
            "Merge Sort",
            "Quick Sort",
            "Manual Comparison"
        ],
        "Action": "Choose a method and apply it to the list."
    },
    "Step 2: Simplify the Sorting Task": {
        "Description": "Break down the sorting process into simpler steps. For example, start by sorting a smaller subset of the words, or focus on sorting based on the first letter only before considering subsequent letters.",
        "Subset Example": "Sort the first five words: odessa, dance, formulae, dietetic, mantle.",
        "Action": "Sort a smaller subset and then expand to the full list."
    },
    "Step 3: Divide the L

In [114]:
import re

def map_fn(ins):
    if ins["answer_pred"] == None:
        text = "The final answer is:\n"
        pattern = fr"(?<={re.escape(text)}).*"
    
        response = ins["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response, re.DOTALL).group(0).translate(str.maketrans("", "", "`")).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": ins["trajectory"],
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [117]:
print(dataset.filter(lambda x: x["answer_pred"] == None)[0]["reasoning"])

```json
{
    "Step 1: List the words": {
        "Description": "Write down the list of words to be sorted.",
        "Action": "List the words: wagging, cabdriver, astronomic, pivot, loch, coherent"
    },
    "Step 2: Compare the first two words": {
        "Description": "Compare the first two words alphabetically.",
        "Action": "Compare 'wagging' and 'cabdriver'"
    },
    "Step 3: Swap if necessary": {
        "Description": "If the second word comes before the first word alphabetically, swap them.",
        "Action": "Swap 'wagging' and 'cabdriver' to get: cabdriver, wagging, astronomic, pivot, loch, coherent"
    },
    "Step 4: Compare the next pair": {
        "Description": "Compare the next pair of words alphabetically.",
        "Action": "Compare 'wagging' and 'astronomic'"
    },
    "Step 5: Continue comparing and swapping": {
        "Description": "Continue comparing and swapping adjacent words until the list is sorted.",
        "Action": "Repeat the compariso

In [119]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": ins["answer_pred"]
        }
        
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    if "[" in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
    elif "," in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
    elif "9" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
    elif "-" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))

    return {
        "answer_pred": refined_answer
    }


dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [120]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

confess croupier daffy dockyard duty household hypothesis info loam mandate mantic minstrelsy nepotism peccary sawtimber serenade silver summate triode, croupier daffy dockyard duty hypothesis household info loam mandate mantic minstrelsy nepotism peccary serenade silver summate triode

bologna cottrell crackle cure doubtful entropy extoller gloria litigant procedural summand tyke, bologna crackle cure cottrell doubtful entropy extoller gloria litigant procedural summand tyke

geld phase thunder, phase geld thunder

adonis birdseed citizen contaminant convair extensive fateful frighten judaica scrubby soothe southeastern stormy suppose trillion trundle, adonis birdseed citizen convair contaminant extensive fateful frighten judaica scrubby soothe southeastern stormy suppose trillion trundle

belize bolshevism cost dance deadline dietetic formulae foster hesitant huddle judson mantle odessa palace progeny proust rackety resplendent thirdhand warmth, belize bolshevism cost dance deadline 

0.764

In [121]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]