In [1]:
import os
from pyprojroot import here
from datasets import Dataset

In [2]:
base_path = os.path.join("evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh")

In [3]:
from tqdm.notebook import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
) == y_pred_i.translate(str.maketrans("", "", '.(),"'))

In [4]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(y_i)
            print("-" * 100)
            print(y_pred_i)
            print("-+=" * 100)
    return correct_preds

# boolean_expressions

In [5]:
subset = 'boolean_expressions'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-boolean_expressions/bbh-boolean_expressions_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the given reasoning structure:

```
{
    "Understanding the Statement": {
        "Break down the statement into its basic components": {
            "Identify the logical operators used": "AND, OR, NOT",
            "Identify the truth values involved": "True, False"
        },
        "Define the truth values": {
            "Definition of True": "A statement that is correct or accurate",
            "Definition of False": "A statement that is incorrect or inaccurate"
        }
    },
    "Analyzing the Negation Operator": {
        "Understanding the effect of 'not True'": {
            "Applying the negation operator to True": "NOT True",
            "Resulting truth value": "False"
        }
    },
    "Applying Logical Rules and Operators": {
        "Understanding the OR operator": {
            "Effect of OR on True and False": "Returns True if at least one of the operands is True",
            "Effect of OR on (not True)": "Returns False if the othe

In [13]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

False, True.

False, correct based on the order of operations and boolean logic rules"

True, False.

True, False.

True, False.



0.984

In [13]:
subset = 'causal_judgement'

In [14]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-causal_judgement/bbh-causal_judgement_eval')

In [15]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 187
})

In [16]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify the primary event or action that triggered the problem": {
        "Description": "Determine the main event that led to the issue.",
        "Action": "Identify the simultaneous login of Alice and Zoe at 9 am."
    },
    "Step 2: Identify the unspoken rules or conditions that led to this situation": {
        "Description": "Understand the underlying rules or conditions that caused the problem.",
        "Action": "Recognize the rule that an empty email is sent if two people are logged in at the same time."
    },
    "Step 3: Sequence the events leading up to the problem": {
        "Description": "List the events in chronological order.",
        "Action": [
            "Alice logs in at 9 am.",
            "Zoe logs in at 9 am.",
            "An empty email is sent immediately."
        ]
    },
    "Step 4: Analytical Thinking - Evaluate the problem from different viewpoints": {
        "Description": "Consider different perspectives and challenge a

In [17]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[:3].strip()

In [18]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 187it [00:00, 29266.23it/s]

No, Yes.

Yes, No.

Yes, No.

No, Yes"

No, Yes.

Yes, No.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

No, Yes."

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

No, Yes.

No, Yes.

No, Yes.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

Yes, No.

No, Yes.

Yes, No.

No, Yes.

Yes, No.

Yes, No.

Yes, No.






0.732620320855615

# date_understanding

In [5]:
subset = 'date_understanding'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-date_understanding/bbh-date_understanding_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the specific date information given in the task": {
        "Date given in the task": "Jan 21, 2011",
        "Relevant information for date calculation": "yesterday's date is given, and we need to find the date one week ago from today"
    },
    "Step 2: Eliminate irrelevant information": {
        "Irrelevant information": "Jane ate 2 pizzas and 5 wings",
        "Relevant information for date calculation": "Jan 21, 2011, and the fact that yesterday's date is given"
    },
    "Step 3: Identify the individual components of the date": {
        "Month": "January",
        "Day": "21",
        "Year": "2011"
    },
    "Step 4: Determine the reference point for the calculation": {
        "Exact date given in the task": "Jan 21, 2011 (yesterday's date)",
        "Reference point for calculation": "today's date, which is Jan 22, 2011"
    },
    "Step 5: Calculate the date one week ago": {
        "Starting date": "Jan 22, 2011",
        "Subtract one week":

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(C), B.

(A), C.

(B), None

(F), A.

(E), B.

(D), E.

(E), B.

(A), 11/29/2001.

(C), A.

(C), 08/25/2021.

(F), C.

(A), D.

(E), D.

(D), 12/02/1962.

(D), None of the options.

(F), B.

(D), F.

(B), C.

(F), 10/22/2001

(D), F.

(D), 07/10/1972.

(E), C.

(E), D.

(C), None of the given options.

(F), E.

(B), D.

(B), F.

(B), C.

(A), B.

(C), F.

(C), F.



0.876

In [16]:
def map_fn(ins):
    if not ins["answer_pred"] or len(ins["answer_pred"]) > 4:
        print(ins["input"])
        print("-" * 100)
        print(ins["target"], ins["answer_pred"])
        print("-" * 100)
        if not ins["answer_pred"]:
            print(ins["trajectory"])
        print("-+" * 100)

dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Jane quited her job on Mar 20, 2020. 176 days have passed since then. What is the date yesterday in MM/DD/YYYY?
Options:
(A) 09/11/2094
(B) 09/11/2020
(C) 09/10/2020
(D) 08/14/2020
(E) 10/09/2020
(F) 09/17/2020
----------------------------------------------------------------------------------------------------
(B) None
----------------------------------------------------------------------------------------------------
Here's the filled JSON for the given reasoning structure:

```
{
    "Step 1: Identify the initial date and the number of days passed": {
        "Initial date": "March 20, 2020",
        "Number of days passed": 176
    },
    "Step 2: Break down the number of days into smaller parts": {
        "Number of weeks": 25,
        "Number of remaining days": 1
    },
    "Step 3: Calculate the number of months and remaining days": {
        "Number of months": 6,
        "Number of remaining days": 0
    },
    "Step 4: Determine the new month and day after adding the calcula

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [17]:
subset = 'disambiguation_qa'

In [18]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-disambiguation_qa/bbh-disambiguation_qa_eval')

In [19]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [20]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the pronoun and its potential antecedents": {
        "Pronoun": "they",
        "Potential antecedents": ["The worker", "The pedestrian"]
    },
    "Step 2: Analyze the sentence structure to understand the pronoun's relationship with potential antecedents": {
        "Sentence structure": "The worker told the pedestrian that they were repairing the sidewalk as quickly as possible.",
        "Relationship between pronoun and potential antecedents": "The pronoun 'they' is the subject of the subordinate clause, and its antecedent could be either 'The worker' or 'The pedestrian'."
    },
    "Step 3: Examine the grammatical evidence to support or refute each potential antecedent": {
        "Grammatical evidence for each potential antecedent": [
            {"Antecedent": "The worker", "Evidence": "The worker is the main subject of the sentence, and the verb 'told' is in the active voice, suggesting that the worker is performing the action."},
            {"An

In [21]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [22]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), C.

(B), C.

(C), B.

(A), C.

(A), C.

(B), C.

(A), C.

(A), C.

(A), C.

(C), B.

(C), B.

(A), C.

(C), B.

(C), B.

(C), A.

(C), B.

(C), B.

(A), C.

(A), (C) Ambiguous.

(B), C.

(B), option (B) because the sentence's structure and the use of the word 'bought' imply that the accountant is the recipient of the car and therefore the one who needs it. This interpretation is consistent with standard grammatical rules and the logic of the situation."

(B), C.

(C), B.

(C), A.

(A), C.

(A), C.

(C), B.

(A), (C) Ambiguous.

(C), B.

(A), C.

(C), B.

(C), B.

(C), B.

(A), C.

(A), C.

(C), A.

(C), A.

(B), C.

(C), B.

(C), B.

(B), A.

(A), C.

(B), C.

(C), A.

(A), C.

(B), C.

(C), A.

(B), C.

(C), A.

(A), C.

(C), B.

(A), C.

(A), C.

(B), C.



0.788

# dyck_languages

In [23]:
subset = 'dyck_languages'

In [24]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [25]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [26]:
print(dataset[0]["reasoning"])

To solve the given task, we will follow the provided step-by-step reasoning plan in JSON format.

```
{
    "Step 1: Analyze the input sequence": {
        "Input sequence": "{ ( < [ < > ]",
        "Identify the types of parentheses": ["{", "(", "<", "[", ">"],
        "Count the number of opening parentheses": {
            "{": 1,
            "(": 1,
            "<": 2,
            "[": 1
        }
    },
    "Step 2: Identify matching pairs of parentheses": {
        "List of matching pairs": ["{ }", "( )", "< >", "[ ]"],
        "Identify pairs that can be closed immediately": ["< >"]
    },
    "Step 3: Break down the sequence into smaller parts": {
        "Focus on one type of parenthesis at a time": ["{", "(", "<", "[", "]"],
        "Start with innermost parentheses": "< >"
    },
    "Step 4: Explore possible closing sequences": {
        "List of possible closing sequences": ["] >", ") >", "} >", "] )", "} )", "] ) >", "} ) >"],
        "Apply each sequence to the input": [

In [43]:
def map_fn(ins):
    find = "Input: "
    index = ins["input"].find(find)
    
    return {
        "target": ins["input"][index + len(find):] + " " + ins["target"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [51]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
).replace(" ", "") == y_pred_i.translate(str.maketrans("", "", '.(),"')).replace(" ", "")

In [52]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

{ ( < [ < > ] > ) }, { ( [ < > ] ) }.

{ ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] > ), { ( < > ) } ( ( ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] ).

< ( < { [ { } < ( { ( < < < { [ ( [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > ) >, < ( < { [ { } < ( { ( < < < { [ ( [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > )"

< { [ < > ] ( ( ( ( { { } } ) ) ) ) } >, justified by the step-by-step plan and the verification of correctness.",

[ < < [ [ ] ( ) { < > ( [ { } { < > } { } ] ) } [ [ [ ( [ ( ) [ [ { < [ { { } } < { { < ( ) > } } > ] > } ] ] ] ) ] < < [ [ ( < < ( ) > > ) ] ] > > [ ] ] ] ] < ( [ ] ) > { ( ( < { } > ) ) } > > ], [ < < [ [ ] ( ) { < > ( [ { } { < > } { } ] ) } ] [ [ [ ( [ ( ) [ [ { < [ { { } } < { { < ( ) > } } > ] > } ] ] ] ) ] < < [ [ ( < < ( ) > > ) ] ] > > [ ] ] ] ] < ( [ ] ) > { ( ( < { } > ) ) } > > ].

< [ { ( ( < ( ( ) ) > ) ) } ] >, < [ { ( ( < ( ( )

0.492

# hyperbaton

In [53]:
subset = 'hyperbaton'

In [54]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-hyperbaton/bbh-hyperbaton_eval')

In [55]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [56]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core rules of adjective order in English grammar": {
        "Rules": [
            "Opinion (e.g. wonderful, lovely)",
            "Size (e.g. big, small)",
            "Shape (e.g. circular, square)",
            "Origin (e.g. Pakistani, American)",
            "Color (e.g. orange, blue)",
            "Qualifier (e.g. smoking, non-smoking)"
        ],
        "Description": "The core rules of adjective order in English grammar dictate that adjectives should be ordered in a specific sequence, starting with opinion, followed by size, shape, origin, color, and qualifier."
    },
    "Step 2: Break down the adjectives into categories (e.g. size, shape, origin)": {
        "Adjective categories": [
            {
                "Category": "Opinion",
                "Adjectives": ["wonderful"]
            },
            {
                "Category": "Size",
                "Adjectives": ["big"]
    

In [57]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [58]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), Neither A nor B.

(A), B.

(A), None of the above options. However, based on the instructions to choose one of the options, the closest one would be option (B).

(A), B.

(B), A.

(A), B.

(A), B.

(A), B.

(B), A.

(A), B.

(A), B.

(B), A.

(B), A.

(A), B.

(A), B.

(A), B.

(A), B.

(B), A.

(A), B.

(A), B.

(A), B.

(A), B.

(A), B.



0.908

# logical_deduction_five_objects

In [59]:
subset = 'logical_deduction_five_objects'

In [60]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-logical_deduction_five_objects/bbh-logical_deduction_five_objects_eval')

In [61]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [62]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the given reasoning structure:

```
{
    "Step 1: Identify the known relationships between the fruits": {
        "Watermelons vs Cantaloupes": "Watermelons are more expensive than cantaloupes",
        "Mangoes vs Pears": "Mangoes are less expensive than pears",
        "Apples": "Apples are the second-cheapest",
        "Watermelons vs Mangoes": "Watermelons are less expensive than mangoes"
    },
    "Step 2: Determine the underlying factors contributing to the price relationships": {
        "Most expensive fruit": "Pears",
        "Least expensive fruit": "Not explicitly stated, but can be deduced from other relationships",
        "Fruits that are more expensive than others": "Watermelons are more expensive than cantaloupes, mangoes are less expensive than pears, watermelons are less expensive than mangoes"
    },
    "Step 3: Analyze the price relationships from different perspectives": {
        "Assumptions about the relative prices of each fruit": 

In [10]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [63]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(B), D.

(B), D.

(E), C.

(D), A.

(C), one of the given options, and it is option (C)."

(E), C.

(E), A.

(C), B.

(E), C.

(C), E.

(C), D.

(E), D.

(E), D.

(C), E.

(D), E.

(D), E.

(C), B.

(B), A.

(B), C.

(D), B.

(D), consistent with all the given statements, including 'Joe finished last', 'Ana finished second', 'Mya finished second-to-last', and 'Eve finished below Amy'."

(C), E.



0.916

In [12]:
subset = 'logical_deduction_seven_objects'

In [13]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-logical_deduction_seven_objects/bbh-logical_deduction_seven_objects_eval')

In [14]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [15]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Identify clear positional clues": {
        "Description": "Identify the birds with clear positional clues.",
        "Action": "List the birds with specific positions mentioned in the clues.",
        "Result": "The owl is the second from the right. The cardinal is the fourth from the left. The raven is the second from the left."
    },
    "Step 2: Place birds with clear positions": {
        "Description": "Place the birds with clear positions on the branch.",
        "Action": "Assign the positions based on the clear clues provided.",
        "Result": "Positions: [Raven, _, Cardinal, _, _, Owl, _]"
    },
    "Step 3: Analyze relative positions": {
        "Description": "Analyze the relative positions of the remaining birds.",
        "Action": "List the relative positions of the birds (e.g., to the left of, to the right of).",
        "Result": "The falcon is to the left of the blue jay. The quail is to the left of the falcon. The robin is to the left of t

In [16]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [17]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 212995.33it/s]

(C), D"

(B), (A).

(A), B."

(F), B.

(A), F.

(C), correct."

(D), C.

(C), G.

(E), B.

(G), E"

(D), F.

(G), C.

(B), A.

(A), C.

(E), B.

(G), B.

(E), (A) The hawk is the second from the left"

(B), D.

(E), G.

(F), (D) The black book is the third from the left.

(F), A"

(G), (F) The red book is the fourth from the left.

(G), C.

(C), E.

(F), E"

(F), C.

(E), C.

(B), F.

(D), G.

(E), C.

(G), A.

(E), C"

(E), B.

(F), D"

(C), F"

(A), C.

(E), F"

(G), E.

(F), D."

(A), G.

(E), A.

(B), F.

(E), C.

(G), E.

(E), G.

(B), G.






0.816

# logical_deduction_three_objects

In [64]:
subset = 'logical_deduction_three_objects'

In [65]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-logical_deduction_three_objects/bbh-logical_deduction_three_objects_eval')

In [66]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [67]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the given reasoning structure:

```
{
    "Step 1: Identify the core issue": 
        "The task requires identifying the cheapest fruit among plums, apples, and loquats based on the given statements.",

    "Step 2: Simplify the given information": 
        "The key points are: (1) The loquats are the cheapest, and (2) The plums are less expensive than the apples.",

    "Step 3: Analyze the statements from different perspectives": 
        "The first statement directly states that loquats are the cheapest. The second statement compares plums and apples, but since loquats are already established as the cheapest, this comparison only helps to establish the order of the more expensive fruits.",

    "Step 4: Evaluate the evidence provided": 
        "The evidence directly supports option (C) The loquats are the cheapest. It contradicts options (A) The plums are the cheapest and (B) The apples are the cheapest, as the statements clearly place loquats as the chea

In [68]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [69]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(B), based on logical deduction and consistent with the given information. Therefore, the oranges are the second-most expensive."

(A), C.

(C), A.

(C), A.

(A), consistent with the given statements and logical deductions. The falcon is indeed the leftmost bird."



0.98

In [5]:
subset = 'movie_recommendation'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-movie_recommendation/bbh-movie_recommendation_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Brainstorm Movie Similarities": {
        "Description": "Generate a list of characteristics and themes that make The Shawshank Redemption, Forrest Gump, Dances with Wolves, and Mr. Holland's Opus similar.",
        "Action": "Identify common themes, genres, and emotional impacts.",
        "Result": "Common themes include redemption, personal growth, overcoming adversity, and emotional depth."
    },
    "Step 2: Explore Different Perspectives": {
        "Description": "Consider different genres, themes, or emotional impacts that could be relevant when comparing the given movies to the options.",
        "Action": "List various perspectives and their relevance to the given movies.",
        "Result": "Perspectives include drama, historical context, character development, and emotional resonance."
    },
    "Step 3: Critical Movie Analysis": {
        "Description": "Analyze the movies from various angles such as plot, characters, and cinematography.",
        

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 183381.60it/s]

(A), B.

(D), B"

(A), that none of the options (A, B, C, D, E) are similar to the given movies (Goodfellas, Raiders of the Lost Ark, Star Wars Episode IV - A New Hope, The Silence of the Lambs).

(A), C.

(A), C.

(D), B.

(C), A"

(E), D.

(C), B"

(A), D"

(A), E.

(C), B.

(B), A.

(B), C."

(C), A.

(A), B.

(B), D.

(B), A.

(A), D.

(A), B.

(D), C.

(B), C.

(A), B.

(C), A.

(A), B."

(C), D.

(A), D.

(D), E.

(D), C.

(A), B"

(C), B.

(A), B.

(D), A.

(A), D"

(A), C.

(D), A"

(C), A.

(D), E.

(A), D."

(B), A.

(A), B"

Monsters, Inc, B"

(D), C.

(D), A.

(C), None of the options are similar to the given movies.

(B), C"

(A), D"

(D), B.

(D), B.

(D), C.

(A), None of the options are similar to the given set of movies.

(D), B.

(B), D.

(C), B.

(C), A.

(D), A.

(C), B.

(A), D.

(B), D.

(C), D.

(B), C.

(A), B.

(C), None of the options are similar to the given movies.

(B), C.

(B), D"

(B), D.

(C), A.

(A), B.

(C), A.

(D), None of the options are closely si




0.716

# multistep_arithmetic_two

In [70]:
subset = 'multistep_arithmetic_two'

In [71]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-multistep_arithmetic_two/bbh-multistep_arithmetic_two_eval')

In [72]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [73]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core mathematical operations":
        "The core mathematical operations are multiplication, addition, and subtraction.",

    "Step 2: Break down the expression into smaller parts":
        "The expression can be broken down into two smaller parts: (-6 * -1 - 2 + -2) and (9 - 4 + -1 - 7). These expressions inside the parentheses can be evaluated separately.",

    "Step 3: Evaluate the expressions inside the parentheses":
        {
            "Step 3.1: Evaluate the first set of parentheses":
                "(-6 * -1 - 2 + -2) = ?",
            "Step 3.2: Evaluate the second set of parentheses":
                "(9 - 4 + -1 - 7) = ?"
        },

    "Step 4: Simplify the expressions inside the parentheses":
        {
            "Step 4.1: Simplify the first set of parentheses":
                "(-6 * -1) = 6, then 6 - 2 = 4, then 4 + -2 = 2",
            "Step 4.2: Simplify the second set of 

In [76]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 3) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

-5453, -5463."

-37, 43.

250992, 251472.

-3850, correct"

-8, 0.

343, the result of combining the two expressions, which is 343."

20, verified to be accurate."

-20, -18.

88, 74.

-97, -99.

8, verified to be 8"

312, -360.

33, reasonable and correct based on the given expression."

36, -90.

630, 1890.

48, None

42, the result of the multiplication, which is 42."

5, -17.

72, the result of the multiplication, which is 72."

-113, None

107, the result of the subtraction in Step 5.4"

48, 59.

101, 59.

-43, 85.

21, accurate and correct"



0.912

# penguins_in_a_table

In [77]:
subset = 'penguins_in_a_table'

In [78]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-penguins_in_a_table/bbh-penguins_in_a_table_eval')

In [79]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 146
})

In [80]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core question being asked": "How many penguins are less than 8 years old?",
    "Step 2: Filter out irrelevant information from the table": {
        "Columns to focus on": "name, age",
        "Rows to consider": "All rows except the header"
    },
    "Step 3: Determine the age threshold for filtering penguins": "8 years old",
    "Step 4: Update the table after deletion": {
        "Penguin deleted": "Bernard",
        "Remaining penguins": "Louis, Vincent, Gwen"
    },
    "Step 5: Go through the table row by row to identify penguins less than 8 years old": {
        "Penguin 1 (Louis)": {
            "Age": "7",
            "Meets the age condition": "Yes"
        },
        "Penguin 2 (Bernard)": {
            "Age": "5",
            "Meets the age condition": "Yes",
            "Deleted": "Yes"
        },
        "Penguin 3 (Vincent)": {
            "Age": "9",
            "Meets the age c

In [81]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [82]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(A), None

(C), E.

(B), C.

(A), B.

(B), E.



0.9657534246575342

# reasoning_about_colored_objects

In [83]:
subset = 'reasoning_about_colored_objects'

In [84]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-reasoning_about_colored_objects/bbh-reasoning_about_colored_objects_eval')

In [85]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [86]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the given task:

```
{
    "Objects on the nightstand": "black necklace, green fidget spinner, blue keychain, yellow sheet of paper, red stress ball",
    "Relevant colors for categorization": "yellow and green",
    "Groups based on yellow and green colors": {
        "Yellow objects": "yellow sheet of paper",
        "Green objects": "green fidget spinner",
        "Neither yellow nor green objects": "black necklace, blue keychain, red stress ball"
    },
    "Specific question being asked": "How many objects are neither yellow nor green?",
    "Available color information": "black, green, blue, yellow, red",
    "Object-by-object analysis": {
        "Object 1 (black necklace)": {
            "Color": "black",
            "Is yellow or green": "no",
            "Counted in neither yellow nor green": "yes"
        },
        "Object 2 (green fidget spinner)": {
            "Color": "green",
            "Is yellow or green": "yes",
            "Counted in n

In [87]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [88]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(D), C.

(N), D.

(A), B.

(E), (D).

(J), F.

(B), A.

(A), E.

(H), G.

(F), D.

(G), F.

(D), C.

(G), F.

(E), F.

(H), C.

(D), J.

(C), B.

(M), G.

(B), C.

(A), G.



0.924

# ruin_names

In [89]:
subset = 'ruin_names'

In [90]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-ruin_names/bbh-ruin_names_eval')

In [91]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [92]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core issue or problem":
        "The core issue is to identify a humorous edit of the artist or movie name 'star wars' by analyzing the options provided.",
    
    "Step 2: Analyze the original name":
        "The original name 'star wars' has two words that can be targeted for a humorous edit: 'star' and 'wars'. A humorous edit could involve wordplay on either of these words.",
    
    "Step 3: Evaluate each option for wordplay":
        {
            "Option A: stpr wars": "There is no apparent wordplay or humorous twist in this option, as it appears to be a typo or a minor variation of the original name.",
            "Option B: start wars": "There is a possible wordplay in this option, as 'start' is a verb that can be associated with the beginning of a conflict, but it's not a strong humorous twist.",
            "Option C: star warts": "There is a clear wordplay in this option, as 'warts' 

In [93]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()",'))[0]

In [94]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(B), C.

(B), A.

(C), A.

(B), A.

(D), B.

rita, sue and bob poo, D.

(D), A.

(A), C.

(B), D.

(A), D.

(B), C.

(D), A.

(C), A.

(C), A.

(D), B.

(B), C.

(C), B.

(A), D.

(C), A.

(C), A.

dearth, wind, & fire, I.

(D), C.

(D), C.

(B), C.

(D), B.

(C), A.

(D), A.

(D), B.

(A), B.

(D), B.

(C), A.



0.876

In [29]:
subset = 'salient_translation_error_detection'

In [30]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-salient_translation_error_detection/bbh-salient_translation_error_detection_eval')

In [31]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [32]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Understand the Task": {
        "Description": "Read and understand the task requirements and the types of errors that could occur in the translation.",
        "Action": "Identify the error types: Named Entities, Numerical Values, Modifiers or Adjectives, Negation or Antonyms, Facts, Dropped Content."
    },
    "Step 2: Analyze the Source Text": {
        "Description": "Carefully read and analyze the source text to understand its content and structure.",
        "Action": "Identify key elements such as named entities, numerical values, modifiers, negations, facts, and significant clauses."
    },
    "Step 3: Analyze the Translation": {
        "Description": "Carefully read and analyze the translation to understand its content and structure.",
        "Action": "Identify key elements such as named entities, numerical values, modifiers, negations, facts, and significant clauses."
    },
    "Step 4: Compare Source Text and Translation": {
        "Description"

In [33]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [34]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 188423.36it/s]

(A), E.

(F), D.

(A), F.

(F), D."

(D), F.

(B), D.

(A), E.

(F), E."

(F), E.

(A), D.

(F), D.

(F), D.

(A), E.

(F), D.

(E), D. The error is that 'Landkreis Konstanz' is changed to 'district of Constance'."

(F), D.

(C), A.

(C), A."

(F), D"

(E), B.

(A), E."

(C), F.

(C), D.

(D), E.

(F), A.

(F), D.

(D), F.

(A), D.

(B), E.

(F), D.

(D), E.

(C), A.

(A), D.

(A), B.

(F), B.

(A), B.

(E), D.

(A), E.

(F), D.

(F), E.

(A), D.

(D), E.

(F), C.

(C), F.

(F), D.

(F), (A) Modifiers or Adjectives.

(C), A.

(B), E.

(C), A.

(C), B.

(B), E.

(A), F.

(F), D.

(C), D.

(C), B.

(E), D.

(A), D.

(A), E.

(E), D.

(F), D.

(A), D.

(E), D.

(A), F.

(F), D.

(F), D.

(C), A.

(F), D.

(A), D.

(A), E.

(A), E.

(B), D.

(C), B.

(D), E.

(A), D.

(D), E.

(A), D.

(F), D.

(C), A.

(A), D.

(C), (A) Modifiers or Adjectives.

(A), D.

(A), F.

(C), (A) Modifiers or Adjectives.

(F), D.

(A), E.

(F), D."






0.656

In [35]:
subset = 'snarks'

In [36]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/mistral/structured/few_shot_5/bbh/bbh-snarks/bbh-snarks_eval')

In [37]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'few_shot_examples', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 178
})

In [38]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Understand the literal meaning of each statement": {
        "Statement A": "Working the printer is too complex for me",
        "Statement B": "Working the microprocessor is too complex for me"
    },
    "Step 2: Analyze the context and tone for any signs of sarcasm": {
        "Context of Statement A": "Statement A is made in a context where the task of using a printer is generally considered simple.",
        "Context of Statement B": "Statement B is made in a context where working with a microprocessor is generally considered complex.",
        "Tone of Statement A": "The tone of Statement A suggests exaggeration and frustration, which could indicate sarcasm.",
        "Tone of Statement B": "The tone of Statement B is more straightforward and could be taken at face value."
    },
    "Step 3: Identify the core issue or problem that needs to be addressed": {
        "Core issue of Statement A": "The main point of Statement A is the perceived difficulty of us

In [39]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [40]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 178it [00:00, 214968.65it/s]

(B), A.

(A), B.

(A), B.

(A), B.

(B), A"

(B), A"

(A), The statement cannot be determined as sarcastic due to insufficient context.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A.

(B), A"

(B), A.

(B), A.

(B), A"

(B), A"

(B), A.

(B), A.

(A), B.

(B), A.

(B), A."






0.8707865168539326

# sports_understanding

In [5]:
subset = 'sports_understanding'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```
{
    "Assumptions about football and Tyreek Hill": {
        "Description": "List the key assumptions about football and Tyreek Hill that underlie the plausibility of the sentence.",
        "Assumptions": [
            "Tyreek Hill is a professional football player.",
            "Tyreek Hill plays as a wide receiver.",
            "Tyreek Hill has notable skills, including speed and agility.",
            "A screen pass is a type of short pass in football, typically caught by a receiver or running back behind the line of scrimmage."
        ]
    },
    "Evidence or context supporting plausibility": {
        "Description": "Identify the kinds of evidence or context that typically support the plausibility of sentences describing sports events.",
        "Evidence or context": [
            "Tyreek Hill's past performances and statistics, including his reception and yardage totals.",
            "Specific football games or situations in which Tyreek Hill has caught screen passes.

In [9]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False',
  'False.',
  'Highly Unlikely.',
  'Highly unlikely.',
  'Implausible.',
  'Initially implausible but plausible under alternative scenarios.',
  'It depends on the context.',
  'No',
  'No.',
  'Not plausible.',
  'Plausible',
  'Plausible but lacks specificity and context.',
  'Plausible but situation-dependent.',
  'Plausible but uncertain due to lack of concrete evidence.',
  'Plausible.',
  'The claim is plausible.',
  "The sentence 'Luke Voit was out at first' is plausible.",
  'The sentence is plausible in certain contexts, but implausible in others.',
  'The sentence is somewhat implausible.',
  'The sentence is somewhat plausible.',
  'The sentence seems implausible.',
  'True',
  'True.',
  'Uncertain',
  'Uncertain due to lack of context and evidence.',
  'Uncertain, but likely implausible in a literal sense.',
  'Uncertain, but possible.',
  'Uncertain.',
  'Unlikely.',
  "Without additional information or evidence, the claim's plausibility cannot be determined."

In [10]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

{'False',
 'Highly Unlikely',
 'Highly unlikely',
 'Implausible',
 'Initially implausible but plausible under alternative scenarios',
 'It depends on the context',
 'No',
 'Not plausible',
 'Plausible',
 'Plausible but lacks specificity and context',
 'Plausible but situation-dependent',
 'Plausible but uncertain due to lack of concrete evidence',
 'The claim is plausible',
 "The sentence 'Luke Voit was out at first' is plausible",
 'The sentence is plausible in certain contexts, but implausible in others',
 'The sentence is somewhat implausible',
 'The sentence is somewhat plausible',
 'The sentence seems implausible',
 'True',
 'Uncertain',
 'Uncertain due to lack of context and evidence',
 'Uncertain, but likely implausible in a literal sense',
 'Uncertain, but possible',
 'Unlikely',
 "Without additional information or evidence, the claim's plausibility cannot be determined",
 'Without specific evidence, it is impossible to determine the plausibility of the claim',
 'Yes'}

In [12]:
# Plausible (Yes)
plausible_yes = [
    "Plausible",
    "The claim is plausible",
    "The sentence 'Luke Voit was out at first' is plausible",
    "Yes",
    "True",
]

# Implausible (No)
implausible_no = [
    "False",
    "Highly Unlikely",
    "Highly unlikely",
    "Implausible",
    "No",
    "Not plausible",
    "The sentence seems implausible",
    "Unlikely",
]

# Indeterminate
indeterminate = [
    "Initially implausible but plausible under alternative scenarios",
    "The sentence is somewhat implausible",
    "It depends on the context",
    "The sentence is plausible in certain contexts, but implausible in others",
    "The sentence is somewhat plausible",
    "Uncertain",
    "Uncertain due to lack of context and evidence",
    "Uncertain, but likely implausible in a literal sense",
    "Uncertain, but possible",
    "Without additional information or evidence, the claim's plausibility cannot be determined",
    "Without specific evidence, it is impossible to determine the plausibility of the claim",
    "Plausible but lacks specificity and context",
    "Plausible but situation-dependent",
    "Plausible but uncertain due to lack of concrete evidence",
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

{'Initially implausible but plausible under alternative scenarios',
 'It depends on the context',
 'Plausible but lacks specificity and context',
 'Plausible but situation-dependent',
 'Plausible but uncertain due to lack of concrete evidence',
 'The sentence is plausible in certain contexts, but implausible in others',
 'The sentence is somewhat implausible',
 'The sentence is somewhat plausible',
 'Uncertain',
 'Uncertain due to lack of context and evidence',
 'Uncertain, but likely implausible in a literal sense',
 'Uncertain, but possible',
 "Without additional information or evidence, the claim's plausibility cannot be determined",
 'Without specific evidence, it is impossible to determine the plausibility of the claim',
 'no',
 'yes'}

In [13]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

no
----------------------------------------------------------------------------------------------------
Uncertain
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
yes
----------------------------------------------------------------------------------------------------
no
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
no
----------------------------------------------------------------------------------------------------
yes
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+

0.708

# temporal_sequences

In [105]:
subset = 'temporal_sequences'

In [106]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-temporal_sequences/bbh-temporal_sequences_eval')

In [107]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [108]:
print(dataset[0]["reasoning"])

Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify given time intervals":
    {
        "Tiffany woke up": "6am",
        "Reading at the library": "6am-9am",
        "Driving to the water park": "9am-10am",
        "Buying a phone at the electronics store": "12pm-6pm",
        "Working out at the gym": "6pm-10pm",
        "Market closure": "after 10pm"
    },
    "Step 2: Eliminate irrelevant time periods":
    {
        "Time periods to eliminate": ["6am-9am", "9am-10am", "12pm-6pm", "6pm-10pm"],
        "Reasoning for elimination": ["Tiffany was reading", "Tiffany was driving", "Tiffany was buying a phone", "Tiffany was working out"]
    },
    "Step 3: Break down Tiffany's daily activities into separate time blocks":
    {
        "Time blocks": [
            {"start": "6am", "end": "9am", "activity": "reading"},
            {"start": "9am", "end": "10am", "activity": "driving"},
            {"start": "10am", "end": "12pm", "activity": ""},
     

In [109]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [110]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(C), A.



0.996

# web_of_lies

In [111]:
subset = 'web_of_lies'

In [112]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [113]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [114]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the statements and their relationships": {
        "Raymond's statement": "Raymond tells the truth",
        "Sal's statement about Raymond": "Sal says Raymond lies",
        "Alexis's statement about Sal": "Alexis says Sal lies",
        "Helene's statement about Alexis": "Helene says Alexis lies",
        "Elanor's statement about Helene": "Elanor says Helene lies"
    },
    "Step 2: Determine the truth value of Raymond's statement": {
        "Assume Raymond tells the truth": "If Raymond tells the truth, then his statement is true",
        "Assume Raymond lies": "If Raymond lies, then his statement is false, which is a contradiction"
    },
    "Step 3: Analyze Sal's statement about Raymond": {
        "If Raymond tells the truth, then Sal's statement is": "false, because Sal says Raymond lies",
        "If Raymond lies, then Sal's statement is": "This case is not possible due to the contradiction in Step 2"
    },
    "Step 4: Evaluate Alexis's stateme

In [115]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Cannot be determined.',
  'False.',
  'Inga tells the truth if and only if Alexis lies',
  'No',
  'No.',
  'T.',
  'True',
  'True.',
  'True."',
  'Yes',
  'Yes.',
  'false.',
  'no.',
  'yes.'},
 {'No', 'Yes'})

In [116]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."*')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Cannot be determined',
 'False',
 'Inga tells the truth if and only if Alexis lies',
 'No',
 'T',
 'True',
 'Yes',
 'false',
 'no',
 'yes'}

In [117]:
# Truth (Yes)
truth_yes = [
    'T',
    'True',
    'Yes',
    'yes'
]

# False (No)
false_no = [
    'False',
    'No',
    'false',
    'no',
]

indeterminate = [
    'Cannot be determined',
    'Inga tells the truth if and only if Alexis lies',
]



def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Cannot be determined',
 'Inga tells the truth if and only if Alexis lies',
 'No',
 'Yes'}

In [120]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Inga tells the truth if and only if Alexis lies

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Cannot be determined

Yes, No

Yes, No

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes



0.888

# word_sorting

In [11]:
subset = 'word_sorting'

In [12]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/structured/few_shot_0/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [13]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [14]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the list of words to be sorted": {
        "List of words": "slurp, raytheon, gloucester"
    },
    "Step 2: Separate the list into individual words": {
        "Word 1": "slurp",
        "Word 2": "raytheon",
        "Word 3": "gloucester"
    },
    "Step 3: Identify the first letter of each word": {
        "Word 1 first letter": "s",
        "Word 2 first letter": "r",
        "Word 3 first letter": "g"
    },
    "Step 4: Compare the first letters of each word to determine the alphabetical order": {
        "Comparison of Word 1 and Word 2 first letters": "s comes after r",
        "Comparison of Word 2 and Word 3 first letters": "r comes after g",
        "Comparison of Word 1 and Word 3 first letters": "s comes after g"
    },
    "Step 5: If the first letters are the same, compare the subsequent letters of each word": {
        "Comparison of Word 1 and Word 2 subsequent letters": "not nece

In [15]:
answer_pred_list = [x.translate(str.maketrans("", "", ".'")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

131

In [16]:
answer_pred_list[0].translate(str.maketrans("", "", "'\"[]")).split(", ")

['chlorate',
 'glidden',
 'incentive',
 'judicatory',
 'lavoisier',
 'manatee',
 'spurt']

In [17]:
set(dataset["answer_pred"])

{'',
 'Yes.',
 '[',
 '["Amethyst", "Bathos", "Dormouse", "Obtuse", "Resignation", "Walt"].',
 '["Dateline", "Household", "Jill", "Langmuir", "Pipette"].',
 '["abc", "ada", "austere", "blend", "cankerworm", "falcon", "flamboyant", "gag", "grecian", "hanukkah", "indicate", "kruger", "lobster", "militia", "nobody", "pierson", "quad", "right", "ron", "wildcat"].',
 '["aberdeen", "analogue", "deciduous", "easel", "sprightly", "swaziland"].',
 '["abner", "abramson", "amity", "automate", "exquisite", "fruitful", "gurgle", "none", "shampoo", "shorten", "waterproof"].',
 '["above", "big", "broken", "coexist", "dominate", "irk", "olive", "prometheus", "screw", "thirdhand"].',
 '["abramson", "bangui", "carlisle", "cavalier", "contextual", "dustbin", "emacs", "implementor", "islamabad", "magistrate", "nudge", "picnicking", "railway", "refractory", "silvery", "waite"].',
 '["accelerate", "bauer", "county", "nail", "nominee", "o\'connell", "phony", "poole", "putnam", "quantify", "raisin", "venice"].

In [18]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 41
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 1
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 1
 }))

In [19]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if answer_pred is None or answer_pred == '':
        marker = "The final answer is:"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [20]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 1
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 1
 }))

In [21]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if not answer_pred or answer_pred in ['[', 'Yes.']:
        marker = "The final answer is"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [22]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 1
 }))

In [23]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": ins["answer_pred"]
        }
        
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    try:
        if "[" in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
        elif "," in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
        elif "1" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        elif "-" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        else:
            refined_answer = " ".join(answer_pred.split("\n"))
    except Exception:
        refined_answer = answer_pred
        
    return {
        "answer_pred": refined_answer.lower()
    }


dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [25]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 6) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

confess croupier daffy dockyard duty household hypothesis info loam mandate mantic minstrelsy nepotism peccary sawtimber serenade silver summate triode
----------------------------------------------------------------------------------------------------
confess croupier daffy dockyard duty household hypothesis info loam mantic mandate minstrelsy nepotism peccary sawtimber serenade silver summate triode
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
cartilaginous no science spokane that'd
----------------------------------------------------------------------------------------------------
cartilaginous no science spokane thatd
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=

0.924

In [121]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]