In [1]:
import os
from pyprojroot import here
from datasets import Dataset

In [2]:
base_path = os.path.join("evals", "logs", "mistral", "phaseII", "bbh")

In [132]:
from tqdm import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))

In [6]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(f"{y_i}, {y_pred_i}\n")
    return correct_preds

# boolean_expressions

In [3]:
subset = 'boolean_expressions'

In [4]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-boolean_expressions/bbh-boolean_expressions_eval')

In [5]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [6]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the problem type": {
        "Determine if the problem is a logical evaluation": {
            "Check if the problem involves boolean operators, order of operations, or simplification techniques": "The problem involves boolean operators (not, and) and requires evaluation of a logical expression."
        }
    },
    "Break down the complex logical expression": {
        "Divide the expression into smaller, more manageable parts": "The expression can be broken down into two parts: not ( True ) and ( True )."
    },
    "Simplify the logical expression": {
        "Apply simplification techniques to make the expression easier to evaluate": "Simplify not ( True ) to False."
    },
    "Design a logical experiment": {
        "Plan how to evaluate the truth value of the expression": "Evaluate the expression step by step, starting with the innermost operation."
    },
    "Critical Thinking": {
        "Analyze the logical expression from different perspectives": "C

In [23]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 250it [00:00, 125143.33it/s]

True, False.

True, False.

True, False.

True, False.

True, False.

True, False.

True, False.

True, False.

False, True.

True, False.

True, False.

True, False.

True, False.

True, False.

True, False.

False, True.

True, 'True'."






0.936

# date_understanding

In [7]:
subset = 'date_understanding'

In [8]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-date_understanding/bbh-date_understanding_eval')

In [9]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [10]:
print(dataset[0]["reasoning"])

```json
{
    "Core Issue Identification": {
        "Identify the key date or time interval that needs to be determined": "The date tomorrow after Christmas Eve of 1937."
    },
    "Underlying Factors": {
        "Identify relevant calendar rules, time units, or date formats that contribute to the problem": "Christmas Eve is December 24th. The next day is December 25th."
    },
    "Data and Information": {
        "Gather relevant calendar data or date formats that can provide insights": "Christmas Eve is on December 24th, 1937."
    },
    "Problem Type": {
        "Determine if the problem is about calculating a specific date or understanding calendar rules and formats": "This problem is about calculating a specific date."
    },
    "Problem Simplification": {
        "Break down the date calculation into simpler, more manageable steps": "Identify the date of Christmas Eve and add one day to it."
    },
    "Problem Decomposition": {
        "Divide the date problem into smaller 

In [73]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [74]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 83306.27it/s]

(B), C.

(E), D.

(A), C.

(E), A.

(B), F.

(C), A.

(E), B.

(B), D.

(E), D.

(C), B.

(A), B.

(C), D.

(C), F.

(D), B.

(B), D.

(B), F.

(E), B.

(A), B.

(D), C.

(C), B.

(B), (D) 09/06/2020.

(F), B.

(B), C.

(D), (E) 08/28/2021.

(D), B.

(B), that none of the given options match the correct date one year ago from today.

(F), A.

(F), E.

(B), (E) 12/11/1929, as it is the closest option to the correct date a month ago from 12/31/1929.

(A), B.

(A), B.

(F), B.

(B), (A) 11/25/1933.

(E), C.

(C), (A) 09/02/2021.

(D), F.

(A), B.

(C), D.

(E), A.






0.844

# disambiguation_qa

In [11]:
subset = 'disambiguation_qa'

In [12]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-disambiguation_qa/bbh-disambiguation_qa_eval')

In [13]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [14]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the core ambiguity or pronoun reference issue that needs to be resolved in the sentence": {
        "Step": "Identify the pronoun in the sentence and determine its ambiguity.",
        "Value": "The pronoun 'he' in the sentence 'The patient was referred to the specialist because he had a rare skin condition' could be ambiguous."
    },
    "Determine the potential antecedents or subjects that the pronoun could be referring to": {
        "Step": "List all possible nouns or subjects in the sentence that the pronoun could refer to.",
        "Value": "The potential antecedents for 'he' are 'the patient' and 'the specialist'."
    },
    "Assess if there are any linguistic cues, contextual information, or grammatical rules that can help clarify the pronoun reference": {
        "Step": "Analyze the sentence for any linguistic cues, contextual information, or grammatical rules that can help identify the correct antecedent.",
        "Value": "The context suggests th

In [35]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [36]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 49986.94it/s]

(B), C.

(A), C.

(B), (C) Ambiguous.

(C), B.

(A), C.

(A), C.

(A), C.

(A), C.

(A), C.

(B), C.

(B), C.

(B), C.

(A), C.

(A), C.

(A), C.

(A), C.

(A), (C) Ambiguous.

(C), (A) The nurse was busy.

(B), (C) Ambiguous.

(A), C.

(A), (C) Ambiguous.

(B), C.

(A), C.

(A), C.

(A), C.

(A), C.

(A), C.

(A), C.

(B), C.

(B), C.

(A), C.

(B), C.

(B), C.

(B), C.

(B), C.

(A), C.

(B), (C) Ambiguous.

(B), C.

(A), C.

(A), (C) Ambiguous.

(A), C.

(B), C.

(B), C.

(C), (A) The surgeon needed more time.

(A), C.

(C), A.

(A), (C) Ambiguous.

(B), (C) Ambiguous.

(B), C.

(A), C.

(B), C.

(A), C.

(A), (C) Ambiguous.

(A), C.

(B), C.

(B), C.

(B), C.

(B), C.

(B), C.

(A), C.

(B), C.

(A), C.

(B), (C) Ambiguous.

(A), C.

(A), C.

(A), (C) Ambiguous.

(A), C.

(A), (C) Ambiguous.

(A), C.

(B), (C) Ambiguous.

(B), (C) Ambiguous.

(A), C.

(A), C.

(A), C.

(A), C.

(A), C.

(A), C.

(A), (C) Ambiguous.

(C), B.

(A), C.

(B), C.

(B), (C) Ambiguous.

(A), C.

(B), C.






0.632

# dyck_languages

In [18]:
subset = 'dyck_languages'

In [19]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [20]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [21]:
print(dataset[0]["reasoning"])

```json
{
    "Simplify the Problem": {
        "Focus on one type of bracket at a time": "We will focus on closing the square brackets first, then the parentheses."
    },
    "Break Down the Problem": {
        "Divide the sequence into smaller segments": "The sequence is '[ ['. We can break it down into two segments: the outer square brackets and the inner square brackets.",
        "Ensure each segment is properly closed before moving to the next": "We will close the inner square brackets first, then the outer square brackets."
    },
    "Critical Thinking": {
        "Analyze the sequence from different perspectives": "We need to ensure that each opening bracket has a corresponding closing bracket.",
        "Consider the hierarchy of nested brackets": "The inner square brackets should be closed before the outer square brackets.",
        "Evaluate the evidence of open and closed brackets at each point": "Currently, we have two open square brackets and no closed brackets."
    },

In [None]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 124889.95it/s]

(B), C.

(C), (C) 06/18/2016.

(E), D.

(A), C.

(E), A.

(B), F.

(C), A.

(E), B.

(B), D.

(E), D.

(A), (A) 02/29/2008.

(C), B.

(A), (A) 04/27/2004.

(A), (A) 12/22/1929.

(A), B.

(C), D.

(C), F.

(A), (A) 01/02/1930.

(A), (A) 04/29/2002.

(A), (A) 01/16/2010.

(A), (A) 02/23/1973.

(D), B.

(B), D.

(B), F.

(E), B.

(A), (A) 09/08/2003.

(A), (A) 11/29/2002.

(A), (A) 09/09/1909.

(A), B.

(D), C.

(C), B.

(B), (D) 09/06/2020.

(F), B.

(B), C.

(D), (E) 08/28/2021.

(A), (A) 12/02/2007.

(A), (A) 03/07/2016.

(A), (A) 06/11/2019.

(D), B.

(B), that none of the given options match the correct date one year ago from today.

(F), A.

(F), E.

(B), (E) 12/11/1929, as it is the closest option to the correct date a month ago from 12/31/1929.

(A), B.

(A), B.

(A), (A) 02/28/2015.

(F), B.

(B), (A) 11/25/1933.

(E), C.

(C), (A) 09/02/2021.

(F), (F) 10/22/2002.

(D), F.

(A), (A) 11/01/2019.

(A), B.

(A), (A) 06/20/2019.

(C), D.

(E), A.






0.84

# formal_fallacies

In [22]:
subset = 'formal_fallacies'

In [23]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-formal_fallacies/bbh-formal_fallacies_eval')

In [24]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [27]:
print(dataset[0]["reasoning"])

```json
{
    "Identify Key Assumptions": {
        "Identify the key premises or given statements that the argument relies on": {
            "Premise 1": "Whoever is a schoolmate of Sondra is not a stepsister of Pricilla.",
            "Premise 2": "Whoever is not a stepsister of Pricilla is a schoolmate of Sondra."
        }
    },
    "Break Down the Argument": {
        "Deconstruct the argument into smaller logical steps or components": {
            "Step 1": "If someone is a schoolmate of Sondra, then they are not a stepsister of Pricilla.",
            "Step 2": "If someone is not a stepsister of Pricilla, then they are a schoolmate of Sondra."
        }
    },
    "Critical Analysis": {
        "Analyze the argument from different logical perspectives, question the assumptions, and evaluate the inferential steps for validity. Focus on identifying any logical fallacies or gaps": {
            "Analysis": "The argument presents a logical equivalence between being a schoolmate o

In [58]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 249779.90it/s]

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

valid, invalid.

invalid, valid.

valid, invalid.

valid, invalid.

valid, invalid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

valid, invalid.

invalid, valid.

valid, invalid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

invalid, valid.

valid, invalid.

invalid, valid.

valid, invalid.

valid, invalid.

invalid, valid.

invalid, valid.

valid, invalid.

valid, invalid.

invalid, valid




0.756

# geometric_shapes

In [28]:
subset = 'geometric_shapes'

In [29]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-geometric_shapes/bbh-geometric_shapes_eval')

In [30]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [31]:
print(dataset[0]["reasoning"])

```json
{
    "Determine the core issue or problem that needs to be addressed": {
        "Identify the shape represented by the given SVG path element": "The task is to identify the shape drawn by the SVG path element."
    },
    "Identify the underlying causes or factors contributing to the problem": {
        "Extract key coordinates and commands from the SVG path data": "The SVG path data is 'M 55.57,80.69 L 57.38,65.80 M 57.38,65.80 L 48.90,57.46 M 48.90,57.46 L 45.58,47.78 M 45.58,47.78 L 53.25,36.07 L 66.29,48.90 L 78.69,61.09 L 55.57,80.69'."
    },
    "Identify potential obstacles or challenges": {
        "Check for complex SVG commands or transformations": "The path data uses simple 'M' (move to) and 'L' (line to) commands.",
        "Identify any ambiguities in the path data": "There are no ambiguities in the path data."
    },
    "Gather relevant data or information": {
        "Understand available SVG path commands and their meanings": "The commands 'M' and 'L' are us

In [62]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [63]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 39845.57it/s]

(B), D.

(K), J.

(K), J.

(C), D.

(C), G.

(K), D.

(F), B.

(C), G.

(B), C.

(K), I.

(C), J.

(B), D.

(C), D.

(K), D.

(B), (J) triangle.

(F), C.

(K), J.

(B), D.

(K), H.

(B), G.

(B), J.

(F), B (heptagon).

(K), D.

(G), J.

(C), J.

(B), (J) triangle.

(K), I.

(C), D.

(F), (E) line.

(K), D.

(I), J.

(F), B.

(D), J.

(K), I.

(B), G.

(B), C.

(D), J.

(C), B (heptagon), as the path data describes a shape with seven vertices.

(I), J.

(B), D.

(C), B.

(G), J.

(D), J.

(C), J.

(F), G.

(F), B.

(K), J.

(K), I.

(C), B.

(F), B (heptagon).

(I), J.

(F), B (heptagon).

(C), G.

(F), G.

(D), J.

(D), J.

(K), D.

(C), J.

(F), D.

(F), D.

(C), G.

(K), J.

(F), D.

(K), D.

(G), D.

(B), D.

(B), J.

(G), D.

(C), G.

(F), (G) pentagon.

(G), D.

(F), D (kite).

(D), J.

(B), G.

(C), (J) triangle.

(K), H.

(K), J.

(F), (G) pentagon.

(G), D.

(D), J.

(K), D.

(K), H.

(C), (J) triangle.

(K), D.

(B), (J) triangle.

(C), (J) triangle.

(G), D.

(F), G.

(K), H




0.472

# logical_deduction_five_objects

In [32]:
subset = 'logical_deduction_five_objects'

In [33]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-logical_deduction_five_objects/bbh-logical_deduction_five_objects_eval')

In [34]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [35]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the problem into smaller parts": {
        "Identify each object and the relationships or comparisons given": {
            "List these relationships separately": "1. The owl is the leftmost. 2. The robin is to the left of the raven. 3. The quail is the rightmost. 4. The raven is the third from the left."
        }
    },
    "Critical Thinking": {
        "Analyze the problem from different perspectives by focusing on each object's position or rank based on the given comparisons": {
            "Question assumptions about their order and evaluate the information available to form a logical sequence": "Start with the definitive positions: the owl is leftmost and the quail is rightmost. Then use the other comparisons to fill in the middle positions."
        }
    },
    "Identify the core issue": {
        "Determine what the main question is asking": "The main question is asking which bird is the rightmost."
    },
    "Underlying causes/factors": {
        "

In [68]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [69]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 249660.95it/s]

(E), B.

(C), D.

(D), (A) The truck is the second-oldest.

(A), B.

(E), B.

(E), C.

(D), C.

(E), B.

(E), D.

(D), B.

(E), D.

(E), B.

(A), D.

(B), E.

(C), D.

(C), D.

(C), D.

(E), C.

(E), B.

(B), C.

(D), B.

(D), E.

(A), C.

(B), (A) The gray book is the third from the left.

(E), C.

(D), B.

(E), (A) The blue jay is the second from the right.

(C), B.

(B), C.

(B), D.

(C), A.

(E), B.

(C), E.

(D), B.

(D), B.

(C), B.

(B), A.

(E), (A) The tractor is the second-oldest.

(E), C.

(E), B.

(D), C.

(A), C.

(C), E.






0.828

# logical_deduction_seven_objects

In [36]:
subset = 'logical_deduction_seven_objects'

In [37]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-logical_deduction_seven_objects/bbh-logical_deduction_seven_objects_eval')

In [38]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [39]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the problem into smaller parts": {
        "Identify each object and the statements that describe their order": {
            "List these statements and objects separately": {
                "Objects": ["Ana", "Eve", "Ada", "Dan", "Rob", "Amy", "Joe"],
                "Statements": [
                    "Dan finished third.",
                    "Ana finished above Ada.",
                    "Amy finished last.",
                    "Dan finished below Rob.",
                    "Eve finished below Ada.",
                    "Rob finished below Joe."
                ]
            }
        }
    },
    "Critical Thinking": {
        "Analyze the problem by considering each perspective": {
            "Question assumptions about the order and evaluate the information given to ensure logical consistency": {
                "Dan is third, so there are two golfers above him and four below him."
            },
            "Identify any biases or flaws in the reaso

In [78]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [79]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 249839.41it/s]

(B), C.

(A), F.

(C), B. The yellow book is the second from the left.

(F), D.

(E), B.

(B), D.

(G), B.

(E), D.

(B), D.

(B), D.

(C), D.

(E), B.

(F), E.

(E), G.

(C), B.

(E), A.

(A), B.

(A), B. The blue jay is the third from the left.

(F), D.

(F), C.

(A), G.

(B), D.

(B), D.

(C), B.

(E), D.

(A), D.

(E), B.

(F), E.

(F), B.

(G), D.

(F), B.

(E), B.

(F), B.

(D), B.

(G), C.

(F), A.

(G), B.

(F), D.

(D), B.

(G), D.

(G), C.

(C), B.

(A), C.

(G), B.

(F), C.

(F), G.

(E), B.






0.812

# logical_deduction_three_objects

In [40]:
subset = 'logical_deduction_three_objects'

In [41]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-logical_deduction_three_objects/bbh-logical_deduction_three_objects_eval')

In [42]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [43]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1 - Core issue identification": {
        "Identify the primary ordering or ranking that needs to be determined": "Determine the order of the birds on the branch: blue jay, quail, and falcon."
    },
    "Step 2 - Underlying factors": {
        "Identify the given comparisons or relationships between the objects": "The falcon is to the right of the blue jay. The blue jay is to the right of the quail."
    },
    "Step 3 - Analytical problem check": {
        "Confirm if the problem is a logical ordering one that requires comparing and sequencing techniques": "Yes, the problem requires determining the sequence of the birds based on the given relationships."
    },
    "Step 4 - Problem decomposition": {
        "Break down the ordering problem into smaller, more manageable comparisons between two objects at a time": "1. Falcon is to the right of the blue jay. 2. Blue jay is to the right of the quail."
    },
    "Step 5 - Systematic solution testing": {
        "List

In [83]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [84]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 249660.95it/s]

(A), B.

(C), B.

(A), B.

(A), B.

(C), B.

(B), C.

(C), B.

(C), B.

(A), B.

(C), B.

(C), B.

(A), C.

(A), B.

(C), B.

(C), B.

(C), B.

(C), B.

(C), B.

(A), B.

(C), (A) The hawk is the second from the left.

(C), B.

(A), B.

(A), B.

(A), B.

(A), B.

(A), B.






0.896

# movie_recommendation

In [44]:
subset = 'movie_recommendation'

In [45]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-movie_recommendation/bbh-movie_recommendation_eval')

In [46]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [47]:
print(dataset[0]["reasoning"])

```json
{
    "Design an experiment": {
        "Create a comparison method to identify similarities between movies and the given options": {
            "Method": "Compare movies based on genre, themes, and main characters."
        }
    },
    "Systematic ideation": {
        "List strategies to compare movies (e.g., genre, themes, actors) and apply them sequentially to find matches": {
            "Strategies": [
                "Compare genres",
                "Compare themes",
                "Compare main characters and their development"
            ]
        }
    },
    "Problem decomposition": {
        "Break down the movie comparison process into smaller tasks, like analyzing genre, themes, or critical acclaim": {
            "Tasks": [
                "Analyze the genre of each movie",
                "Identify the main themes of each movie",
                "Evaluate the main characters and their development"
            ]
        }
    },
    "Critical movie analysis":

In [88]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [89]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 124253.58it/s]

(C), B.

(C), B.

(A), B.

(A), B.

(A), (D) Mystery.

(A), B.

(C), B.

(C), A.

(C), B.

(D), C.

(C), B.

(D), E.

(D), (A) The Impostors.

(C), B.

(A), B.

(A), D.

(A), D.

(D), E.

(D), (C) Futurama Bender's Game.

(A), B.

(D), B.

(D), C.

(A), B.

(A), E.

Monsters, Inc, B.

(D), B.

(C), B.

(E), (A) The Firm.

(A), (C) Cleanskin.

(D), B.

(C), A.

(A), B.

(B), C.

(A), B.

(A), D.

(B), A.

(A), B.

(A), B.

(A), B.

(C), B.

(D), B.

(A), B.

(D), B.

(A), D.

(D), B.

(C), B.

(D), (A) The Village.

(A), D.

(A), B.

(C), B.

(A), B.

(A), B.

(A), B.

(C), B.

(C), B.

(A), B.

(D), B.

(A), B.

(C), B.

(D), B.

(C), B.

(A), C.

(A), B.

(D), B.

(D), B.

(A), B.

(A), B.

(A), B.

(D), C.

(B), C.

(A), B (Everlasting Piece).

(D), B.

(A), B.

(A), B.

(A), B.

(C), D.

(A), B.

(A), (D) Thunderbirds, as it is the closest match in terms of genre (action, sci-fi) and themes (adventure, thriller) to the given movies.

(A), B.

(A), B.






0.68

# penguins_in_a_table

In [48]:
subset = 'penguins_in_a_table'

In [49]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-penguins_in_a_table/bbh-penguins_in_a_table_eval')

In [50]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 146
})

In [51]:
print(dataset[0]["reasoning"])

```json
{
    "Identify Core Issue": {
        "Understand the key question that needs to be answered using the data in the tables.": "The key question is to identify the oldest penguin from the given table."
    },
    "Understand Underlying Factors": {
        "Identify the relevant columns or data points in the tables that will influence the solution to the problem.": "The relevant column is the 'age' column in the penguin table."
    },
    "Break Down the Problem": {
        "Divide the task into smaller steps, such as filtering, sorting, or aggregating the table data.": "1. Extract the age of each penguin. 2. Compare the ages to find the oldest penguin."
    },
    "Design an Experiment": {
        "Structure a query or test to extract the necessary information from the given tables to solve the problem.": "Extract the 'age' column and compare the values to find the maximum age."
    },
    "Gather Relevant Data": {
        "Extract specific columns or rows in the tables that pro

In [93]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [94]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 146it [00:00, 48658.59it/s]

(A), B.

(B), 2 penguins.

(A), B.

(A), B.

(A), B.

(A), B.

(A), B.

(A), B.

(B), E.

(A), D.

(A), B.

(B), C.

(D), B.

(A), B.

(A), B.






0.8972602739726028

# reasoning_about_colored_objects

In [52]:
subset = 'reasoning_about_colored_objects'

In [53]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-reasoning_about_colored_objects/bbh-reasoning_about_colored_objects_eval')

In [54]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [55]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the given items and their properties": {
        "List the items mentioned in the problem": {
            "cat toys": {
                "mauve": 1,
                "purple": 2,
                "grey": 3,
                "burgundy": 3
            },
            "notebooks": {
                "mauve": 2,
                "grey": 3,
                "purple": 1
            }
        },
        "Identify the color of each item": {
            "cat toys": ["mauve", "purple", "grey", "burgundy"],
            "notebooks": ["mauve", "grey", "purple"]
        },
        "Identify the arrangement of the items": {
            "cat toys": "mixed on the floor",
            "notebooks": "mixed on the floor"
        }
    },
    "Determine the key piece of information or relationship": {
        "Identify the specific item or position in question": "grey objects",
        "Identify the criteria for the solution (e.g., furthest from, directly to the right of)": "remaining grey ob

In [9]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [10]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 26975.10it/s]

(E), D.

(C), B.

(A), B.

(G), F.

(A), N.

(F), G.

(D), C.

(C), B.






0.968

# ruin_names

In [56]:
subset = 'ruin_names'

In [57]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-ruin_names/bbh-ruin_names_eval')

In [58]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [59]:
print(dataset[0]["reasoning"])

```json
{
    "Break Down the Problem": {
        "Identify the original name": "The original name is 'rain man'.",
        "Understand the types of edits made in each option": {
            "(A) ruin man": "Changes 'rain' to 'ruin'.",
            "(B) rains man": "Adds an 's' to 'rain'.",
            "(C) rain men": "Changes 'man' to 'men'.",
            "(D) rainmman": "Adds an extra 'm' to 'man'."
        },
        "Evaluate the humorous effect of each edit": {
            "(A) ruin man": "Creates a play on words with a negative connotation.",
            "(B) rains man": "Suggests multiple rains, which is grammatically incorrect and slightly humorous.",
            "(C) rain men": "Pluralizes 'man', which is straightforward and not particularly humorous.",
            "(D) rainmman": "Adds an extra 'm' to 'man', creating a humorous misspelling."
        }
    },
    "Generate and Test Ideas": {
        "Brainstorm various criteria for humor": "Criteria could include puns, wordplay

In [14]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [15]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 128000.00it/s]

(A), D.

(D), B.

(B), A.

(D), C.

(A), C.

(C), A.

(D), B.

(C), A.

(C), (A) job division.

(A), D.

(C), A.

(A), B.

(C), A.

(D), C.

(A), B.

(C), A.

(D), B.

(C), B.

(C), B.

(B), D.

rita, sue and bob poo, D.

(A), None

(C), A.

(B), D.

(A), C.

(B), D.

(A), B.

(B), D.

(D), B.

(C), None

(C), None

(A), None

(D), None

(B), C.

(B), A.

(B), C.

(C), B.

(B), C.

(C), A.

(A), B.

(D), B.

(B), D.

(D), A.

(A), B.

(B), C.

(B), D.

(D), C.

(D), C.

(B), C.

(A), B.

(C), A.

(C), A.

(D), C.

(C), B.

(B), D.

(A), D.

(D), C.

(B), C.

(A), C.

(D), B.

(B), C.

(A), D.

(C), D.

dearth, wind, & fire, G.






0.744

# salient_translation_error_detection

In [60]:
subset = 'salient_translation_error_detection'

In [61]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-salient_translation_error_detection/bbh-salient_translation_error_detection_eval')

In [62]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [63]:
print(dataset[0]["reasoning"])

```json
{
    "Identify key assumptions": {
        "Description": "The assumed rules and patterns in translation include maintaining the accuracy of names, titles, and roles. The translation should preserve the original meaning and context of the source text."
    },
    "Break down the problem": {
        "Description": "Segment the sentences into smaller components: names (Karl Borromäus Joseph Fürst von Liechtenstein), titles (kaiserlicher Feldmarschall), and roles (field marshal)."
    },
    "Critical Thinking": {
        "Description": "Analyze the translation from semantic and contextual perspectives. Question the accuracy of the title 'judicial field marshal' compared to 'kaiserlicher Feldmarschall'."
    },
    "Identify the core issue": {
        "Description": "The primary discrepancy is the change in the title from 'kaiserlicher Feldmarschall' to 'judicial field marshal'."
    },
    "Identify underlying causes": {
        "Description": "The error likely stems from a misu

In [19]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [20]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 78316.23it/s]

(A), D.

(F), D.

(F), D.

(A), D.

(F), D.

(C), (A) Modifiers or Adjectives.

(C), (A) Modifiers or Adjectives.

(F), D.

(F), D.

(B), F.

(D), E.

(C), (A) Modifiers or Adjectives.

(A), E.

(F), D.

(D), F.

(C), F.

(F), D.

(C), (A) Modifiers or Adjectives.

(F), (A) Modifiers or Adjectives.

(A), E.

(A), D.

(C), F.

(C), F.

(A), D.

(E), D.

(F), (A) Modifiers or Adjectives.

(C), B.

(A), F.

(A), D.

(C), (A) Modifiers or Adjectives.

(D), E.

(F), D.

(F), D.

(C), E.

(C), D.

(A), D.

(E), D.

(F), D.

(C), (A) Modifiers or Adjectives.

(A), E.

(F), (A) Modifiers or Adjectives.

(F), (A) Modifiers or Adjectives.

(C), F.

(E), B.

(A), E.

(C), D.

(C), D.

(E), D.

(F), D.

(D), (E) Dropped Content.

(E), B.

(F), C.

(F), D.

(A), D.

(A), E.

(A), E.

(B), D.

(C), (A) Modifiers or Adjectives.

(B), D.

(A), E.

(F), D.

(F), (A) Modifiers or Adjectives.

(A), E.

(E), F.

(A), D.

(C), (A) Modifiers or Adjectives.

(C), F.

(A), B.

(F), B.

(E), D.

(F), B.

(A), 




0.664

# sports_understanding

In [64]:
subset = 'sports_understanding'

In [65]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [66]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [67]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the core issue or problem that needs to be addressed": {
        "Identify the key elements of the sentence that need to be evaluated for plausibility": {
            "The sentence involves a player named Elias Lindholm and the action of beating the buzzer."
        }
    },
    "Determine if there are relevant data or information that can provide insights into the problem": {
        "Identify specific sports rules, player positions, or common terminology that can help assess the sentence's plausibility": {
            "In sports like basketball and hockey, 'beating the buzzer' refers to scoring just before the end of a period."
        },
        "Identify available sports databases, glossaries, or expert analyses": {
            "Elias Lindholm is a known hockey player, and the term 'beating the buzzer' is commonly used in hockey."
        }
    },
    "Ascertain if the problem requires specific expertise or skill set": {
        "Determine if the plausibilit

In [25]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False.',
  'The sentence "Mikal Bridges scored a windmill dunk" is plausible.',
  'The sentence is not plausible.',
  'The sentence is plausible but highly unlikely.',
  'The sentence is plausible but highly unusual.',
  'The sentence is plausible but not typical for Ramires.',
  'The sentence is plausible if Blake Snell was playing in a game where pitchers hit.',
  'The sentence is plausible if Darius Slayton played in the AFC Championship.',
  'The sentence is plausible.',
  'True.'},
 {'no', 'yes'})

In [26]:
# Plausible (Yes)
plausible_yes = [
    'True.',
    'The sentence "Mikal Bridges scored a windmill dunk" is plausible.',
    'The sentence is plausible but highly unlikely.',
    'The sentence is plausible but highly unusual.',
    'The sentence is plausible but not typical for Ramires.',
    'The sentence is plausible if Blake Snell was playing in a game where pitchers hit.',
    'The sentence is plausible if Darius Slayton played in the AFC Championship.',
    'The sentence is plausible.'
]

# Implausible (No)
implausible_no = [
    'False.',
    'The sentence is not plausible.'
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'no', 'yes'}

In [28]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 64086.05it/s]

no, yes

yes, no

yes, no

no, yes

yes, no

no, yes

no, yes

yes, no

yes, no

no, yes

yes, no

no, yes

yes, no

yes, no

yes, no

yes, no

yes, no

no, yes

yes, no

yes, no

no, yes

yes, no

no, yes

yes, no

no, yes

yes, no

yes, no

yes, no

no, yes

no, yes

no, yes






0.876

# temporal_sequences

In [68]:
subset = 'temporal_sequences'

In [69]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-temporal_sequences/bbh-temporal_sequences_eval')

In [70]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [71]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the Time Window": {
        "Determine the primary time frame in question that needs to be determined": "The primary time frame in question is the time Susan could have gone to the coffee shop."
    },
    "Break Down the Day": {
        "Segment the day into smaller, manageable time blocks based on the given information": "The day can be segmented into the following time blocks: 7am to 11am, 11am to 12pm, 12pm to 1pm, 1pm to 2pm, 2pm to 6pm, 6pm to 9pm."
    },
    "Sequential Analysis": {
        "Analyze the given time blocks step by step": "Analyzing each time block: 7am to 11am (Susan was driving to the water park), 11am to 12pm (Susan was buying clothes at the mall), 12pm to 1pm (Susan was taking photos near the Eiffel Tower), 1pm to 2pm (Susan was buying lunch at the deli), 2pm to 6pm (Susan was reading at the library), 6pm to 9pm (Susan's whereabouts are not accounted for)."
    },
    "Relevant Observations": {
        "Identify the relevant sightings o

In [32]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [33]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 257762.05it/s]

(C), None of the options.






0.996

# tracking_shuffled_objects_five_objects

In [72]:
subset = 'tracking_shuffled_objects_five_objects'

In [73]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-tracking_shuffled_objects_five_objects/bbh-tracking_shuffled_objects_five_objects_eval')

In [74]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [75]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the problem into sequential swaps": {
        "Identify each swap of items between pairs and track these exchanges step by step": {
            "1. Dave and Eve switch partners: Dave gets Melissa, Eve gets Lola",
            "2. Dave and Alice switch partners: Dave gets Patrick, Alice gets Melissa",
            "3. Eve and Alice switch partners: Eve gets Patrick, Alice gets Lola",
            "4. Claire and Bob switch partners: Claire gets Sam, Bob gets Jamie",
            "5. Dave and Alice switch partners: Dave gets Lola, Alice gets Patrick"
        }
    },
    "Critical Thinking for Tracking": {
        "Analyze the sequence of swaps from the perspective of each participant involved": {
            "Alice: Patrick -> Melissa -> Lola -> Patrick",
            "Bob: Sam -> Jamie",
            "Claire: Jamie -> Sam",
            "Dave: Lola -> Melissa -> Patrick -> Lola",
            "Eve: Melissa -> Lola -> Patrick"
        },
        "Question and verify the

In [37]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [38]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 255937.52it/s]

(C), B.

(B), E.

(B), (A) red ball.

(C), D.

(E), (D) orange ball.

(D), A.

(D), (A) brown present.

(E), B.

(A), C.

(D), B.

(E), A.

(D), E.

(E), (C) white ball.

(B), C.

(A), C.

(D), C.

(C), (A) brown present.

(C), D.

(B), C.

(A), B.

(C), E.

(E), D.

(D), C.

(E), C.

(A), C.

(C), A.

(C), D.

(E), D.

(B), E.

(B), E.

(C), D.

(B), D.

(E), (D) Patrick.

(E), (B) red present.

(E), D.

(E), D.

(A), B.

(E), (A) benchwarmer.

(B), D.

(A), C.

(A), B.

(E), D.

(A), D.

(B), C.

(D), (A) orange ball.

(D), C.

(A), C.

(A), (E) striker.

(B), E.

(D), B.

(E), C.

(D), (A) green present.

(C), B.

(A), B.

(E), D.

(D), (A) striker.

(D), C.

(A), B.

(C), B.

(D), B.

(D), (C) green present.






0.756

# tracking_shuffled_objects_seven_objects

In [76]:
subset = 'tracking_shuffled_objects_seven_objects'

In [77]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-tracking_shuffled_objects_seven_objects/bbh-tracking_shuffled_objects_seven_objects_eval')

In [78]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [79]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the problem": {
        "Identify the initial state of each person and their respective partner or item.": {
            "Alice": "striker",
            "Bob": "right winger",
            "Claire": "left winger",
            "Dave": "benchwarmer",
            "Eve": "goalkeeper",
            "Fred": "center midfielder",
            "Gertrude": "cheerleader"
        },
        "List each swap or trade chronologically.": [
            {"Eve": "goalkeeper", "Claire": "left winger"},
            {"Gertrude": "cheerleader", "Alice": "striker"},
            {"Fred": "center midfielder", "Bob": "right winger"},
            {"Dave": "benchwarmer", "Fred": "center midfielder"},
            {"Fred": "center midfielder", "Bob": "right winger"},
            {"Bob": "right winger", "Eve": "goalkeeper"},
            {"Claire": "left winger", "Alice": "striker"}
        ]
    },
    "Identify the core question": {
        "Determine who or what is the focus at the end of the

In [42]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [43]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 42652.78it/s]

(F), (G) Izzi.

(E), D.

(A), E.

(G), B.

(D), (F) right winger.

(A), E.

(G), F.

(F), D.

(C), D.

(F), B.

(A), C.

(C), B.

(A), B.

(E), D.

(C), B.

(B), D.

(G), C.

(A), B.

(E), F.

(D), B.

(A), C.

(F), C.

(C), G.

(C), D.

(G), F.

(E), B.

(F), D.

(F), G.

(A), F.

(A), B.

(G), B.

(E), D.

(F), D.

(F), E.

(G), B.

(A), C.

(A), B.

(C), B.

(E), (G) Frankenstein.

(G), D.

(E), D.

(C), B.

(D), G.

(F), (A) benchwarmer.

(G), B.

(E), D.

(F), B.

(B), D.

(C), D.

(A), C.

(A), B.

(E), B.

(B), G.

(D), B.

(C), D.

(C), D.

(G), D.

(F), B.

(A), D.

(A), G.






0.76

# tracking_shuffled_objects_three_objects

In [80]:
subset = 'tracking_shuffled_objects_three_objects'

In [81]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-tracking_shuffled_objects_three_objects/bbh-tracking_shuffled_objects_three_objects_eval')

In [82]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [83]:
print(dataset[0]["reasoning"])

```json
{
    "Step 1: Initial Setup": {
        "Identify the initial state of each participant (e.g., Alice, Bob, Claire) and their respective partners or items.": "Alice has Ulysses, Bob has Frankenstein, Claire has Lolita."
    },
    "Step 2: Process Deconstruction": {
        "Break down the partner swapping or item exchange process into individual steps to analyze the problem more closely.": "1. Claire and Bob swap books. 2. Bob and Alice swap books. 3. Claire and Bob swap books."
    },
    "Step 3: Step-by-Step Tracking": {
        "Track the partners or items step by step through each swap or exchange.": [
            "After the first swap (Claire and Bob): Alice has Ulysses, Bob has Lolita, Claire has Frankenstein.",
            "After the second swap (Bob and Alice): Alice has Lolita, Bob has Ulysses, Claire has Frankenstein.",
            "After the third swap (Claire and Bob): Alice has Lolita, Bob has Frankenstein, Claire has Ulysses."
        ]
    },
    "Step 4: Persp

In [47]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [48]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 128250.49it/s]

(A), B.

(C), B.

(C), B.

(C), B.

(A), C.

(C), B.

(C), (A) Jamie."

(C), A.

(B), A.

(C), B.






0.96

# web_of_lies

In [84]:
subset = 'web_of_lies'

In [85]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [86]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [87]:
print(dataset[0]["reasoning"])

```json
{
    "Break down the sequence of statements": {
        "Identify each person's statement and their implications one at a time.": [
            "Sherrie tells the truth.",
            "Vernell says Sherrie tells the truth.",
            "Alexis says Vernell lies.",
            "Michaela says Alexis tells the truth.",
            "Elanor says Michaela tells the truth."
        ]
    },
    "Evaluate each statement's truth value": {
        "Make a list of possibilities (true or false) for each person's statement, starting with the initial person mentioned, and propagate the implications down the chain.": [
            "If Sherrie tells the truth, then Vernell's statement that Sherrie tells the truth is also true.",
            "If Vernell tells the truth, then Alexis's statement that Vernell lies is false.",
            "If Alexis's statement is false, then Michaela's statement that Alexis tells the truth is also false.",
            "If Michaela's statement is false, then Elan

In [53]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Amberly tells the truth.',
  'Andree tells the truth.',
  'Christie tells the truth.',
  'Conception tells the truth.',
  'Delbert tells the truth.',
  'Delfina tells the truth.',
  'False.',
  'Maybelle tells the truth.',
  'Michaela does not tell the truth.',
  'Millie tells the truth.',
  'Shalonda tells the truth.',
  'Sima tells the truth.',
  'True.'},
 {'No', 'Yes'})

In [55]:
# Truth (Yes)
truth_yes = [
    'True.',
    'Amberly tells the truth.',
    'Andree tells the truth.',
    'Christie tells the truth.',
    'Conception tells the truth.',
    'Delbert tells the truth.',
    'Delfina tells the truth.',
    'Maybelle tells the truth.',
    'Millie tells the truth.',
    'Shalonda tells the truth.',
    'Sima tells the truth.'
]


# False (No)
false_no = [
    'False.',
    'Michaela does not tell the truth.'
]


def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [56]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 12544.73it/s]

No, Yes

No, Yes






0.992

# word_sorting

In [88]:
subset = 'word_sorting'

In [89]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

WindowsPath('d:/Surge/self-discover/evals/logs/mistral/phaseII/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [90]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [91]:
print(dataset[0]["reasoning"])

```json
{
    "Identify the core task": {
        "Identify the core task: sorting words alphabetically": "The core task is to sort the words 'syndrome' and 'therefrom' alphabetically."
    },
    "Analyze the list from different perspectives": {
        "Analyze the list from different perspectives (e.g., word length, prefixes, suffixes) to identify the best sorting approach": "The words are 'syndrome' and 'therefrom'. Analyzing word length, 'syndrome' has 8 letters and 'therefrom' has 9 letters. Analyzing initial letters, 'syndrome' starts with 's' and 'therefrom' starts with 't'."
    },
    "Identify relevant data": {
        "Identify relevant data (e.g., word length, initial letters) and how it can be used to optimize sorting": "The relevant data is the initial letters of the words. 'syndrome' starts with 's' and 'therefrom' starts with 't'."
    },
    "Generate a list of sorting algorithms": {
        "Generate a list of sorting algorithms and apply each one to the list to see 

In [116]:
answer_pred_list = [x.translate(str.maketrans("", "", ".'")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

18

In [117]:
answer_pred_list[0].translate(str.maketrans("", "", "[]")).split(", ")

['barn',
 'delmarva',
 'damp',
 'dot',
 'drumhead',
 'embezzle',
 'entirety',
 'guru',
 'greene',
 'it&t',
 'malton',
 'obstetric',
 'onus',
 'panicking',
 'prod',
 'same',
 'scorch',
 'splutter',
 'subsist',
 'thrill']

In [118]:
set(dataset["answer_pred"])

{'"bengal", "fettle", "yeager".',
 '"bootlegging", "indifferent", "trainman".',
 '"novelty", "rectitude", "splashy".',
 None,
 '["abutted", "agamemnon", "aquatic", "capacity", "casualty", "essex", "guinea", "hitachi", "hondo", "islamic", "loosen", "loquacious", "niece", "planet", "roadway", "solstice", "steed", "suspicion", "tibet"].',
 '["advent", "anger", "convoy", "deliver", "filly", "gneiss", "grocer", "hessian", "hotbox", "landau", "marlborough", "ninebark", "plat", "platelet", "pyrotechnic", "siemens", "stapleton", "treadle", "transitive", "uncle"].',
 '["anarchic", "bstj", "elution", "exhumation", "furl", "geld", "gradual", "j", "liniment", "locomote", "midshipman", "pantheist", "profess", "riddance", "rowley", "saline"].',
 '["animism", "awash", "beau", "bessie", "cream", "exricable", "helical", "indoeuropean", "pendulum", "sanhedrin", "scratchy", "venezuela", "vice"].',
 '["auerbach", "deoxyribose", "decor", "devisee", "dianne", "hodges", "incommensurable", "motorcade", "strat

In [120]:
for instance in dataset.filter(lambda x: x["answer_pred"] == None):
    print(instance["trajectory"])

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

```json
{
    "Identify the core task": {
        "Identify the core task: sorting words alphabetically": {}
    },
    "Analyze the list from different perspectives": {
        "Analyze the list from different perspectives (e.g., word length, prefixes, suffixes) to identify the best sorting approach": {
            "word length": "Varies",
            "prefixes": "Various",
            "suffixes": "Various"
        }
    },
    "Identify relevant data": {
        "Identify relevant data (e.g., word length, initial letters) and how it can be used to optimize sorting": {
            "initial letters": "b, s, g, b, i, e, r, f, d, j, d, g, c, p"
        }
    },
    "Generate a list of sorting algorithms": {
        "Generate a list of sorting algorithms and apply each one to the list to see which is most effective": {
            "algorithms": ["Bubble Sort", "Quick Sort", "Merge Sort"]
        }
    },
    "Explore creative sorting methods": {
        "Explore creative sorting methods, 

In [121]:
import re

def map_fn(ins):
    if ins["answer_pred"] == None:
        text = "The final answer is:\n"
        pattern = fr"(?<={re.escape(text)}).*"
    
        response = ins["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response, re.DOTALL).group(0).translate(str.maketrans("", "", "`")).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": ins["trajectory"],
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [122]:
dataset.filter(lambda x: x["answer_pred"] == None)

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [131]:
def map_fn(ins):
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    if "[" in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
    elif "," in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
    elif "9" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
    elif "-" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))

    return {
        "answer_pred": refined_answer
    }


dataset = dataset.map(map_fn)

In [133]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 250it [00:00, 42313.71it/s]

barn damp delmarva dot drumhead embezzle entirety greene guru it&t malton obstetric onus panicking prod same scorch splutter subsist thrill, barn delmarva damp dot drumhead embezzle entirety guru greene it&t malton obstetric onus panicking prod same scorch splutter subsist thrill

citrus cloudy euclidean fight hobby invite majestic scene stonehenge surge thrifty winsome, cloudy citrus euclidean fight hobby invite majestic scene stonehenge surge thrifty winsome

administer aeneid coachman decadent delhi dey gradate grim jacky littleneck phosphorescent pristine shrunk sinh systemwide tasting thrown torpedo verdict, aeneid administer coachman decadent dey delhi gradate grim jacky littleneck phosphorescent pristine shrunk sinh systemwide tasting thrown torpedo verdict

bivalve mainstream malformed mortify o'connell paunchy sleuth twelvefold umbilical vinegar, bivalve malformed mainstream mortify o'connell paunchy sleuth twelvefold umbilical vinegar

allotted fate figural gorky grapple hydr




0.788

In [136]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]