In [1]:
import os
import re
from pyprojroot import here
from datasets import Dataset

In [2]:
base_path = os.path.join("evals", "logs", "self_discover", "llama", "phaseII", "bbh")

In [3]:
from tqdm.auto import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))

In [4]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(f"{'<' * 20}START{'>' * 20}")
            print(f"{y_i}\n{'-' * 100}\n{y_pred_i}")
            print(f"{'<' * 20}END{'>' * 20}", end="\n\n")
    return correct_preds

# dyck_languages

In [8]:
subset = 'dyck_languages'

In [10]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [11]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [12]:
print(dataset[0]["reasoning"])

Here is the filled JSON for the given reasoning structure:

```
{
  "Step 1: Break Down the Brackets": {
    "Divide the sequence into smaller parts": "The given sequence is { ( < [ < > ]. We can isolate individual parentheses and brackets as follows: {, (, <, [, <, >, ]"
  },
  "Step 2: Simplify the Sequence": {
    "Eliminate unnecessary elements": "Upon examination, we see that the sequence contains alternating opening and closing brackets. We can simplify the sequence by focusing on the essential structure: {, (, <, [, <, >, ]"
  },
  "Step 3: Critical Sequence Analysis": {
    "Analyze the sequence from different perspectives": "Looking at the sequence, we notice that the opening brackets are in the order {, (, <, [. We should question whether this order is necessary for the sequence. We also notice that the closing brackets are in the order >, ]. This might indicate that the sequence requires a specific order for the closing brackets."
  },
  "Step 4: Systems Thinking for Sequenc

In [13]:
def map_fn(ins):
    find = "Input: "
    index = ins["input"].find(find)
    
    return {
        "target": ins["input"][index + len(find):] + " " + ins["target"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
).replace(" ", "") == y_pred_i.translate(str.maketrans("", "", '.(),"')).replace(" ", "")

In [15]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
{ ( < [ < > ] > ) }
----------------------------------------------------------------------------------------------------
{ ( < [ < > ] ) }.
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
{ ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] > )
----------------------------------------------------------------------------------------------------
{ } ( ( ) ) < ( ) ( ).
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
< ( < { [ { } < ( { ( < < < { [ ( [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > ) >
----------------------------------------------------------------------------------------------------
< ( < { [ { } < ( { ( < { [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > ).
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
< { [

0.436

# sports_understanding

In [5]:
subset = 'sports_understanding'

In [11]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [12]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [13]:
print(dataset[0]["reasoning"])

```
{
    Identify key assumptions: {
        Identify the sport being referred to in the sentence: Football (American),
        Identify the specific action being described in the sentence: Catching a screen pass,
        Identify the key elements such as the person, action, and sport: Tyreek Hill, catching, screen pass, football,
    },
    Simplify the sentence: {
        Remove any unnecessary words or phrases from the sentence: None,
        Identify the main verb and subject of the sentence: caught (verb), Tyreek Hill (subject),
    },
    Analyze from different perspectives: {
        Evaluate the sentence based on the rules of the sport: In football, a screen pass is a type of play where the quarterback throws a short pass to a receiver who is being blocked by linemen. This is a common and allowed play in football.,
        Evaluate the sentence based on the abilities of the person mentioned: Tyreek Hill is a professional football player known for his speed and agility, making 

In [14]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False.',
  'No.',
  'Not very plausible.',
  'Plausible but uncertain.',
  'Plausible.',
  'Somewhat plausible, but unlikely.',
  'True.',
  'Unlikely.'},
 {'no', 'yes'})

In [17]:
# Plausible (Yes)
plausible_yes = [
    'Plausible.',
    'True.',
]

# Implausible (No)
implausible_no = [
    'No.',
    'Not very plausible.',
    'False.',
    'Unlikely.'
]

indeterminate = [
    'Plausible but uncertain.',
    'Somewhat plausible, but unlikely.',
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Plausible but uncertain.', 'Somewhat plausible, but unlikely.', 'no', 'yes'}

In [18]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

no, yes

yes, no

no, yes

no, yes

yes, no

no, yes

no, Somewhat plausible, but unlikely.

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

no, yes

no, yes

yes, no

no, yes

yes, Plausible but uncertain.

no, yes

no, yes

no, yes

yes, no

yes, no

yes, no

no, yes



0.868

# web_of_lies

In [19]:
subset = 'web_of_lies'

In [20]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [21]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [22]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the Target Person": {
        "Identify the person whose truthfulness needs to be determined": "Elanor"
    },
    "Step 2: Analyze the Chain of Statements": {
        "Break down the chain of statements into smaller parts": [
            "Raymond tells the truth",
            "Sal says Raymond lies",
            "Alexis says Sal lies",
            "Helene says Alexis lies",
            "Elanor says Helene lies"
        ],
        "Identify the relationships between each pair of people": {
            "Raymond and Sal": "opposite",
            "Sal and Alexis": "opposite",
            "Alexis and Helene": "opposite",
            "Helene and Elanor": "opposite"
        }
    },
    "Step 3: Simplify the Problem": {
        "Identify the key statements and relationships between people": [
            "Raymond tells the truth",
            "Sal and Alexis have opposite statements",
            "Alexis and Helene have opposite statements",
            "Helene an

In [23]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Alexis tells the truth.',
  'False.',
  'Lorine lies.',
  'No.',
  'True.',
  'Vina tells the truth.',
  'Yes.'},
 {'No', 'Yes'})

In [24]:
# Truth (Yes)
truth_yes = [
    'Alexis tells the truth.',
    'True.',
    'Vina tells the truth.',
    'Yes.'
]


# False (No)
false_no = [
    'False.',
    'Lorine lies.',
    'No.',
]


def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [25]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes



0.912

# word_sorting

In [5]:
subset = 'word_sorting'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

```
{
    Identify the list of words to be sorted: {
        Extract the list of words from the task description: The list of words to be sorted is "slurp", "raytheon", and "gloucester".
    },
    Determine the primary goal or objective: {
        Identify the sorting criterion, which is alphabetical order: The primary goal is to arrange the words in alphabetical order.
    },
    Simplify the list of words: {
        Identify common characteristics, such as prefixes or suffixes: There are no common prefixes or suffixes among the words.
    },
    Break down the list of words into smaller groups or categories: {
        Group words starting with the same letter: The words can be grouped as follows: 
            - "g": gloucester
            - "r": raytheon
            - "s": slurp
    },
    Compare each word to the ones that come before and after it: {
        Iterate through the list of words step by step to ensure correct order: 
            1. gloucester comes before raytheon
    

In [9]:
answer_pred_list = [x.translate(str.maketrans("", "", "-.'\"")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

36

In [10]:
answer_pred_list[0].translate(str.maketrans("", "", "[]")).split(", ")

['asset',
 'bona',
 'cicero',
 'coastal',
 'dusky',
 'exonerate',
 'gaussian',
 'handlebar',
 'inhabitation',
 'portfolio',
 'purport',
 'rastus',
 'responsible',
 'ruanda',
 'silver',
 'zig']

In [11]:
set(dataset["answer_pred"])

{'',
 '"clytemnestra", "crag", "cutover", "diocletian", "dickson", "electrolytic", "inhuman", "lipton", "marginal", "scrawny", "stalk", "thereupon", "took", "wife", "wireman", "workplace".',
 "- accelerate, bauer, county, nail, nominee, o'connell, phony, poole, putnam, quantify, raisin, venice.",
 '["abc", "ada", "austere", "blend", "cankerworm", "falcon", "flamboyant", "gag", "grecian", "hanukkah", "indicate", "kruger", "lobster", "militia", "nobody", "pierson", "quad", "right", "ron", "wildcat"].',
 '["abdominal", "address", "berry", "bounty", "effusive", "fomalhaut", "hanoverian", "involve", "islamabad", "jordan", "optimal", "pay", "stearic", "stigmata", "swathe", "tattoo", "them", "tornado", "yang"].',
 '["abramson", "bangui", "carlisle", "cavalier", "contextual", "dustbin", "emacs", "implementor", "islamabad", "magistrate", "nudge", "picnicking", "railway", "refractory", "silvery", "waite"].',
 '["absorption", "aristocratic", "bermuda", "cesium", "cheerful", "congo", "diagram", "e

In [12]:
dataset.filter(lambda x: x["answer_pred"] == '')

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 7
})

In [13]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if answer_pred is None or answer_pred == '':
        marker = "The final answer is:"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
dataset.filter(lambda x: x["answer_pred"] == '')

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [15]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": ins["answer_pred"]
        }
        
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    try:
        if "[" in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
        elif "," in answer_pred:
            refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
        elif "1" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        elif "-" in answer_pred:
            refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
        else:
            refined_answer = " ".join(answer_pred.split("\n"))
    except Exception:
        refined_answer = answer_pred
        
    return {
        "answer_pred": refined_answer.lower()
    }


dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [17]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 1) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
confess croupier daffy dockyard duty household hypothesis info loam mandate mantic minstrelsy nepotism peccary sawtimber serenade silver summate triode
----------------------------------------------------------------------------------------------------
confess croupier daffy dockyard duty household hypothesis info loam mantic mandate minstrelsy nepotism peccary sawtimber serenade silver summate triode
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
clytemnestra crag cutover dickson diocletian electrolytic inhuman lipton marginal scrawny stalk thereupon took wife wireman workplace
----------------------------------------------------------------------------------------------------
clytemnestra crag cutover diocletian dickson electrolytic inhuman lipton marginal scrawny stalk thereupon took wife wireman workplace
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>

0.912

In [18]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]