In [35]:
import os
import re
from pyprojroot import here
from datasets import Dataset

In [10]:
base_path = os.path.join("evals", "logs", "self_discover", "llama", "phaseII", "bbh")

In [3]:
from tqdm.auto import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))

In [54]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(f"{'<' * 20}START{'>' * 20}")
            print(f"{y_i}\n{'-' * 100}\n{y_pred_i}")
            print(f"{'<' * 20}END{'>' * 20}", end="\n\n")
    return correct_preds

# sports_understanding

In [5]:
subset = 'sports_understanding'

In [11]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [12]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [13]:
print(dataset[0]["reasoning"])

```
{
    Identify key assumptions: {
        Identify the sport being referred to in the sentence: Football (American),
        Identify the specific action being described in the sentence: Catching a screen pass,
        Identify the key elements such as the person, action, and sport: Tyreek Hill, catching, screen pass, football,
    },
    Simplify the sentence: {
        Remove any unnecessary words or phrases from the sentence: None,
        Identify the main verb and subject of the sentence: caught (verb), Tyreek Hill (subject),
    },
    Analyze from different perspectives: {
        Evaluate the sentence based on the rules of the sport: In football, a screen pass is a type of play where the quarterback throws a short pass to a receiver who is being blocked by linemen. This is a common and allowed play in football.,
        Evaluate the sentence based on the abilities of the person mentioned: Tyreek Hill is a professional football player known for his speed and agility, making 

In [14]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False.',
  'No.',
  'Not very plausible.',
  'Plausible but uncertain.',
  'Plausible.',
  'Somewhat plausible, but unlikely.',
  'True.',
  'Unlikely.'},
 {'no', 'yes'})

In [17]:
# Plausible (Yes)
plausible_yes = [
    'Plausible.',
    'True.',
]

# Implausible (No)
implausible_no = [
    'No.',
    'Not very plausible.',
    'False.',
    'Unlikely.'
]

indeterminate = [
    'Plausible but uncertain.',
    'Somewhat plausible, but unlikely.',
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Plausible but uncertain.', 'Somewhat plausible, but unlikely.', 'no', 'yes'}

In [18]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

no, yes

yes, no

no, yes

no, yes

yes, no

no, yes

no, Somewhat plausible, but unlikely.

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

yes, no

no, yes

no, yes

no, yes

yes, no

no, yes

yes, Plausible but uncertain.

no, yes

no, yes

no, yes

yes, no

yes, no

yes, no

no, yes



0.868

# web_of_lies

In [19]:
subset = 'web_of_lies'

In [20]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [21]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [22]:
print(dataset[0]["reasoning"])

```
{
    "Step 1: Identify the Target Person": {
        "Identify the person whose truthfulness needs to be determined": "Elanor"
    },
    "Step 2: Analyze the Chain of Statements": {
        "Break down the chain of statements into smaller parts": [
            "Raymond tells the truth",
            "Sal says Raymond lies",
            "Alexis says Sal lies",
            "Helene says Alexis lies",
            "Elanor says Helene lies"
        ],
        "Identify the relationships between each pair of people": {
            "Raymond and Sal": "opposite",
            "Sal and Alexis": "opposite",
            "Alexis and Helene": "opposite",
            "Helene and Elanor": "opposite"
        }
    },
    "Step 3: Simplify the Problem": {
        "Identify the key statements and relationships between people": [
            "Raymond tells the truth",
            "Sal and Alexis have opposite statements",
            "Alexis and Helene have opposite statements",
            "Helene an

In [23]:
set(dataset["answer_pred"]), set(dataset["target"])

({'Alexis tells the truth.',
  'False.',
  'Lorine lies.',
  'No.',
  'True.',
  'Vina tells the truth.',
  'Yes.'},
 {'No', 'Yes'})

In [24]:
# Truth (Yes)
truth_yes = [
    'Alexis tells the truth.',
    'True.',
    'Vina tells the truth.',
    'Yes.'
]


# False (No)
false_no = [
    'False.',
    'Lorine lies.',
    'No.',
]


def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [25]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes



0.912

# word_sorting

In [26]:
subset = 'word_sorting'

In [27]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/self_discover/llama/phaseII/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [106]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [58]:
print(dataset[0]["reasoning"])

```
{
    Identify the list of words to be sorted: {
        Extract the list of words from the task description: The list of words to be sorted is "slurp", "raytheon", and "gloucester".
    },
    Determine the primary goal or objective: {
        Identify the sorting criterion, which is alphabetical order: The primary goal is to arrange the words in alphabetical order.
    },
    Simplify the list of words: {
        Identify common characteristics, such as prefixes or suffixes: There are no common prefixes or suffixes among the words.
    },
    Break down the list of words into smaller groups or categories: {
        Group words starting with the same letter: The words can be grouped as follows: 
            - "g": gloucester
            - "r": raytheon
            - "s": slurp
    },
    Compare each word to the ones that come before and after it: {
        Iterate through the list of words step by step to ensure correct order: 
            1. gloucester comes before raytheon
    

In [59]:
answer_pred_list = [x.translate(str.maketrans("", "", "-.'\"")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

36

In [60]:
answer_pred_list[0].translate(str.maketrans("", "", "[]")).split(", ")

['asset',
 'bona',
 'cicero',
 'coastal',
 'dusky',
 'exonerate',
 'gaussian',
 'handlebar',
 'inhabitation',
 'portfolio',
 'purport',
 'rastus',
 'responsible',
 'ruanda',
 'silver',
 'zig']

In [61]:
set(dataset["answer_pred"])

{'',
 '"clytemnestra", "crag", "cutover", "diocletian", "dickson", "electrolytic", "inhuman", "lipton", "marginal", "scrawny", "stalk", "thereupon", "took", "wife", "wireman", "workplace".',
 "- accelerate, bauer, county, nail, nominee, o'connell, phony, poole, putnam, quantify, raisin, venice.",
 '["abc", "ada", "austere", "blend", "cankerworm", "falcon", "flamboyant", "gag", "grecian", "hanukkah", "indicate", "kruger", "lobster", "militia", "nobody", "pierson", "quad", "right", "ron", "wildcat"].',
 '["abdominal", "address", "berry", "bounty", "effusive", "fomalhaut", "hanoverian", "involve", "islamabad", "jordan", "optimal", "pay", "stearic", "stigmata", "swathe", "tattoo", "them", "tornado", "yang"].',
 '["abramson", "bangui", "carlisle", "cavalier", "contextual", "dustbin", "emacs", "implementor", "islamabad", "magistrate", "nudge", "picnicking", "railway", "refractory", "silvery", "waite"].',
 '["absorption", "aristocratic", "bermuda", "cesium", "cheerful", "congo", "diagram", "e

In [77]:
dataset.filter(lambda x: x["answer_pred"] == '')

Dataset({
    features: ['input', 'target', 'self_discover_input', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 7
})

In [72]:
print(dataset.filter(lambda x: x["answer_pred"] == '')["reasoning"][6])

```
{
    Identify the list of words to be sorted: {
        Extract the list of words from the task description: The list of words to be sorted is "extempore" and "gotten",
    },
    Determine the primary goal or objective: {
        Identify the sorting criterion, which is alphabetical order: The primary goal is to sort the words alphabetically,
    },
    Simplify the list of words: {
        Identify common characteristics, such as prefixes or suffixes: There are no common prefixes or suffixes in the given words,
    },
    Break down the list of words into smaller groups or categories: {
        Group words starting with the same letter: The words can be grouped as follows: 
            - E: extempore
            - G: gotten
    },
    Compare each word to the ones that come before and after it: {
        Iterate through the list of words step by step to ensure correct order: 
            - extempore comes before gotten in alphabetical order,
    },
    Implement a step-by-step p

In [112]:
print(content)

```
{
    Identify the list of words to be sorted: {
        Extract the list of words from the task description: 
            The list of words is: corpulent, diagnose, code, scalp, bombproof, fluorine, blythe, swipe, honeybee, damn, pore, maharaja, cytolysis, solicit
    },
    Determine the primary goal or objective: {
        Identify the sorting criterion, which is alphabetical order: 
            The goal is to arrange the words in alphabetical order from A to Z.
    },
    Simplify the list of words: {
        Identify common characteristics, such as prefixes or suffixes: 
            There are no common prefixes, but some words have suffixes like -proof, -lysis, and -bee.
    },
    Break down the list of words into smaller groups or categories: {
        Group words starting with the same letter: 
            B: bombproof, blythe
            C: code, corpulent, cytolysis
            D: damn, diagnose
            F: fluorine
            H: honeybee
            M: maharaja
     

In [127]:
pattern = r"The final answer is:?"
  
index = 6
# Use re.search to find the first occurrence of the pattern in the text
match = re.search(pattern, t["reasoning"][index], re.IGNORECASE)

if match:
    start_index = match.end()
    content = t["reasoning"][index][start_index:]

print(content)

 
1. extempore
2. gotten


In [146]:
def map_fn(ins):
    if ins["answer_pred"] == '':
        pattern = r"The final answer is:?"
    
        response = ins["reasoning"]
    
        try:
            match = re.search(pattern, response, re.IGNORECASE)

            if match:
                start_index = match.end()
                answer = response[start_index:]
            
                trajectory = re.sub(pattern, "", response[:start_index], flags=re.IGNORECASE)
        except:
            answer, trajectory = None, response

        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": ins["trajectory"],
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [147]:
set(dataset["answer_pred"])

{' \n- amphibious\n- assist\n- baseplate\n- benchmark\n- ell\n- hatchet\n- homecoming\n- loess\n- machine\n- percentage\n- pilot\n- prorate\n- redcoat\n- reverie\n- sank\n- stallion\n- thoughtful\n- wehr\n- wince',
 ' \n- anarchic\n- bstj\n- elution\n- exhumation\n- furl\n- geld\n- gradual\n- j\n- liniment\n- locomote\n- midshipman\n- pantheist\n- profess\n- riddance\n- rowley\n- saline',
 " \n1. acuity\n2. anticonvulsant\n3. carrageen\n4. discovery\n5. disseminate\n6. drafty\n7. embolden\n8. glamour\n9. hangout\n10. hasty\n11. magnificent\n12. pewee\n13. proscenium\n14. registrar\n15. scrub\n16. supposable\n17. sushi\n18. you'd",
 ' \n1. behind\n2. hornpipe\n3. iniquity\n4. inmate\n5. mcconnell\n6. mollie\n7. sandy\n8. scorn\n9. toroidal\n10. volcanism\n11. wellwisher\n12. yoghurt\n13. zip',
 ' \n1. bombproof\n2. blythe\n3. code\n4. corpulent\n5. cytolysis\n6. damn\n7. diagnose\n8. fluorine\n9. honeybee\n10. maharaja\n11. pore\n12. scalp\n13. solicit\n14. swipe',
 ' \n1. brainy,\n2. c

In [45]:
def map_fn(ins):
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    if "[" in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]\"-")).replace('"', "").split(", ")])
    elif "," in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
    elif "9" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
    elif "-" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))

    return {
        "answer_pred": refined_answer
    }


dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [55]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
confess croupier daffy dockyard duty household hypothesis info loam mandate mantic minstrelsy nepotism peccary sawtimber serenade silver summate triode
----------------------------------------------------------------------------------------------------
confess croupier daffy dockyard duty household hypothesis info loam mantic mandate minstrelsy nepotism peccary sawtimber serenade silver summate triode
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>>
clytemnestra crag cutover dickson diocletian electrolytic inhuman lipton marginal scrawny stalk thereupon took wife wireman workplace
----------------------------------------------------------------------------------------------------
clytemnestra crag cutover diocletian dickson electrolytic inhuman lipton marginal scrawny stalk thereupon took wife wireman workplace
<<<<<<<<<<<<<<<<<<<<END>>>>>>>>>>>>>>>>>>>>

<<<<<<<<<<<<<<<<<<<<START>>>>>>>>>>>>>>>>>>>

0.892

In [None]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]