In [11]:
import os
from pyprojroot import here
from datasets import Dataset

In [12]:
base_path = os.path.join("evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh")

In [13]:
from tqdm.notebook import tqdm


t4d = (
    lambda y_i, y_pred_i: y_pred_i
    and y_i in y_pred_i
    and y_i == str(y_pred_i.translate(str.maketrans("", "", ".'"))[2:])
)
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
) == y_pred_i.translate(str.maketrans("", "", '.(),"'))

In [14]:
def calculate_correct_prediction_count(benchmark, y: list[str], y_pred: list[str]):
    correct_preds = 0
    for y_i, y_pred_i in tqdm(zip(y, y_pred), desc="Calculating..."):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(y_i, y_pred_i):
            correct_preds += 1
        else:
            print(y_i)
            print("-" * 100)
            print(y_pred_i)
            print("-+=" * 100)
    return correct_preds

# dyck_languages

In [9]:
subset = 'dyck_languages'

In [10]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh/bbh-dyck_languages/bbh-dyck_languages_eval')

In [11]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [12]:
print(dataset[0]["reasoning"])

**Step 1: Analyze the Input Sequence**
The given sequence is: { ( < [ < > ]
The sequence contains curly brackets {}, round brackets (), angle brackets <>, and square brackets [].

**Step 2: Identify Patterns and Relationships**
The sequence appears to be symmetric, with the innermost parentheses being the angle brackets <>, followed by the square brackets [], then the round brackets (), and finally the curly brackets {}.

**Step 3: Break Down the Sequence**
Divide the sequence into smaller parts, focusing on one type of parenthesis at a time:
- Innermost: < > (already closed)
- Next: [ < > ] (already closed)
- Next: ( [ < > ] ) (missing closing parenthesis)
- Outermost: { ( [ < > ] ) (missing closing parentheses)

**Step 4: Create a List of Possible Next Steps**
Possible next steps:
- Add a closing parenthesis to match the opening round bracket.
- Add a closing parenthesis to match the opening curly bracket.

**Step 5: Apply Possible Next Steps**
Apply each possible next step to the se

In [13]:
def map_fn(ins):
    find = "Input: "
    index = ins["input"].find(find)
    
    return {
        "target": ins["input"][index + len(find):] + " " + ins["target"]
    }

dataset = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()\"")
).replace(" ", "") == y_pred_i.translate(str.maketrans("", "", '.(),"')).replace(" ", "")

In [15]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

{ ( < [ < > ] > ) }, { ( [ < > ] ) }.

{ ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] > ), { ( < > ) } ( ( [ ] ) < [ ( [ [ ] ] [ { } ] { } [ < { [ ] } > ] ( ) ) ] ) ) ) )

< { < [ [ ( { } ) ] ] > } >, < { < [ [ ( { } ) ] ] } >.

< ( < { [ { } < ( { ( < < < { [ ( [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > ) >, < ( < { [ { } ] } < ( { ( < < < { [ ( [ ( [ { { < [ { } < ( ) > ] > } } ] ) ] ) ] } > < > > ( ( < { } > ) ) > ) } ) > ] } > )

[ < < [ [ ] ( ) { < > ( [ { } { < > } { } ] ) } [ [ [ ( [ ( ) [ [ { < [ { { } } < { { < ( ) > } } > ] > } ] ] ] ) ] < < [ [ ( < < ( ) > > ) ] ] > > [ ] ] ] ] < ( [ ] ) > { ( ( < { } > ) ) } > > ], [ < < [ [ ] ( ) { < > ( [ { } { < > } { } ] ) } [ [ [ ( [ ( ) [ [ { < [ { { } } < { { < ( ) > } } > ] > } ] ] ] ) ] < < [ [ ( < < ( ) > > ) ] ] > > [ ] ] ] ] < ( [ ] ) > { ( ( < { } > ) ) } > >.

< [ { ( ( < ( ( ) ) > ) ) } ] >, [ < ( () ) > ].

{ { [ ( [ { ( { ( [ ( [ ] ) { ( < < [ ] > [ [ ] ] >

0.5

# reasoning_about_colored_objects

In [16]:
subset = 'reasoning_about_colored_objects'

In [17]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh/bbh-reasoning_about_colored_objects/bbh-reasoning_about_colored_objects_eval')

In [18]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [19]:
print(dataset[0]["reasoning"])

1. Identify the objects on the nightstand.
   - The objects are: black necklace, green fidget spinner, blue keychain, yellow sheet of paper, and red stress ball.

2. Determine the colors of the objects.
   - The colors are: 
     - black necklace (black)
     - green fidget spinner (green)
     - blue keychain (blue)
     - yellow sheet of paper (yellow)
     - red stress ball (red)

3. Identify the colors to exclude (yellow and green).
   - The colors to exclude are: yellow and green.

4. Categorize the objects by color.
   - The yellow and green objects are: green fidget spinner and yellow sheet of paper.
   - The objects that are neither yellow nor green are: black necklace, blue keychain, and red stress ball.

5. Count the objects that are neither yellow nor green.
   - Counting the objects that are neither yellow nor green: 
     - black necklace (1)
     - blue keychain (2)
     - red stress ball (3)
   - There are 3 objects that are neither yellow nor green.

6. Determine the fi

In [20]:
bbh = lambda y_i, y_pred_i: y_pred_i and y_i.translate(
    str.maketrans("", "", "()")
) == y_pred_i.translate(str.maketrans("", "", '.()"'))[0]

In [21]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

(N), D.

(A), B.

(E), (D).

(E), F.

(B), A.

(B), A.

(E), B.

(H), G.

(F), E.

(L), (D).

(D), C.

(C), E.

(D), J.

(C), B.

(A), N.



0.94

# sports_understanding

In [5]:
subset = 'sports_understanding'

In [6]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh/bbh-sports_understanding/bbh-sports_understanding_eval')

In [7]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [8]:
print(dataset[0]["reasoning"])

**Step 1: Identify Key Assumptions**
The key assumptions underlying the claim are that Tyreek Hill is a football player, he was participating in a game or practice, and a screen pass was thrown to him.

**Step 2: Determine Required Evidence or Context**
To evaluate the claim, we would need evidence or context such as game stats, play-by-play commentary, or eyewitness accounts. Information about the specific game, the teams involved, and the play in question would be relevant.

**Step 3: Assess the Need for Sports Expertise**
Assessing the claim does not require specialized sports expertise, as catching a screen pass is a common occurrence in football. However, knowledge of football basics and strategies can be helpful.

**Step 4: Analyze Underlying Factors**
Tyreek Hill is a skilled wide receiver known for his speed and agility, making it more likely for him to catch a screen pass. However, the opposing team's defense and the game situation could affect the likelihood of a successful s

In [9]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False',
  'False.',
  'No.',
  'Plausible but unlikely.',
  'Plausible.',
  'Somewhat Plausible.',
  'True.',
  'Uncertain.',
  'Unclear',
  'Unlikely.'},
 {'no', 'yes'})

In [10]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

{'False',
 'No',
 'Plausible',
 'Plausible but unlikely',
 'Somewhat Plausible',
 'True',
 'Uncertain',
 'Unclear',
 'Unlikely'}

In [35]:
# Plausible (Yes)
plausible_yes = [
    'Plausible',
    'True',
]

# Implausible (No)
implausible_no = [
    'False',
    'No',
    'Unlikely'
]

# Indeterminate
indeterminate = [
    'Plausible but unlikely',
    'Somewhat Plausible',
    'Uncertain',
    'Unclear',
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

{'Plausible but unlikely',
 'Somewhat Plausible',
 'Uncertain',
 'Unclear',
 'no',
 'yes'}

In [36]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

yes, Uncertain

no, Somewhat Plausible

yes, no

yes, no

yes, Uncertain

yes, no

no, yes

yes, no

no, yes

yes, no

yes, no

yes, no

yes, no

yes, no

yes, no

no, yes

no, yes

no, yes

no, yes

yes, no

yes, no

yes, no

no, Uncertain

yes, no

yes, no

no, yes

yes, Uncertain

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

no, yes

yes, no

yes, no

yes, no

no, yes

no, yes

yes, Uncertain

yes, no

yes, no

yes, no

yes, Unclear

no, yes

yes, no

yes, no

yes, Plausible but unlikely

yes, no

no, yes

no, yes



0.796

# web_of_lies

In [47]:
subset = 'web_of_lies'

In [48]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh/bbh-web_of_lies/bbh-web_of_lies_eval')

In [49]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [50]:
print(dataset[0]["reasoning"])

**Step 1: Break down the statements into smaller parts**
We have the following claims:
- Raymond: tells the truth (no claim about others)
- Sal: says Raymond lies
- Alexis: says Sal lies
- Helene: says Alexis lies
- Elanor: says Helene lies

**Step 2: Create a diagram or truth table to visualize the statements**
Let's create a table with columns for each speaker and their claim:

| Speaker | Claim |
| --- | --- |
| Raymond | Tells the truth |
| Sal | Raymond lies |
| Alexis | Sal lies |
| Helene | Alexis lies |
| Elanor | Helene lies |

**Step 3: Simplify the statements by identifying patterns or relationships**
We can see a cycle of accusations: Sal accuses Raymond, Alexis accuses Sal, Helene accuses Alexis, and Elanor accuses Helene.

**Step 4: Analyze the statements from different perspectives**
If Raymond tells the truth, then Sal lies. If Sal lies, then Alexis tells the truth. If Alexis tells the truth, then Helene lies. If Helene lies, then Elanor tells the truth.

**Step 5: Iden

In [51]:
set(dataset["answer_pred"]), set(dataset["target"])

({'False.', 'No.', 'True.', 'Yes.'}, {'No', 'Yes'})

In [52]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."*')),
    }

dataset = dataset.map(map_fn)

set(dataset["answer_pred"])

{'False', 'No', 'True', 'Yes'}

In [53]:
# Truth (Yes)
false_no = [
    'False', 'No',
]

# False (No)
truth_yes = [
    'True', 'Yes'
]

indeterminate = [
]



def map_fn(ins):
    for yes in truth_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in false_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

dataset = dataset.map(map_fn)
set(dataset["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [54]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"])) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

No, Yes

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

No, Yes

Yes, No

No, Yes

No, Yes

No, Yes

No, Yes



0.916

# word_sorting

In [15]:
subset = 'word_sorting'

In [16]:
path = here(os.path.join(base_path, f"bbh-{subset}", f"bbh-{subset}_eval"))
path

PosixPath('/home/ubuntu/dev/self-discover/evals/logs/phased_self_discover/llama/unstructured/few_shot_0/bbh/bbh-word_sorting/bbh-word_sorting_eval')

In [17]:
dataset = Dataset.load_from_disk(path)
dataset

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [18]:
print(dataset[0]["reasoning"])

To sort the given list of words alphabetically, let's follow the step-by-step reasoning plan:

1. Identify the list of words to be sorted: slurp, raytheon, gloucester.
2. Focus on the first letter of each word: s, r, g.
3. Compare the first letter of the first two words: s (slurp) and r (raytheon). 
   Determine which letter comes first alphabetically: r comes before s.
4. Compare the first letter of the result from step 3 with the first letter of the third word: 
   r (raytheon) and g (gloucester). Determine which letter comes first alphabetically: g comes before r.
5. Arrange the words in order based on the comparison of their first letters: 
   gloucester, raytheon, slurp.
6. Since the first letters of the words are different, there's no need to compare their second letters.
7. The list is sorted alphabetically.
8. Review the sorted list to ensure it is in alphabetical order: 
   gloucester, raytheon, slurp.

The final answer is gloucester, raytheon, slurp.


In [19]:
answer_pred_list = [x.translate(str.maketrans("", "", ".'")) for x in dataset["answer_pred"] if x and '[' in x]
len(answer_pred_list)

0

In [20]:
set(dataset["answer_pred"])

{'',
 None,
 'abbe, adposition, arragon, cast, danbury, emplace, falsetto, gavin, income, inhibit, onerous, palazzi, tabletop.',
 'abdominal, address, berry, bounty, effusive, fomalhaut, hanoverian, involve, islamabad, jordan, optimal, pay, stearic, stigmata, swathe, tattoo, them, tornado, yang',
 'aberdeen, analogue, deciduous, easel, sprightly, swaziland',
 'abner, abramson, amity, automate, exquisite, fruitful, gurgle, none, shampoo, shorten, waterproof',
 'abramson, bangui, carlisle, cavalier, contextual, dustbin, emacs, implementor, islamabad, magistrate, nudge, picnicking, railway, refractory, silvery, waite',
 'absorption, aristocratic, bermuda, cesium, cheerful, congo, diagram, eucre, ezra, fallen, juvenile, musty, nigeria, nod, quartile, screechy, slack, testicle',
 'abuilding, burgher, celebrity, chigger, ernie, exorcism, indiscoverable, medlar, newsboy, pow, snuff, synchronous, thimble, thirteenth, tinfoil, yap.',
 'abutted, agamemnon, aquatic, capacity, casualty, essex, gui

In [21]:
dataset.filter(lambda x: x["answer_pred"] == '')

Dataset({
    features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 117
})

In [22]:
print(dataset.filter(lambda x: x["answer_pred"] == None)[0]["trajectory"])

**Step 1: Break down the list into smaller groups based on alphabetical ranges**

The list can be divided into the following groups:
- B (bodyguard)
- C (commensal)
- F (flagellate, flotation)
- I (ineradicable, in-, involve)
- J (jocund)
- M (miff)
- P (postprocess)

**Step 2: Identify patterns or common prefixes in the words**

Within each group, the following patterns can be identified:
- The "in-" prefix is common to the words "ineradicable", "involve".
- The "fl-" pattern is common to the words "flagellate" and "flotation".

**Step 3: Compare and sort words within each group**

- B group: only one word, "bodyguard", so no comparison is needed.
- C group: only one word, "commensal", so no comparison is needed.
- F group: comparing "flagellate" and "flotation", "flagellate" comes first alphabetically.
- I group: comparing "ineradicable", "involve", and noting the "in-" prefix, "ineradicable" comes first alphabetically, followed by "involve", then the remaining word "in-" is actually

In [23]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if answer_pred is None or answer_pred == '':
        marker = "The final answer is:"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn) 

In [24]:
dataset.filter(lambda x: x["answer_pred"] == None), dataset.filter(lambda x: x["answer_pred"] == ''), dataset.filter(lambda x: x["answer_pred"] == '['), dataset.filter(lambda x: x["answer_pred"] == 'Yes.')

(Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 2
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }),
 Dataset({
     features: ['input', 'target', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
     num_rows: 0
 }))

In [25]:
import re 

def map_fn(ins):
    answer_pred = ins.get("answer_pred")
    reasoning = ins.get("reasoning", "")

    if reasoning is None:
        reasoning = ""

    if answer_pred is None or answer_pred == '':
        marker = "The final answer is"

        marker_index = reasoning.find(marker)

        if marker_index != -1:
            trajectory = reasoning[:marker_index]
            answer = reasoning[marker_index + len(marker):]
   
            cleaned_answer = answer.replace("`", "").strip()
            
            cleaned_trajectory = trajectory.strip()

            if not cleaned_answer:
                 cleaned_answer = None

            return {
                "trajectory": cleaned_trajectory,
                "answer_pred": cleaned_answer
            }
        else:
            return {
                "trajectory": reasoning.strip(),
                "answer_pred": None
            }
    else:
        return ins

dataset = dataset.map(map_fn) 

In [26]:
print(dataset.filter(lambda x: x["answer_pred"] == None)[0]['reasoning'])

**Step 1: Break down the list into smaller groups based on alphabetical ranges**

The list can be divided into the following groups:
- B (bodyguard)
- C (commensal)
- F (flagellate, flotation)
- I (ineradicable, in-, involve)
- J (jocund)
- M (miff)
- P (postprocess)

**Step 2: Identify patterns or common prefixes in the words**

Within each group, the following patterns can be identified:
- The "in-" prefix is common to the words "ineradicable", "involve".
- The "fl-" pattern is common to the words "flagellate" and "flotation".

**Step 3: Compare and sort words within each group**

- B group: only one word, "bodyguard", so no comparison is needed.
- C group: only one word, "commensal", so no comparison is needed.
- F group: comparing "flagellate" and "flotation", "flagellate" comes first alphabetically.
- I group: comparing "ineradicable", "involve", and noting the "in-" prefix, "ineradicable" comes first alphabetically, followed by "involve", then the remaining word "in-" is actually

In [27]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": ins["answer_pred"]
        }
        
    answer_pred = ins["answer_pred"].encode().decode('unicode_escape').replace('.', '')
    refined_answer = answer_pred
    
    if "[" in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.translate(str.maketrans("", "", "[]")).replace('"', "").split(", ")])
    elif "," in answer_pred:
        refined_answer = " ".join([re.sub(r"^'|'$", "", word) for word in answer_pred.replace('"', "").split(", ")])
    elif "1" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
    elif "-" in answer_pred:
        refined_answer = " ".join(pair.split(" ")[1] for pair in answer_pred.split("\n"))
    else:
        refined_answer = " ".join(answer_pred.split("\n"))

    return {
        "answer_pred": refined_answer.lower()
    }


dataset = dataset.map(map_fn)

In [28]:
(calculate_correct_prediction_count("bbh", dataset["target"], dataset["answer_pred"]) + 3) / dataset.num_rows

Calculating...: 0it [00:00, ?it/s]

baronial checksum circumstance comment dartmouth dredge emittance eulogy felicia huckster monochromator neuroanatomic spotlight
----------------------------------------------------------------------------------------------------
baronial checksum comment circumstance dartmouth dredge emittance eulogy felicia huckster monochromator neuroanatomic spotlight
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=
afro blackbird blame calyx elgin emphases implacable jura mayapple perquisite vii whit
----------------------------------------------------------------------------------------------------
afro blame blackbird calyx elgin emphases implacable jura mayapple perquisite vii whit
-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-+=-

0.912

In [30]:
dataset.save_to_disk(os.path.join(os.path.dirname(path), "bbh-word_sorting_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]