In [1]:
from datasets import Dataset
from pyprojroot import here
import os

In [2]:
par_dir = here("struct_vs_unstruct/data/mistral_large_2407/modified/non_self_synthesis/bbh/")
save_par_dir = here(os.path.join(par_dir, "refined"))
print(par_dir)
print(save_par_dir)

/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/modified/non_self_synthesis/bbh
/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/modified/non_self_synthesis/bbh/refined


# boolean_expressions

In [3]:
chk_dir = here(os.path.join(par_dir, "bbh-boolean_expressions/bbh_eval"))
chk_dir

PosixPath('/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/modified/non_self_synthesis/bbh/bbh-boolean_expressions/bbh_eval')

In [4]:
dataset = Dataset.load_from_disk(chk_dir)

In [5]:
set(dataset["answer_pred"])

{'False**.', 'False.', None, 'True.', '`False`.', '`True`.'}

In [10]:
none_ds = dataset.filter(lambda x: x["answer_pred"] == None)
print(none_ds[2]["trajectory"])

To evaluate the logical expression `( True ) and not True or True`, we follow these steps:

1. **Evaluate `not True`**:
   - `not True` evaluates to `False`.

2. **Substitute the result back into the expression**:
   - The expression becomes `( True ) and False or True`.

3. **Evaluate `( True ) and False`**:
   - `( True ) and False` evaluates to `False`.

4. **Substitute the result back into the expression**:
   - The expression becomes `False or True`.

5. **Evaluate `False or True`**:
   - `False or True` evaluates to `True`.

Therefore, the final answer is `True`.


In [11]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": "True"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [12]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,*`"'))
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False', 'True'}

In [13]:
new_ds.save_to_disk(os.path.join(save_par_dir, "boolean_expressions"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
total = 0
for instance in new_ds:
    if instance["answer_pred"] == instance["target"]:
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.972)

# causal_judgement

In [15]:
chk_dir = here(os.path.join(par_dir, "bbh-causal_judgement/bbh_eval"))

In [16]:
dataset = Dataset.load_from_disk(chk_dir)

In [17]:
set(dataset["answer_pred"])

{'No**.', 'No.', 'Yes.'}

In [18]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.*"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [19]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/187 [00:00<?, ? examples/s]

0.7165775401069518

In [20]:
new_ds.save_to_disk(os.path.join(save_par_dir, "causal_judgement"))

Saving the dataset (0/1 shards):   0%|          | 0/187 [00:00<?, ? examples/s]

# date_understanding

In [21]:
chk_dir = here(os.path.join(par_dir, "bbh-date_understanding/bbh_eval"))

In [22]:
dataset = Dataset.load_from_disk(chk_dir)

In [23]:
set(dataset["answer_pred"])

{'(A) 01/02/1930.',
 '(A) 01/09/2015.',
 '(A) 01/31/2012.',
 '(A) 02/28/2015.',
 '(A) 08/09/1909.',
 '(A) 11/29/2002, assuming a typographical error in the year.',
 '(A) 12/22/1929.',
 '(B) 01/02/1961.',
 '(C) 01/02/2008.',
 '(C) 06/18/2016.',
 '(C) 11/21/2002.',
 '(D) 01/28/2017.',
 '(D) 02/16/2010.',
 '(D) 06/17/2019.',
 '(D) 08/31/2003.',
 '(D) 09/09/2021.',
 '(E) 07/01/1972.',
 '(E) 07/18/2002.',
 '(F) 04/09/1969.',
 '(F) 10/22/2002.',
 '(F) 11/29/2014.',
 '(F) 11/30/2019.',
 '(F).',
 '11/22/2001.',
 '11/23/2001.',
 '12/24/2014.',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.',
 'None of the above.',
 'None of the given options match the calculated date.',
 'not listed among the given options.',
 'not listed among the options provided.',
 'not listed among the options.',
 'not listed among the provided options.',
 'not listed in the given options.',
 'not listed in the options provided.',
 'not listed in the provided options.'}

In [24]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'11/22/2001',
 '11/23/2001',
 '12/24/2014',
 'A',
 'A 01/02/1930',
 'A 01/09/2015',
 'A 01/31/2012',
 'A 02/28/2015',
 'A 08/09/1909',
 'A 11/29/2002 assuming a typographical error in the year',
 'A 12/22/1929',
 'B',
 'B 01/02/1961',
 'C',
 'C 01/02/2008',
 'C 06/18/2016',
 'C 11/21/2002',
 'D',
 'D 01/28/2017',
 'D 02/16/2010',
 'D 06/17/2019',
 'D 08/31/2003',
 'D 09/09/2021',
 'E',
 'E 07/01/1972',
 'E 07/18/2002',
 'F',
 'F 04/09/1969',
 'F 10/22/2002',
 'F 11/29/2014',
 'F 11/30/2019',
 'None of the above',
 'None of the given options match the calculated date',
 'not listed among the given options',
 'not listed among the options',
 'not listed among the options provided',
 'not listed among the provided options',
 'not listed in the given options',
 'not listed in the options provided',
 'not listed in the provided options'}

In [27]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    if len(ls) == 2:
        return {
            "answer_pred": ls[0]
        }

    if ins["answer_pred"] == "A 11/29/2002 assuming a typographical error in the year":
        return {
            "answer_pred": "A"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'11/22/2001',
 '11/23/2001',
 '12/24/2014',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'None of the above',
 'None of the given options match the calculated date',
 'not listed among the given options',
 'not listed among the options',
 'not listed among the options provided',
 'not listed among the provided options',
 'not listed in the given options',
 'not listed in the options provided',
 'not listed in the provided options'}

In [28]:
not_choice_ds = new_ds.filter(lambda x: len(x["answer_pred"]) == 10)
not_choice_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 3
})

In [29]:
for x in not_choice_ds:
    print(x["input"])
    print("-"*5)
    print(x["answer_pred"])
    print("+"*80)

2015 is coming in 36 hours. What is the date one week ago from today in MM/DD/YYYY?
Options:
(A) 12/22/2040
(B) 12/23/2014
(C) 12/22/2014
(D) 01/05/2015
(E) 12/22/2086
(F) 02/06/2015
-----
12/24/2014
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
In the US, Thanksgiving is on the fourth Thursday of November. Today is the US Thanksgiving of 2001. What is the date today in MM/DD/YYYY?
Options:
(A) 01/16/2003
(B) 11/21/2002
(C) 09/04/2002
(D) 11/22/2002
(E) 08/24/2002
(F) 11/23/2002
-----
11/22/2001
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
In the US, Thanksgiving is on the fourth Thursday of November. Today is the US Thanksgiving of 2001. What is the date tomorrow in MM/DD/YYYY?
Options:
(A) 11/09/2002
(B) 12/07/2002
(C) 11/23/2002
(D) 06/23/2002
(E) 10/15/2002
(F) 08/29/2002
-----
11/23/2001
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [30]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [31]:
total = 0

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.8)

# disambiguation_qa

In [32]:
chk_dir = here(os.path.join(par_dir, "bbh-disambiguation_qa/bbh_eval"))

In [33]:
dataset = Dataset.load_from_disk(chk_dir)

In [34]:
set(dataset["answer_pred"])

{'(A) Alex could not meet.',
 '(A) Alex could not meet.**',
 '(A) Alex sent the letter.',
 "(A) It is Sam's office.",
 "(A) It was the educator's grading policy.",
 "(A) It was the pathologist's microscope.",
 "(A) It will be the producers' office.",
 '(A) The carpenter had been working on the house.',
 '(A) The cook likes to teach.',
 '(A) The developer focuses on code.',
 '(A) The developer understood the problem.',
 '(A) The developer uses big words.',
 '(A) The homeowner had purchased.',
 '(A) The investigator was too late.',
 '(A) The janitor would have to mop.',
 '(A) The lawyer needed to understand.',
 '(A) The mechanic was in a good mood.',
 '(A) The nurse was busy.',
 "(A) The office was the director's office.",
 '(A) The paralegal was fired.',
 '(A) The patient had a history.',
 '(A) The patient had a skin condition.',
 "(A) The secretary didn't reply yet.",
 '(A) The secretary understood the problem.',
 '(A) The sheriff upholds the peace.',
 '(A) The surgeon needed more time

In [35]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()*"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Alex could not meet',
 'A Alex sent the letter',
 "A It is Sam's office",
 "A It was the educator's grading policy",
 "A It was the pathologist's microscope",
 "A It will be the producers' office",
 'A The carpenter had been working on the house',
 'A The cook likes to teach',
 'A The developer focuses on code',
 'A The developer understood the problem',
 'A The developer uses big words',
 'A The homeowner had purchased',
 'A The investigator was too late',
 'A The janitor would have to mop',
 'A The lawyer needed to understand',
 'A The mechanic was in a good mood',
 'A The nurse was busy',
 "A The office was the director's office",
 'A The paralegal was fired',
 'A The patient had a history',
 'A The patient had a skin condition',
 "A The secretary didn't reply yet",
 'A The secretary understood the problem',
 'A The sheriff upholds the peace',
 'A The surgeon needed more time',
 'A The technician completed the repair',
 'A The worker was repairing',
 "A They were my cousin

In [36]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [37]:
print(none_ds[0]["trajectory"])

To determine the antecedent of the pronoun "her" in the sentence "The pathologist obtained tissue samples from the victim to look at under her microscope," we follow the reasoning plan step-by-step:

1. **Identify the Pronoun**:
   - The pronoun in question is "her."

2. **List Potential Antecedents**:
   - The potential antecedents are "pathologist" and "victim."

3. **Analyze Sentence Structure**:
   - The sentence structure is: "The pathologist obtained tissue samples from the victim to look at under her microscope."
   - The phrase "under her microscope" suggests possession or use of the microscope.

4. **Consider Grammatical Rules**:
   - Typically, the pronoun "her" refers to the nearest preceding noun that makes sense in the context.
   - However, context and common sense also play a role.

5. **Evaluate Contextual Clues**:
   - A pathologist is more likely to use a microscope to examine tissue samples than a victim.
   - The action of "looking at tissue samples" is typically pe

In [38]:
def map_fn(ins):
    if not ins["answer_pred"]:
        return {
            "answer_pred": "A"
        }

    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [39]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [40]:
total = 0

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.728)

# dyck_languages

In [326]:
chk_dir = here(os.path.join(par_dir, "bbh-dyck_languages/bbh_eval"))

In [327]:
dataset = Dataset.load_from_disk(chk_dir)

In [328]:
none_ds = dataset.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 23
})

In [329]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = dataset.map(map_fn)
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 6
})

In [52]:
print(none_ds[5]["trajectory"])

To complete the sequence with properly closed parentheses, we will follow the step-by-step reasoning plan provided.

### Step-by-Step Reasoning

1. **Identify and List All Opening Brackets:**
   - Scan through the input sequence and list all the opening brackets in the order they appear.
   - Opening brackets in the sequence: `(`, `[`, `[`, `{`, `<`, `{`, `{`, `<`, `<`, `<`, `<`, `{`, `(`, `<`, `(`, `(`, `[`, `(`, `[`, `{`, `<`, `(`, `{`, `<`, `<`, `>`, `[`, `<`, `(`, `)`.

2. **Identify and List All Closing Brackets:**
   - Scan through the input sequence and list all the closing brackets in the order they appear.
   - Closing brackets in the sequence: `>`, `}`, `}`, `>`, `>`, `>`, `}`, `)`, `)`, `]`, `)`, `]`, `>`, `}`, `}`, `]`, `>`, `>`, `)`, `>`, `]`, `]`.

3. **Match Opening and Closing Brackets:**
   - For each type of bracket, match the opening brackets with their corresponding closing brackets.
   - Ensure that each opening bracket has a corresponding closing bracket.

4. **Ch

In [53]:
set(new_ds["answer_pred"])

{'',
 '"( < [ < ( [ ( ) ] < > < ( { } ) > ) < [ ] > > ] ( ) < [ < > ] > )".',
 '( ( ( [ { } ] ) ) )',
 '( ( ( [ { } ] ) ))',
 '( ).',
 '( [ [ [ ( { ( ( < [ { < > } ] > { { [ ] } } ) ) } ( [ [ < > ] ] ) ) ] ] ] ) ( < > < [ ( ) ] > (',
 None,
 'The sequence is already balanced.',
 '[ < [ ( ( ) < ( ) > ( { { } } [ [ [ < ( [ ] ) ( ) > ] ] ] { { { { { } } } { { } { < [ [ ] ] > } } { } } } ) ) ] > ]',
 '\\[ [ [ < < { } > ] ] > > \\]',
 '`( ( ( ) ) )`.',
 '`( ( ) ) [ { ( < > ) } ]`.',
 '`( ( < < < ( ( [ { [ ] } ) ) > > ) )`.',
 '`( ( < > ) )`.',
 '`( ( < [ { { } } ] > { } ) ( < > ) [ ( < > ) ] < >`.',
 '`( ( < { [ { } ] } > ) ) < { } >`.',
 '`( ( [ < [ ] > { { [ [ ] ] } } ] ) )`.',
 '`( ( [ [ ] ] ) )`.',
 '`( ( { ( ( { } ) ) ( ( [ ] ) ) } ) )`.',
 '`( ( { < ( { } ) > } ) ) ) }`.',
 '`( ( { < > ( ) [ { < ( < > < ( ( ) ( [ { } ] ) ) > { { } { ( ) } } ) [ ] > } ] { { } } }`',
 '`( ( { } ) )`.',
 '`( ) ( ()`.',
 '`( ) ( < < { } ) > >`.',
 '`( ) [ ( [ < { { ( { } ) } } ] ] >`.',
 '`( ) { < > }`.',

In [60]:
index = 2
print(new_ds[index]["input"])
print("+"*25)
print(new_ds[index]["target"])
print("+"*25)
print(new_ds[index]["trajectory"])

Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: { < [ ] > ( { [ ] } <
+++++++++++++++++++++++++
> ) }
+++++++++++++++++++++++++
1. **Identify Open Parentheses:**
   - Input: `{ < [ ] > ( { [ ] } <`
   - Open parentheses: `{`, `<`, `[`, `(`, `{`, `[`, `<`

2. **Identify Closed Parentheses:**
   - Input: `{ < [ ] > ( { [ ] } <`
   - Closed parentheses: `]`, `>`, `}`, `]`

3. **Match Open and Closed Parentheses:**
   - Match `[` with `]`
   - Match `{` with `}`
   - Match `<` with `>`

4. **Determine Unmatched Open Parentheses:**
   - Unmatched open parentheses: `{`, `<`, `(`, `{`, `<`

5. **Close Unmatched Parentheses:**
   - Add `>`, `}`, `)`, `}`, `>` at the end of the sequence.

6. **Verify the Sequence:**
   - Final sequence: `{ < [ ] > ( { [ ] } < > } ) } >`
   - Ensure all parentheses are properly matched and closed.


In [None]:
def map_fn(ins):
    if ins["answer_pred"] and "sequence" in ins["answer_pred"]:
        print(ins["reasoning_plan"])
        print("+"*80)
        print(ins["trajectory"])
        print("+"*80)

new_ds.map(map_fn)

In [358]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."'))
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [62]:
print(new_ds[1]["trajectory"])

```json
{
    "Step 1 - Simplify the Task": {
        "Description": "Identify the current state of the sequence and simplify it if possible.",
        "Action": "Note the current sequence: ( ) ( (",
        "Result": "Identify the current sequence as ( ) ( ("
    },
    "Step 2 - Break Down the Task": {
        "Description": "Break down the sequence completion task into smaller, more manageable steps.",
        "Action": "Identify the open and closed parentheses in the current sequence.",
        "Result": "Identify open parentheses: 2, closed parentheses: 1"
    },
    "Step 3 - Analyze the Task": {
        "Description": "Determine if the task is analytical and requires tracking and balancing parentheses.",
        "Action": "Check if the sequence is balanced.",
        "Result": "Determine if the sequence is balanced or needs more parentheses to be balanced."
    },
    "Step 4 - Plan to Complete the Sequence": {
        "Description": "Make a step-by-step plan to complete the seq

In [362]:
blank_ds = new_ds.filter(lambda x: x["answer_pred"] == "")
none_blank_ds = new_ds.filter(lambda x: x["answer_pred"] != "")

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

In [364]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].replace(ins["target"], "")
    }

none_blank_ds = none_blank_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

{'',
 "'< < > >'",
 "'< [ [ ] ] >'",
 "'< [ ] >'",
 '( ( ( ( [ [ < [ { { [ ] } } ] > ] ] ( ) ) ) ) ) ) ) )',
 '( ( ( ) ) ))',
 '( ( < < < ( ( ) ) ( [ ] ) > > { [ ] } ) ) > > >',
 '( ( < > ) )',
 '( ( < { } > ) ) < { } >',
 '( ( { } ) )',
 '( ) ( () )',
 '( ) ( < < { } > >',
 '( ) [ ( [ < { { ( { } ) } } > ] ) ]',
 '( ) { < } >',
 '( < < > > < > [ ] [ )',
 '( < > ( [ ( ) ] ) )',
 '( < [ ( ) ] > )',
 '( < { } [ ] > )',
 '( [ ( ) ] )',
 '( [ < < { } > > ] )',
 '( [ [ [ ( { ( ( < [ { < > } ] > { { [ ] } } ) ) } ( [ [ < > ] ] ) ) ] ] ] ) ( < > < [ ( ) ] > ( ) > )',
 '( { ( ) } )',
 '( { < [ < > ] > } )',
 '( { < { ( ( { } ) ( ) ) } { } < { } > < > > } { } ( { ( { { } } ) [ ( ) ] } ) ) [ ( [ ] ) ]',
 '( { [ { } ] } )',
 '( { } )',
 '( { } ) < { < { } > } >',
 '( { } ) { ( [ { ( ) } ] ( [ ] ) ) }',
 '**',
 '< ( ( ( [ { } ] ) ) ) >',
 '< ( () )',
 '< ( ) ( { { [ ] } } ) >',
 '< ( ) >',
 '< ( < > ) >',
 '< ( [ { ( < > ) } ] ) > { ( [ ] } ) >',
 '< ( { ( < < > > ) } ) >',
 '< ( { [ { } ] } [ ] [

In [330]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
for instance in new_ds:
    print(instance["target"])
    print(instance["trajectory"])
    print("="*50)

# formal_fallacies

In [67]:
chk_dir = here(os.path.join(par_dir, "bbh-formal_fallacies/bbh_eval"))

In [68]:
dataset = Dataset.load_from_disk(chk_dir)

In [69]:
set(dataset["answer_pred"])

{'invalid**.', 'invalid.', 'valid**.', 'valid.'}

In [72]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"*')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'invalid', 'valid'}

In [73]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [74]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.824

# geometric_shapes

In [75]:
chk_dir = here(os.path.join(par_dir, "bbh-geometric_shapes/bbh_eval"))

In [76]:
dataset = Dataset.load_from_disk(chk_dir)

In [77]:
set(dataset["answer_pred"])

{'(A) circle.',
 '(B) heptagon.',
 '(C) hexagon.',
 '(D) kite.',
 '(G) pentagon.',
 '(I) sector.',
 '(J) triangle.',
 '(K) ellipse.',
 '(K) trapezoid.',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.',
 'G.',
 'H.',
 'I.',
 'J.',
 'K.'}

In [78]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A circle',
 'B',
 'B heptagon',
 'C',
 'C hexagon',
 'D',
 'D kite',
 'E',
 'F',
 'G',
 'G pentagon',
 'H',
 'I',
 'I sector',
 'J',
 'J triangle',
 'K',
 'K ellipse',
 'K trapezoid'}

In [79]:
def map_fn(ins):   
    ls = ins["answer_pred"].split()
    
    return {
        "answer_pred": ls[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'}

In [80]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [81]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.712

# hyperbaton

In [82]:
chk_dir = here(os.path.join(par_dir, "bbh-hyperbaton/bbh_eval"))

In [83]:
dataset = Dataset.load_from_disk(chk_dir)

In [84]:
set(dataset["answer_pred"])

{'(A).', 'A.', 'A.**', 'B.', 'B.**', None}

In [85]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()*"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', None}

In [86]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 2
})

In [88]:
print(none_ds[1]["trajectory"])

To determine the correct adjective order, we need to follow the standard rules for adjective order in English: opinion, size, age, shape, color, origin, material, purpose, noun.

Let's break down the adjectives in each option and categorize them:

- enormous (size)
- circular (shape)
- Pakistani (origin)
- snorkeling (purpose)
- gold (material)
- yellow (color)

Now, let's apply the standard adjective order rules to each option:

Option (A): enormous circular Pakistani snorkeling gold yellow baby
- size, shape, origin, purpose, material, color

Option (B): enormous circular yellow Pakistani gold snorkeling baby
- size, shape, color, origin, material, purpose

Comparing the arranged adjectives in each option with the standard adjective order, we can see that Option (A) follows the standard order more closely.

Therefore, the final answer is (A).


In [89]:
def map_fn(ins):
    if not ins["answer_pred"]:
        return {
            "answer_pred": "A"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B'}

In [90]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [91]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.98

# logical_deduction_five_objects

In [92]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_five_objects/bbh_eval"))

In [93]:
dataset = Dataset.load_from_disk(chk_dir)

In [94]:
set(dataset["answer_pred"])

{'(A) Amy finished first.',
 '(A) Amy finished third.',
 '(A) Eve finished third.',
 '(A) Joe finished second.',
 '(A) Mel finished first.',
 '(A) Rob finished first.',
 '(A) Rob finished last.',
 '(A) The apples are the second-cheapest.',
 '(A) The black book is the leftmost.',
 '(A) The black book is the second from the left.',
 '(A) The blue book is the leftmost.',
 '(A) The brown book is the third from the left.',
 '(A) The bus is the oldest.',
 '(A) The crow is the second from the left.',
 '(A) The falcon is the leftmost.',
 '(A) The gray book is the second from the left.',
 '(A) The gray book is the third from the left.',
 '(A) The green book is the rightmost.',
 '(A) The loquats are the cheapest.',
 '(A) The mangoes are the cheapest.',
 '(A) The oranges are the cheapest.',
 '(A) The owl is the leftmost.',
 '(A) The owl is the second from the left.',
 '(A) The owl is the second from the right.',
 '(A) The sedan is the second-newest.',
 '(A) The tractor is the oldest.',
 '(A) The 

In [95]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Amy finished first',
 'A Amy finished third',
 'A Eve finished third',
 'A Joe finished second',
 'A Mel finished first',
 'A Rob finished first',
 'A Rob finished last',
 'A The apples are the second-cheapest',
 'A The black book is the leftmost',
 'A The black book is the second from the left',
 'A The blue book is the leftmost',
 'A The brown book is the third from the left',
 'A The bus is the oldest',
 'A The crow is the second from the left',
 'A The falcon is the leftmost',
 'A The gray book is the second from the left',
 'A The gray book is the third from the left',
 'A The green book is the rightmost',
 'A The loquats are the cheapest',
 'A The mangoes are the cheapest',
 'A The oranges are the cheapest',
 'A The owl is the leftmost',
 'A The owl is the second from the left',
 'A The owl is the second from the right',
 'A The sedan is the second-newest',
 'A The tractor is the oldest',
 'A The tractor is the second-newest',
 'A The truck is the third-newest',
 'A The

In [96]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [97]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [98]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.94

# logical_deduction_seven_objects

In [99]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_seven_objects/bbh_eval"))

In [100]:
dataset = Dataset.load_from_disk(chk_dir)

In [101]:
set(dataset["answer_pred"])

{'(A) Ana finished fourth.',
 '(A) Eve finished third-to-last.',
 '(A) Eve finished third.',
 '(A) Joe finished first.',
 '(A) Joe finished fourth.',
 '(A) Joe finished last.',
 '(A) Joe finished second-to-last.',
 '(A) The apples are the most expensive.',
 '(A) The black book is the third from the left.',
 '(A) The blue jay is the fourth from the left.',
 '(A) The bus is the newest.',
 '(A) The cardinal is the third from the left.',
 '(A) The falcon is the third from the left.',
 '(A) The hatchback is the second-oldest.',
 '(A) The hummingbird is the second from the right.',
 '(A) The limousine is the fourth-newest.',
 '(A) The limousine is the second-oldest.',
 '(A) The limousine is the third-oldest.',
 '(A) The motorcycle is the oldest.',
 '(A) The owl is the third from the left.',
 '(A) The purple book is the third from the left.',
 '(A) The raven is the second from the left.',
 '(A) The robin is the fourth from the left.',
 '(A) The sedan is the fourth-newest.',
 '(A) The station 

In [102]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Ana finished fourth',
 'A Eve finished third',
 'A Eve finished third-to-last',
 'A Joe finished first',
 'A Joe finished fourth',
 'A Joe finished last',
 'A Joe finished second-to-last',
 'A The apples are the most expensive',
 'A The black book is the third from the left',
 'A The blue jay is the fourth from the left',
 'A The bus is the newest',
 'A The cardinal is the third from the left',
 'A The falcon is the third from the left',
 'A The hatchback is the second-oldest',
 'A The hummingbird is the second from the right',
 'A The limousine is the fourth-newest',
 'A The limousine is the second-oldest',
 'A The limousine is the third-oldest',
 'A The motorcycle is the oldest',
 'A The owl is the third from the left',
 'A The purple book is the third from the left',
 'A The raven is the second from the left',
 'A The robin is the fourth from the left',
 'A The sedan is the fourth-newest',
 'A The station wagon is the fourth-newest',
 'A The white book is the fourth from t

In [103]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F', 'G'}

In [104]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [105]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.896

# logical_deduction_three_objects

In [106]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_three_objects/bbh_eval"))

In [107]:
dataset = Dataset.load_from_disk(chk_dir)

In [108]:
set(dataset["answer_pred"])

{'(A) Amy finished first.',
 '(A) Amy finished last.**',
 '(A) Amy finished second.',
 '(A) Ana finished last.',
 '(A) Ana finished second.',
 '(A) Eli finished first.',
 '(A) Eli finished last.',
 '(A) Eve finished first.',
 '(A) Eve finished second.',
 '(A) Joe finished first.',
 '(A) The apples are the cheapest.',
 '(A) The black book is the leftmost.',
 '(A) The blue book is the rightmost.',
 '(A) The blue jay is the second from the left.',
 '(A) The convertible is the newest.',
 '(A) The crow is the leftmost.',
 '(A) The falcon is the leftmost.',
 '(A) The gray book is the leftmost.',
 '(A) The green book is the leftmost.',
 '(A) The hatchback is the second-newest.',
 '(A) The hawk is the second from the left.',
 '(A) The hummingbird is the leftmost.',
 '(A) The limousine is the second-newest.',
 '(A) The loquats are the most expensive.',
 '(A) The motorcycle is the oldest.',
 '(A) The orange book is the leftmost.',
 '(A) The owl is the second from the left.',
 '(A) The peaches ar

In [109]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Amy finished first',
 'A Amy finished last**',
 'A Amy finished second',
 'A Ana finished last',
 'A Ana finished second',
 'A Eli finished first',
 'A Eli finished last',
 'A Eve finished first',
 'A Eve finished second',
 'A Joe finished first',
 'A The apples are the cheapest',
 'A The black book is the leftmost',
 'A The blue book is the rightmost',
 'A The blue jay is the second from the left',
 'A The convertible is the newest',
 'A The crow is the leftmost',
 'A The falcon is the leftmost',
 'A The gray book is the leftmost',
 'A The green book is the leftmost',
 'A The hatchback is the second-newest',
 'A The hawk is the second from the left',
 'A The hummingbird is the leftmost',
 'A The limousine is the second-newest',
 'A The loquats are the most expensive',
 'A The motorcycle is the oldest',
 'A The orange book is the leftmost',
 'A The owl is the second from the left',
 'A The peaches are the most expensive',
 'A The pears are the second-most expensive',
 'A The 

In [110]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [111]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [112]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.996

# movie_recommendation

In [113]:
chk_dir = here(os.path.join(par_dir, "bbh-movie_recommendation/bbh_eval"))

In [114]:
dataset = Dataset.load_from_disk(chk_dir)

In [115]:
set(dataset["answer_pred"])

{'(B) The House.',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'None of the options are similar to the given movies.',
 'None of the options are similar.',
 'None of the options.',
 'None.',
 'that none of the options are a perfect match.',
 'that none of the options provided are similar to the given movies based on the identified criteria.'}

In [116]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'B',
 'B The House',
 'C',
 'D',
 'E',
 'None',
 'None of the options',
 'None of the options are similar',
 'None of the options are similar to the given movies',
 'that none of the options are a perfect match',
 'that none of the options provided are similar to the given movies based on the identified criteria'}

In [118]:
def map_fn(ins):
    ls = ins["answer_pred"].split()
    if ls[0] in ["A", "B", "C", "D", "E"]:
        return {
            "answer_pred": ls[0]
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'B',
 'C',
 'D',
 'E',
 'None',
 'None of the options',
 'None of the options are similar',
 'None of the options are similar to the given movies',
 'that none of the options are a perfect match',
 'that none of the options provided are similar to the given movies based on the identified criteria'}

In [119]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [120]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.688

# penguins_in_a_table

In [121]:
chk_dir = here(os.path.join(par_dir, "bbh-penguins_in_a_table/bbh_eval"))

In [122]:
dataset = Dataset.load_from_disk(chk_dir)

In [123]:
set(dataset["answer_pred"])

{'(A) Louis.',
 '(A).',
 '(C) Vincent.',
 '(D) Gwen.',
 '(E) James.',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.'}

In [124]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

{'A', 'A Louis', 'B', 'C', 'C Vincent', 'D', 'D Gwen', 'E', 'E James'}

In [125]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [126]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/146 [00:00<?, ? examples/s]

In [127]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

0.9794520547945206

# reasoning_about_colored_objects

In [128]:
chk_dir = here(os.path.join(par_dir, "bbh-reasoning_about_colored_objects/bbh_eval"))

In [129]:
dataset = Dataset.load_from_disk(chk_dir)

In [130]:
set(dataset["answer_pred"])

{'(A) red.',
 '(A) yes.',
 '(A) zero.',
 '(A).',
 '(B) no.',
 '(C) two.',
 '(D).',
 '(F) five.',
 '(G) six.',
 '(I) mauve.',
 '(I).',
 '(J) teal.',
 '(K) ten.',
 '(L) burgundy.',
 '(M) silver.',
 '(O) black.',
 '(Q) purple.',
 '(Q).',
 '(R) pink.',
 '(R).',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.',
 'G.',
 'H.',
 'I.',
 'J.',
 'K.',
 'L.',
 'M.',
 'N.',
 'O.',
 'P.',
 'Q.',
 'R.'}

In [131]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A red',
 'A yes',
 'A zero',
 'B',
 'B no',
 'C',
 'C two',
 'D',
 'E',
 'F',
 'F five',
 'G',
 'G six',
 'H',
 'I',
 'I mauve',
 'J',
 'J teal',
 'K',
 'K ten',
 'L',
 'L burgundy',
 'M',
 'M silver',
 'N',
 'O',
 'O black',
 'P',
 'Q',
 'Q purple',
 'R',
 'R pink'}

In [132]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R'}

In [133]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [134]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.972

# ruin_names

In [135]:
chk_dir = here(os.path.join(par_dir, "bbh-ruin_names/bbh_eval"))

In [136]:
dataset = Dataset.load_from_disk(chk_dir)

In [137]:
set(dataset["answer_pred"])

{'(A) the colt and (C) the cut.', '(A).', 'A.', 'B.', 'C.', 'D.', 'E.'}

In [138]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'A the colt and C the cut', 'B', 'C', 'D', 'E'}

In [139]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [140]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.74

# salient_translation_error_detection

In [141]:
chk_dir = here(os.path.join(par_dir, "bbh-salient_translation_error_detection/bbh_eval"))

In [142]:
dataset = Dataset.load_from_disk(chk_dir)

In [143]:
set(dataset["answer_pred"])

{'(A) Modifiers or Adjectives.',
 '(B) Numerical Values.',
 '(C) Negation or Antonyms.',
 '(D) Named Entities.',
 '(E) Dropped Content.',
 '(F) Facts.',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.'}

In [144]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Modifiers or Adjectives',
 'B',
 'B Numerical Values',
 'C',
 'C Negation or Antonyms',
 'D',
 'D Named Entities',
 'E',
 'E Dropped Content',
 'F',
 'F Facts'}

In [147]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F'}

In [148]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [149]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.68

# snarks

In [179]:
chk_dir = here(os.path.join(par_dir, "bbh-snarks/bbh_eval"))

In [180]:
dataset = Dataset.load_from_disk(chk_dir)

In [181]:
set(dataset["answer_pred"])

{'A or B.', 'A.', 'B**.', 'B.', 'B.**', 'both (A) and (B).', 'not sarcastic.'}

In [182]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()*"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

{'A', 'A or B', 'B', 'both A and B', 'not sarcastic'}

In [174]:
def map_fn(ins):
    if "B Statement B is sarcastic" in ins["answer_pred"]:
        return {
            "answer_pred": "B"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

{'A',
 'B',
 'Both statements are sarcastic',
 'Neither statement is sarcastic',
 'both statements A and B are sarcastic**',
 'that the statement A The NB cannot be determined as sarcastic due to insufficient context'}

In [186]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/178 [00:00<?, ? examples/s]

In [187]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/178 [00:00<?, ? examples/s]

0.8707865168539326

# sports_understanding

In [188]:
chk_dir = here(os.path.join(par_dir, "bbh-sports_understanding/bbh_eval"))

In [189]:
dataset = Dataset.load_from_disk(chk_dir)

In [190]:
set(dataset["answer_pred"])

{'False.',
 'Implausible.',
 None,
 'True.',
 'implausible.',
 'not True or False. A: False',
 'not plausible.'}

In [192]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,:()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False',
 'Implausible',
 None,
 'True',
 'implausible',
 'not True or False A False',
 'not plausible'}

In [181]:
none_ds = 

{'False',
 'No',
 None,
 'Plausible',
 'The sentence is not plausible',
 'The sentence is plausible',
 "The sentence is plausible if interpreted as Nerlens Noel being outside his house but within his home's premises",
 'True',
 'Yes',
 '[answer]',
 'not plausible',
 'that the sentence Marouane Fellaini scored in extra time is plausible based on the structured analysis and available data',
 "that the sentence is plausible with a correction: 'Michael Conforto committed a three second violation'"}

In [193]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 26
})

In [196]:
index = 0
print(none_ds[0]["trajectory"])

To determine the plausibility of the statement "Mikal Bridges scored a windmill dunk," we will follow the reasoning plan step-by-step:

1. **Identify Key Assumptions**
   - Mikal Bridges has the physical capabilities to perform a windmill dunk.
   - Mikal Bridges' playing style includes high-flying dunks.
   - There is evidence of Mikal Bridges performing similar dunks in the past.

2. **Gather Relevant Data**
   - Mikal Bridges is known for his athleticism and defensive prowess.
   - He has a vertical leap that is above average for NBA players.
   - Highlight reels show Mikal Bridges performing various types of dunks, including some that require significant athleticism.

3. **Analyze Data**
   - Mikal Bridges' physical attributes and athleticism suggest he has the capability to perform a windmill dunk.
   - His playing style includes a variety of dunks, indicating he is comfortable with high-flying maneuvers.

4. **Evaluate from Different Perspectives**
   - Physical capabilities: Mik

In [194]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False',
 'Implausible',
 'Plausible.',
 'The plausibility of the sentence "Juan Soto did a double stepover" is uncertain without further evidence.',
 'The plausibility of the sentence cannot be determined with the information provided.',
 'The plausibility of the statement "Deshaun Watson was flagged on the play" is uncertain without specific data.',
 'The plausibility of the statement "Jonas Valanciunas beat the buzzer" depends on the analysis of relevant data and critical thinking as outlined above.',
 'The sentence "Dani Alves took the snap" is not plausible in a sports context.',
 'The sentence "Kendrick Nunn took a charge" is plausible.',
 'The sentence "Kyle Tucker took a left footed shot" is plausible, assuming Kyle Tucker is a soccer player and considering the general possibility of players using their non-dominant foot.',
 'The sentence "Toni Kroos performed a give and go" is plausible.',
 'The sentence is not plausible.',
 'The sentence is plausible but not definitively tru

In [195]:
new_ds.filter(lambda x: x["answer_pred"] == None)

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [197]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False',
 'Implausible',
 'Plausible',
 'The plausibility of the sentence Juan Soto did a double stepover is uncertain without further evidence',
 'The plausibility of the sentence cannot be determined with the information provided',
 'The plausibility of the statement Deshaun Watson was flagged on the play is uncertain without specific data',
 'The plausibility of the statement Jonas Valanciunas beat the buzzer depends on the analysis of relevant data and critical thinking as outlined above',
 'The sentence Dani Alves took the snap is not plausible in a sports context',
 'The sentence Kendrick Nunn took a charge is plausible',
 'The sentence Kyle Tucker took a left footed shot is plausible assuming Kyle Tucker is a soccer player and considering the general possibility of players using their non-dominant foot',
 'The sentence Toni Kroos performed a give and go is plausible',
 'The sentence is not plausible',
 'The sentence is plausible',
 'The sentence is plausible but not definitivel

{'False',
 'No',
 'Plausible',
 'The plausibility of the sentence James Karinchak crossed the blue line remains uncertain without specific context or evidence',
 'The plausibility of the sentence is uncertain without concrete data',
 'The plausibility of the statement Malcolm Brogdon drove into the restricted area in the Eastern Conference Finals can be determined by following the outlined steps to verify the relevant data and context',
 "The sentence 'Patrice Bergeron took a backhand shot' is plausible",
 'The sentence Allen Robinson gained five yards is plausible',
 'The sentence Andres Iniesta performed a give and go is plausible',
 'The sentence Angel Di Maria scored in extra time is plausible if verified by match records',
 'The sentence Anthony Davis beat the buzzer is plausible',
 "The sentence Bastian Schweinsteiger scored in added time is plausible based on the historical context of his career the definition of 'added time' in soccer and the likelihood of him scoring in such s

In [198]:
# Plausible (Yes)
plausible_yes = [
    'Plausible',
    'The sentence Kendrick Nunn took a charge is plausible',
    'The sentence Kyle Tucker took a left footed shot is plausible assuming Kyle Tucker is a soccer player and considering the general possibility of players using their non-dominant foot',
    'The sentence Toni Kroos performed a give and go is plausible',
    'The sentence is plausible',
    'The sentence is plausible but not definitively true without additional context or information',
    'The sentence is plausible if Brandon Lowe is a soccer player but not if he is a baseball player',
    'The sentence is plausible if Elias Lindholm is a hockey player and the context is a hockey game',
    'The sentence is plausible in a sports context',
    'The statement Luis Robert was out at second is plausible',
    'The statement is plausible',
    'The statement is plausible if supported by relevant evidence',
    'True'
]

# Implausible (No)
implausible_no = [
    'False',
    'Implausible',
    'The sentence Dani Alves took the snap is not plausible in a sports context',
    'The sentence is not plausible',
    'implausible',
    'not plausible'
]

# Indeterminate
indeterminate = [
    'The plausibility of the sentence Juan Soto did a double stepover is uncertain without further evidence',
    'The plausibility of the sentence cannot be determined with the information provided',
    'The plausibility of the statement Deshaun Watson was flagged on the play is uncertain without specific data',
    'The plausibility of the statement Jonas Valanciunas beat the buzzer depends on the analysis of relevant data and critical thinking as outlined above',
    'The plausibility of the sentence is plausible if Brandon Lowe is a soccer player but not if he is a baseball player',
    'The plausibility of the sentence is plausible if Elias Lindholm is a hockey player and the context is a hockey game',
    'The plausibility of the sentence is plausible if supported by relevant evidence',
    'not True or False A False'
]


def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'The plausibility of the sentence Juan Soto did a double stepover is uncertain without further evidence',
 'The plausibility of the sentence cannot be determined with the information provided',
 'The plausibility of the statement Deshaun Watson was flagged on the play is uncertain without specific data',
 'The plausibility of the statement Jonas Valanciunas beat the buzzer depends on the analysis of relevant data and critical thinking as outlined above',
 'no',
 'not True or False A False',
 'yes'}

{'The plausibility of the sentence James Karinchak crossed the blue line remains uncertain without specific context or evidence',
 'The plausibility of the sentence is uncertain without concrete data',
 'The plausibility of the statement Malcolm Brogdon drove into the restricted area in the Eastern Conference Finals can be determined by following the outlined steps to verify the relevant data and context',
 "The sentence Paulinho earned an indirect kick in the FA Cup is plausible if all the assumptions and considerations align with the available data and football rules However without specific data on Paulinho's participation in the FA Cup and the exact context of the indirect kick the plausibility remains uncertain",
 'The sentence is plausible',
 'The sentence is plausible but highly unlikely',
 "The sentence is plausible if 'in the Stanley Cup' is interpreted as referring to the championship series rather than the physical trophy",
 "The sentence is plausible if 'the screen' refers 

In [199]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [200]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.784

# temporal_sequences

In [201]:
chk_dir = here(os.path.join(par_dir, "bbh-temporal_sequences/bbh_eval"))

In [202]:
dataset = Dataset.load_from_disk(chk_dir)

In [203]:
set(dataset["answer_pred"])

{'(A) 10am to 12pm.',
 '(A) 11am to 1pm.',
 '(A) 12pm to 1pm.',
 '(A) 12pm to 2pm.',
 '(A) 1pm to 2pm and (B) 5pm to 6pm.',
 '(A) 2pm to 7pm.',
 '(A) 3pm to 4pm.',
 '(A) 3pm to 6pm.',
 '(A) 5am to 6am.',
 '(A) 5am to 7am.',
 '(A) 5pm to 9pm.',
 '(A) 6am to 10am.',
 '(A) 6am to 11am.',
 '(A) 6am to 7am.',
 '(A) 6pm to 7pm.',
 '(A) 7am to 8am.',
 '(A) 8am to 10am.',
 '(A) 8am to 9am.',
 '(A) 8pm to 9pm.',
 '(A) 9am to 11am.',
 '(A) 9am to 2pm.',
 '(A) 9am to 4pm.',
 '(A) 9am to 5pm.',
 '(A).',
 '(B) 11am to 12pm.',
 '(B) 2pm to 9pm.',
 '(B) 4pm to 5pm and (C) 2pm to 4pm.',
 '(B) 5am to 11am.',
 '(B) 5pm to 8pm.',
 '(B) 7am to 9am.',
 '(C) 10am to 12pm.',
 '(C) 1pm to 2pm.',
 '(C) 3pm to 4pm.',
 '(C) 4pm to 5pm.',
 '(C) 5am to 7am.',
 '(C) 6am to 7am.',
 '(C) 6am to 8am.',
 '(C) 7am to 10am.',
 '(C) 7am to 11am.',
 '(C) 8am to 10am.',
 '(C) 9am to 10am.',
 '(C) 9am to 11am.',
 '(D) 11am to 2pm.',
 '(D) 12pm to 4pm.',
 '(D) 1pm to 3pm.',
 '(D) 1pm to 6pm.',
 '(D) 2pm to 3pm.',
 '(D) 2pm to

In [204]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()}"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A 10am to 12pm',
 'A 11am to 1pm',
 'A 12pm to 1pm',
 'A 12pm to 2pm',
 'A 1pm to 2pm and B 5pm to 6pm',
 'A 2pm to 7pm',
 'A 3pm to 4pm',
 'A 3pm to 6pm',
 'A 5am to 6am',
 'A 5am to 7am',
 'A 5pm to 9pm',
 'A 6am to 10am',
 'A 6am to 11am',
 'A 6am to 7am',
 'A 6pm to 7pm',
 'A 7am to 8am',
 'A 8am to 10am',
 'A 8am to 9am',
 'A 8pm to 9pm',
 'A 9am to 11am',
 'A 9am to 2pm',
 'A 9am to 4pm',
 'A 9am to 5pm',
 'A and D',
 'A or D',
 'B',
 'B 11am to 12pm',
 'B 2pm to 9pm',
 'B 4pm to 5pm and C 2pm to 4pm',
 'B 5am to 11am',
 'B 5pm to 8pm',
 'B 7am to 9am',
 'C',
 'C 10am to 12pm',
 'C 1pm to 2pm',
 'C 3pm to 4pm',
 'C 4pm to 5pm',
 'C 5am to 7am',
 'C 6am to 7am',
 'C 6am to 8am',
 'C 7am to 10am',
 'C 7am to 11am',
 'C 8am to 10am',
 'C 9am to 10am',
 'C 9am to 11am',
 'D',
 'D 11am to 2pm',
 'D 12pm to 4pm',
 'D 1pm to 3pm',
 'D 1pm to 6pm',
 'D 2pm to 3pm',
 'D 2pm to 4pm',
 'D 3pm to 6pm',
 'D 4pm to 7pm',
 'D 5am to 11am',
 'D 5am to 6am',
 'D 5pm to 6pm',
 'D 6am to 2p

In [205]:
def map_fn(ins):
    sp = ins["answer_pred"].split()
    if len(sp) == 4 or len(sp) == 1:
        return {
            "answer_pred": sp[0]
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A 1pm to 2pm and B 5pm to 6pm',
 'A and D',
 'A or D',
 'B',
 'B 4pm to 5pm and C 2pm to 4pm',
 'C',
 'D'}

In [206]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [207]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.972

# tracking_shuffled_objects_five_objects

In [208]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_five_objects/bbh_eval"))
chk_dir

PosixPath('/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/modified/non_self_synthesis/bbh/bbh-tracking_shuffled_objects_five_objects/bbh_eval')

In [209]:
dataset = Dataset.load_from_disk(chk_dir)

In [210]:
set(dataset["answer_pred"])

{'(A) Catch-22.',
 '(A) Helga.',
 '(A) Izzi.',
 '(A) Jamie.',
 '(A) Moby Dick.',
 '(A) The Great Gatsby.',
 '(A) Ulysses.',
 '(A) benchwarmer.',
 '(A) black ball.',
 '(A) brown ball.',
 '(A) brown present.',
 '(A) fullback.',
 '(A) green ball.',
 '(A) green present.',
 '(A) orange ball.',
 '(A) red ball.',
 '(A) red present.',
 '(A) right winger.',
 '(B) Catch-22.',
 '(B) Izzi.',
 '(B) Jamie.',
 '(B) Karl.',
 '(B) Moby Dick.',
 '(B) Patrick.',
 '(B) Sam.',
 '(B) Ulysses.',
 '(B) black ball.',
 '(B) brown present.',
 '(B) center midfielder.',
 '(B) orange ball.',
 '(B) red present.',
 '(B) right winger.',
 '(B) striker.',
 '(B) white present.',
 '(C) Catch-22.',
 '(C) Frankenstein.',
 '(C) Helga.',
 '(C) Hound of the Baskervilles.',
 '(C) Lola.',
 '(C) Melissa.',
 '(C) Patrick.',
 '(C) benchwarmer.',
 '(C) blue present.',
 '(C) fullback.',
 '(C) green ball.',
 '(C) green present.',
 '(C) red ball.',
 '(C) right winger.',
 '(C) striker.',
 '(D)',
 '(D) Frankenstein.',
 '(D) Helga.',
 '(D

In [211]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Catch-22',
 'A Helga',
 'A Izzi',
 'A Jamie',
 'A Moby Dick',
 'A The Great Gatsby',
 'A Ulysses',
 'A benchwarmer',
 'A black ball',
 'A brown ball',
 'A brown present',
 'A fullback',
 'A green ball',
 'A green present',
 'A orange ball',
 'A red ball',
 'A red present',
 'A right winger',
 'B',
 'B Catch-22',
 'B Izzi',
 'B Jamie',
 'B Karl',
 'B Moby Dick',
 'B Patrick',
 'B Sam',
 'B Ulysses',
 'B black ball',
 'B brown present',
 'B center midfielder',
 'B orange ball',
 'B red present',
 'B right winger',
 'B striker',
 'B white present',
 'C',
 'C Catch-22',
 'C Frankenstein',
 'C Helga',
 'C Hound of the Baskervilles',
 'C Lola',
 'C Melissa',
 'C Patrick',
 'C benchwarmer',
 'C blue present',
 'C fullback',
 'C green ball',
 'C green present',
 'C red ball',
 'C right winger',
 'C striker',
 'D',
 'D Frankenstein',
 'D Helga',
 'D Hound of the Baskervilles',
 'D Jamie',
 'D Karl',
 'D Ophelia',
 'D Patrick',
 'D The Fellowship of the Ring',
 'D Ulysses',
 'D benchwa

In [212]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [213]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [214]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.984

# tracking_shuffled_objects_seven_objects

In [215]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_seven_objects/bbh_eval"))

In [216]:
dataset = Dataset.load_from_disk(chk_dir)

In [217]:
set(dataset["answer_pred"])

{'(A) Catch-22.',
 '(A) Frankenstein.',
 '(A) Lola.',
 '(A) Lolita.',
 '(A) Patrick.',
 '(A) Sam.',
 '(A) The Fellowship of the Ring.',
 '(A) The Pearl.',
 '(A) black ball.',
 '(A) blue present.',
 '(A) goalkeeper.',
 '(A) green ball.',
 '(A) green present.',
 '(A) left winger.',
 '(A) orange ball.',
 '(A) purple ball.',
 '(A) purple present.',
 '(A) red present.',
 '(A) striker.',
 '(A).',
 '(B) Izzi.',
 '(B) Lola.',
 '(B) The Odyssey.',
 '(B) blue ball.',
 '(B) goalkeeper.',
 '(B) right winger.',
 '(C) Jamie.',
 '(C) Lola.',
 '(C) Lolita.',
 '(C) Ophelia.',
 '(C) Patrick.',
 '(C) Ulysses.',
 '(C) blue ball.',
 '(C) brown ball.',
 '(C) brown present.',
 '(C) green ball.',
 '(C) purple ball.',
 '(C) red ball.',
 '(C) right midfielder.',
 '(C).',
 '(D) Izzi.',
 '(D) Lola.',
 '(D) Melissa.',
 '(D) Moby Dick.',
 '(D) Patrick.',
 '(D) Rodrigo.',
 '(D) Ulysses.',
 '(D) blue ball.',
 '(D) brown present.',
 '(D) green ball.',
 '(D) left midfielder.',
 '(D) pink ball.',
 '(D) purple ball.',
 '

In [218]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [219]:
set(new_ds["answer_pred"])

{'A',
 'A Catch-22',
 'A Frankenstein',
 'A Lola',
 'A Lolita',
 'A Patrick',
 'A Sam',
 'A The Fellowship of the Ring',
 'A The Pearl',
 'A black ball',
 'A blue present',
 'A goalkeeper',
 'A green ball',
 'A green present',
 'A left winger',
 'A orange ball',
 'A purple ball',
 'A purple present',
 'A red present',
 'A striker',
 'B',
 'B Izzi',
 'B Lola',
 'B The Odyssey',
 'B blue ball',
 'B goalkeeper',
 'B right winger',
 'C',
 'C Jamie',
 'C Lola',
 'C Lolita',
 'C Ophelia',
 'C Patrick',
 'C Ulysses',
 'C blue ball',
 'C brown ball',
 'C brown present',
 'C green ball',
 'C purple ball',
 'C red ball',
 'C right midfielder',
 'D',
 'D Izzi',
 'D Lola',
 'D Melissa',
 'D Moby Dick',
 'D Patrick',
 'D Rodrigo',
 'D Ulysses',
 'D blue ball',
 'D brown present',
 'D green ball',
 'D left midfielder',
 'D pink ball',
 'D purple ball',
 'D purple present',
 'D red',
 'D red ball',
 'D right midfielder',
 'D white ball',
 'E',
 'E Hound of the Baskervilles',
 'E Melissa',
 'E Patrick

In [220]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F', 'G'}

In [221]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [222]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.944

# tracking_shuffled_objects_three_objects

In [223]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_three_objects/bbh_eval"))

In [224]:
dataset = Dataset.load_from_disk(chk_dir)

In [225]:
set(dataset["answer_pred"])

{'(A) Catch-22.',
 '(A) Frankenstein.',
 '(A) Helga.',
 '(A) Hound of the Baskervilles.',
 '(A) Izzi.',
 '(A) Lola.',
 '(A) Lolita.',
 '(A) Melissa.',
 '(A) Ophelia.',
 '(A) Patrick.',
 '(A) Rodrigo.',
 '(A) Sam.',
 '(A) The Fellowship of the Ring.',
 '(A) The Great Gatsby.',
 '(A) Ulysses.',
 '(A) black ball.',
 '(A) blue present.',
 '(A) brown ball.',
 '(A) cheerleader.',
 '(A) goalkeeper.',
 '(A) green ball.',
 '(A) orange ball.',
 '(A) pink ball.',
 '(A) red ball.',
 '(A) red present.',
 '(A) right midfielder.',
 '(A) right winger.',
 '(A) striker.',
 '(A) white present.',
 '(A) yellow ball.',
 '(A) yellow present.',
 '(B) Frankenstein.',
 '(B) Helga.',
 '(B) Izzi.',
 '(B) Jamie.',
 '(B) Karl.',
 '(B) Moby Dick.',
 '(B) Patrick.',
 '(B) Rodrigo.',
 '(B) The Odyssey.',
 '(B) benchwarmer.',
 '(B) black ball.',
 '(B) blue present.',
 '(B) center midfielder.',
 '(B) left midfielder.',
 '(B) orange ball.',
 '(B) pink ball.',
 '(B) purple ball.',
 '(B) purple present',
 '(B) purple prese

In [226]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Catch-22',
 'A Frankenstein',
 'A Helga',
 'A Hound of the Baskervilles',
 'A Izzi',
 'A Lola',
 'A Lolita',
 'A Melissa',
 'A Ophelia',
 'A Patrick',
 'A Rodrigo',
 'A Sam',
 'A The Fellowship of the Ring',
 'A The Great Gatsby',
 'A Ulysses',
 'A black ball',
 'A blue present',
 'A brown ball',
 'A cheerleader',
 'A goalkeeper',
 'A green ball',
 'A orange ball',
 'A pink ball',
 'A red ball',
 'A red present',
 'A right midfielder',
 'A right winger',
 'A striker',
 'A white present',
 'A yellow ball',
 'A yellow present',
 'B',
 'B Frankenstein',
 'B Helga',
 'B Izzi',
 'B Jamie',
 'B Karl',
 'B Moby Dick',
 'B Patrick',
 'B Rodrigo',
 'B The Odyssey',
 'B benchwarmer',
 'B black ball',
 'B blue present',
 'B center midfielder',
 'B left midfielder',
 'B orange ball',
 'B pink ball',
 'B purple ball',
 'B purple present',
 'B red present',
 'B right winger',
 'B white present',
 'C',
 'C Frankenstein',
 'C Hound of the Baskervilles',
 'C Izzi',
 'C Jamie',
 'C Karl',
 'C 

In [227]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [228]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [229]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.992

# web_of_lies

In [230]:
chk_dir = here(os.path.join(par_dir, "bbh-web_of_lies/bbh_eval"))

In [231]:
dataset = Dataset.load_from_disk(chk_dir)

In [232]:
set(dataset["answer_pred"])

{'False.', None, 'True.', 'Yes.'}

In [233]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'False', None, 'True', 'Yes'}

In [234]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 11
})

In [238]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "the final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Amberly tells the truth if Conception lies.',
 'Crista tells the truth.',
 'False',
 'It cannot be determined if Gwenn tells the truth.',
 'It cannot be determined whether Elanor tells the truth based on the given information.',
 'It cannot be determined whether Jaymie tells the truth or lies.',
 'It cannot be determined with the given information.',
 'Kandi does not tell the truth.',
 'Millicent tells the truth.',
 'Sima tells the truth.',
 'The problem is not solvable with the given information.',
 'True',
 'We cannot determine if Fletcher tells the truth.',
 'Yes'}

In [239]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [240]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Amberly tells the truth if Conception lies',
 'Crista tells the truth',
 'False',
 'It cannot be determined if Gwenn tells the truth',
 'It cannot be determined whether Elanor tells the truth based on the given information',
 'It cannot be determined whether Jaymie tells the truth or lies',
 'It cannot be determined with the given information',
 'Kandi does not tell the truth',
 'Millicent tells the truth',
 'Sima tells the truth',
 'The problem is not solvable with the given information',
 'True',
 'We cannot determine if Fletcher tells the truth',
 'Yes'}

In [242]:
# Yes list
yes_ls = [
    'Crista tells the truth',
    'Millicent tells the truth',
    'Sima tells the truth',
    'True',
    'Yes'
]

# No list
no_ls = [
    'False',
    'Kandi does not tell the truth'
]

# Indeterminate list
indeterminate_ls = [
    'Amberly tells the truth if Conception lies',
    'It cannot be determined if Gwenn tells the truth',
    'It cannot be determined whether Elanor tells the truth based on the given information',
    'It cannot be determined whether Jaymie tells the truth or lies',
    'It cannot be determined with the given information',
    'The problem is not solvable with the given information',
    'We cannot determine if Fletcher tells the truth'
]

def map_fn(ins):
    for yes in yes_ls:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in no_ls:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Amberly tells the truth if Conception lies',
 'It cannot be determined if Gwenn tells the truth',
 'It cannot be determined whether Elanor tells the truth based on the given information',
 'It cannot be determined whether Jaymie tells the truth or lies',
 'It cannot be determined with the given information',
 'No',
 'The problem is not solvable with the given information',
 'We cannot determine if Fletcher tells the truth',
 'Yes'}

In [243]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [244]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.892

# word_sorting

In [288]:
chk_dir = here(os.path.join(par_dir, "bbh-word_sorting/bbh_eval"))

In [289]:
dataset = Dataset.load_from_disk(chk_dir)

In [290]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]`"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

In [291]:
none_ds = new_ds.filter(lambda x: x["answer_pred"]==None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 86
})

In [292]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

In [294]:
new_ds["target"][0]

'agile blackguard butt clapeyron cognoscenti flamboyant geophysical lift lightfooted manumitted mathieu meager purposive reconnaissance sawbelly scribe seaworthy wiseacre woodcut yves'

In [295]:
print(new_ds["trajectory"][3])

### Step-by-Step Reasoning Plan

1. **Understand the Task**:
   - The task is to sort a list of words alphabetically.
   - The list of words is: crossbill, bushland, capacitive, meager, ricochet, battery, enigma, wysiwyg, wallet, lipton, jane, wacke, contingent.

2. **Simplify the Sorting Task**:
   - Break down the list into smaller, manageable groups to make sorting easier.
   - For simplicity, divide the list into groups of 3-4 words each.

3. **Break Down the List**:
   - Divide the list into smaller groups:
     - Group 1: crossbill, bushland, capacitive
     - Group 2: meager, ricochet, battery
     - Group 3: enigma, wysiwyg, wallet
     - Group 4: lipton, jane, wacke
     - Group 5: contingent

4. **Sort Each Group Alphabetically**:
   - Sort Group 1: crossbill, bushland, capacitive
     - Sorted Group 1: bushland, capacitive, crossbill
   - Sort Group 2: meager, ricochet, battery
     - Sorted Group 2: battery, meager, ricochet
   - Sort Group 3: enigma, wysiwyg, wallet
     -

In [296]:
blank_ds = new_ds.filter(lambda x: x["answer_pred"] == "")
blank_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 36
})

In [297]:
print(blank_ds[4]["trajectory"])

### Step-by-Step Reasoning Plan

1. **Identify the Core Task**:
   - The core task is to sort the given list of words alphabetically.

2. **Simplify the Sorting Task**:
   - Break down the list of words into smaller, manageable groups to make the sorting process easier.

3. **Break Down the List**:
   - Divide the list into smaller groups. For example:
     - Group 1: sociolinguistic, nigeria, odysseus
     - Group 2: demystify, proton, voltaire
     - Group 3: penny, contaminate, bighorn
     - Group 4: stirrup

4. **Sort Each Group Alphabetically**:
   - Sort Group 1:
     - Compare and arrange: nigeria, odysseus, sociolinguistic
   - Sort Group 2:
     - Compare and arrange: demystify, proton, voltaire
   - Sort Group 3:
     - Compare and arrange: bighorn, contaminate, penny
   - Sort Group 4:
     - Since there is only one word, it remains as is: stirrup

5. **Merge Sorted Groups**:
   - Merge the sorted groups back into a single list while maintaining the alphabetical order:
    

In [298]:
non_blank_ds = new_ds.filter(lambda x: x["answer_pred"] != "")
non_blank_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 214
})

In [299]:
blank_ds.to_csv("./bbh_word_sorting_blank.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

332293

In [300]:
import json
file_name = "./project-6-at-2024-11-12-14-05-becaaf3a.json"
with open(file_name, "r") as f:
    blank_ds_ann = Dataset.from_list(json.load(f))

blank_ds_ann

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred', 'id', 'answer_pred_ann', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time'],
    num_rows: 36
})

In [301]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred_ann"]
    }

blank_ds = blank_ds_ann.map(map_fn, remove_columns=['id', 'answer_pred_ann', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time'])

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [302]:
blank_ds[5]["answer_pred"]

'caliber capricious eft faulkner fragile gastrointestinal headboard irishman kingsley lobby nary ouzo peaceable phillip phylum residue stamp sulfanilamide upholster'

In [303]:
from datasets import concatenate_datasets

new_ds = concatenate_datasets([non_blank_ds, blank_ds])
new_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [305]:
none_ds = new_ds.filter(lambda x: x["answer_pred"]==None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [306]:
print(none_ds[0]["trajectory"])

Let's follow the reasoning plan step-by-step to sort the words alphabetically:

1. **Initial List**: condescend, rampant, percolate, coltish, date, rochester, placid, significant.

2. **First Pass**:
   - condescend vs. rampant: no swap
   - rampant vs. percolate: swap (percolate, rampant)
   - rampant vs. coltish: swap (coltish, rampant)
   - rampant vs. date: swap (date, rampant)
   - rampant vs. rochester: no swap
   - rochester vs. placid: swap (placid, rochester)
   - rochester vs. significant: no swap

   **List after First Pass**: condescend, percolate, coltish, date, rampant, placid, rochester, significant.

3. **Second Pass**:
   - condescend vs. percolate: no swap
   - percolate vs. coltish: swap (coltish, percolate)
   - percolate vs. date: swap (date, percolate)
   - percolate vs. rampant: no swap
   - rampant vs. placid: no swap
   - placid vs. rochester: no swap
   - rochester vs. significant: no swap

   **List after Second Pass**: condescend, coltish, date, percolate, r

In [307]:
def map_fn(ins):
    if ins["answer_pred"] == None:
        return {
            "answer_pred": "coltish date condescend percolate placid rampant rochester significant"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
none_ds = new_ds.filter(lambda x: x["answer_pred"]==None)
none_ds

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [312]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].replace(",", "")
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [325]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [313]:
corr = new_ds.filter(lambda x: x["target"].lower() == x["answer_pred"].lower())
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.636

In [314]:
wro = new_ds.filter(lambda x: x["target"].lower() != x["answer_pred"].lower())
wro

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 91
})

In [324]:
index = 51

print(wro[index]["target"])
print("+"*80)
print(wro[index]["answer_pred"])

affable almost antic apache astute dandelion deadlock delphic execution fortunate horntail leverage levitate libertarian sanction scathe semitic storehouse sweeney unbeknownst
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
almost antic apache astute affable deadlock delphic dandelion execution fortunate horntail levitate leverage libertarian sanction scathe semitic storehouse sweeney unbeknownst


# multistep_arithmetic_two

In [150]:
chk_dir = here(os.path.join(par_dir, "bbh-multistep_arithmetic_two/bbh_eval"))

In [151]:
dataset = Dataset.load_from_disk(chk_dir)

In [155]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]*\()"`')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{' 108 ',
 '-1',
 '-10',
 '-11',
 '-110',
 '-113',
 '-114',
 '-11520',
 '-13',
 '-1300',
 '-1343',
 '-147',
 '-15',
 '-151',
 '-168',
 '-169',
 '-17',
 '-170',
 '-18',
 '-19',
 '-192',
 '-196',
 '-1960',
 '-20',
 '-21',
 '-2146',
 '-22',
 '-224',
 '-23',
 '-235',
 '-24',
 '-240',
 '-2448',
 '-25',
 '-26',
 '-262',
 '-264',
 '-3',
 '-30',
 '-3136',
 '-316',
 '-32',
 '-320',
 '-33',
 '-330',
 '-340',
 '-3400',
 '-343',
 '-35',
 '-37',
 '-38',
 '-3850',
 '-39',
 '-391',
 '-39960',
 '-4',
 '-43',
 '-46',
 '-48',
 '-5',
 '-50',
 '-51',
 '-52',
 '-5453',
 '-55',
 '-554',
 '-57',
 '-6',
 '-60',
 '-61',
 '-658',
 '-7',
 '-71',
 '-76',
 '-8',
 '-80',
 '-81',
 '-83',
 '-8304',
 '-84',
 '-8840',
 '-89',
 '-9',
 '-92',
 '-97',
 '-99',
 '0',
 '10',
 '101',
 '105',
 '107',
 '11',
 '11050',
 '1168',
 '12',
 '120',
 '128',
 '14',
 '140',
 '15',
 '150',
 '1608',
 '17',
 '18',
 '181',
 '198',
 '1980',
 '2',
 '20',
 '21',
 '211',
 '216',
 '220',
 '2240',
 '24',
 '25',
 '250992',
 '26',
 '2652',
 '27',
 '

In [157]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 2
})

In [160]:
print(none_ds[0]["trajectory"])
print(none_ds[1]["trajectory"])

Let's follow the reasoning plan step-by-step to solve the given arithmetic expression:

### Step 1: Simplify the Expression within the Parentheses

First, we simplify the expression inside the first set of parentheses:
\[ (-9 + -7 + -1 - -1) \]

- Combine like terms:
  \[ -9 + -7 + -1 - -1 = -9 - 7 - 1 + 1 \]

- Simplify `-1 - -1` to `0`:
  \[ -9 - 7 - 1 + 1 = -9 - 7 + 0 \]

- Combine the remaining terms:
  \[ -9 - 7 = -16 \]

So, the first part simplifies to:
\[ -16 \]

Next, we simplify the expression inside the second set of parentheses:
\[ (3 - -7 - -1 * 0) \]

- Simplify `-1 * 0` to `0`:
  \[ 3 - -7 - 0 = 3 - -7 \]

- Simplify `- -7` to `7`:
  \[ 3 + 7 \]

- Combine the terms:
  \[ 3 + 7 = 10 \]

So, the second part simplifies to:
\[ 10 \]

### Step 2: Break Down the Expression

Now the expression is simplified to:
\[ (-16) * (10) \]

### Step 3: Apply Order of Operations

Perform the multiplication:
\[ -16 * 10 = -160 \]

### Step 4: Create a Step-by-Step Plan

- Step 1: Simplify

In [162]:
def map_fn(ins):
    if "- Step 1: Simplify `(-9 + -7 + -1 - -1)` to `-16`." in ins["trajectory"]:
        return {
            "answer_pred": "-160"
        }

    if "\[ (-8 * 5 * -6 + 7) \]" in ins["trajectory"]:
        return {
            "answer_pred": "237"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [163]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.92

In [164]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

# navigate

In [165]:
chk_dir = here(os.path.join(par_dir, "bbh-navigate/bbh_eval"))

In [166]:
dataset = Dataset.load_from_disk(chk_dir)

In [167]:
set(dataset["answer_pred"])

{'No.', 'Yes.', 'Yes.**'}

In [170]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]*"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'No', 'Yes'}

In [171]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.968

In [172]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

# object_counting

In [173]:
chk_dir = here(os.path.join(par_dir, "bbh-object_counting/bbh_eval"))

In [174]:
dataset = Dataset.load_from_disk(chk_dir)

In [175]:
set(dataset["answer_pred"])

{'10.',
 '11.',
 '12.',
 '13.',
 '14.',
 '15.',
 '16.',
 '17.',
 '18.',
 '2.',
 '3.',
 '4.',
 '5.',
 '6.',
 '7.',
 '8.',
 '9.'}

In [176]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}

In [177]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.976

In [178]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]