In [1]:
from datasets import Dataset
from pyprojroot import here
import os

In [2]:
par_dir = here("struct_vs_unstruct/data/mistral_large_2407/original/non_self_synthesis/bbh/")
save_par_dir = here(os.path.join(par_dir, "refined"))
print(par_dir)
print(save_par_dir)

/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/original/non_self_synthesis/bbh
/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/original/non_self_synthesis/bbh/refined


# boolean_expressions

In [6]:
chk_dir = here(os.path.join(par_dir, "bbh-boolean_expressions/bbh_eval"))

In [5]:
dataset = Dataset.load_from_disk(chk_dir)

In [7]:
set(dataset["answer_pred"])

{'False"',
 'False.',
 'False."',
 'True"',
 'True.',
 'True."',
 'True.",',
 'not True or False."'}

In [10]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,"'))
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [11]:
set(new_ds["answer_pred"])

{'False', 'True', 'not True or False'}

In [12]:
new_ds.save_to_disk(os.path.join(save_par_dir, "boolean_expressions"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [15]:
total = 0
for instance in new_ds:
    if instance["answer_pred"] == instance["target"]:
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.956)

# causal_judgement

In [16]:
chk_dir = here(os.path.join(par_dir, "bbh-causal_judgement/bbh_eval"))

In [17]:
dataset = Dataset.load_from_disk(chk_dir)

In [18]:
set(dataset["answer_pred"])

{'No.', 'No."', 'Yes.', 'Yes."'}

In [19]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [20]:
set(new_ds["answer_pred"])

{'No', 'Yes'}

In [21]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/187 [00:00<?, ? examples/s]

0.7005347593582888

In [22]:
new_ds.save_to_disk(os.path.join(save_par_dir, "causal_judgement"))

Saving the dataset (0/1 shards):   0%|          | 0/187 [00:00<?, ? examples/s]

# date_understanding

In [23]:
chk_dir = here(os.path.join(par_dir, "bbh-date_understanding/bbh_eval"))

In [24]:
dataset = Dataset.load_from_disk(chk_dir)

In [25]:
set(dataset["answer_pred"])

{'(D)."',
 '(E).',
 '(E)."',
 '(F) 06/01/1943.',
 '(F) 11/30/2019."',
 '(F) 12/31/2014."',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'E."',
 'F"',
 'F.',
 'F."',
 None,
 'None of the options match the calculated date.',
 'None of the options match the calculated date."',
 'None of the options match the new date.',
 'not listed among the given options.',
 'not listed among the provided options.',
 'not listed in the options."',
 'that none of the given options match the calculated date of 02/16/2010.',
 'that none of the given options match the calculated date."',
 'that none of the options match the calculated date 05/01/2021.',
 'that none of the options match the calculated date.',
 'that the calculated date 01/02/1963 does not match any of the provided options.'}

In [26]:
def map_fn(instance):
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [27]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'F 06/01/1943',
 'F 11/30/2019',
 'F 12/31/2014',
 None,
 'None of the options match the calculated date',
 'None of the options match the new date',
 'not listed among the given options',
 'not listed among the provided options',
 'not listed in the options',
 'that none of the given options match the calculated date',
 'that none of the given options match the calculated date of 02/16/2010',
 'that none of the options match the calculated date',
 'that none of the options match the calculated date 05/01/2021',
 'that the calculated date 01/02/1963 does not match any of the provided options'}

In [30]:
def map_fn(ins):
    ls = ins["answer_pred"].split() if ins["answer_pred"] else [None]

    if ls[0] == "F":
        return {
            "answer_pred": "F"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [31]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 None,
 'None of the options match the calculated date',
 'None of the options match the new date',
 'not listed among the given options',
 'not listed among the provided options',
 'not listed in the options',
 'that none of the given options match the calculated date',
 'that none of the given options match the calculated date of 02/16/2010',
 'that none of the options match the calculated date',
 'that none of the options match the calculated date 05/01/2021',
 'that the calculated date 01/02/1963 does not match any of the provided options'}

In [32]:
ds_none = new_ds.filter(lambda x: x["answer_pred"] == None)
ds_none

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 3
})

In [35]:
ds_none[2]

{'input': 'Jane scheduled 3 apointments with 5 poeple for tomorrow (Tue, 7/9/1972). What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 07/09/1923\n(B) 08/05/1972\n(C) 09/01/1972\n(D) 07/09/1972\n(E) 07/09/2007\n(F) 09/04/1972',
 'target': '(D)',
 'reasoning_formats': '\n- If the answer is not multiple choice, [answer] should be the decided answer. (For eg: Q: not True or False. A: False)\n- If the answer is multiple choice,\n    - and the given choices are unlabelled options, [answer] should be the chosen option (For eg: Q: Where does the sun rise from? Options: - East, - West, - North. A: East)\n    - and the given choices are labelled options, [answer] should be the letter corresponding to the chosen option (For eg: Q: Where does the sun rise from? Options: - A. West, - B. East, - C. North. A: B)',
 'selected_modules': '16. What is the core issue or problem that needs to be addressed?\n20. Are there any relevant data or information that can provide insights into the problem? If 

In [37]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [38]:
total = 0

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.816)

# disambiguation_qa

In [39]:
chk_dir = here(os.path.join(par_dir, "bbh-disambiguation_qa/bbh_eval"))

In [40]:
dataset = Dataset.load_from_disk(chk_dir)

In [41]:
set(dataset["answer_pred"])

{'(A) Alex sent the letter.',
 '(A) Alex sent the letter."',
 '(A) It is Sam\'s office"',
 '(A) It was the educator\'s grading policy."',
 '(A) The developer focuses on code.',
 '(A) The homeowner had purchased."',
 '(A) The lawyer needed to understand."',
 '(A) The mechanic was in a good mood.',
 '(A) The patient had a skin condition."',
 '(A) The scientist shares a story"',
 '(A) The surgeon needed more time.',
 '(A) The worker was repairing."',
 "(A) They were my parent's secretary.",
 '(A).',
 '(A)."',
 '(B)."',
 '(C) Ambiguous"',
 '(C) Ambiguous.',
 '(C) Ambiguous."',
 '(C).',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B. It is the chef\'s culinary training."',
 'B. The pronoun \'them\' refers to \'the editor\'."',
 'B."',
 'C"',
 'C.',
 'C."'}

In [42]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [43]:
set(new_ds["answer_pred"])

{'A',
 'A Alex sent the letter',
 "A It is Sam's office",
 "A It was the educator's grading policy",
 'A The developer focuses on code',
 'A The homeowner had purchased',
 'A The lawyer needed to understand',
 'A The mechanic was in a good mood',
 'A The patient had a skin condition',
 'A The scientist shares a story',
 'A The surgeon needed more time',
 'A The worker was repairing',
 "A They were my parent's secretary",
 'B',
 "B It is the chef's culinary training",
 "B The pronoun 'them' refers to 'the editor'",
 'C',
 'C Ambiguous'}

In [44]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    return {
        "answer_pred": ls[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [45]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [46]:
total = 0

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.692)

# dyck_languages

In [345]:
chk_dir = here(os.path.join(par_dir, "bbh-dyck_languages/bbh_eval"))

In [346]:
dataset = Dataset.load_from_disk(chk_dir)

In [347]:
none_ds = dataset.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 49
})

In [355]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = dataset.map(map_fn)

In [356]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [357]:
set(new_ds["answer_pred"])

{'',
 '"< < [ ] >".',
 '"< [ ] { < ( ) > } [ ] ( { }".',
 '"[ < [ ] { { } { < ( { } ) > } } > ] ( { } )".',
 "'< < > >'.",
 "'< [ [ ] ] >'.",
 "'< [ ] >'.",
 '( ( ( ( [ [ < [ { { [ ] } } ] > ] ] ( ) ) ) ) ) ) ) )',
 '( ( ( ) ) )).',
 '( ( < < < ( ( ) ) ( [ ] ) > > { [ ] } ) ) > > >',
 '( ( < > ) ).',
 '( ( < { } > ) ) < { } >.',
 '( ( { } ) ).',
 '( ) ( () ).',
 '( ) ( < < { } > >.',
 '( ) [ ( [ < { { ( { } ) } } > ] ) ]',
 '( ) { < } >.',
 '( < < > > < > [ ] [ )',
 '( < > ( [ ( ) ] ) )',
 '( < [ ( ) ] > ).',
 '( < { } [ ] > ).',
 '( [ ( ) ] ).',
 '( [ < < { } > > ] ).',
 '( [ [ [ ( { ( ( < [ { < > } ] > { { [ ] } } ) ) } ( [ [ < > ] ] ) ) ] ] ] ) ( < > < [ ( ) ] > ( ) > )',
 '( { ( ) } ).',
 '( { < [ < > ] > } ).',
 '( { < { ( ( { } ) ( ) ) } { } < { } > < > > } { } ( { ( { { } } ) [ ( ) ] } ) ) [ ( [ ] ) ]',
 '( { [ { } ] } ).',
 '( { } )',
 '( { } ) < { < { } > } >',
 '( { } ) { ( [ { ( ) } ] ( [ ] ) ) }.',
 '**',
 '< ( ( ( [ { } ] ) ) ) >.',
 '< ( () )',
 '< ( ) ( { { [ ] } } ) >.'

In [358]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."'))
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [62]:
print(new_ds[1]["trajectory"])

```json
{
    "Step 1 - Simplify the Task": {
        "Description": "Identify the current state of the sequence and simplify it if possible.",
        "Action": "Note the current sequence: ( ) ( (",
        "Result": "Identify the current sequence as ( ) ( ("
    },
    "Step 2 - Break Down the Task": {
        "Description": "Break down the sequence completion task into smaller, more manageable steps.",
        "Action": "Identify the open and closed parentheses in the current sequence.",
        "Result": "Identify open parentheses: 2, closed parentheses: 1"
    },
    "Step 3 - Analyze the Task": {
        "Description": "Determine if the task is analytical and requires tracking and balancing parentheses.",
        "Action": "Check if the sequence is balanced.",
        "Result": "Determine if the sequence is balanced or needs more parentheses to be balanced."
    },
    "Step 4 - Plan to Complete the Sequence": {
        "Description": "Make a step-by-step plan to complete the seq

In [362]:
blank_ds = new_ds.filter(lambda x: x["answer_pred"] == "")
none_blank_ds = new_ds.filter(lambda x: x["answer_pred"] != "")

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

In [364]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].replace(ins["target"], "")
    }

none_blank_ds = none_blank_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

{'',
 "'< < > >'",
 "'< [ [ ] ] >'",
 "'< [ ] >'",
 '( ( ( ( [ [ < [ { { [ ] } } ] > ] ] ( ) ) ) ) ) ) ) )',
 '( ( ( ) ) ))',
 '( ( < < < ( ( ) ) ( [ ] ) > > { [ ] } ) ) > > >',
 '( ( < > ) )',
 '( ( < { } > ) ) < { } >',
 '( ( { } ) )',
 '( ) ( () )',
 '( ) ( < < { } > >',
 '( ) [ ( [ < { { ( { } ) } } > ] ) ]',
 '( ) { < } >',
 '( < < > > < > [ ] [ )',
 '( < > ( [ ( ) ] ) )',
 '( < [ ( ) ] > )',
 '( < { } [ ] > )',
 '( [ ( ) ] )',
 '( [ < < { } > > ] )',
 '( [ [ [ ( { ( ( < [ { < > } ] > { { [ ] } } ) ) } ( [ [ < > ] ] ) ) ] ] ] ) ( < > < [ ( ) ] > ( ) > )',
 '( { ( ) } )',
 '( { < [ < > ] > } )',
 '( { < { ( ( { } ) ( ) ) } { } < { } > < > > } { } ( { ( { { } } ) [ ( ) ] } ) ) [ ( [ ] ) ]',
 '( { [ { } ] } )',
 '( { } )',
 '( { } ) < { < { } > } >',
 '( { } ) { ( [ { ( ) } ] ( [ ] ) ) }',
 '**',
 '< ( ( ( [ { } ] ) ) ) >',
 '< ( () )',
 '< ( ) ( { { [ ] } } ) >',
 '< ( ) >',
 '< ( < > ) >',
 '< ( [ { ( < > ) } ] ) > { ( [ ] } ) >',
 '< ( { ( < < > > ) } ) >',
 '< ( { [ { } ] } [ ] [

In [365]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
for instance in new_ds:
    print(instance["target"])
    print(instance["trajectory"])
    print("="*50)

# formal_fallacies

In [63]:
chk_dir = here(os.path.join(par_dir, "bbh-formal_fallacies/bbh_eval"))

In [64]:
dataset = Dataset.load_from_disk(chk_dir)

In [65]:
set(dataset["answer_pred"])

{'invalid.', 'invalid."', 'valid.', 'valid."'}

In [66]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [67]:
set(new_ds["answer_pred"])

{'invalid', 'valid'}

In [68]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [69]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.724

# geometric_shapes

In [70]:
chk_dir = here(os.path.join(par_dir, "bbh-geometric_shapes/bbh_eval"))

In [71]:
dataset = Dataset.load_from_disk(chk_dir)

In [72]:
set(dataset["answer_pred"])

{'(I) sector."',
 'A."',
 'B"',
 'B.',
 'B. The shape is a heptagon, as it has 7 unique points connected in a sequence forming a closed shape. Other options were eliminated because they do not match the number of points and connections observed."',
 'B."',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'E."',
 'F"',
 'F.',
 'G"',
 'G.',
 'G."',
 'H"',
 'H.',
 'I"',
 'I.',
 'I."',
 'J"',
 'J.',
 'J."',
 'K"',
 'K.',
 'K."'}

In [73]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [74]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'B The shape is a heptagon as it has 7 unique points connected in a sequence forming a closed shape Other options were eliminated because they do not match the number of points and connections observed',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'I sector',
 'J',
 'K'}

In [75]:
def map_fn(ins):   
    ls = ins["answer_pred"].split()
    
    return {
        "answer_pred": ls[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [76]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'}

In [77]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [78]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.664

# hyperbaton

In [79]:
chk_dir = here(os.path.join(par_dir, "bbh-hyperbaton/bbh_eval"))

In [80]:
dataset = Dataset.load_from_disk(chk_dir)

In [81]:
set(dataset["answer_pred"])

{'(A).',
 '(A). Sentence A follows the correct adjective order rule."',
 'A"',
 'A.',
 'A. Sentence A follows the correct adjective order: new (age), white (color), lead (material), walking (purpose)."',
 'A. Sentence A follows the correct adjective order: size (enormous), shape (rectangular), color (blue), noun (dog)."',
 'A."',
 'B"',
 'B.',
 'B. Option B follows the correct adjective order: opinion (silly), age (old-fashioned), color (tan)."',
 'B. Sentence (B) follows the correct adjective order rules: opinion, size, origin, material, purpose."',
 'B. Sentence B follows the correct adjective order according to standard linguistic rules."',
 'B."',
 'B.**',
 'Option (A)."'}

In [82]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [83]:
set(new_ds["answer_pred"])

{'A',
 'A Sentence A follows the correct adjective order rule',
 'A Sentence A follows the correct adjective order: new age white color lead material walking purpose',
 'A Sentence A follows the correct adjective order: size enormous shape rectangular color blue noun dog',
 'B',
 'B Option B follows the correct adjective order: opinion silly age old-fashioned color tan',
 'B Sentence B follows the correct adjective order according to standard linguistic rules',
 'B Sentence B follows the correct adjective order rules: opinion size origin material purpose',
 'B**',
 'Option A'}

In [84]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    if len(ls[0]) == 1:
        return {
            "answer_pred": ls[0]
        }
    elif ls[0] == "B**":
        return {
            "answer_pred": ls[0][0]
        }
    elif ls[0] == "Option":
        return {
            "answer_pred": ls[1]
        }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B'}

In [85]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [86]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.944

# logical_deduction_five_objects

In [87]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_five_objects/bbh_eval"))

In [88]:
dataset = Dataset.load_from_disk(chk_dir)

In [89]:
set(dataset["answer_pred"])

{'(A) The blue book is the rightmost."',
 '(A) The falcon is the leftmost.',
 '(A) The sedan is the second-newest."',
 '(A) The tractor is the oldest.',
 'A"',
 'A.',
 'A."',
 'B"',
 'B**',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'E."'}

In [90]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [91]:
set(new_ds["answer_pred"])

{'A',
 'A The blue book is the rightmost',
 'A The falcon is the leftmost',
 'A The sedan is the second-newest',
 'A The tractor is the oldest',
 'B',
 'B**',
 'C',
 'D',
 'E'}

In [92]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    if len(ls[0]) == 1:
        return {
            "answer_pred": ls[0]
        }
    elif ls[0] == "B**":
        return {
            "answer_pred": ls[0][0]
        }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [93]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [94]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.888

# logical_deduction_seven_objects

In [95]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_seven_objects/bbh_eval"))

In [96]:
dataset = Dataset.load_from_disk(chk_dir)

In [97]:
set(dataset["answer_pred"])

{'(A) Ada finished third"}',
 '(A) Ana finished fourth.',
 '(A) The cardinal is the third from the left.',
 '(A) The oranges are the fourth-most expensive.',
 '(A) The white book is the fourth from the left.',
 '(B) The hummingbird is the third from the right."',
 '(D) The blue jay is the fourth from the left."',
 '(E) The crow is the second from the left.',
 '(G) The truck is the second-oldest."',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'F"',
 'F.',
 'G"',
 'G.',
 'G."'}

In [98]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Ada finished third}',
 'A Ana finished fourth',
 'A The cardinal is the third from the left',
 'A The oranges are the fourth-most expensive',
 'A The white book is the fourth from the left',
 'B',
 'B The hummingbird is the third from the right',
 'C',
 'D',
 'D The blue jay is the fourth from the left',
 'E',
 'E The crow is the second from the left',
 'F',
 'G',
 'G The truck is the second-oldest'}

In [99]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    if len(ls[0]) == 1:
        return {
            "answer_pred": ls[0]
        }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F', 'G'}

In [100]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [101]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.788

# logical_deduction_three_objects

In [102]:
chk_dir = here(os.path.join(par_dir, "bbh-logical_deduction_three_objects/bbh_eval"))

In [103]:
dataset = Dataset.load_from_disk(chk_dir)

In [104]:
set(dataset["answer_pred"])

{'(A) Amy finished second.',
 '(A) Mel finished last.',
 '(A) The apples are the cheapest.',
 '(A) The hatchback is the second-newest.',
 '(A) The hummingbird is the leftmost.',
 '(A) The pears are the second-most expensive.',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."'}

In [105]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A Amy finished second',
 'A Mel finished last',
 'A The apples are the cheapest',
 'A The hatchback is the second-newest',
 'A The hummingbird is the leftmost',
 'A The pears are the second-most expensive',
 'B',
 'C'}

In [106]:
def map_fn(ins):
    ls = ins["answer_pred"].split()

    if len(ls[0]) == 1:
        return {
            "answer_pred": ls[0]
        }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [107]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [108]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.984

# movie_recommendation

In [109]:
chk_dir = here(os.path.join(par_dir, "bbh-movie_recommendation/bbh_eval"))

In [110]:
dataset = Dataset.load_from_disk(chk_dir)

In [111]:
set(dataset["answer_pred"])

{'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B. Up is the most similar to the given movies due to its adventure and fantasy genres, complex plot, epic scale, and intricate world-building."',
 'B."',
 'C and D.',
 'C"',
 'C.',
 'D"',
 'D.',
 'D. The Usual Suspects is the most similar to the given movies due to its complex narrative, unpredictable plot twists, strong character development, and dark and gritty atmosphere."',
 'D."',
 'E.',
 'None of the options are similar to the given movies.',
 'None of the options closely match the criteria of historical/dramatic content, emotional resonance, and epic narrative found in the given movies.',
 'None of the options match the criteria of the given movies in terms of genre, themes, narrative techniques, cinematic elements, director style, character development, or storyline complexity.',
 'that none of the options (A, B, C, D) closely match the criteria extracted from the given movies. Therefore, there is no suitable match among the provided option

In [115]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [116]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'B Up is the most similar to the given movies due to its adventure and fantasy genres complex plot epic scale and intricate world-building',
 'C',
 'C and D',
 'D',
 'D The Usual Suspects is the most similar to the given movies due to its complex narrative unpredictable plot twists strong character development and dark and gritty atmosphere',
 'E',
 'None of the options are similar to the given movies',
 'None of the options closely match the criteria of historical/dramatic content emotional resonance and epic narrative found in the given movies',
 'None of the options match the criteria of the given movies in terms of genre themes narrative techniques cinematic elements director style character development or storyline complexity',
 'that none of the options A B C D closely match the criteria extracted from the given movies Therefore there is no suitable match among the provided options'}

In [117]:
def map_fn(ins):
    if ins["answer_pred"] and "B Up is the most similar to the given" in ins["answer_pred"]:
        return {
            "answer_pred": "B"
        }
    elif ins["answer_pred"] and "D The Usual Suspects is the most similar" in ins["answer_pred"]:
        return {
            "answer_pred": "D"
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'B',
 'C',
 'C and D',
 'D',
 'E',
 'None of the options are similar to the given movies',
 'None of the options closely match the criteria of historical/dramatic content emotional resonance and epic narrative found in the given movies',
 'None of the options match the criteria of the given movies in terms of genre themes narrative techniques cinematic elements director style character development or storyline complexity',
 'that none of the options A B C D closely match the criteria extracted from the given movies Therefore there is no suitable match among the provided options'}

In [118]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [119]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.716

# penguins_in_a_table

In [120]:
chk_dir = here(os.path.join(par_dir, "bbh-penguins_in_a_table/bbh_eval"))

In [121]:
dataset = Dataset.load_from_disk(chk_dir)

In [122]:
set(dataset["answer_pred"])

{'(A) 1."', 'A"', 'A.', 'A."', 'B"', 'B.', 'C"', 'C.', 'D"', 'D.', 'E"', 'E.'}

In [123]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [124]:
set(new_ds["answer_pred"])

{'A', 'A 1', 'B', 'C', 'D', 'E'}

In [125]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].split()[0]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [126]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/146 [00:00<?, ? examples/s]

In [127]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

0.9794520547945206

# reasoning_about_colored_objects

In [128]:
chk_dir = here(os.path.join(par_dir, "bbh-reasoning_about_colored_objects/bbh_eval"))

In [129]:
dataset = Dataset.load_from_disk(chk_dir)

In [130]:
set(dataset["answer_pred"])

{'(B)"',
 '(D) green"',
 '(D) green."',
 '(D) three."',
 '(E) blue.',
 '(E) blue."',
 '(E)."',
 '(M)"',
 '6"',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'E."',
 'F"',
 'F.',
 'G"',
 'G.',
 'G."',
 'H"',
 'H.',
 'H."',
 'I"',
 'I.',
 'I."',
 'J"',
 'J.',
 'J."',
 'K.',
 'K."',
 'L"',
 'L.',
 'L."',
 'M"',
 'M.',
 'M."',
 'N.',
 'O"',
 'O.',
 'O."',
 'P"',
 'P.',
 'P."',
 'Q.',
 'Q."',
 'R"',
 'R.',
 'R."',
 'the option selected in Step 7.",'}

In [131]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'6',
 'A',
 'B',
 'C',
 'D',
 'D green',
 'D three',
 'E',
 'E blue',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'the option selected in Step 7'}

In [132]:
def map_fn(instance):
    sp = instance["answer_pred"].split()
    if len(sp) == 2:
        return {"answer_pred": sp[0]}
    else:
        return {"answer_pred": instance["answer_pred"]}

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'6',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'the option selected in Step 7'}

In [134]:
print(new_ds.filter(lambda x: x["answer_pred"] == "6")[0]["trajectory"])
print(new_ds.filter(lambda x: x["answer_pred"] == "the option selected in Step 7")[0]["trajectory"])

```json
{
  "Step 1 - Identify relevant items": {
    "Description": "List all the puzzles mentioned in the task, regardless of their color.",
    "Action": "Extract puzzles from the item list.",
    "Result": "Three yellow puzzles, three teal puzzles"
  },
  "Step 2 - Identify items to be excluded": {
    "Description": "Identify the color of the items that need to be removed, as mentioned in the task.",
    "Action": "Note down the color magenta.",
    "Result": "Magenta"
  },
  "Step 3 - Filter out irrelevant items": {
    "Description": "Remove the items that are of the color identified in Step 2 from the list of relevant items obtained in Step 1.",
    "Action": "Remove magenta puzzles (if any) from the puzzle list.",
    "Result": "Three yellow puzzles, three teal puzzles (no magenta puzzles to remove)"
  },
  "Step 4 - Categorize remaining items": {
    "Description": "Categorize the remaining puzzles by their color to make counting easier.",
    "Action": "Group puzzles by thei

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

```json
{
  "Step 1: List all items": {
    "description": "Write down all the items seen on the floor.",
    "action": "List items",
    "items": [
      "three silver pencils",
      "one teal pencil",
      "one turquoise mug",
      "two yellow pencils",
      "three teal bracelets",
      "two silver mugs",
      "one turquoise jug",
      "three turquoise pencils",
      "two turquoise bracelets"
    ]
  },
  "Step 2: Identify silver items": {
    "description": "Identify which items are silver.",
    "action": "Mark silver items",
    "silver_items": [
      "three silver pencils",
      "two silver mugs"
    ]
  },
  "Step 3: Remove silver items": {
    "description": "Remove all the silver items from the list.",
    "action": "Remove silver items",
    "remaining_items": [
      "one teal pencil",
      "one turquoise mug",
      "two yellow pencils",
      "three teal bracelets",
      "one turquoise jug",
      "three turquoise pencils",
      "two turquoise bracelets"
    ]

In [137]:
def map_fn(ins):
    if "the option selected in Step 7" in ins["answer_pred"]:
        return {
            "answer_pred": "B"
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'6',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R'}

In [138]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [139]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.968

# ruin_names

In [140]:
chk_dir = here(os.path.join(par_dir, "bbh-ruin_names/bbh_eval"))

In [141]:
dataset = Dataset.load_from_disk(chk_dir)

In [142]:
set(dataset["answer_pred"])

{'(A) the girl with all the grifts.',
 '(A) toe abyss, as it is the most humorous edit of \'the abyss\'."',
 '(B) guns n\' ropes and (D) guns n\' hoses."',
 '(C) the why!',
 'A and C.',
 'A"',
 'A.',
 'A."',
 'B"',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."',
 'C.**',
 'D"',
 'D.',
 'D. The option "the shawshark redemption" is the humorous edit because it plays on the word "shawshank" by changing it to "shawshark," which is a humorous and clever wordplay.',
 'D."',
 'I.',
 None,
 "None of the provided options are particularly humorous. A new humorous version could be 'run dmc and chill'."}

In [149]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A and C',
 'A the girl with all the grifts',
 "A toe abyss as it is the most humorous edit of 'the abyss'",
 'B',
 "B guns n' ropes and D guns n' hoses",
 'C',
 'C the why!',
 'C**',
 'D',
 'D The option the shawshark redemption is the humorous edit because it plays on the word shawshank by changing it to shawshark which is a humorous and clever wordplay',
 'I',
 None,
 "None of the provided options are particularly humorous A new humorous version could be 'run dmc and chill'"}

In [150]:
def map_fn(ins):
    corr_A = ["A the girl with all the grifts", "toe abyss as it is the most humorous"]
    corr_C = ["the why!", "C**"]
    corr_D = ["D The option the shawshark"]
    

    if ins["answer_pred"]:
        for a in corr_A:
            if a in ins["answer_pred"]:
                return {
                    "answer_pred": "A"
                }
        for c in corr_C:
            if c in ins["answer_pred"]:
                return {
                    "answer_pred": "C"
                }
        for d in corr_D:
            if d in ins["answer_pred"]:
                return {
                    "answer_pred": "D"
                }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

{'A',
 'A and C',
 'B',
 "B guns n' ropes and D guns n' hoses",
 'C',
 'D',
 'I',
 None,
 "None of the provided options are particularly humorous A new humorous version could be 'run dmc and chill'"}

In [151]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
print(none_ds[0]["trajectory"])

```json
{
    "Step 1 - Identify the Core Name": {
        "Task": "Identify the core name or phrase that needs to be edited to create a humorous effect.",
        "Output": "Core name identified: 'little shop of horrors'."
    },
    "Step 2 - Break Down the Name": {
        "Task": "Break down the name 'little shop of horrors' into smaller parts to understand potential areas for humorous edits.",
        "Output": "Name broken down into parts: 'little', 'shop', 'of', 'horrors'."
    },
    "Step 3 - Brainstorm Humorous Edits": {
        "Task": "Make a list of humorous edits for the name 'little shop of horrors' by changing letters or adding elements.",
        "Output": "List of humorous edits generated: 'little shoy of horrors', 'little hop of horrors', 'little wshop of horrors', 'ulittle shop of horrors'."
    },
    "Step 4 - Creative Name Generation": {
        "Task": "Use creative thinking to generate innovative and humorous edits of 'little shop of horrors'. Consider unconven

In [152]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A and C',
 'B',
 "B guns n' ropes and D guns n' hoses",
 'C',
 'D',
 'I',
 "None of the given options are particularly humorous. A new edit 'little shop of whorror' is more humorous.",
 "None of the provided options are particularly humorous A new humorous version could be 'run dmc and chill'"}

In [153]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [154]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.76

# salient_translation_error_detection

In [155]:
chk_dir = here(os.path.join(par_dir, "bbh-salient_translation_error_detection/bbh_eval"))

In [156]:
dataset = Dataset.load_from_disk(chk_dir)

In [157]:
set(dataset["answer_pred"])

{'(A) Modifiers or Adjectives.',
 'A.',
 'B"',
 'B.',
 'B."',
 'C"',
 'C.',
 'D"',
 'D.',
 'D."',
 'E"',
 'E.',
 'F"',
 'F.',
 'F."'}

In [166]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'A Modifiers or Adjectives', 'B', 'C', 'D', 'E', 'F'}

In [167]:
def map_fn(ins):
    if "A Modifiers or Adjectives" == ins["answer_pred"]:
        return {
            "answer_pred": "A"
        }

    return {"answer_pred": ins["answer_pred"]}

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F'}

In [168]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [169]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.656

# snarks

In [170]:
chk_dir = here(os.path.join(par_dir, "bbh-snarks/bbh_eval"))

In [171]:
dataset = Dataset.load_from_disk(chk_dir)

In [172]:
set(dataset["answer_pred"])

{'A"',
 'A.',
 'A."',
 'A.",',
 'B"',
 'B.',
 'B. Statement B is sarcastic due to its use of exaggeration, irony, and humorous tone, as well as its reference to \'trolls\' as an \'endangered species,\' which is incongruous and intended to ridicule."',
 'B."',
 'Both statements are sarcastic.',
 'Neither statement is sarcastic.',
 'both statements (A) and (B) are sarcastic.**',
 'that the statement "(A) The NB" cannot be determined as sarcastic due to insufficient context.'}

In [173]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

{'A',
 'B',
 "B Statement B is sarcastic due to its use of exaggeration irony and humorous tone as well as its reference to 'trolls' as an 'endangered species' which is incongruous and intended to ridicule",
 'Both statements are sarcastic',
 'Neither statement is sarcastic',
 'both statements A and B are sarcastic**',
 'that the statement A The NB cannot be determined as sarcastic due to insufficient context'}

In [174]:
def map_fn(ins):
    if "B Statement B is sarcastic" in ins["answer_pred"]:
        return {
            "answer_pred": "B"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

{'A',
 'B',
 'Both statements are sarcastic',
 'Neither statement is sarcastic',
 'both statements A and B are sarcastic**',
 'that the statement A The NB cannot be determined as sarcastic due to insufficient context'}

In [175]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/178 [00:00<?, ? examples/s]

In [176]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/178 [00:00<?, ? examples/s]

0.8595505617977528

# sports_understanding

In [177]:
chk_dir = here(os.path.join(par_dir, "bbh-sports_understanding/bbh_eval"))

In [178]:
dataset = Dataset.load_from_disk(chk_dir)

In [179]:
set(dataset["answer_pred"])

{'"The sentence is not plausible."',
 '"The sentence is plausible if interpreted as Nerlens Noel being outside his house but within his home\'s premises."',
 '"The sentence is plausible."',
 'False"',
 'False.',
 'False."',
 'No.',
 None,
 'Plausible.',
 'True.',
 'True."',
 'Yes.',
 '[answer]."',
 'not plausible."',
 'that the sentence "Marouane Fellaini scored in extra time" is plausible based on the structured analysis and available data.',
 'that the sentence is plausible with a correction: \'Michael Conforto committed a three second violation.\'"'}

In [180]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [181]:
set(new_ds["answer_pred"])

{'False',
 'No',
 None,
 'Plausible',
 'The sentence is not plausible',
 'The sentence is plausible',
 "The sentence is plausible if interpreted as Nerlens Noel being outside his house but within his home's premises",
 'True',
 'Yes',
 '[answer]',
 'not plausible',
 'that the sentence Marouane Fellaini scored in extra time is plausible based on the structured analysis and available data',
 "that the sentence is plausible with a correction: 'Michael Conforto committed a three second violation'"}

In [182]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 163
})

In [183]:
index = 0
print(none_ds[0]["trajectory"])

```json
{
    "Step 1: Identify Key Assumptions": {
        "Description": "What are the key assumptions underlying the plausibility of the sentence?",
        "Action": "List the assumptions that need to be true for the sentence to be plausible.",
        "Value": [
            "Travis Konecny is a known figure in a context where 'doing a maradona' is relevant.",
            "The term 'maradona' refers to a specific action or move that can be performed on a defender.",
            "The context is likely related to sports, specifically soccer or hockey."
        ]
    },
    "Step 2: Explore Alternative Interpretations": {
        "Description": "What are the alternative interpretations or contexts for the sentence?",
        "Action": "Identify different ways the sentence could be interpreted based on context.",
        "Value": [
            "The sentence could be interpreted in the context of soccer, where 'maradona' might refer to a specific dribbling move.",
            "The sente

In [184]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [185]:
new_ds.filter(lambda x: x["answer_pred"] == None)

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [186]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [187]:
set(new_ds["answer_pred"])

{'False',
 'No',
 'Plausible',
 'The plausibility of the sentence James Karinchak crossed the blue line remains uncertain without specific context or evidence',
 'The plausibility of the sentence is uncertain without concrete data',
 'The plausibility of the statement Malcolm Brogdon drove into the restricted area in the Eastern Conference Finals can be determined by following the outlined steps to verify the relevant data and context',
 "The sentence 'Patrice Bergeron took a backhand shot' is plausible",
 'The sentence Allen Robinson gained five yards is plausible',
 'The sentence Andres Iniesta performed a give and go is plausible',
 'The sentence Angel Di Maria scored in extra time is plausible if verified by match records',
 'The sentence Anthony Davis beat the buzzer is plausible',
 "The sentence Bastian Schweinsteiger scored in added time is plausible based on the historical context of his career the definition of 'added time' in soccer and the likelihood of him scoring in such s

In [188]:
plausible_yes = [
    'Plausible',
    "The sentence 'Patrice Bergeron took a backhand shot' is plausible",
    'The sentence Allen Robinson gained five yards is plausible',
    'The sentence Andres Iniesta performed a give and go is plausible',
    'The sentence Angel Di Maria scored in extra time is plausible if verified by match records',
    'The sentence Anthony Davis beat the buzzer is plausible',
    "The sentence Bastian Schweinsteiger scored in added time is plausible based on the historical context of his career the definition of 'added time' in soccer and the likelihood of him scoring in such situations as evidenced by match records player statistics and official reports",
    'The sentence Blake Snell hit a single is plausible in the context of a National League game where pitchers are required to bat',
    'The sentence David Silva took a throw in is plausible',
    'The sentence Dejounte Murray took a side-step three is plausible',
    'The sentence Deshaun Watson was flagged on the play is plausible if supported by official game reports and expert commentary',
    'The sentence Drew Brees was flagged on the play is plausible',
    'The sentence Elias Lindholm beat the buzzer is plausible',
    'The sentence Gleyber Torres got a base hit is plausible',
    'The sentence Jakub Vrana skated backwards is plausible',
    'The sentence James Karinchak worked a full count is plausible',
    'The sentence Javier Mascherano took a left footed shot is plausible',
    'The sentence John Carlson scored in the third period is plausible if the key assumptions are validated through the analysis of relevant data sources and expertise',
    'The sentence Jonas Valanciunas beat the buzzer is plausible',
    'The sentence Josh Allen caught the screen pass is plausible',
    'The sentence Josh Allen hit the screen pass is plausible',
    'The sentence Julian Edelman fumbled the ball is plausible',
    'The sentence Kawhi Leonard took a turnaround jumper is plausible',
    'The sentence Kyle Tucker stepped on first base is plausible',
    'The sentence LaMelo Ball launched the half court shot in the Western Conference Finals is plausible if supported by the gathered data and analysis',
    'The sentence Luis Robert was out at second is plausible',
    'The sentence Luke Voit was out at first is plausible',
    'The sentence Malcolm Brogdon banked the shot in is plausible',
    'The sentence Marcell Ozuna hit into a double play is plausible',
    'The sentence Mark Stone spent time in the penalty box in the Stanley Cup is plausible if the gathered data and analysis confirm that Mark Stone has indeed been in the penalty box during a Stanley Cup game',
    'The sentence Marvin Jones lost control of the puck is plausible',
    'The sentence Max Scherzer was safe at first is plausible',
    'The sentence Mikal Bridges scored a windmill dunk is plausible',
    'The sentence Mike Trout hit a walkoff homer is plausible',
    'The sentence Mitchell Marner nutmegged the defender is plausible',
    'The sentence Mitchell Robinson airballed the shot is plausible',
    'The sentence Pedro struck out the side is plausible',
    'The sentence Pepe converted the first down is plausible',
    'The sentence Philip Rivers launched a hail mary is plausible',
    'The sentence Pierre-Luc Dubois skated backwards is plausible',
    'The sentence Ramires scored a header goal is plausible',
    'The sentence Ryan Fitzpatrick scored a touchdown is plausible but not a common occurrence',
    "The sentence Sergio Aguero maradona'd the defender is plausible",
    'The sentence Stephan El Shaarawy shot with the left foot is plausible',
    'The sentence Stephen Curry scored a reverse layup is plausible',
    'The sentence Teuvo Teravainen shot the puck is plausible',
    'The sentence Tom Brady converted the first down is plausible',
    'The sentence Tyreek Hill caught the screen pass is plausible',
    'The sentence Yaya Toure scored a freekick is plausible based on his career statistics and technical abilities',
    'The statement Collin Sexton hit the buzzer beater is plausible',
    'The statement David Pastrnak skated backwards is plausible',
    'The statement Drew Brees went for it on fourth down is plausible based on the analysis of his historical decisions the game context and strategic implications',
    'The statement Nicklas Backstrom earned a trip to the penalty box is plausible',
    'The statement Stefon Diggs hit the slant pass is plausible',
    'The statement is plausible if there is documented evidence of Jonathan Marchessault scoring a power play goal in a Stanley Cup game Further verification through official records and statistics is required to confirm this',
    'True',
    'Yes',
    'Yes the sentence is plausible',
    'that the sentence Marouane Fellaini scored in extra time is plausible based on the structured analysis and available data',
    "that the sentence is plausible with a correction: 'Michael Conforto committed a three second violation'"
]

implausible_no = [
    'False',
    'No',
    'The sentence Bryce Harper fumbled the ball is not plausible',
    'The sentence Carlos Tevez skated backwards is implausible',
    'The sentence Clint Capela got into the endzone is not plausible',
    'The sentence Corbin Burnes earned an indirect kick is not plausible',
    'The sentence Didier Drogba got into the endzone is not plausible without additional context',
    'The sentence Edinson Cavani caught the screen pass is not plausible without additional context as it mixes terminology from different sports',
    'The sentence Elias Lindholm took the snap is not plausible',
    'The sentence Fernando Tatis Jr earned a red card is not plausible',
    'The sentence Fred VanVleet passed the puck is not plausible',
    'The sentence Gerard Pique scored a corner kick is not plausible',
    'The sentence Gerrit Cole set the hard screen is not plausible',
    'The sentence Jakub Vrana hit a walkoff homer is not plausible',
    'The sentence Juan Soto did a double stepover is not plausible',
    'The sentence Mario Gomez scored a reverse layup is implausible',
    'The sentence Michael Thomas took the snap is not plausible',
    'The sentence Mike Williams fumbled the ball in the Superbowl is not plausible',
    'The sentence Mookie Betts took a side-step three is not plausible',
    'The sentence Nazem Kadri was out at home is not plausible',
    'The sentence Sterling Shepard hit a walkoff homer is not plausible',
    'The sentence Steven Stamkos hit the slant pass is not plausible',
    'The sentence Tristan Jarry dunked the ball is not plausible',
    'The sentence Tuukka Rask hit a double is not plausible',
    'The sentence is implausible',
    'The sentence is not plausible',
    "The sentence is not plausible without additional context as Jerry Jeudy is not typically associated with the term 'powerplay' in his sport",
    'The statement Nick Foles lost control of the puck is not plausible',
    'The statement Ryan Nugent-Hopkins killed the powerplay is not sufficiently supported by the available data and alternative explanations It is more plausible that the powerplay failure was due to a combination of factors rather than solely Ryan Nugent-Hopkins performance',
    'The statement is not plausible',
    'not plausible'
]

indeterminate = [
    'The plausibility of the sentence James Karinchak crossed the blue line remains uncertain without specific context or evidence',
    'The plausibility of the sentence is uncertain without concrete data',
    'The plausibility of the statement Malcolm Brogdon drove into the restricted area in the Eastern Conference Finals can be determined by following the outlined steps to verify the relevant data and context',
    'The sentence John Carlson scored in the third period is plausible if the key assumptions are validated through the analysis of relevant data sources and expertise',
    'The sentence LaMelo Ball launched the half court shot in the Western Conference Finals is plausible if supported by the gathered data and analysis',
    'The sentence Mark Stone spent time in the penalty box in the Stanley Cup is plausible if the gathered data and analysis confirm that Mark Stone has indeed been in the penalty box during a Stanley Cup game',
    'The sentence is plausible but highly unlikely',
    "The sentence is plausible if 'in the Stanley Cup' is interpreted as referring to the championship series rather than the physical trophy",
    "The sentence is plausible if 'the screen' refers to a sports term or a technological device but the exact meaning is unclear without additional context",
    "The sentence is plausible if Jesus Luzardo is involved in a sport like hockey where 'slashing' is a recognized penalty However if Jesus Luzardo is known to be a baseball player the sentence is not plausible as 'slashing' is not a term used in baseball",
    "The sentence is plausible if interpreted as Nerlens Noel being outside his house but within his home's premises",
    'The sentence is plausible if interpreted in the context of baseball terminology',
    'The sentence is plausible if interpreted metaphorically within the context of sports',
    'The sentence is plausible under certain conditions but additional context is needed for a definitive determination',
    'The statement is plausible if there is documented evidence of Jonathan Marchessault scoring a power play goal in a Stanley Cup game Further verification through official records and statistics is required to confirm this',
    '[answer]'
]

def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [189]:
set(new_ds["answer_pred"])

{'The plausibility of the sentence James Karinchak crossed the blue line remains uncertain without specific context or evidence',
 'The plausibility of the sentence is uncertain without concrete data',
 'The plausibility of the statement Malcolm Brogdon drove into the restricted area in the Eastern Conference Finals can be determined by following the outlined steps to verify the relevant data and context',
 "The sentence Paulinho earned an indirect kick in the FA Cup is plausible if all the assumptions and considerations align with the available data and football rules However without specific data on Paulinho's participation in the FA Cup and the exact context of the indirect kick the plausibility remains uncertain",
 'The sentence is plausible',
 'The sentence is plausible but highly unlikely',
 "The sentence is plausible if 'in the Stanley Cup' is interpreted as referring to the championship series rather than the physical trophy",
 "The sentence is plausible if 'the screen' refers 

In [190]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [191]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.688

# temporal_sequences

In [192]:
chk_dir = here(os.path.join(par_dir, "bbh-temporal_sequences/bbh_eval"))

In [193]:
dataset = Dataset.load_from_disk(chk_dir)

In [194]:
set(dataset["answer_pred"])

{'(A) 11am to 1pm.',
 '(A) 12pm to 2pm.',
 '(A) 3pm to 6pm.',
 '(A) 5am to 6am."',
 '(A) 6am to 7am"',
 '(A) 7am to 8am.',
 '(A) 9am to 4pm.',
 '(C) 9am to 11am.',
 '(D) 7am to 9am.',
 '(D) 8am to 10am.',
 '(D)."',
 'A or B.',
 'A"',
 'A.',
 'A."',
 'B"',
 'B"}',
 'B.',
 'B."',
 'C"',
 'C.',
 'C."',
 'D"',
 'D.',
 'D."'}

In [199]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()}"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A 11am to 1pm',
 'A 12pm to 2pm',
 'A 3pm to 6pm',
 'A 5am to 6am',
 'A 6am to 7am',
 'A 7am to 8am',
 'A 9am to 4pm',
 'A or B',
 'B',
 'C',
 'C 9am to 11am',
 'D',
 'D 7am to 9am',
 'D 8am to 10am'}

In [200]:
def map_fn(ins):
    sp = ins["answer_pred"].split()
    if len(sp) == 4:
        return {
            "answer_pred": sp[0]
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'A or B', 'B', 'C', 'D'}

In [201]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [202]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.992

# tracking_shuffled_objects_five_objects

In [204]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_five_objects/bbh_eval"))
chk_dir

PosixPath('/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/mistral_large_2407/original/non_self_synthesis/bbh/bbh-tracking_shuffled_objects_five_objects/bbh_eval')

In [205]:
dataset = Dataset.load_from_disk(chk_dir)

In [206]:
set(dataset["answer_pred"])

{'(A) The Great Gatsby.',
 '(A) Ulysses.',
 '(A) brown present.',
 '(C) blue present.',
 '(C) pink ball.',
 '(D) Patrick.',
 'A.',
 'B"',
 'B.',
 'C.',
 'D.',
 'E.'}

In [207]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A',
 'A The Great Gatsby',
 'A Ulysses',
 'A brown present',
 'B',
 'C',
 'C blue present',
 'C pink ball',
 'D',
 'D Patrick',
 'E'}

In [208]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E'}

In [209]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [210]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.976

# tracking_shuffled_objects_seven_objects

In [211]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_seven_objects/bbh_eval"))

In [212]:
dataset = Dataset.load_from_disk(chk_dir)

In [213]:
set(dataset["answer_pred"])

{'(A) Patrick.', 'A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.'}

In [214]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [215]:
set(new_ds["answer_pred"])

{'A', 'A Patrick', 'B', 'C', 'D', 'E', 'F', 'G'}

In [216]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C', 'D', 'E', 'F', 'G'}

In [217]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [218]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.968

# tracking_shuffled_objects_three_objects

In [219]:
chk_dir = here(os.path.join(par_dir, "bbh-tracking_shuffled_objects_three_objects/bbh_eval"))

In [220]:
dataset = Dataset.load_from_disk(chk_dir)

In [221]:
set(dataset["answer_pred"])

{'(A) The Fellowship of the Ring.',
 '(A) The Great Gatsby.',
 'A"',
 'A.',
 'B"',
 'B.',
 'C.'}

In [222]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'A The Fellowship of the Ring', 'A The Great Gatsby', 'B', 'C'}

In [223]:
def map_fn(ins):    
    sp = ins["answer_pred"].split()
    return {
        "answer_pred": sp[0]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'A', 'B', 'C'}

In [224]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [225]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
(corr.num_rows) / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.964

# web_of_lies

In [226]:
chk_dir = here(os.path.join(par_dir, "bbh-web_of_lies/bbh_eval"))

In [227]:
dataset = Dataset.load_from_disk(chk_dir)

In [228]:
set(dataset["answer_pred"])

{'Alejandro tells the truth.',
 'Alexis tells the truth.',
 'Amberly tells the truth.',
 'Andree tells the truth.',
 'Bernita is telling the truth.',
 'Christie does not tell the truth.',
 'Christie is telling the truth.',
 'Christie tells the truth.',
 'Conception tells the truth.',
 'Crista tells the truth.',
 'Dallas tells the truth.',
 'Delbert tells the truth.',
 'Delfina tells the truth.',
 'Elanor tells the truth.',
 'False.',
 'False."',
 'Inga is a liar.',
 'Inga tells the truth.',
 'Jamey tells the truth.',
 'Jaymie tells the truth.',
 'Jim tells the truth.',
 'Ka tells the truth.',
 'Kandi is a liar.',
 'Leda tells the truth.',
 'Maybelle tells the truth.',
 'Michael tells the truth.',
 'Millicent tells the truth.',
 'No, Ryan does not tell the truth.',
 'No.',
 None,
 'Osvaldo tells the truth.',
 'Phoebe tells the truth.',
 'Sal does not tell the truth.',
 'Shaunda tells the truth.',
 'Shenna tells the truth.',
 'Sherrie tells the truth.',
 'Sima does not tell the truth.',


In [240]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [241]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 70
})

In [242]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

{'Alejandro is telling the truth.',
 'Alejandro tells the truth',
 'Alejandro tells the truth if Inga is lying.',
 'Alejandro tells the truth.',
 'Alexis tells the truth',
 'Alexis tells the truth.',
 'Amberly does not tell the truth.',
 'Amberly tells the truth',
 'Amberly tells the truth.',
 'Andree tells the truth',
 'Andree tells the truth.',
 'Audrie tells the truth.',
 "Audrie's truthfulness depends on Willian's truthfulness. If Willian is truthful, then Audrie is truthful. If Willian is lying, then Audrie is lying.",
 'Bernita does not tell the truth.',
 'Bernita is telling the truth',
 'Christie does not tell the truth',
 'Christie does not tell the truth.',
 'Christie is telling the truth',
 'Christie tells the truth',
 'Christie tells the truth.',
 'Conception tells the truth',
 'Conception tells the truth.',
 'Crista tells the truth',
 'Dallas tells the truth',
 'Dallas tells the truth.',
 'Delbert does not tell the truth.',
 'Delbert tells the truth',
 'Delfina tells the tr

In [243]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [244]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Alejandro is telling the truth',
 'Alejandro tells the truth',
 'Alejandro tells the truth if Inga is lying',
 'Alexis tells the truth',
 'Amberly does not tell the truth',
 'Amberly tells the truth',
 'Andree tells the truth',
 'Audrie tells the truth',
 "Audrie's truthfulness depends on Willian's truthfulness If Willian is truthful then Audrie is truthful If Willian is lying then Audrie is lying",
 'Bernita does not tell the truth',
 'Bernita is telling the truth',
 'Christie does not tell the truth',
 'Christie is telling the truth',
 'Christie tells the truth',
 'Conception tells the truth',
 'Crista tells the truth',
 'Dallas tells the truth',
 'Delbert does not tell the truth',
 'Delbert tells the truth',
 'Delfina tells the truth',
 'Elanor tells the truth',
 'False',
 'Fidel tells the truth',
 'Fletcher does not tell the truth',
 'Fletcher tells the truth',
 'Gwenn does not tell the truth',
 'Gwenn tells the truth',
 'Helene tells the truth',
 'Helene tells the truth if and o

In [245]:
yes_ls = [
    'Alejandro is telling the truth',
    'Alejandro tells the truth',
    'Alexis tells the truth',
    'Amberly tells the truth',
    'Andree tells the truth',
    'Audrie tells the truth',
    'Bernita is telling the truth',
    'Christie is telling the truth',
    'Christie tells the truth',
    'Conception tells the truth',
    'Crista tells the truth',
    'Dallas tells the truth',
    'Delbert tells the truth',
    'Delfina tells the truth',
    'Elanor tells the truth',
    'Fidel tells the truth',
    'Fletcher tells the truth',
    'Gwenn tells the truth',
    'Helene tells the truth',
    'Inga tells the truth',
    'Jamey tells the truth',
    'Jaymie tells the truth',
    'Jerry tells the truth',
    'Jim tells the truth',
    'Ka is telling the truth',
    'Ka tells the truth',
    'Leda tells the truth',
    'Lorine tells the truth',
    'Maybelle tells the truth',
    'Michael tells the truth',
    'Michaela tells the truth',
    'Millicent tells the truth',
    'Millie tells the truth',
    'Osvaldo tells the truth',
    'Phoebe tells the truth',
    'Rashida tells the truth',
    'Sal tells the truth',
    'Shalonda tells the truth',
    'Shaunda tells the truth',
    'Shenna tells the truth',
    'Sherrie tells the truth',
    'Sima tells the truth',
    'Teressa tells the truth',
    'True',
    'Vina tells the truth',
    'Willian tells the truth',
    'Yes',
    'Yes Phoebe is telling the truth'
]

no_ls = [
    'Amberly does not tell the truth',
    'Bernita does not tell the truth',
    'Christie does not tell the truth',
    'Delbert does not tell the truth',
    'False',
    'Fletcher does not tell the truth',
    'Gwenn does not tell the truth',
    'Inga does not tell the truth',
    'Inga is a liar',
    'Jamey does not tell the truth',
    'Ka does not tell the truth',
    'Kandi does not tell the truth',
    'Kandi is a liar',
    'Leda does not tell the truth',
    'Michaela does not tell the truth',
    'No',
    'No Amberly does not tell the truth',
    'No Ryan does not tell the truth',
    'No Teressa does not tell the truth',
    'Raymond does not tell the truth',
    'Sal does not tell the truth',
    'Shalonda does not tell the truth',
    'Shaunda does not tell the truth',
    'Sima does not tell the truth',
    'Tamika does not tell the truth'
]

indeterminate_ls = [
    'Alejandro tells the truth if Inga is lying',
    "Audrie's truthfulness depends on Willian's truthfulness If Willian is truthful then Audrie is truthful If Willian is lying then Audrie is lying",
    'Helene tells the truth if and only if Alexis tells the truth',
    'Ka does not necessarily tell the truth',
    'Vina tells the truth if the entire chain of statements is consistent and true'
]


def map_fn(ins):
    for yes in yes_ls:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in no_ls:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'Alejandro tells the truth if Inga is lying',
 "Audrie's truthfulness depends on Willian's truthfulness If Willian is truthful then Audrie is truthful If Willian is lying then Audrie is lying",
 'Helene tells the truth if and only if Alexis tells the truth',
 'Ka does not necessarily tell the truth',
 'No',
 'Vina tells the truth if the entire chain of statements is consistent and true',
 'Yes'}

In [246]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [247]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.864

# word_sorting

In [248]:
chk_dir = here(os.path.join(par_dir, "bbh-word_sorting/bbh_eval"))

In [249]:
dataset = Dataset.load_from_disk(chk_dir)

In [250]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [251]:
none_ds = new_ds.filter(lambda x: x["answer_pred"]==None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 159
})

In [252]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [253]:
none_ds = new_ds.filter(lambda x: x["answer_pred"]==None)
none_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [254]:
new_ds["target"][0]

'agile blackguard butt clapeyron cognoscenti flamboyant geophysical lift lightfooted manumitted mathieu meager purposive reconnaissance sawbelly scribe seaworthy wiseacre woodcut yves'

In [261]:
print(new_ds["trajectory"][3])

```json
{
    "Step 1: Understand the Task": {
        "Description": "Identify the task as sorting a list of words alphabetically.",
        "Action": "Read and comprehend the task requirements."
    },
    "Step 2: Simplify the Sorting Task": {
        "Description": "Determine if there are any simplifications that can be made to the sorting process.",
        "Action": "Check for any patterns or groupings that can simplify the task."
    },
    "Step 3: Break Down the List into Smaller Groups": {
        "Description": "Divide the list into smaller, manageable groups for easier sorting.",
        "Action": "Split the list into smaller sublists."
    },
    "Step 4: Identify Specific Techniques or Tools": {
        "Description": "Determine if any specific techniques or tools are required for alphabetical sorting.",
        "Action": "Identify and list any necessary techniques or tools."
    },
    "Step 5: Sort Each Group Alphabetically": {
        "Description": "Sort each smaller 

In [262]:
blank_ds = new_ds.filter(lambda x: x["answer_pred"] == "")
blank_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 44
})

In [267]:
print(blank_ds[4]["trajectory"])

```json
{
    "Step 1: List of Words": {
        "description": "Identify the list of words to be sorted.",
        "words": [
            "sideboard",
            "procrustean",
            "seepage",
            "buxton",
            "olympic",
            "tty",
            "typescript",
            "marque",
            "callus",
            "unitary",
            "cameron",
            "contribute",
            "precise",
            "verify",
            "shelf",
            "extensible",
            "methanol"
        ]
    },
    "Step 2: Generate List of Sorting Methods": {
        "description": "Generate a list of sorting methods that can be applied.",
        "sorting_methods": [
            "Bubble Sort",
            "Quick Sort",
            "Merge Sort",
            "Insertion Sort",
            "Selection Sort"
        ]
    },
    "Step 3: Simplify the Sorting Process": {
        "description": "Simplify the sorting process by choosing an effective sorting method.",
  

In [287]:
import re
def map_fn(ins):
    if ins["answer_pred"] == "":
        match_string = r"""```(.*?)
(.*?)
```"""
        matches = re.findall(match_string, ins["trajectory"], re.DOTALL)

        if matches:
            if "Step" not in matches[-1][1]:
                wsl = matches[-1][1].translate(str.maketrans("", "", '\n.[]"'))
                wsl_spl = wsl.split(",") if len(wsl.split(",")) > 0 else wsl.split(" ")

                wsl_str = " ".join([word.strip() for word in wsl_spl])
                return {
                    "answer_pred": (wsl_str.replace("Sorted List:", "").replace("{", "").replace("}", "").strip())
                }
                

    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [288]:
blank_ds = new_ds.filter(lambda x: x["answer_pred"] == "")
blank_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 23
})

In [290]:
non_blank_ds = new_ds.filter(lambda x: x["answer_pred"] != "")
non_blank_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 227
})

In [289]:
blank_ds.to_csv("./bbh_word_sorting_blank.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

270067

In [292]:
import json
file_name = "./project-4-at-2024-11-07-09-08-1040c6e0.json"
with open(file_name, "r") as f:
    blank_ds_ann = Dataset.from_list(json.load(f))

blank_ds_ann

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred', 'id', 'answer_pred_ann', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time'],
    num_rows: 23
})

In [294]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred_ann"]
    }

blank_ds = blank_ds_ann.map(map_fn, remove_columns=['id', 'answer_pred_ann', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time'])

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [296]:
blank_ds[5]["answer_pred"]

'allot chauncey clergymen coddington coachmen companion embark fatten gazpacho granular hobble muslim murk niggle pvc pristine singlet threefold too yeats'

In [298]:
from datasets import concatenate_datasets

new_ds = concatenate_datasets([non_blank_ds, blank_ds])
new_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [315]:
def map_fn(ins):
    return {
        "answer_pred": ins["answer_pred"].replace(",", "").replace(".", "").replace("[", "").replace("]", "")
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [321]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [316]:
corr = new_ds.filter(lambda x: x["target"].lower() == x["answer_pred"].lower())
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.728

In [317]:
wro = new_ds.filter(lambda x: x["target"].lower() != x["answer_pred"].lower())
wro

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 68
})

In [320]:
index = 15
print(wro[index]["target"])
print(wro[index]["answer_pred"])

allotted fate figural gorky grapple hydroxyl knives neapolitan nerve plainfield rampage saxon scottish scrumptious seventeen sidereal siena stooge thermal yakima
allotted fate figural gorky grapple hydroxyl knives neapolitan nerve plainfield rampage saxon scottish scrumptious siena sidereal seventeen stooge thermal yakima


# multistep_arithmetic_two

In [3]:
chk_dir = here(os.path.join(par_dir, "bbh-multistep_arithmetic_two/bbh_eval"))

In [4]:
dataset = Dataset.load_from_disk(chk_dir)

In [6]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]`"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)
set(new_ds["answer_pred"])

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'-1',
 '-10',
 '-1008',
 '-11',
 '-110',
 '-113',
 '-114',
 '-11520',
 '-13',
 '-1300',
 '-1343',
 '-14',
 '-144',
 '-147',
 '-15',
 '-151',
 '-16',
 '-160',
 '-168',
 '-169',
 '-17',
 '-170',
 '-18',
 '-19',
 '-192',
 '-196',
 '-1960',
 '-1968',
 '-2',
 '-20',
 '-21',
 '-2146',
 '-217',
 '-22',
 '-224',
 '-23',
 '-24',
 '-25',
 '-26',
 '-262',
 '-264',
 '-3',
 '-30',
 '-3136',
 '-316',
 '-32',
 '-320',
 '-3225',
 '-33',
 '-330',
 '-340',
 '-3400',
 '-35',
 '-37',
 '-38',
 '-3850',
 '-39',
 '-391',
 '-4',
 '-43',
 '-45',
 '-46',
 '-47',
 '-48',
 '-5',
 '-50',
 '-51',
 '-52',
 '-5453',
 '-55',
 '-554',
 '-57',
 '-6',
 '-60',
 '-61',
 '-658',
 '-7',
 '-76',
 '-8',
 '-80',
 '-80080',
 '-81',
 '-83',
 '-8304',
 '-84',
 '-9',
 '-90',
 '-92',
 '-97',
 '-99',
 '0',
 '107',
 '11',
 '11050',
 '1168',
 '12',
 '120',
 '123',
 '137',
 '14',
 '15',
 '151',
 '16',
 '1608',
 '17',
 '176',
 '18',
 '181',
 '198',
 '1980',
 '2',
 '20',
 '21',
 '211',
 '216',
 '220',
 '23',
 '237',
 '24',
 '242',
 '25',

In [325]:
non_num_ds = new_ds.filter(lambda x: x["answer_pred"] == "correct")
non_num_ds

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [327]:
print(non_num_ds[0]["trajectory"])

```json
{
    "Step 1: Simplify the innermost parentheses": {
        "Expression": "((5 - 7 - 8 - 6) * (-2 * -8 * -9 + 4))",
        "Focus": "Simplify (5 - 7 - 8 - 6)",
        "Result": "-16"
    },
    "Step 2: Simplify the next set of parentheses": {
        "Expression": "(-2 * -8 * -9 + 4)",
        "Focus": "Simplify (-2 * -8 * -9 + 4)",
        "Result": "-140"
    },
    "Step 3: Multiply the simplified values": {
        "Expression": "-16 * -140",
        "Focus": "Multiply the two simplified values",
        "Result": "2240"
    },
    "Step 4: Verify the order of operations (PEMDAS/BODMAS)": {
        "Expression": "((5 - 7 - 8 - 6) * (-2 * -8 * -9 + 4))",
        "Focus": "Ensure the order of operations is followed correctly",
        "Result": "Order of operations is correct"
    },
    "Step 5: Final calculation": {
        "Expression": "2240",
        "Focus": "Perform the final calculation",
        "Result": "2240"
    },
    "Is the final answer correct": {
      

In [7]:
def map_fn(ins):
    if ins["answer_pred"] == "correct":
        return {
            "answer_pred": "2240"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }
    

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [8]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.848

In [9]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [11]:
new_ds.push_to_hub("sachithgunasekara/self-discover-mistral-original-bbh-eval", cat)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/17.4k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sachithgunasekara/self-discover-mistral-original-bbh-eval/commit/a5766e422e3ebdcb788d4548d15de742b2faeb45', commit_message='Upload dataset', commit_description='', oid='a5766e422e3ebdcb788d4548d15de742b2faeb45', pr_url=None, pr_revision=None, pr_num=None)

# navigate

In [331]:
chk_dir = here(os.path.join(par_dir, "bbh-navigate/bbh_eval"))

In [332]:
dataset = Dataset.load_from_disk(chk_dir)

In [333]:
set(dataset["answer_pred"])

{'No"', 'No.', 'No."', 'Yes"', 'Yes.'}

In [334]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [335]:
set(new_ds["answer_pred"])

{'No', 'Yes'}

In [336]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.936

In [337]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

# object_counting

In [338]:
chk_dir = here(os.path.join(par_dir, "bbh-object_counting/bbh_eval"))

In [339]:
dataset = Dataset.load_from_disk(chk_dir)

In [340]:
set(dataset["answer_pred"])

{'10.',
 '10."',
 '11"',
 '11.',
 '11."',
 '12"',
 '12.',
 '12."',
 '13"',
 '13.',
 '14"',
 '14.',
 '14."',
 '15"',
 '15.',
 '15."',
 '16"',
 '16.',
 '17"',
 '17.',
 '17."',
 '18"',
 '18.',
 '2"',
 '2.',
 '2."',
 '3"',
 '3.',
 '3."',
 '4"',
 '4.',
 '4."',
 '5"',
 '5.',
 '5."',
 '6"',
 '6.',
 '6."',
 '7"',
 '7.',
 '7."',
 '8"',
 '8.',
 '8."',
 '9"',
 '9.',
 '9."'}

In [341]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [342]:
set(new_ds["answer_pred"])

{'10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}

In [343]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.848

In [344]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(save_par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]