In [472]:
from datasets import Dataset
from pyprojroot import here
import os

In [774]:
par_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/refined/")
par_dir

PosixPath('/home/ubuntu/jupyter/struct-vs-unstruct/struct_vs_unstruct/data/original/non_self_synthesis/bbh/refined')

# boolean_expressions

In [456]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-boolean_expressions/bbh_eval")

In [457]:
dataset = Dataset.load_from_disk(chk_dir)

In [460]:
def map_fn(instance):
    if "False," in instance["answer_pred"]:
        return {"answer_pred": "False"}
    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.""'))
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [471]:
set(new_ds["answer_pred"])

{'False',
 'Not True and Not False',
 'True',
 'justified based on the correct application of logical operations in the given order'}

In [467]:
new_ds.filter(lambda x: "True because both" in x["answer_pred"])["target"]

['True']

In [470]:
def map_fn(ins):
    corr_ls = ["True because both", "based on the application ", "justified by evaluating the expression"]
    for string in corr_ls:
        if string in ins["answer_pred"]:
            return {
                "answer_pred": "True"
            }

    return {
        "answer_pred": ins["answer_pred"]
    }
new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [475]:
new_ds.save_to_disk(os.path.join(par_dir, "boolean_expressions"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [58]:
total = 3
for instance in new_ds:
    if instance["answer_pred"] == instance["target"]:
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.976)

In [57]:
for instance in new_ds:
    if instance["answer_pred"] != instance["target"]:
        print(instance["target"])
        print(instance["answer_pred"])
        print(instance["trajectory"])
        print("=====================")

True
based on the application of the 'or' and 'not' operators, as well as the simplification of the inner parentheses,
```
{
    "Step 1: Analyze the logical expression": {
        "Identify the logical operators used": "The logical operators used are 'or' and 'not'.",
        "Identify the truth values involved": "The truth values involved are True and False.",
        "Initial interpretation of the expression": "The expression involves a combination of 'or' and 'not' operators with True and False values."
    },
    "Step 2: Simplify the logical expression": {
        "Apply De Morgan's laws or distributive property if applicable": "De Morgan's laws are not applicable in this case, but we can start by evaluating the inner parentheses.",
        "Simplify any inner parentheses or nested expressions": "The inner parentheses contain 'True or False', which simplifies to True.",
        "Expression after simplification": "( True ) or not True"
    },
    "Step 3: Evaluate the inner parent

In [27]:
new_ds.save_to_disk(here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-boolean_expressions/bbh_eval_refined"))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

# causal_judgement

In [476]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-causal_judgement/bbh_eval")

In [477]:
dataset = Dataset.load_from_disk(chk_dir)

In [478]:
set(dataset["answer_pred"])

{'No.', 'No."', None, 'Yes.'}

In [485]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [486]:
set(new_ds["answer_pred"])

{'No', None, 'Yes'}

In [487]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

In [488]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '."')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [489]:
set(new_ds["answer_pred"])

{'No', 'Yes'}

In [490]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])

Filter:   0%|          | 0/187 [00:00<?, ? examples/s]

In [491]:
corr.num_rows / new_ds.num_rows

0.7058823529411765

In [492]:
new_ds.save_to_disk(os.path.join(par_dir, "causal_judgement"))

Saving the dataset (0/1 shards):   0%|          | 0/187 [00:00<?, ? examples/s]

# date_understanding

In [493]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-date_understanding/bbh_eval")

In [494]:
dataset = Dataset.load_from_disk(chk_dir)

In [495]:
set(dataset["answer_pred"])

{'(D).',
 '(D).",',
 '01/11/2011.',
 '07/10/1972.',
 '10/22/2001',
 '11/29/2001.',
 '12/02/1962.',
 'A.',
 'B',
 'B.',
 'C.',
 'D.',
 'E (Though the date provided does not exactly match the calculated date of 12/30/2013).',
 'E.',
 'F.',
 None,
 'already in the US format: 01/09/1987."'}

In [496]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [513]:
set(new_ds["answer_pred"])

{'01/11/2011',
 '07/10/1972',
 '10/22/2001',
 '11/29/2001',
 '12/02/1962',
 'A',
 'B',
 'C',
 'D',
 'E',
 'E Though the date provided does not exactly match the calculated date of 12/30/2013',
 'F',
 None,
 'already in the US format: 01/09/1987'}

In [503]:
ds_none = new_ds.filter(lambda x: x["answer_pred"] == None)
ds_none

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 3
})

In [514]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [94]:
total = 1

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.888)

In [92]:
for inst in new_ds:
    if inst["answer_pred"] and len(inst["answer_pred"]) != 1:
        print(inst["target"])
        print(inst["answer_pred"])
        print(inst["input"])
        print("=" * 50)

(D)
12/02/1962
Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date a month ago in MM/DD/YYYY?
Options:
(A) 12/02/2000
(B) 12/02/2055
(C) 12/01/1960
(D) 12/02/1960
(E) 05/02/1961
(F) 11/18/1960
(A)
11/29/2001
In the US, Thanksgiving is on the fourth Thursday of November. Today is the US Thanksgiving of 2001. What is the date one week from today in MM/DD/YYYY?
Options:
(A) 11/29/2002
(B) 11/15/2002
(C) 11/30/2002
(D) 12/27/2002
(E) 12/11/2002
(F) 11/29/2078
(F)
10/22/2001
In the US, Thanksgiving is on the fourth Thursday of November. Today is the US Thanksgiving of 2001. What is the date a month ago in MM/DD/YYYY?
Options:
(A) 11/22/2001
(B) 10/21/2002
(C) 07/21/2002
(D) 10/22/1923
(E) 10/15/2002
(F) 10/22/2002
(B)
01/11/2011
Yesterday, Jan 21, 2011, Jane ate 2 pizzas and 5 wings. What is the date 10 days ago in MM/DD/YYYY?
Options:
(A) 01/18/2011
(B) 01/12/2011
(C) 01/12/2069
(D) 01/13/2011
(E) 05/12/2010
(F) 08/12/2010
(F)
already in the US form

In [90]:
for ins in ds_none:
    print(ins["target"])
    print(ins["trajectory"])
    print("============================")

(E)
To solve the task, we will follow the step-by-step reasoning plan.

```
{
    "Step 1: Identify the day, month, and year of the current date": {
        "Day": "Since tomorrow is Tuesday, 7/9/1972's Monday is 7/8/1972's and today is 7/8/1972, but today is actually 7/8/1972, so today is Sunday 7/8/1972's day before which is Saturday 7/7/1972's day before which is Friday 7/6/1972's day before which is Thursday 7/5/1972's day before which is Wednesday 7/4/1972's day before which is Tuesday 7/3/1972's day before which is Monday 7/2/1972's day before which is Sunday 7/1/1972's day before which is Saturday 6/30/1972's day before which is Friday 6/29/1972's day before which is Thursday 6/28/1972's day before which is Wednesday 6/27/1972's day before which is Tuesday 6/26/1972's day before which is Monday 6/25/1972's day before which is Sunday 6/24/1972's day before which is Saturday 6/23/1972's day before which is Friday 6/22/1972's day before which is Thursday 6/21/1972's day before whic

# disambiguation_qa

In [515]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-disambiguation_qa/bbh_eval")

In [516]:
dataset = Dataset.load_from_disk(chk_dir)

In [517]:
set(dataset["answer_pred"])

{'(B).', '(C).', 'A.', 'B.', 'C.'}

In [518]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [99]:
set(new_ds["answer_pred"])

{'A', 'B', 'C'}

In [519]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [100]:
total = 0

for instance in new_ds:
    if instance["answer_pred"] == instance["target"].translate(str.maketrans("", "", '()"')):
        total += 1

"Accuracy", total / new_ds.num_rows

('Accuracy', 0.704)

# dyck_languages

In [520]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-dyck_languages/bbh_eval")

In [521]:
dataset = Dataset.load_from_disk(chk_dir)

In [524]:
new_ds = dataset.filter(lambda x: x["answer_pred"] == None)
new_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 45
})

In [523]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [525]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/45 [00:00<?, ? examples/s]

In [None]:
for instance in new_ds:
    print(instance["target"])
    print(instance["trajectory"])
    print("="*50)

# formal_fallacies

In [526]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-formal_fallacies/bbh_eval")

In [527]:
dataset = Dataset.load_from_disk(chk_dir)

In [528]:
set(dataset["answer_pred"])

{None, 'invalid.', 'valid.'}

In [529]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [112]:
set(new_ds["answer_pred"])

{None, 'invalid', 'valid'}

In [530]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [531]:
set(new_ds["answer_pred"])

{'invalid', 'valid'}

In [534]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [532]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

In [533]:
(corr.num_rows) / new_ds.num_rows

0.728

# geometric_shapes

In [535]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-geometric_shapes/bbh_eval")

In [536]:
dataset = Dataset.load_from_disk(chk_dir)

In [546]:
set(dataset["answer_pred"])

{'(D).',
 '(D)."',
 '(K) ellipse."',
 '(K)."',
 'A.',
 'A."',
 'B.',
 'C.',
 'D.',
 'D."',
 'E.',
 'E.",',
 'F.',
 'G.',
 'H.',
 'I.',
 'I."',
 'J.',
 'K.',
 'K."'}

In [547]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [548]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'K ellipse'}

In [549]:
def map_fn(instance):    
    if "K ellipse" == instance["answer_pred"]:
        return {
            "answer_pred": "K"
        }
    return {
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [550]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'}

In [551]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [125]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 167
})

In [126]:
(corr.num_rows + 1) / new_ds.num_rows

0.672

# hyperbaton

In [552]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-hyperbaton/bbh_eval")

In [553]:
dataset = Dataset.load_from_disk(chk_dir)

In [554]:
set(dataset["answer_pred"])

{'A',
 'A"',
 'A.',
 'B is incorrect and the correct answer is not among the options, it should be: Russian old-fashioned rectangular brown baby.',
 'B.',
 'Neither (A) nor (B) is correct; the correct order is: old brown circular Turkish wool car',
 'Neither.'}

In [555]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [556]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'B is incorrect and the correct answer is not among the options it should be: Russian old-fashioned rectangular brown baby',
 'Neither',
 'Neither A nor B is correct; the correct order is: old brown circular Turkish wool car'}

In [557]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [125]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 167
})

In [126]:
(corr.num_rows + 1) / new_ds.num_rows

0.672

# logical_deduction_five_objects

In [558]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-logical_deduction_five_objects/bbh_eval")

In [559]:
dataset = Dataset.load_from_disk(chk_dir)

In [560]:
set(dataset["answer_pred"])

{'(D)."', 'A.', 'B.', 'C.', 'D', 'D.', 'D."', 'E.'}

In [561]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [562]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E'}

In [563]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [139]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 229
})

In [140]:
(corr.num_rows) / new_ds.num_rows

0.916

# logical_deduction_seven_objects

# logical_deduction_three_objects

In [575]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-logical_deduction_three_objects/bbh_eval")

In [576]:
dataset = Dataset.load_from_disk(chk_dir)

In [577]:
set(dataset["answer_pred"])

{'A"', 'A.', 'B.', 'C.', 'C."'}

In [578]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [579]:
set(new_ds["answer_pred"])

{'A', 'B', 'C'}

In [580]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [581]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [582]:
(corr.num_rows) / new_ds.num_rows

1.0

# movie_recommendation

In [583]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-movie_recommendation/bbh_eval")

In [584]:
dataset = Dataset.load_from_disk(chk_dir)

In [585]:
set(dataset["answer_pred"])

{'A.',
 'B is not the answer, it is actually A.',
 'B.',
 'C.',
 'D.',
 'E.',
 None,
 'None of the above.'}

In [586]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [587]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'B is not the answer it is actually A',
 'C',
 'D',
 'E',
 None,
 'None of the above'}

In [595]:
def map_fn(ins):
    if ins["answer_pred"] and "B is not the answer" in ins["answer_pred"]:
        return {
            "answer_pred": "A"
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

In [596]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', None, 'None of the above'}

In [588]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [589]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'B is not the answer it is actually A',
 'C',
 'D',
 'E',
 None,
 'None of the above'}

In [597]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [156]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 166
})

In [590]:
none = new_ds.filter(lambda x: x["answer_pred"] == None)
none

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 2
})

In [159]:
for n in none:
    print(n["target"])
    print(n["trajectory"])
    print("="*80)

(C)
Here is the filled JSON for the above reasoning structure:

```
{
  "Step 1: Identify genres, themes, and elements of given movies":
  {
    "Get Shorty genres": ["Comedy", "Crime"],
    "Get Shorty themes": ["Hollywood satire", "Redemption"],
    "Get Shorty elements": ["Mobsters", "Screenwriting"],
    "Mr Holland's Opus genres": ["Drama", "Music"],
    "Mr Holland's Opus themes": ["Perseverance", "Inspiration"],
    "Mr Holland's Opus elements": ["Music teacher", "High school"],
    "Stargate genres": ["Science Fiction", "Adventure"],
    "Stargate themes": ["Exploration", "Ancient civilizations"],
    "Stargate elements": ["Time travel", "Aliens"],
    "Dances with Wolves genres": ["Western", "Drama"],
    "Dances with Wolves themes": ["Cultural clash", "Self-discovery"],
    "Dances with Wolves elements": ["Native Americans", "Frontier life"]
  },
  
  "Step 2: Analyze movies from different perspectives":
  {
    "Genre analysis": "The movies span multiple genres, including co

In [163]:
new_ds.filter(lambda x: len(x["target"]) != 3)[0]

{'input': 'Find a movie similar to Minority Report, Shrek, Catch Me If You Can, Aladdin:\nOptions:\n(A) Monsters\n(B) Inc\n(C) Children of the Night\n(D) The Incredible Shrinking Man\n(E) Town & Country',
 'target': 'Monsters, Inc',
 'reasoning_formats': '\n- If the answer is not multiple choice, [answer] should be the decided answer. (For eg: Q: not True or False. A: False)\n- If the answer is multiple choice,\n    - and the given choices are unlabelled options, [answer] should be the chosen option (For eg: Q: Where does the sun rise from? Options: - East, - West, - North. A: East)\n    - and the given choices are labelled options, [answer] should be the letter corresponding to the chosen option (For eg: Q: Where does the sun rise from? Options: - A. West, - B. East, - C. North. A: B)',
 'selected_modules': '1. How could I devise an experiment to help solve that problem?\n2. Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress 

In [164]:
(corr.num_rows + 1) / new_ds.num_rows

0.668

In [165]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-penguins_in_a_table/bbh_eval")

In [166]:
dataset = Dataset.load_from_disk(chk_dir)

In [167]:
set(dataset["answer_pred"])

{'(A) 37.',
 '(A).',
 '(B).',
 '(C).',
 '(D).',
 '(E).',
 'A because there is only 1 penguin that meets both conditions (age < 8 years old and weight > 12 kg)."',
 'A.',
 'A."',
 'B.',
 'C.',
 'D.',
 'E.',
 None,
 'the option that matches the calculated cumulative weight, which is option (C)."'}

In [168]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [169]:
set(new_ds["answer_pred"])

{'A',
 'A 37',
 'A because there is only 1 penguin that meets both conditions age < 8 years old and weight > 12 kg',
 'B',
 'C',
 'D',
 'E',
 None,
 'the option that matches the calculated cumulative weight which is option C'}

In [170]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 138
})

In [171]:
none = new_ds.filter(lambda x: x["answer_pred"] == None)
none

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [None]:
for n in none:
    print(n["target"])
    print(n["trajectory"])
    print("="*80)

In [173]:
(corr.num_rows + 3) / new_ds.num_rows

0.9657534246575342

# penguins_in_a_table

In [598]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-penguins_in_a_table/bbh_eval")

In [599]:
dataset = Dataset.load_from_disk(chk_dir)

In [600]:
set(dataset["answer_pred"])

{'(A) 37.',
 '(A).',
 '(B).',
 '(C).',
 '(D).',
 '(E).',
 'A because there is only 1 penguin that meets both conditions (age < 8 years old and weight > 12 kg)."',
 'A.',
 'A."',
 'B.',
 'C.',
 'D.',
 'E.',
 None,
 'the option that matches the calculated cumulative weight, which is option (C)."'}

In [601]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [602]:
set(new_ds["answer_pred"])

{'A',
 'A 37',
 'A because there is only 1 penguin that meets both conditions age < 8 years old and weight > 12 kg',
 'B',
 'C',
 'D',
 'E',
 None,
 'the option that matches the calculated cumulative weight which is option C'}

In [605]:
def map_fn(ins):
    corr_A = ["A 37", "A because there is only 1 penguin"]
    corr_B = ["the option that matches the calculated"]

    if ins["answer_pred"]:
        for a in corr_A:
            if a in ins["answer_pred"]:
                return {
                    "answer_pred": "A"
                }
        for c in corr_B:
            if c in ins["answer_pred"]:
                return {
                    "answer_pred": "C"
                }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [606]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', None}

In [609]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/146 [00:00<?, ? examples/s]

In [607]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 141
})

In [171]:
none = new_ds.filter(lambda x: x["answer_pred"] == None)
none

Filter:   0%|          | 0/146 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [None]:
for n in none:
    print(n["target"])
    print(n["trajectory"])
    print("="*80)

In [608]:
(corr.num_rows) / new_ds.num_rows

0.9657534246575342

# reasoning_about_colored_objects

In [610]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-reasoning_about_colored_objects/bbh_eval")

In [611]:
dataset = Dataset.load_from_disk(chk_dir)

In [612]:
set(dataset["answer_pred"])

{'(A) yes.',
 '(A) zero.',
 '(A).',
 '(B).',
 '(D) three.',
 '(D).',
 '(E) four.',
 '(E).',
 '(F) five."',
 '(H).',
 '(I).',
 '(M).',
 '(O) black.',
 '(P) grey.',
 '(P).',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.',
 'G.',
 'H.',
 'I.',
 'J.',
 'K.',
 'L.',
 'M.',
 'N or A.',
 'N.',
 'O.',
 'P.',
 'Q.',
 'R.',
 'indeed one of the available options."',
 'verified to be correct.",'}

In [613]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [614]:
def map_fn(instance):
    sp = instance["answer_pred"].split()
    if len(sp) == 2:
        return {"answer_pred": sp[0]}
    else:
        return {"answer_pred": instance["answer_pred"]}

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [615]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'N or A',
 'O',
 'P',
 'Q',
 'R',
 'indeed one of the available options',
 'verified to be correct'}

In [616]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [187]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 230
})

In [188]:
corr.num_rows / new_ds.num_rows

0.92

# ruin_names

In [617]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-ruin_names/bbh_eval")

In [618]:
dataset = Dataset.load_from_disk(chk_dir)

In [619]:
set(dataset["answer_pred"])

{'A.',
 'A. This option is the most likely to be a humorous edit, as it takes the original phrase and gives it a completely different meaning, implying that the panic is now related to technology or networking."',
 'A."',
 'B.',
 'C, as it is the option that best matches the key features of a humorous edit, including an unexpected twist, use of wordplay, and subversion of the original meaning."',
 'C.',
 'C."',
 'D.',
 'D."',
 'I.',
 None}

In [620]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [621]:
set(new_ds["answer_pred"])

{'A',
 'A This option is the most likely to be a humorous edit as it takes the original phrase and gives it a completely different meaning implying that the panic is now related to technology or networking',
 'B',
 'C',
 'C as it is the option that best matches the key features of a humorous edit including an unexpected twist use of wordplay and subversion of the original meaning',
 'D',
 'I',
 None}

In [622]:
def map_fn(ins):
    corr_A = ["A This option is the most"]
    corr_B = ["C as it is the option that best"]

    if ins["answer_pred"]:
        for a in corr_A:
            if a in ins["answer_pred"]:
                return {
                    "answer_pred": "A"
                }
        for c in corr_B:
            if c in ins["answer_pred"]:
                return {
                    "answer_pred": "C"
                }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [623]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'I', None}

In [624]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [198]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
print(none_ds[0]["target"])
print(none_ds[0]["trajectory"])

(D)
Here is the filled JSON for the above reasoning structure:

```
{
    "Step 1: Analyze the original phrase":
        "Description": "Break down the original phrase 'black sabbath' into its linguistic components (e.g. phonetics, semantics, syntax)",
        "Analysis": "The original phrase 'black sabbath' is a proper noun, referring to a British heavy metal band. The words 'black' and 'sabbath' have individual meanings, with 'black' referring to the color or darkness, and 'sabbath' referring to a day of rest or a meeting of witches.",
        "Questions to consider": [
            "What are the individual words and their meanings?",
            "How do the words interact with each other?",
            "Are there any idiomatic expressions or colloquialisms?"
        ],
        "Answers": [
            "The individual words are 'black' and 'sabbath'.",
            "The words interact with each other to form a proper noun, referring to a specific entity.",
            "There are no idi

In [184]:
set(new_ds["answer_pred"])

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'N or A',
 'O',
 'P',
 'Q',
 'R',
 'indeed one of the available options',
 'verified to be correct'}

In [199]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 216
})

In [200]:
corr.num_rows / new_ds.num_rows

0.864

# salient_translation_error_detection

In [625]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-salient_translation_error_detection/bbh_eval")

In [626]:
dataset = Dataset.load_from_disk(chk_dir)

In [627]:
set(dataset["answer_pred"])

{'(A) Modifiers or Adjectives or (E) Dropped Content. However, since the question asks for a single option and (A) is listed first among the correct options, the answer would be A."',
 '(D) Named Entities.",',
 '(D).',
 'A.',
 'B.',
 'C.',
 'D.',
 'E.',
 'F.'}

In [628]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [629]:
set(new_ds["answer_pred"])

{'A',
 'A Modifiers or Adjectives or E Dropped Content However since the question asks for a single option and A is listed first among the correct options the answer would be A',
 'B',
 'C',
 'D',
 'D Named Entities',
 'E',
 'F'}

In [630]:
def map_fn(instance):
    ls = instance["answer_pred"].split()

    if len(ls) >= 3:
        return {"answer_pred": ls[0]}

    return {"answer_pred": instance["answer_pred"]}

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [631]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F'}

In [632]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [633]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 179
})

In [634]:
corr.num_rows / new_ds.num_rows

0.716

# snarks

In [635]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-snarks/bbh_eval")

In [636]:
dataset = Dataset.load_from_disk(chk_dir)

In [637]:
set(dataset["answer_pred"])

{'A.',
 'B (or A, as both statements are identical and sarcastic).',
 'B.',
 'No.'}

In [638]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [639]:
set(new_ds["answer_pred"])

{'A', 'B', 'B or A as both statements are identical and sarcastic', 'No'}

In [226]:
no_ds = new_ds.filter(lambda x: x["answer_pred"] == "No")
print(no_ds[0]["input"])
print(no_ds[0]["answer_pred"])
print(no_ds[0]["target"])

Which statement is sarcastic?
Options:
(A) The NB
No
(A)


In [640]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/178 [00:00<?, ? examples/s]

In [227]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/178 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 153
})

In [228]:
corr.num_rows / new_ds.num_rows

0.8595505617977528

# sports_understanding

In [641]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-sports_understanding/bbh_eval")

In [642]:
dataset = Dataset.load_from_disk(chk_dir)

In [643]:
set(dataset["answer_pred"])

{'False.',
 'False."',
 'High.',
 'Implausible.',
 'No.',
 None,
 'Plausible but not confirmed.',
 'Plausible but uncertain.',
 'Plausible, but unconfirmed without concrete evidence or records.',
 'Plausible.',
 'The claim is plausible but unverifiable.',
 'True.',
 'True."',
 'Uncertain.',
 'Unlikely.',
 'Yes.',
 'yes.'}

In [644]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [645]:
set(new_ds["answer_pred"])

{'False',
 'High',
 'Implausible',
 'No',
 None,
 'Plausible',
 'Plausible but not confirmed',
 'Plausible but uncertain',
 'Plausible but unconfirmed without concrete evidence or records',
 'The claim is plausible but unverifiable',
 'True',
 'Uncertain',
 'Unlikely',
 'Yes',
 'yes'}

In [245]:
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 59
})

In [246]:
index = 0
print(none_ds[0]["trajectory"])

```
{
    "Understanding the context and terminology": {
        "Key assumptions about the context": "The sentence is referring to a sports game, likely hockey or soccer, given the mention of a defender.",
        "Key assumptions about the terminology": "The term 'Maradona' is likely being used to describe a move or action, possibly a reference to the famous Argentine soccer player Diego Maradona.",
        "Potential impact on plausibility": "If the context is hockey, the sentence may be less plausible, as Maradona is a soccer legend. However, if the context is soccer, the sentence becomes more plausible."
    },
    "Interpretation by different groups": {
        "Hockey fans' interpretation": "Hockey fans may interpret the sentence as Travis Konecny performing a skillful move, but may be unfamiliar with the reference to Maradona.",
        "Soccer fans' interpretation": "Soccer fans will likely recognize the reference to Maradona and interpret the sentence as Travis Konecny perfor

In [646]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [647]:
new_ds.filter(lambda x: x["answer_pred"] == None)

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [648]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = new_ds.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [649]:
set(new_ds["answer_pred"])

{'False',
 'High',
 'Implausible',
 'Impossible to determine',
 'No',
 'No the sentence Emmanuel Sanders got a base hit is not plausible',
 'Plausible',
 'Plausible but not confirmed',
 'Plausible but not necessarily probable',
 'Plausible but potentially exaggerated or metaphorical',
 'Plausible but uncertain',
 'Plausible but unconfirmed without concrete evidence or records',
 'Somewhat plausible',
 "The claim is likely implausible as a literal baseball play but without context it's difficult to say for certain",
 'The claim is plausible but unverifiable',
 'The claim is plausible but unverified',
 "The claim is uncertain and lacks specific evidence to support it but it is not entirely implausible given Curry's skills and abilities",
 "The sentence 'Jesus Luzardo was called for slashing' is plausible but there is limited information available to confirm its accuracy",
 'The sentence is partially plausible',
 'The sentence is plausible',
 'The sentence is plausible but lacks concrete 

In [652]:
plausible_yes = [
    'Plausible',
    'The sentence is plausible',
    'Yes',
    'Yes the sentence is plausible',
    'True',
    'yes'
]
implausible_no = [
    'False',
    'Implausible',
    'No',
    'No the sentence Emmanuel Sanders got a base hit is not plausible',
    'Unlikely',
    "The statement is implausible in the context of Fernando Tatis Jr's profession as a baseball player but it becomes more plausible when considering alternative perspectives and sports contexts"
]
indeterminate = [
    'High',
    'Impossible to determine',
    'Plausible but not confirmed',
    'Plausible but not necessarily probable',
    'Plausible but potentially exaggerated or metaphorical',
    'Plausible but uncertain',
    'Plausible but unconfirmed without concrete evidence or records',
    'Somewhat plausible',
    "The claim is likely implausible as a literal baseball play but without context it's difficult to say for certain",
    'The claim is plausible but unverifiable',
    'The claim is plausible but unverified',
    "The claim is uncertain and lacks specific evidence to support it but it is not entirely implausible given Curry's skills and abilities",
    "The sentence 'Jesus Luzardo was called for slashing' is plausible but there is limited information available to confirm its accuracy",
    'The sentence is partially plausible',
    'The sentence is plausible but lacks concrete evidence to confirm its accuracy',
    'The sentence is plausible if interpreted metaphorically or as an analogy but less plausible if taken literally',
    "The sentence is plausible in the context of ice hockey but it's unclear without more context",
    'The sentence is somewhat plausible',
    'The sentence is somewhat plausible but unlikely',
    'Uncertain',
    'Uncertain due to lack of information',
    "Without further evidence or context it's difficult to determine the plausibility of the claim",
    'uncertain'
]

def map_fn(ins):
    for yes in plausible_yes:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "yes"
            }

    for no in implausible_no:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "no"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [653]:
set(new_ds["answer_pred"])

{'High',
 'Impossible to determine',
 'Plausible but not confirmed',
 'Plausible but not necessarily probable',
 'Plausible but potentially exaggerated or metaphorical',
 'Plausible but uncertain',
 'Plausible but unconfirmed without concrete evidence or records',
 'Somewhat plausible',
 "The claim is likely implausible as a literal baseball play but without context it's difficult to say for certain",
 'The claim is plausible but unverifiable',
 'The claim is plausible but unverified',
 "The claim is uncertain and lacks specific evidence to support it but it is not entirely implausible given Curry's skills and abilities",
 "The sentence 'Jesus Luzardo was called for slashing' is plausible but there is limited information available to confirm its accuracy",
 'The sentence is partially plausible',
 'The sentence is plausible but lacks concrete evidence to confirm its accuracy',
 'The sentence is plausible if interpreted metaphorically or as an analogy but less plausible if taken literall

In [655]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [654]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.732

# temporal_sequences

In [656]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-temporal_sequences/bbh_eval")

In [657]:
dataset = Dataset.load_from_disk(chk_dir)

In [658]:
set(dataset["answer_pred"])

{'(B)."',
 '(D) 4pm to 5pm.',
 'A or C, but since the format requires a single answer, we will choose one of the correct options.',
 'A.',
 'B.',
 'Both (A) and (D).',
 'C.',
 'C."',
 'D.'}

In [659]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [660]:
set(new_ds["answer_pred"])

{'A',
 'A or C but since the format requires a single answer we will choose one of the correct options',
 'B',
 'Both A and D',
 'C',
 'D',
 'D 4pm to 5pm'}

In [662]:
def map_fn(ins):
    if "D 4pm to 5pm" == ins["answer_pred"]:
        return {
            "answer_pred": "D"
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [663]:
set(new_ds["answer_pred"])

{'A',
 'A or C but since the format requires a single answer we will choose one of the correct options',
 'B',
 'Both A and D',
 'C',
 'D'}

In [666]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [664]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 248
})

In [665]:
(corr.num_rows) / new_ds.num_rows

0.992

# tracking_shuffled_objects_five_objects

In [667]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-tracking_shuffled_objects_five_objects/bbh_eval")

In [668]:
dataset = Dataset.load_from_disk(chk_dir)

In [669]:
set(dataset["answer_pred"])

{'A.', 'A."', 'B.', 'C.', 'D.', 'E.'}

In [670]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [671]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E'}

In [672]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [284]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 248
})

In [285]:
(corr.num_rows) / new_ds.num_rows

0.992

# tracking_shuffled_objects_seven_objects

In [673]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-tracking_shuffled_objects_seven_objects/bbh_eval")

In [674]:
dataset = Dataset.load_from_disk(chk_dir)

In [675]:
set(dataset["answer_pred"])

{'A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G) Izzi.', 'G.'}

In [676]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [677]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'G Izzi'}

In [678]:
def map_fn(ins):
    if "G Izzi" == ins["answer_pred"]:
        return {
            "answer_pred": "G"
        }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [679]:
set(new_ds["answer_pred"])

{'A', 'B', 'C', 'D', 'E', 'F', 'G'}

In [682]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [680]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 249
})

In [681]:
(corr.num_rows) / new_ds.num_rows

0.996

# tracking_shuffled_objects_three_objects

In [683]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-tracking_shuffled_objects_three_objects/bbh_eval")

In [684]:
dataset = Dataset.load_from_disk(chk_dir)

In [685]:
set(dataset["answer_pred"])

{'A"', 'A.', 'A."', 'B.', 'B."', 'C.', 'C."'}

In [686]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [687]:
set(new_ds["answer_pred"])

{'A', 'B', 'C'}

In [688]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [301]:
corr = new_ds.filter(lambda x: x["target"].translate(str.maketrans("", "", '()"')) == x["answer_pred"])
corr

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 250
})

In [303]:
(corr.num_rows) / new_ds.num_rows

1.0

# web_of_lies

In [689]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-web_of_lies/bbh_eval")

In [690]:
dataset = Dataset.load_from_disk(chk_dir)

In [691]:
set(dataset["answer_pred"])

{'F.', 'False.', 'No.', None, 'T.', 'True.', 'True."', 'Yes.', 'true.', 'yes.'}

In [692]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.,()"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn, load_from_cache_file=False)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [693]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

In [694]:
set(new_ds["answer_pred"])

{'F', 'False', 'No', None, 'T', 'True', 'Yes', 'true', 'yes'}

In [312]:
set(new_ds["target"])

{'No', 'Yes'}

In [695]:
yes_ls = ["True", "Yes", "true", "yes", "T"]
no_ls = ["False", "No", "F"]

def map_fn(ins):
    for yes in yes_ls:
        if yes == ins["answer_pred"]:
            return {
                "answer_pred": "Yes"
            }

    for no in no_ls:
        if no == ins["answer_pred"]:
            return {
                "answer_pred": "No"
            }
    return {
        "answer_pred": ins["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [696]:
set(new_ds["answer_pred"])

{'No', None, 'Yes'}

In [698]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [697]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.876

In [321]:
yes_corr_ds = yes_ds.filter(lambda x: x["target"] == "Yes")
no_corr_ds = no_ds.filter(lambda x: x["target"] == "No")
none_ds = new_ds.filter(lambda x: x["answer_pred"] == None)
none_ds

Filter:   0%|          | 0/142 [00:00<?, ? examples/s]

Filter:   0%|          | 0/107 [00:00<?, ? examples/s]

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 1
})

In [322]:
print(none_ds[0]["input"])
print(none_ds[0]["target"])
print(none_ds[0]["trajectory"])

Question: Elanor lies. Millicent says Elanor tells the truth. Millie says Millicent lies. Michaela says Millie tells the truth. Kandi says Michaela lies. Does Kandi tell the truth?
No
```
{
    "Step 1: Analyze Elanor's statement": {
        "Assumption": "Elanor lies",
        "Implication": "Elanor's statement is false, so the opposite of what Elanor says is true."
    },
    "Step 2: Analyze Millicent's statement": {
        "Statement": "Millicent says Elanor tells the truth",
        "Assumption": "Millicent lies",
        "Implication": "If Millicent says Elanor tells the truth, but Elanor actually lies, then Millicent's statement is false."
    },
    "Step 3: Analyze Millie's statement": {
        "Statement": "Millie says Millicent lies",
        "Assumption": "Millie tells the truth",
        "Implication": "If Millie says Millicent lies, and Millicent's statement is actually false, then Millie's statement is true."
    },
    "Step 4: Analyze Michaela's statement": {
       

# word_sorting

In [765]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-word_sorting/bbh_eval")

In [766]:
dataset = Dataset.load_from_disk(chk_dir)

In [767]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

In [768]:
t = new_ds.filter(lambda x: x["answer_pred"]==None)
t

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 124
})

In [769]:
import re
def map_fn(instance):
    if instance["answer_pred"] == None:
        text = "The final answer is:"
        pattern = fr"(?<={text}).*"
    
        response = instance["trajectory"]
    
        try:
            answer, trajectory = re.search(pattern, response).group(0).strip(), re.sub(pattern, "", response).replace(text, "").strip()
        except:
            answer, trajectory = None, response
    
        return {
            "trajectory": trajectory,
            "answer_pred": answer
        }

    return {
        "trajectory": instance["trajectory"],
        "answer_pred": instance["answer_pred"]
    }

new_ds = new_ds.map(map_fn)

In [770]:
t = new_ds.filter(lambda x: x["answer_pred"]==None)
t

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [771]:
new_ds["target"][0]

'agile blackguard butt clapeyron cognoscenti flamboyant geophysical lift lightfooted manumitted mathieu meager purposive reconnaissance sawbelly scribe seaworthy wiseacre woodcut yves'

In [772]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"'))
    }

new_ds = new_ds.map(map_fn)

In [None]:
set(new_ds["answer_pred"])

In [752]:
'[skimpy, zoroaster]'.strip("[]").split(", ")

['skimpy', 'zoroaster']

In [753]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [754]:
corr = new_ds.filter(lambda x: x["target"].lower() == x["answer_pred"].lower())
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.02

In [755]:
wro = new_ds.filter(lambda x: x["target"].lower() != x["answer_pred"].lower())
wro

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'target', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_structure', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 245
})

In [453]:
index = 11
print(wro[index]["target"])
print(wro[index]["answer_pred"])

abbas average bridesmaid catsup charm coddle dogfish hypothalamus inconvertible inequity integral invocable memorandum multiplet phloem region scherzo shutout therewith trumpery
abbas average bridesmaid catsup charm coddle dogfish hypothalamus inequity inconvertible integral invocable memorandum multiplet phloem region scherzo shutout therewith trumpery


In [455]:
(corr.num_rows) / dataset.num_rows

0.72

# multistep_arithmetic_two

In [776]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-multistep_arithmetic_two/bbh_eval")

In [777]:
dataset = Dataset.load_from_disk(chk_dir)

In [778]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [779]:
set(new_ds["answer_pred"])

{'-1',
 '-10',
 '-1008',
 '-11',
 '-110',
 '-113',
 '-114',
 '-11520',
 '-13',
 '-1343',
 '-144',
 '-147',
 '-15',
 '-151',
 '-160',
 '-168',
 '-169',
 '-17',
 '-170',
 '-17238',
 '-18',
 '-19',
 '-192',
 '-196',
 '-1960',
 '-1968',
 '-2',
 '-20',
 '-21',
 '-2146',
 '-217',
 '-22',
 '-23',
 '-24',
 '-240',
 '-25',
 '-26',
 '-262',
 '-264',
 '-2646',
 '-284',
 '-3',
 '-30',
 '-32',
 '-320',
 '-33',
 '-330',
 '-340',
 '-3400',
 '-343',
 '-35',
 '-36',
 '-37',
 '-38',
 '-3850',
 '-39',
 '-391',
 '-39960',
 '-4',
 '-43',
 '-46',
 '-48',
 '-5',
 '-50',
 '-51',
 '-52',
 '-5463',
 '-55',
 '-554',
 '-57',
 '-59',
 '-6',
 '-60',
 '-61',
 '-658',
 '-7',
 '-73',
 '-76',
 '-8',
 '-80',
 '-81',
 '-83',
 '-8304',
 '-84',
 '-8840',
 '-89',
 '-9',
 '-90',
 '-97',
 '-99',
 '0',
 '10',
 '101',
 '107',
 '11',
 '1168',
 '12',
 '120',
 '123',
 '14',
 '140',
 '15',
 '150',
 '16',
 '1608',
 '17',
 '176',
 '18',
 '181',
 '1980',
 '2',
 '20',
 '211',
 '216',
 '220',
 '2240',
 '237',
 '24',
 '25',
 '251472',
 '

In [784]:
def map_fn(ins):
    corr_ls = ["accurate", "reasonable and accurate, as the calculations were performed correctly", "reasonable and correct based on the given expression"]

    for x in corr_ls:
        if x == ins["answer_pred"]:
            print(ins["answer_pred"])
            print(ins["trajectory"])
            print("="*80)
    if ins["answer_pred"] == corr_ls[0]:
        return {
            "answer_pred": "10"
        }

    if ins["answer_pred"] == corr_ls[2]:
        return {
            "answer_pred": "21"
        }

    if ins["answer_pred"] == corr_ls[1]:
        return {
            "answer_pred": "57"
        }

    return {
        "answer_pred": ins["answer_pred"]
    }
    

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

accurate
Here's the filled JSON for the above reasoning structure:

```
{
    "Step 1: Identify the core mathematical operations":
        "The main operations are addition, subtraction, and multiplication inside the parentheses, followed by subtraction of the results of the two parentheses.",

    "Step 2: Break down the expression into smaller parts":
        "The expression can be broken down into two parts inside the parentheses: (1 + 2 * -1 - -8) and (-8 - 2 - -1 + 6).",

    "Step 3: Evaluate expressions inside parentheses":
        {
            "Parenthesis 1": "1 + 2 * -1 - -8",
            "Parenthesis 2": "-8 - 2 - -1 + 6"
        },

    "Step 4: Simplify expressions inside parentheses":
        {
            "Parenthesis 1": "Apply order of operations (PEMDAS): 1 + 2 * -1 - -8 = 1 + (-2) + 8",
            "Parenthesis 2": "Apply order of operations (PEMDAS): -8 - 2 - -1 + 6 = -8 - 2 + 1 + 6"
        },

    "Step 5: Combine like terms inside parentheses":
        {
       

In [786]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

0.94

In [788]:
hub_name = "sachithgunasekara/self-discover-original-bbh-eval"

In [789]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

new_ds.push_to_hub(hub_name, cat)

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sachithgunasekara/self-discover-original-bbh-eval/commit/22d78ce471624cea56e4f2866f837d9c08adf9c2', commit_message='Upload dataset', commit_description='', oid='22d78ce471624cea56e4f2866f837d9c08adf9c2', pr_url=None, pr_revision=None, pr_num=None)

# navigate

In [790]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-navigate/bbh_eval")

In [791]:
dataset = Dataset.load_from_disk(chk_dir)

In [792]:
set(dataset["answer_pred"])

{'No.', 'Yes.'}

In [793]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [794]:
set(new_ds["answer_pred"])

{'No', 'Yes'}

In [795]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.972

In [788]:
hub_name = "sachithgunasekara/self-discover-original-bbh-eval"

In [796]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

new_ds.push_to_hub(hub_name, cat)

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/16.2k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sachithgunasekara/self-discover-original-bbh-eval/commit/a863fabda7b72c58c74921dfe556d0750f9fc558', commit_message='Upload dataset', commit_description='', oid='a863fabda7b72c58c74921dfe556d0750f9fc558', pr_url=None, pr_revision=None, pr_num=None)

# object_counting

In [797]:
chk_dir = here("struct_vs_unstruct/data/original/non_self_synthesis/bbh/bbh-object_counting/bbh_eval")

In [798]:
dataset = Dataset.load_from_disk(chk_dir)

In [799]:
set(dataset["answer_pred"])

{'10.',
 '11.',
 '11."',
 '12.',
 '12."',
 '13.',
 '14.',
 '14."',
 '15.',
 '16.',
 '16."',
 '17.',
 '18."',
 '19.',
 '2.',
 '2."',
 '3.',
 '3."',
 '4.',
 '4."',
 '5.',
 '5."',
 '6.',
 '6."',
 '7.',
 '7."',
 '8.',
 '8."',
 '9.',
 '9."',
 'based on the accurate count of the number of animals in the given list, using the defined definition of \'animal\' and categorization."',
 'determined based on the step-by-step analysis."'}

In [800]:
def map_fn(instance):    
    return {
        "answer_pred": instance["answer_pred"].translate(str.maketrans("", "", '.[]"')) if instance["answer_pred"] else None
    }

new_ds = dataset.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [801]:
set(new_ds["answer_pred"])

{'10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 "based on the accurate count of the number of animals in the given list, using the defined definition of 'animal' and categorization",
 'determined based on the step-by-step analysis'}

In [804]:
def map_fn(ins):
    corr_ls = ["based on the accurate count of the number", "determined based on the step-by-step"]

    for x in corr_ls:
        if x in ins["answer_pred"]:
            print(ins["answer_pred"])
            print(ins["trajectory"])
            print("="*80)
    if corr_ls[0] in ins["answer_pred"]:
        return {
            "answer_pred": "9"
        }

    if corr_ls[1] in ins["answer_pred"]:
        return {
            "answer_pred": "10"
        }
        
    return {
        "answer_pred": ins["answer_pred"]
    }
    

new_ds = new_ds.map(map_fn)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

based on the accurate count of the number of animals in the given list, using the defined definition of 'animal' and categorization
```
{
    "Step 1: Define the definition of 'animal' to be used for counting":
        {
            "Definition": "A living organism that feeds on organic matter, typically having specialized sense organs and nervous system and able to respond rapidly to stimuli.",
            "Justification": "This definition encompasses a wide range of living creatures, including mammals, birds, fish, and invertebrates, which will be used to identify and count the animals in the given list."
        },
    "Step 2: Identify the objects in the list that are living creatures":
        {
            "Living creatures": ["snail", "duck", "cat", "fish", "chicken", "cow", "mice", "rabbit"],
            "Non-living objects": ["chair", "toaster", "ovens"]
        },
    "Step 3: Exclude non-living objects from the list":
        {
            "Updated list": ["snail", "duck", "

In [805]:
corr = new_ds.filter(lambda x: x["target"] == x["answer_pred"])
corr.num_rows / new_ds.num_rows

Filter:   0%|          | 0/250 [00:00<?, ? examples/s]

0.884

In [788]:
hub_name = "sachithgunasekara/self-discover-original-bbh-eval"

In [806]:
start_string, end_string = "non_self_synthesis/bbh/", "/bbh_eval"
cat = str(chk_dir)[str(chk_dir).find(start_string) + len(start_string) + 4:str(chk_dir).find(end_string)]
new_ds.save_to_disk(os.path.join(par_dir, cat))

new_ds.push_to_hub(hub_name, cat)

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/16.8k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sachithgunasekara/self-discover-original-bbh-eval/commit/7ab1ac8f4be95cddd9c19c028b45494ca802c675', commit_message='Upload dataset', commit_description='', oid='7ab1ac8f4be95cddd9c19c028b45494ca802c675', pr_url=None, pr_revision=None, pr_num=None)