In [102]:
import random

from tqdm import tqdm
from datasets import load_dataset, get_dataset_config_names
from langchain_core.prompts import PromptTemplate
from transformers import AutoTokenizer

In [13]:
reasoning_modules = [
    "1. How could I devise an experiment to help solve that problem?",
    "2. Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made.",
    # "3. How could I measure progress on this problem?",
    "4. How can I simplify the problem so that it is easier to solve?",
    "5. What are the key assumptions underlying this problem?",
    "6. What are the potential risks and drawbacks of each solution?",
    "7. What are the alternative perspectives or viewpoints on this problem?",
    "8. What are the long-term implications of this problem and its solutions?",
    "9. How can I break down this problem into smaller, more manageable parts?",
    "10. Critical Thinking: This style involves analyzing the problem from different perspectives, questioning assumptions, and evaluating the evidence or information available. It focuses on logical reasoning, evidence-based decision-making, and identifying potential biases or flaws in thinking.",
    "11. Try creative thinking, generate innovative and out-of-the-box ideas to solve the problem. Explore unconventional solutions, thinking beyond traditional boundaries, and encouraging imagination and originality.",
    # "12. Seek input and collaboration from others to solve the problem. Emphasize teamwork, open communication, and leveraging the diverse perspectives and expertise of a group to come up with effective solutions.",
    "13. Use systems thinking: Consider the problem as part of a larger system and understanding the interconnectedness of various elements. Focuses on identifying the underlying causes, feedback loops, and interdependencies that influence the problem, and developing holistic solutions that address the system as a whole.",
    "14. Use Risk Analysis: Evaluate potential risks, uncertainties, and tradeoffs associated with different solutions or approaches to a problem. Emphasize assessing the potential consequences and likelihood of success or failure, and making informed decisions based on a balanced analysis of risks and benefits.",
    # "15. Use Reflective Thinking: Step back from the problem, take the time for introspection and self-reflection. Examine personal biases, assumptions, and mental models that may influence problem-solving, and being open to learning from past experiences to improve future approaches.",
    "16. What is the core issue or problem that needs to be addressed?",
    "17. What are the underlying causes or factors contributing to the problem?",
    "18. Are there any potential solutions or strategies that have been tried before? If yes, what were the outcomes and lessons learned?",
    "19. What are the potential obstacles or challenges that might arise in solving this problem?",
    "20. Are there any relevant data or information that can provide insights into the problem? If yes, what data sources are available, and how can they be analyzed?",
    "21. Are there any stakeholders or individuals who are directly affected by the problem? What are their perspectives and needs?",
    "22. What resources (financial, human, technological, etc.) are needed to tackle the problem effectively?",
    "23. How can progress or success in solving the problem be measured or evaluated?",
    "24. What indicators or metrics can be used?",
    "25. Is the problem a technical or practical one that requires a specific expertise or skill set? Or is it more of a conceptual or theoretical problem?",
    "26. Does the problem involve a physical constraint, such as limited resources, infrastructure, or space?",
    "27. Is the problem related to human behavior, such as a social, cultural, or psychological issue?",
    "28. Does the problem involve decision-making or planning, where choices need to be made under uncertainty or with competing objectives?",
    "29. Is the problem an analytical one that requires data analysis, modeling, or optimization techniques?",
    "30. Is the problem a design challenge that requires creative solutions and innovation?",
    "31. Does the problem require addressing systemic or structural issues rather than just individual instances?",
    "32. Is the problem time-sensitive or urgent, requiring immediate attention and action?",
    "33. What kinds of solution typically are produced for this kind of problem specification?",
    "34. Given the problem specification and the current best solution, have a guess about other possible solutions."
    "35. Let’s imagine the current best solution is totally wrong, what other ways are there to think about the problem specification?"
    "36. What is the best way to modify this current best solution, given what you know about these kinds of problem specification?"
    "37. Ignoring the current best solution, create an entirely new solution to the problem."
    # "38. Let’s think step by step."
    "39. Let’s make a step by step plan and implement it with good notation and explanation.",
]

REASONING_MODULES = "\n".join(reasoning_modules)

In [9]:
### SELECT PROMPT ###

SELECT_PROMPT = """Select several reasoning modules that are crucial to utilize in order to solve the given task.

All reasoning module descriptions:
{reasoning_modules}

Task: 
{task_description}

Select several modules are crucial for solving the task above. 
Your response should only be the list of select modules(module number and description), no explanations or solutions are required."""


### ADAPT PROMPT ###

ADAPT_PROMPT = """Rephrase and specify each reasoning module so that it better helps solving the task.

SELECTED module descriptions:
{selected_modules}

Task:
{task_description}

Adapt each reasoning module description to better solve the task.
Your response should only be the list of adapted modules, no explanations or solutions are required."""


### STRUCTURING PROMPT ###

STRUCTURING_PROMPT = """Operationalize the reasoning modules into a step-by-step reasoning plan in JSON format.

Here's an example:

Example task:

If you follow these instructions, do you return to the starting point? Always face forward. Take 1 step backward. Take 9 steps left. Take 2 steps backward. Take 6 steps forward. Take 4 steps forward. Take 4 steps backward. Take 3 steps right.

Example reasoning structure:

{{
    "Position after instruction 1":
    "Position after instruction 2":
    "Position after instruction n":
    "Is final position the same as starting position":
}}

Adapted module description:
{adapted_modules}

Task:
{task_description}

Implement a reasoning structure for solvers to follow step-by-step and arrive at correct answer.

Note: do NOT actually arrive at a conclusion in this pass. Your job is to generate a PLAN so that in the future you can fill it out and arrive at the correct conclusion for tasks like this"""


### REASONING PLAN PROMPT ###

NL_REASONING_PLAN_PROMPT = """Operationalize the reasoning modules into a step-by-step reasoning plan in plain English to solve the given task.
Make sure the plan is concrete, intuitive, and unambigous.
The reasoning plan should help an AI agent follow it and be able to derive a solution to the given task.

Here's an example:
Example task:
If you follow these instructions, do you return to the starting point? Always face forward. Take 1 step backward. Take 9 steps left. Take 2 steps backward. Take 6 steps forward. Take 4 steps forward. Take 4 steps backward. Take 3 steps right.

Example reasoning structure:
Find position after instruction 1.
FInd position after instruction 2.
Find position after instruction n.
Is final position the same as starting position?

Reasoning Module description:
{adapted_modules}

Task:
{task_description}

Note: do NOT actually arrive at a conclusion in this pass. Your job is to generate a PLAN that can be followed to arrive at the correct answer the given task."""


### FOLLOW PLAN PROMPT ###

REASONING_PROMPT = """Follow the step-by-step reasoning plan in JSON to correctly solve the task filling in the values for the corresponding keys.
Do not simply rephrase the keys.
Phrase your final answer always as "The final answer is [answer]".

[answer] should be in one of the following formats:
{reasoning_formats}
    
Reasoning Structure:
{reasoning_structure}

Correctly follow the above JSON reasoning structure to solve the given task below. Your response should be the filled JSON for the above reasoning structure.

Task:
{task_description}"""

FOLLOW_REASONING_PLAN_PROMPT = """Follow the reasoning plan step-by-step to arrive at the correct answer
Your response should only contain the reasoning process for the given task.
Phrase your final answer always as "The final answer is [answer]".

[answer] should be in one of the following formats:
{reasoning_formats}

Reasoning Plan:
{reasoning_plan}

Task:
{task_description}"""

In [23]:
select_prompt = PromptTemplate.from_template(SELECT_PROMPT)
adapt_prompt = PromptTemplate.from_template(ADAPT_PROMPT)

structuring_prompt = PromptTemplate.from_template(STRUCTURING_PROMPT)
nl_reasoning_plan_prompt = PromptTemplate.from_template(NL_REASONING_PLAN_PROMPT)

reasoning_prompt = PromptTemplate.from_template(REASONING_PROMPT)
follow_reasoning_plan_prompt = PromptTemplate.from_template(FOLLOW_REASONING_PLAN_PROMPT)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-405B-Instruct")

In [100]:
input_token_count = 0
output_token_count = 0

def calculate_token_count(text):
    return len(tokenizer.encode(text))

def map_fn(instance, benchmark, modified):
    global input_token_count, output_token_count

    if benchmark == "t4d":
        task_description_ins = f"""Observation:
{instance["story"]}

Question:
{instance["question"]}"""
        ANSWER_FORMATS = """- should be complete with the letter and correct answer from the list of given choices (Example answer:  K. Ananda))"""
    
    elif benchmark == "bbh":
        task_description_ins = instance["input"]
        ANSWER_FORMATS = """
- If the answer is not multiple choice, [answer] should be the decided answer. (For eg: Q: not True or False. A: False)
- If the answer is multiple choice,
    - and the given choices are unlabelled options, [answer] should be the chosen option (For eg: Q: Where does the sun rise from? Options: - East, - West, - North. A: East)
    - and the given choices are labelled options, [answer] should be the letter corresponding to the chosen option (For eg: Q: Where does the sun rise from? Options: - A. West, - B. East, - C. North. A: B)"""
    
    elif benchmark == "math":
        one_shot_example = random.sample(list(load_dataset("qwedsacf/competition_math", split="train").filter(lambda x: x["level"] == instance["level"] and x["type"] == instance["type"])), 1)[0]

        task_description_ins = f"""Problem: {instance["problem"]}

<<<BEGIN: An example problem and solution>>>
Problem: {one_shot_example["problem"]}
Solution: {one_shot_example["solution"]}
<<<END: An example problem and solution>>>"""
        ANSWER_FORMATS = """
- should be the final answer based on calculations formatted in Latex style"""
    

    input_token_count += calculate_token_count(select_prompt.format(reasoning_modules=REASONING_MODULES, task_description=task_description_ins))
    output_token_count += calculate_token_count(instance["selected_modules"])
    
    input_token_count += calculate_token_count(adapt_prompt.format(selected_modules=instance["selected_modules"], task_description=task_description_ins))
    output_token_count += calculate_token_count(instance["adapted_modules"])

    if not modified:
        input_token_count += calculate_token_count(structuring_prompt.format(adapted_modules=instance["adapted_modules"], task_description=task_description_ins))
        output_token_count += calculate_token_count(instance["reasoning_structure"])
    else:
        input_token_count += calculate_token_count(nl_reasoning_plan_prompt.format(adapted_modules=instance["adapted_modules"], task_description=task_description_ins))
        output_token_count += calculate_token_count(instance["reasoning_plan"])

    if not modified:
        input_token_count += calculate_token_count(reasoning_prompt.format(reasoning_formats=ANSWER_FORMATS, reasoning_structure=instance["reasoning_structure"], task_description=task_description_ins))
    else:
        input_token_count += calculate_token_count(follow_reasoning_plan_prompt.format(reasoning_formats=ANSWER_FORMATS, reasoning_plan=instance["reasoning_plan"], task_description=task_description_ins))
    output_token_count += calculate_token_count(instance["reasoning"])

#     input_token_count += 5 * calculate_token_count(f"""Here are some other examples like the above:

# {task_description_ins}""")

# t4d

## Phased Self-Discover (struct)

In [86]:
input_token_count = 0
output_token_count = 0

dataset = load_dataset("sachithgunasekara/self-discover-llama-original-t4d-eval", split="train", streaming=True)

for instance in dataset:
    map_fn(instance, "t4d", False)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

Total input token count: 1635830
Total output token count: 778058


## Phased Self-Discover (struct)

In [88]:
input_token_count = 0
output_token_count = 0

dataset = load_dataset("sachithgunasekara/self-discover-llama-modified-t4d-eval", split="train", streaming=True)

for instance in dataset:
    map_fn(instance, "t4d", True)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

Total input token count: 1624844
Total output token count: 703373


# BBH

# Phased Self-Discover (struct)

In [90]:
input_token_count = 0
output_token_count = 0

for subset in get_dataset_config_names("sachithgunasekara/self-discover-llama-original-bbh-eval"):
    if subset:
        print(subset)
        dataset = load_dataset("sachithgunasekara/self-discover-llama-original-bbh-eval", subset, split="train", streaming=True)

        for instance in dataset:
            map_fn(instance, "bbh", False)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

boolean_expressions
causal_judgement
date_understanding
disambiguation_qa
dyck_languages
formal_fallacies
geometric_shapes
hyperbaton
logical_deduction_five_objects
logical_deduction_seven_objects
logical_deduction_three_objects
movie_recommendation
multistep_arithmetic_two
navigate
object_counting
penguins_in_a_table
reasoning_about_colored_objects
ruin_names
salient_translation_error_detection
snarks
sports_understanding
temporal_sequences
tracking_shuffled_objects_five_objects
tracking_shuffled_objects_seven_objects
tracking_shuffled_objects_three_objects
web_of_lies
word_sorting
Total input token count: 19323480
Total output token count: 9029033


# Phased Self-Discover (Unstruct)

In [91]:
input_token_count = 0
output_token_count = 0

for subset in get_dataset_config_names("sachithgunasekara/self-discover-llama-modified-bbh-eval"):
    if subset:
        print(subset)
        dataset = load_dataset("sachithgunasekara/self-discover-llama-modified-bbh-eval", subset, split="train", streaming=True)

        for instance in dataset:
            map_fn(instance, "bbh", True)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

README.md:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

boolean_expressions
causal_judgement
date_understanding
disambiguation_qa
dyck_languages
formal_fallacies
geometric_shapes
hyperbaton
logical_deduction_five_objects
logical_deduction_seven_objects
logical_deduction_three_objects
movie_recommendation
multistep_arithmetic_two
navigate
object_counting
penguins_in_a_table
ruin_names
salient_translation_error_detection
snarks
sports_understanding
temporal_sequences
tracking_shuffled_objects_five_objects
tracking_shuffled_objects_seven_objects
tracking_shuffled_objects_three_objects
web_of_lies
word_sorting
Total input token count: 18645184
Total output token count: 7704112


# MATH

## Phase Self-Discover (Struct)

In [98]:
input_token_count = 0
output_token_count = 0

dataset = load_dataset("sachithgunasekara/self-discover-llama-original-MATH-eval", split="train", streaming=True)

for instance in dataset:
    map_fn(instance, "math", False)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Total input token count: 1070274
Total output token count: 353905


## Phase Self-Discover (Unstruct)

In [99]:
input_token_count = 0
output_token_count = 0

dataset = load_dataset("sachithgunasekara/self-discover-llama-modified-MATH-eval", split="train", streaming=True)

for instance in dataset:
    map_fn(instance, "math", True)

print(f"Total input token count: {input_token_count}")
print(f"Total output token count: {output_token_count}")

README.md:   0%|          | 0.00/674 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12500 [00:00<?, ? examples/s]

Total input token count: 1057664
Total output token count: 289796


# Rerunning Past Experiments

In [104]:
datasets = [
    {
        "dataset": "sachithgunasekara/self-discover-llama-original-t4d-eval",
        "benchmark": "t4d",
        "modified": False
    },
    {
        "dataset": "sachithgunasekara/self-discover-llama-modified-t4d-eval",
        "benchmark": "t4d",
        "modified": True
    },
    {
        "dataset": "sachithgunasekara/self-discover-llama-original-bbh-eval",
        "benchmark": "bbh",
        "modified": False
    },
    {
        "dataset": "sachithgunasekara/self-discover-llama-modified-bbh-eval",
        "benchmark": "bbh",
        "modified": True
    },
    {
        "dataset": "sachithgunasekara/self-discover-llama-original-MATH-eval",
        "benchmark": "math",
        "modified": False
    },
    {
        "dataset": "sachithgunasekara/self-discover-llama-modified-MATH-eval",
        "benchmark": "math",
        "modified": True
    }
]

for ds in tqdm(datasets, desc="Datasets"):
    input_token_count = 0
    output_token_count = 0

    if ds["benchmark"] == "bbh":
        for subset in tqdm(get_dataset_config_names(ds["dataset"]), desc="Subsets"):
            if subset:
                print(subset)
                dataset = load_dataset(ds["dataset"], subset, split="train", streaming=True)

                for instance in tqdm(dataset, desc="Instances"):
                    map_fn(instance, ds["benchmark"], ds["modified"])

    else:
        dataset = load_dataset(ds["dataset"], split="train", streaming=True)

        for instance in tqdm(dataset, desc="Instances"):
            map_fn(instance, ds["benchmark"], ds["modified"])
    
    print(f"Token report for {ds['dataset']}")
    print("-" * 50)
    print(f"Total input token count: {input_token_count}")
    print(f"Total output token count: {output_token_count}")

Instances: 564it [00:19, 28.95it/s]00<?, ?it/s]
Datasets:  17%|█▋        | 1/6 [00:24<02:03, 24.77s/it]

Token report for sachithgunasekara/self-discover-llama-original-t4d-eval
--------------------------------------------------
Total input token count: 1346705
Total output token count: 778058


Instances: 564it [00:35, 15.99it/s]
Datasets:  33%|███▎      | 2/6 [01:04<02:15, 33.76s/it]

Token report for sachithgunasekara/self-discover-llama-modified-t4d-eval
--------------------------------------------------
Total input token count: 1335719
Total output token count: 703373




boolean_expressions



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:30,  8.08it/s]


causal_judgement



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 187it [00:19,  9.73it/s]


date_understanding



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:14, 17.34it/s]


disambiguation_qa



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:22, 11.02it/s]


dyck_languages



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 45it [00:04,  9.03it/s]


formal_fallacies



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:14, 16.68it/s]


geometric_shapes



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:12, 20.81it/s]


hyperbaton



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:16, 15.22it/s]


logical_deduction_five_objects



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:22, 11.07it/s]


logical_deduction_seven_objects



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 250it [00:21, 11.63it/s]


logical_deduction_three_objects



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Instances: 72it [00:05, 13.74it/s]
Subsets:  37%|███▋      | 10/27 [04:07<07:01, 24.78s/it]
Datasets:  33%|███▎      | 2/6 [05:19<10:39, 159.85s/it]


KeyboardInterrupt: 