In [14]:
import os
import json
from datasets import Dataset
from pyprojroot import here
from transformers import AutoTokenizer
from langchain_core.prompts import PromptTemplate

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-405B-Instruct")

In [12]:
ANSWER_FORMATS = """- If the answer is not multiple choice, [answer] should be the decided answer. (For eg: Q: not True or False. A: False)
- If the answer is multiple choice,
    - and the given choices are unlabelled options, [answer] should be the chosen option (For eg: Q: Where does the sun rise from? Options: - East, - West, - North. A: East)
    - and the given choices are labelled options, [answer] should be the letter corresponding to the chosen option (For eg: Q: Where does the sun rise from? Options: - A. West, - B. East, - C. North. A: B)"""

In [15]:
REASONING_PROMPT = """Solve the given task by following the step-by-step reasoning plan in JSON filling in the values for the corresponding keys.
Phrase your final answer always as "The final answer is [answer]".

[answer] should be in one of the following formats:
{answer_formats}
    
Reasoning Structure:
{reasoning_structure}

Task:
{task_description}

Correctly follow the above JSON reasoning structure to solve the given task. Your response should be the filled JSON for the above reasoning structure."""


prompt = PromptTemplate.from_template(REASONING_PROMPT)
prompt

PromptTemplate(input_variables=['answer_formats', 'reasoning_structure', 'task_description'], input_types={}, partial_variables={}, template='Solve the given task by following the step-by-step reasoning plan in JSON filling in the values for the corresponding keys.\nPhrase your final answer always as "The final answer is [answer]".\n\n[answer] should be in one of the following formats:\n{answer_formats}\n    \nReasoning Structure:\n{reasoning_structure}\n\nTask:\n{task_description}\n\nCorrectly follow the above JSON reasoning structure to solve the given task. Your response should be the filled JSON for the above reasoning structure.')

In [4]:
base_path = str(here("evals/logs/mistral"))
phase1 = os.path.join(base_path, "phaseI/bbh")
phase2 = os.path.join(base_path, "phaseII/bbh")

base_path, phase1, phase2

('d:\\Surge\\self-discover\\evals\\logs\\mistral',
 'd:\\Surge\\self-discover\\evals\\logs\\mistral\\phaseI/bbh',
 'd:\\Surge\\self-discover\\evals\\logs\\mistral\\phaseII/bbh')

In [17]:
input_token_count = 0
output_token_count = 0

for category in os.listdir(phase2):
    with open(os.path.join(phase1, f"{category}.txt"), "r") as f:
        reasoning_structure = f.read()

    dataset = Dataset.load_from_disk(os.path.join(phase2, category, f"{category}_eval"))

    for instance in dataset:
        input_token_count += len(tokenizer.encode(prompt.format(answer_formats=ANSWER_FORMATS, reasoning_structure=reasoning_structure, task_description=instance["self_discover_input"])))
        output_token_count += len(tokenizer.encode(instance["reasoning"]))

print(f"Input Token Count: {input_token_count}")
print(f"Output Token Count: {output_token_count}")

Input Token Count: 4207004
Output Token Count: 4502629


In [None]:
for category in os.listdir(phase2):
    dataset = Dataset.load_from_disk(os.path.join(phase2, category, f"{category}_eval"))

    dataset.push_to_hub("sachithgunasekara/self-discover-llama-bbh-eval", category[4:])

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/583 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.30k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.37k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.54k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.13k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.68k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/7.77k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/8.31k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/8.86k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/9.45k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/9.97k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

In [11]:
dataset[0]

{'input': 'Sort the following words alphabetically: List: syndrome therefrom',
 'target': 'syndrome therefrom',
 'self_discover_input': 'Sort the following words alphabetically: List: syndrome therefrom',
 'reasoning_structure': '`',
 'reasoning': '```json\n{\n    "Identify the core task": {\n        "Identify the core task: sorting words alphabetically": "The core task is to sort the words \'syndrome\' and \'therefrom\' alphabetically."\n    },\n    "Analyze the list from different perspectives": {\n        "Analyze the list from different perspectives (e.g., word length, prefixes, suffixes) to identify the best sorting approach": "The words are \'syndrome\' and \'therefrom\'. Analyzing word length, \'syndrome\' has 8 letters and \'therefrom\' has 9 letters. Analyzing initial letters, \'syndrome\' starts with \'s\' and \'therefrom\' starts with \'t\'."\n    },\n    "Identify relevant data": {\n        "Identify relevant data (e.g., word length, initial letters) and how it can be used 