*Copyright (c) Meta Platforms, Inc. and affiliates. This source code is licensed under the license found in the LICENSE file in the root directory of this source tree.*

## TL;DR

In [None]:
from data_gen.pretrain.id_gen import IdGen
from tools.tools import tokenizer, fix_seed
from typing import Literal

def get_prob_sol_ans_triple(tpy: Literal["easy", "med", "hard"]):
    assert tpy in ["easy", "med", "hard"], "Invalid type: Choose 'med' or 'hard'"
    # Set parameters based on difficulty
    max_op = 15 if tpy == "med" else 21
    max_edge = 20 if tpy == "med" else 28
    if tpy == "easy":
        max_op = 10
        max_edge = 15

    id_gen = IdGen(
        max_op=max_op,        # Maximum # of operations
        max_edge=max_edge,    # Maximum # of edges (instance parameters) in the structure graph
        perm_level=4,         # Random shuffle level for problem description
        detail_level=0        # Most detailed solution format
    )

    id_gen.gen_prob([i for i in range(23)], p_format="pq")

    return id_gen

# generate 10 000 easy problems and sort them
fix_seed(41)
for i in range(10):
    try:
        id_gen = get_prob_sol_ans_triple("easy")
        print(f"\n\nProblem {i}:")
        print(tokenizer.decode(id_gen.prob_token))
        print(f"Solution:")
        print(tokenizer.decode(id_gen.sol_token))
        print(f"Answer:")
        print(tokenizer.decode(id_gen.ans_token))
        print("OP: ", id_gen.op_)
    except Exception as e:
        print(f"Error generating problem {i}: {e}")

In [None]:
# generate 10 000 easy problems and sort them by the length of the solution
from tqdm import tqdm

test_problems = {i: [] for i in range(15)}
fix_seed(41)
for i in tqdm(range(10000)):
    id_gen = get_prob_sol_ans_triple("easy")
    test_problems[id_gen.op_].append(id_gen)

In [None]:
few_shot_problems = {i: [] for i in range(15)}
fix_seed(42)
for i in tqdm(range(10000)):
    id_gen = get_prob_sol_ans_triple("easy")
    if id_gen.op_ > 5:
        continue
    few_shot_problems[id_gen.op_].append(id_gen)

In [None]:
train_problems = {i: [] for i in range(15)}
fix_seed(42)
for i in tqdm(range(10000)):
    id_gen = get_prob_sol_ans_triple("easy")
    if id_gen.op_ > 5:
        continue
    train_problems[id_gen.op_].append(id_gen)

In [None]:
train_text = f"""Consider the following high school-level math problem where all numbers are integers with arithmetic modulo 23.
Problem: {tokenizer.decode(id_gen.prob_token)}  
Solution: {tokenizer.decode(id_gen.sol_token)}
Answer: {tokenizer.decode(id_gen.ans_token).strip()}
"""

print(train_text)

In [None]:
import numpy as np

id_gen = test_problems[1][0]  # Example problem for testing

op = id_gen.op_
num_few_shot = 5
few_shot_problems = np.random.choice(few_shot_problems[op], num_few_shot, replace=False)

test_text = f"""Consider the following high school-level math problems where all numbers are integers with arithmetic modulo 23.
"""

for i, few_shot in enumerate(few_shot_problems):
    test_text += f"""
Problem {i + 1}: {tokenizer.decode(few_shot.prob_token)}
Solution: {tokenizer.decode(few_shot.sol_token)}
Answer: {tokenizer.decode(few_shot.ans_token).strip()}
"""

test_text += f"""
Problem {num_few_shot+1}: {tokenizer.decode(id_gen.prob_token)}
Solution:"""

print(test_text)

In [None]:
3 * 23 * 23

In [None]:
from mod23benchmark import Mod23BenchmarkGenerator

generator = Mod23BenchmarkGenerator(seed=42)

generator.save_jsonl_file("mod23bench_n=2.jsonl", count=500, n=2)
generator.save_jsonl_file("mod23bench_n=3.jsonl", count=500, n=3)

In [None]:
500 / 23

In [None]:
Consider the following high school-level math problem where all numbers are integers with arithmetic modulo 23. 




In [None]:
from math_gen.problem_gen import auto_easy

auto_easy()

## Introduction

In [None]:
from data_gen.pretrain.id_gen import IdGen
from tools.tools import fix_seed

fix_seed(42)  # Ensures reproducibility

# Initialize IdGen with specific parameters
id_gen = IdGen(
    max_op=15,            # Maximum operations
    max_edge=20,          # Maximum edges in the Structure Graph
    perm_level=5,         # Level of randomization in problem description. 5 represents random shuffle
    detail_level=0        # Level of detail in the solution. 0 represents the most detailed solution format
)

# Generate a problem and format it
id_gen.gen_prob([i for i in range(23)], p_format="pq")

In [None]:
prob_id = id_gen.prob_token  # Problem's token IDs
sol_id = id_gen.sol_token    # Solution's token IDs
ans_id = id_gen.ans_token    # One-number answer's token IDs

In [None]:
from tools.tools import tokenizer
print("Problem:")
print(tokenizer.decode(prob_id))
print("Solution:")
print(tokenizer.decode(sol_id))
print("Answer:")
print(tokenizer.decode(ans_id))

In [None]:
assert id_gen.token_id == [222] + prob_id + [223] + sol_id + [224] + ans_id + [50256]

In [None]:
from tools.tools_test import true_correct

# Example of a solution string
wrong_sol = " Define Penguin Beach's Giraffe as t; so t = 6. Define Octopus Den's Leopard as r; so r = t = 6. Define Penguin Beach's Animal as J; so J = t = 6."

# Validate the solution
correct, my_print, parser = true_correct(wrong_sol, id_gen.problem)
print(f"Correct or not: {correct}")
my_print.display()

In [None]:
id_gen.problem.draw()