In [22]:
from itertools import product
import json
import os

from openai import OpenAI
from pydantic import BaseModel
with open('openai_api', 'r') as f:
    api_key = f.readline().strip()

## Alphabetization

Statistics of Alphabetization for reference:

In [2]:
alphabetization = {}
for subset, stage in product(['train', 'test', 'dev'], ['sup', 'ppo_sampled']):
    with open(f'./Alphabetization/{subset}_{stage}.jsonl', 'r') as f:
        alphabetization[f'{subset}_{stage}'] = [json.loads(line) for line in f.readlines()]
    print(f'{subset}_{stage}: {len(alphabetization[f"{subset}_{stage}"])}')

train_sup: 40000
train_ppo_sampled: 10000
test_sup: 2000
test_ppo_sampled: 1000
dev_sup: 2000
dev_ppo_sampled: 1000


## MATH RL4F

In [74]:
MATH = {}
for subject in ['algebra', 'prealgebra', 'intermediate_algebra']:
    MATH[subject] = {}
    for subset in ['train', 'test']:
        directory = f'MATH/{subset}/{subject}/'
        MATH[subject][subset] = [
            json.load(open(directory+p, 'r')) for p in os.listdir(directory)
            ]
        print(f'{subject}_{subset}: {len(MATH[subject][subset])}')

algebra_train: 1744
algebra_test: 1187
prealgebra_train: 1205
prealgebra_test: 871
intermediate_algebra_train: 1295
intermediate_algebra_test: 903


In [None]:
print(f"Total data size: {sum(len(MATH[subject][subset]) for subject in MATH for subset in MATH[subject])}")

Total data size: 7205


Format of sup:

```
{"text": "problem statement ||| attempt", "summary": "critique"}
```

Format of ppo:

```
{"text": "problem statement ||| attempt", "summary": "correct answer"}
```

Current format of MATH:
```
{"problem": "problem statement", "level": "level", "type": "subject", "solution": "solution"}
```

In [68]:
# split the data into train, dev, and test and sup, ppo_sampled
# we will use len(train) - len(test) as the number of samples for dev set

# we first make the train, dev, test split
MATH_SPLIT = {}
for subject in ['algebra', 'counting_and_probability', 'number_theory']:
    MATH_SPLIT[subject] = {}
    n_train, n_test, n_dev = len(MATH[subject]['test']), len(MATH[subject]['test']), len(MATH[subject]['train']) - len(MATH[subject]['test'])
    MATH_SPLIT[subject]['train_sup'] = MATH[subject]['train'][:n_train//2]
    MATH_SPLIT[subject]['train_ppo_sampled'] = MATH[subject]['train'][n_train//2:n_train]
    MATH_SPLIT[subject]['test_sup'] = MATH[subject]['test'][:n_test//2]
    MATH_SPLIT[subject]['test_ppo_sampled'] = MATH[subject]['test'][n_test//2:]
    MATH_SPLIT[subject]['dev_sup'] = MATH[subject]['train'][:n_dev//2]
    MATH_SPLIT[subject]['dev_ppo_sampled'] = MATH[subject]['train'][n_dev//2:n_dev]

In [51]:
class SupResponse(BaseModel):
    solution_1: str
    critique_1: str
    solution_2: str
    critique_2: str


class PPOResponse(BaseModel):
    solution_1: str
    solution_2: str

In [55]:
def get_augmented_data(problem, mode):
    """Get either the sup or ppo_sampled data for a given mode

    :param dict problem: the problem to get the augmented data from, in the form of a dictionary with keys 'problem', 'level', 'type', 'solution'
    :param str mode: the mode to get the augmented data from, either 'sup' or 'ppo_sampled'
    :return: a tuple of two dictionaries, each containing the augmented data. If mode is 'sup', the dictionaries will contain the keys 'problem', 'level', 'type', 'solution', 'critique'. If mode is 'ppo_sampled', the dictionaries will contain the keys 'problem', 'level', 'type', 'solution'.
    """
    client = OpenAI(api_key=api_key)
    if mode == 'sup':
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": f"You are given the following mathematical problem:\n {problem['problem']} \n\n A reference solution is as follows:\n {problem['solution']}\n\n"},
                {"role": "user", "content": "Modify the reference solution into two wrong ones; remember to enclose your final answer in $\\boxed{}$. Make it seem as if a student wrote those wrong solutions. Finally, generate a critique to each generated wrong solution."},
                {"role": "system", "content": "Limit the critiques to 1 sentence. Write the answer in LaTeX format."},
                ],
                response_format=SupResponse,
            )
        sample_1 = {
            'problem': problem['problem'],
            'level': problem['level'],
            'type': problem['type'],
            'solution': completion.choices[0].message.parsed.solution_1,
            'critique': completion.choices[0].message.parsed.critique_1
        }
        sample_2 = {
            'problem': problem['problem'],
            'level': problem['level'],
            'type': problem['type'],
            'solution': completion.choices[0].message.parsed.solution_2,
            'critique': completion.choices[0].message.parsed.critique_2
        }
        return sample_1, sample_2
    elif mode == 'ppo_sampled':
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": f"You are given the following mathematical problem:\n {problem['problem']} \n\n A reference solution is as follows:\n {problem['solution']}\n\n"},
                {"role": "user", "content": "Modify the reference solution into two wrong ones; remember to enclose your final answer in $\\boxed{}$. Make it seem as if a student wrote those wrong solutions."},
                {"role": "system", "content": "Ensure that the generated solutions are wrong; you do not have to explain why they were wrong. Write the answer in LaTeX format."},
            ],
            response_format=PPOResponse,
        )
        sample_1 = {
            'problem': problem['problem'],
            'level': problem['level'],
            'type': problem['type'],
            'solution': completion.choices[0].message.parsed.solution_1,
            'correct_solution': problem['solution']
        }
        sample_2 = {
            'problem': problem['problem'],
            'level': problem['level'],
            'type': problem['type'],
            'solution': completion.choices[0].message.parsed.solution_2,
            'correct_solution': problem['solution']
        }
        return sample_1, sample_2

In [62]:
def format_sample(sample, mode):
    if mode == 'sup':
        return {
            "text": " ||| ".join([sample['problem'], sample['solution']]),
            "summary": sample['critique'],
            "level": sample['level'],
            "type": sample['type']
        }
    elif mode == 'ppo_sampled':
        return {
            "text": " ||| ".join([sample['problem'], sample['solution']]),
            "summary": sample['correct_solution'],
            "level": sample['level'],
            "type": sample['type']
        }

In [69]:
# each combination of subset_mode has a jsonl file
# write the augmented data there
for subject, mode, subset in product(['algebra', 'counting_and_probability', 'number_theory'], ['sup', 'ppo_sampled'], ['train', 'test', 'dev']):
    # if MATH_RL4F/{subject}/{subset}_{mode}.jsonl does not exist, create it
    # if not os.path.exists(f'MATH_RL4F/{subject}/{subset}_{mode}.jsonl'):
    with open(f'MATH_RL4F/{subject}/{subset}_{mode}.jsonl', 'w') as f:
        pass
    count = 0
    for problem in MATH_SPLIT[subject][f'{subset}_{mode}']:
        sample_1, sample_2 = get_augmented_data(problem, mode)
        with open(f'MATH_RL4F/{subject}/{subset}_{mode}.jsonl', 'a') as f:
            f.write(json.dumps(format_sample(sample_1, mode)) + '\n')
            f.write(json.dumps(format_sample(sample_2, mode)) + '\n')
        count += 1
        if count == 3:
            break

## Examples for data augmentation

In [72]:
problem = MATH['prealgebra']['train'][0]

client = OpenAI(api_key=api_key)

sup_completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-11-20",
    messages=[
        {"role": "system", "content": f"You are given the following mathematical problem:\n {problem['problem']} \n\n A reference solution is as follows:\n {problem['solution']}\n\n"},
        {"role": "user", "content": "Modify the reference solution into two wrong ones; remember to enclose your final answer in $\\boxed{}$. Make it seem as if a student wrote those wrong solutions. Finally, generate a critique to each generated wrong solution."},
        {"role": "system", "content": "Limit the critiques to 1 sentence. Write the answer in LaTeX format."},
    ],
    response_format=SupResponse,
)
ppo_completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-11-20",
    messages=[
        {"role": "system", "content": f"You are given the following mathematical problem:\n {problem['problem']} \n\n A reference solution is as follows:\n {problem['solution']}\n\n"},
        {"role": "user", "content": "Modify the reference solution into two wrong ones; remember to enclose your final answer in $\\boxed{}$. Make it seem as if a student wrote those wrong solutions."},
        {"role": "system", "content": "Ensure that the generated solutions are wrong; you do not have to explain why they were wrong. Write the answer in LaTeX format."},
    ],
    response_format=PPOResponse,
)


sup_response = sup_completion.choices[0].message.parsed
ppo_response = ppo_completion.choices[0].message.parsed

In [73]:
print(f"The problem statement:\n{problem['problem']}")
print(f"The solution:\n{problem['solution']}")
print("="*80)
print("Supervised learning data:")
print(f"The generated wrong solution 1:\n{sup_response.solution_1}")
print(f"The critique:\n{sup_response.critique_1}")
print(f"The generated wrong solution 2:\n{sup_response.solution_2}")
print(f"The critique:\n{sup_response.critique_2}")
print("="*80)
print("PPO data:")
print(f"The generated wrong solution 1:\n{ppo_response.solution_1}")
print(f"The generated wrong solution 2:\n{ppo_response.solution_2}")

The problem statement:
Simplify $((5p+1)-2p\cdot4)(3)+(4-1\div3)(6p-9)$ to a much simpler expression of the form $ap-b$ , where $a$ and $b$ are positive integers.
The solution:
We need to simplify the given expression. Let's start by simplifying the inside of the parenthesis on the left part of the expression. \begin{align*}
((5p+1)&-2p\cdot4)(3)+(4-1\div3)(6p-9)\\
&=(5p+1-8p)(3)+(4-1\div3)(6p-9)\\
&=(-3p+1)(3)+(4-1\div3)(6p-9)
\end{align*} Next we can distribute the 3 to get $$3\cdot(-3p)+3\cdot1+(4-1\div3)(6p-9),$$ which equals $-9p+3+(4-1\div3)(6p-9)$ . The left part looks simplified so now we can focus on the right part. Let's subtract what's in the left parentheses then distribute. \begin{align*}
-9p+3+(4-1\div3)(6p-9)&=-9p+3+(\frac{4\cdot3}{3}-\frac{1}{3})(6p-9)\\
&=-9p+3+\frac{11}{3}(6p-9)\\
&=-9p+3+\frac{11}{3}\cdot6p-\frac{11}{3}\cdot9\\
&=-9p+3+\frac{11\cdot6p}{3}-\frac{11\cdot9}{3}\\
&=-9p+3+\frac{66p}{3}-\frac{99}{3}\\
&=-9p+3+\frac{3\cdot22p}{3}-\frac{3\cdot33}{3}\\
&=-9p+