In [2]:
from transformers import AutoTokenizer
import torch
from datasets import load_dataset
from vllm import LLM, SamplingParams

In [3]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
llm = LLM(model=model_id)

INFO 10-08 23:32:01 config.py:1010] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 10-08 23:32:01 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='meta-llama/Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_mod

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 10-08 23:32:07 model_runner.py:1025] Loading model weights took 2.3185 GB
INFO 10-08 23:32:08 gpu_executor.py:122] # GPU blocks: 138591, # CPU blocks: 8192
INFO 10-08 23:32:09 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-08 23:32:09 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-08 23:32:21 model_runner.py:1456] Graph capturing finished in 12 secs.


In [36]:
ds = load_dataset("lighteval/MATH", split="test")
ds = ds.shuffle(seed=42).select(range(1000))

## Greedy

In [None]:
sampling_params = SamplingParams(temperature=0., max_tokens=2048, seed=42)

In [20]:
# Accuracy for system prompt
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [00:58<00:00, 17.01it/s, est. speed input: 2284.08 toks/s, outpu

Accuracy for system prompt: 0.811





In [23]:
# Look at a few samples which didn't emit boxed
for i, (p, s, b) in enumerate(zip(tmp_ds["prompt"], solutions, is_boxed)):
    if not b:
        print(f"Prompt: {p}")
        print(f"Solution: {s}")
        print()
    if i > 10:
        break

Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Find the positive base $b$ in which the equation $4 \cdot 12 = 103$ is valid.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Solution: ## Step 1: Understand the problem
We need to find a base $b$ in which the equation $4 \cdot 12 = 103$ is valid. This means we are looking for a base $b$ where the product of $4$ and $12$ equals $103$.

## Step 2: Convert the numbers to the same base
To solve this problem, we need to convert both $4$ and $12$ to the same base. The number $12$ can be written as $10$ in base $b$. Therefore, we can rewrite the equation as $4 \cdot 10_b = 103$.

## Step 3: Rewrite the equation
We can rewrite the equation as $40_b = 103$.

## Step 4:

In [18]:
# Accuracy for prefix prompt
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "user", "content": f"{system_prompt}\n{x['problem']}"}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for prefix prompt: {acc}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.
Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [00:56<00:00, 17.57it/s, est. speed input: 2359.39 toks/s, outpu

Accuracy for prefix prompt: 0.745





In [19]:
# Accuracy for suffix prompt
system_prompt = "Solve the above math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "user", "content": f"{x['problem']}\n{system_prompt}"}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for suffix prompt: {acc}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?
Solve the above math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [00:56<00:00, 17.59it/s, est. speed input: 2365.26 toks/s, outpu

Accuracy for suffix prompt: 0.622





## Sampling

It seems to be the ones we don't emit boxed are just stuck. Let's add some sampling

In [24]:
sampling_params = SamplingParams(temperature=0.1, max_tokens=2048, seed=42)

In [25]:
# Accuracy for system prompt
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [01:01<00:00, 16.37it/s, est. speed input: 2198.53 toks/s, outpu

Accuracy for system prompt: 0.826





In [26]:
sampling_params = SamplingParams(temperature=0.2, max_tokens=2048, seed=42)

In [27]:
# Accuracy for system prompt
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [01:01<00:00, 16.37it/s, est. speed input: 2198.63 toks/s, outpu

Accuracy for system prompt: 0.819





In [28]:
sampling_params = SamplingParams(temperature=0.4, max_tokens=2048, seed=42)

In [29]:
# Accuracy for system prompt
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [00:57<00:00, 17.48it/s, est. speed input: 2346.94 toks/s, outpu

Accuracy for system prompt: 0.862





In [30]:
# Look at the first sample that produced a boxed answer
for i, (p, s, b) in enumerate(zip(tmp_ds["prompt"], solutions, is_boxed)):
    if b:
        print(f"Prompt: {p}")
        print(f"Solution: {s}")
        print()
        break

Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Solution: ## Step 1: Define the possible outcomes and their probabilities for each bin
There are three possible outcomes for th

In [31]:
# Look at the first sample that produced a non-boxed answer
for i, (p, s, b) in enumerate(zip(tmp_ds["prompt"], solutions, is_boxed)):
    if not b:
        print(f"Prompt: {p}")
        print(f"Solution: {s}")
        print()
        break

Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Find the positive base $b$ in which the equation $4 \cdot 12 = 103$ is valid.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Solution: ## Step 1: Understand the problem
We need to find a base $b$ for which the equation $4 \cdot 12 = 103$ is valid.

## Step 2: Write the equation with the base
The equation is $4 \cdot 12 = 103$.

## Step 3: Convert the numbers to the same base
We need to convert both sides of the equation to the same base.

## Step 4: Find the base of 12
The prime factorization of 12 is $2^2 \cdot 3$.

## Step 5: Find the base of 103
The prime factorization of 103 is 103.

## Step 6: Determine the base
Since 12 can be expressed as $2^2 \cdot 3$ 

In [38]:
# Accuracy for system prompt
sampling_params = SamplingParams(temperature=0.8, max_tokens=2048, seed=42)
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [00:53<00:00, 18.76it/s, est. speed input: 2518.79 toks/s, outpu

Accuracy for system prompt: 0.877





In [37]:
# Accuracy for system prompt
sampling_params = SamplingParams(temperature=1.0, max_tokens=2048, seed=42)
system_prompt = "Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}."
tmp_ds = ds.map(lambda x : {"prompt": tokenizer.apply_chat_template([{"role": "system", "content": system_prompt}, {"role": "user", "content": x["problem"]}], tokenize=False, add_generation_prompt=True)})
print(tmp_ds[0]["prompt"])
outputs = llm.generate(tmp_ds["prompt"][:], sampling_params)
solutions = [i.outputs[0].text for i in outputs]
is_boxed = [True if "boxed{" in i else False for i in solutions]
acc = sum(is_boxed) / len(is_boxed)
print(f"Accuracy for system prompt: {acc}")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 Oct 2024

Solve the following math problem step-by-step.
Simplify your answer as much as possible. Present your final answer as \boxed{Your Answer}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Bin $A$ has one white ball and four black balls.  Bin $B$ has three balls labeled $\$1$ and one ball labeled $\$7$.  Bin $W$ has five balls labeled $\$8$ and one ball labeled $\$500$.  A game is played as follows: a ball is randomly selected from bin $A$.  If it is black, then a ball is randomly selected from bin $B$; otherwise, if the original ball is white, then a ball is randomly selected from bin $W$.  You win the amount printed on the second ball selected.  What is your expected win?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Processed prompts: 100%|█| 1000/1000 [01:04<00:00, 15.57it/s, est. speed input: 2090.92 toks/s, outpu

Accuracy for system prompt: 0.693





## Generate on full dataset

In [34]:
ds = load_dataset("lighteval/MATH", split="test")