# Prepare S1 dataset for training / calibration

This notebook generates prompts to be run with `run_{model}.py` scripts. Goals include:
- Budget forcing
- Verify model results (produce labels for probes)

In [1]:
import os
import json
from collections import Counter

import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

from sklearn.linear_model import LinearRegression

from transformers import AutoTokenizer
from datasets import load_dataset

Reminder to authenticate!

In [2]:
# os.system("export HF_TOKEN=''")
# os.system("export HF_HOME=''")

File paths

In [3]:
PROMPT_DIR = "../inputs"  # you can customize

# this should be updated with where your outputs are saved
model_to_folder = {
    "qwen2.5": "../outputs",
    "qwq": "../outputs-qwq",
    "llama3.3": "../outputs-llama"
}

## Truncate and embed s1K thoughts

In [4]:
# im lazy this makes the dataset easier to read
fp_s1 = "cache/s1.json"
if not os.path.exists(fp_s1):
    ds = load_dataset("simplescaling/s1K-1.1", split="train")
    ds.to_json(fp_s1)

with open(fp_s1) as f:
    ds = []
    for line in f:
        ds.append(json.loads(line))

In [5]:
model_to_hf = {
    "qwen2.5": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "llama3.3": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    "qwq": "Qwen/QwQ-32B"
}
tokenizers = {}
for model, path in model_to_hf.items():
    tokenizers[model] = AutoTokenizer.from_pretrained(path)

Prepare per-step examples. Add chat template BEFORE prefilling `<think> ...`

In [6]:
def generate_truncated_prompts(item, model, batch_size=10):
    """
    item         (dict) single element
    model        specifies chat template / tokenizer
    batch_size   number of steps per prompt (subsampling)
    """
    question = item["question"]
    steps = separate_steps(item["deepseek_thinking_trajectory"])
    # split into progressively longer prompts
    new_prompts = []
    for i in range(0, len(steps), batch_size):
        thoughts = "\n\n".join(steps[:i+batch_size])
        full_prompt = format_prompt(question, thoughts, model)
        if full_prompt.count("<think>\n\n") > 1:
            full_prompt = full_prompt.replace("<think>\n\n", "", 1)
        new_prompts.append(full_prompt)
    return new_prompts


def separate_steps(thoughts, delims=["wait", "Wait", "but", "But"]):
    """
    Split thoughts into steps = chunks delimited by "\n\n" and specified terms

    thoughts  S1 thoughts
    """
    steps = [""]
    for line in thoughts.split("\n"):
        if len(line) == 0:
            continue
        line = line + "\n"
        new_step = False
        for s in delims:
            if s in line:
                new_step = True
                break
        if new_step:
            steps.append(line)
        else:
            steps[-1] += line
    steps = [s.strip() for s in steps]
    return steps


def format_prompt(question, thoughts, model):
    """
    question  full question
    thoughts  S1 thoughts
    model     specifies chat template / tokenizer
    """
    prompt = convert(f"{question} Please reason step by step, and put your final answer within \\boxed{{}}.", tokenizers[model])
    full_prompt = f"""{prompt}

<think>

{thoughts}

</think>

Final Answer:
"""
    return full_prompt


def convert(messages, tokenizer):
    if type(messages) is str:
        messages = [
            {"role": "user", "content": messages}
        ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return text

In [7]:
def get_step_limits(item, model):
    """
    This is used to align embeddings with steps

    item         (dict) single element
    model        specifies chat template / tokenizer
    """
    question = item["question"]
    steps = separate_steps(item["deepseek_thinking_trajectory"])
    # split into progressively longer prompts
    limits = []
    left = None
    for i in range(len(steps) + 1):
        thoughts = "\n\n".join(steps[:i])
        # trim
        full_prompt = format_prompt(question, thoughts, model).replace("""
</think>

Final Answer:"
""", "")
        tokens = tokenizers[model](full_prompt)["input_ids"]
        right = len(tokens)
        if left is not None:
            limits.append((left, right))
        left = right
    return limits

Generate prompts for budget forcing.

In [8]:
for model in model_to_hf:
    all_prompts = []
    batch_idx = []
    idx_to_index = []
    for i, item in enumerate(ds):
        prompts = generate_truncated_prompts(item, model)
        all_prompts.extend(prompts)
        batch_idx.extend([i] * len(prompts))
        idx_to_index.append([len(all_prompts) - len(prompts) + i for i in range(len(prompts))])
    # save prompts first
    with open(os.path.join(PROMPT_DIR, f"s1_truncated_{model}.json"), "w") as f:
        json.dump(all_prompts, f)
    # this takes longer
    fp_metadata = f"cache/s1_metadata_{model}.json"
    if not os.path.exists(fp_metadata):
        limits = []
        for item in tqdm(ds):
            limits.append(get_step_limits(item, model))
        with open(fp_metadata, "w") as f:
            json.dump({
                "batch_idx": batch_idx,
                "idx_to_index": idx_to_index,
                "step_limits": limits,
            }, f)

Generate prompts for embedding thoughts (only need to run once, through the full thought trajectory, since we truncate).

In [9]:
for model in model_to_hf:
    prompts_embed = []
    for i, item in enumerate(ds):
        prompts_embed.extend(generate_truncated_prompts(item, model, batch_size=1000))
    assert len(prompts_embed) == len(ds)
    with open(os.path.join(PROMPT_DIR, f"s1_embed_{model}.json"), "w") as f:
        json.dump(prompts_embed, f)

## Verifier prompts

These prompts provide labels for *training* probes

### Supervised probe

In [10]:
grading_prompt = """You are an AI assistant for grading a science problem. The user will provide you with the question itself, the correct answer, and the student's attempt. Your job is to judge whether the attempt is correct by comparing it with the correct answer. If the correct answer is a number or choice, there should be no ambiguity, and you should directly compare the answer and the final result. If the attempt is incomplete, you should mark it as wrong. If the correct answer involves going through the entire reasoning process, you should judge the result based on whether the reasoning process is correct, compared to correct answer.

Do NOT try to solve the problem yourself. Only grade the attempt based on the correct answer.

The user will provide the attempt and the correct answer in the following format:

# Problem
{problem}

## Correct answer
{solution}

## Student attempt
{attempt}

Explain your reasoning concisely, and end your response on a new line with only "Yes" or "No" (without quotes).
"""

def get_prompt_supervised(question, attempt, solution):
    user_prompt = f"## Problem\n{question}\n\n## Correct answer\n{solution}\n\n## Student attempt\n{attempt}\n"
    messages = [
        {"role": "system", "content": grading_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages

In [11]:
for model, folder in model_to_folder.items():
    fp_outputs = f"{folder}/s1_truncated_{model}.json"
    if not os.path.exists(fp_outputs):
        continue
    with open(fp_outputs) as f:
        outputs = json.load(f)

    all_prompts_verify = []
    
    questions = [ds[i]["question"] for i in batch_index]
    answers = [ds[i]["question"] for i in batch_index]
    
    for question, attempt, answer in zip(questions, outputs, answers):
        prompt = get_prompt_supervised(question, attempt, answer)
        all_prompts_verify.append(prompt)
    
    with open(os.path.join(PROMPT_DIR, f"s1_verify_{model}.json"), "w") as f:
        json.dump(all_prompts_verify, f)
    
    print(len(all_prompts_verify))

### Consistency probe

In [12]:
grading_prompt = """You are an AI assistant for grading a science problem. The user will provide you with the question itself and two student attempts. Your job is to judge whether the two students arrive at the same answer. If question asks for a single numerical answer, there should be no ambiguity, and you should directly compare the two answers. If the question asks for multiple parts, the two attempts are identical if only if all of the parts arrive at the same conclusion.

Do NOT try to solve the problem yourself. Only grade whether the two attempts are the same.

The user will provide the problem and two attempts in the following format:

# Problem

{problem}

## Attempt 1

{attempt1}

## Attempt 2

{attempt2}

Explain your reasoning concisely, and end your response on a new line with only "Yes" or "No" (without quotes).
"""

def get_prompt_consistency(question, attempt1, attempt2):
    user_prompt = f"## Problem\n{question}\n\n## Attempt 1\n\n{attempt1}\n\n## Attempt 2\n\n{attempt2}\n"
    messages = [
        {"role": "system", "content": grading_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages

In [13]:
for model, folder in model_to_folder.items():
    fp_outputs = f"{folder}/s1_truncated_{model}.json"
    if not os.path.exists(fp_outputs):
        continue
    with open(fp_outputs) as f:
        outputs = json.load(f)

    all_prompts_consistent = []
    
    questions = [ds[i]["question"] for i in batch_index]
    reference_attempts = [outputs[indices[-1]] for indices in idx_to_index]
    
    for question, attempt, answer in zip(questions, outputs, answers):
        cur_index = len(all_prompts_consistent)
        reference = reference_attempts[batch_index[cur_index]]
        prompt = get_prompt_consistency(question, reference, attempt)
        all_prompts_consistent.append(prompt)
    
    with open(os.path.join(PROMPT_DIR, f"s1_consistent_{model}.json"), "w") as f:
        json.dump(all_prompts_consistent, f)
    
    print(len(all_prompts_consistent))

### Novelty probe

In [14]:
def generate_novelty_prompts(item, batch_size=1):
    question = item["question"]
    solution = item["solution"]
    steps = separate_steps(item["deepseek_thinking_trajectory"])
    steps = [f"## step {i+1}\n{s}" for i, s in enumerate(steps)]
    # split into progressively longer prompts
    new_prompts = []
    # technically 1-2 is deterministic, but this is for quality control
    for i in range(1, len(steps), batch_size):
        thoughts = "\n\n".join(steps[:i+batch_size])
        system_prompt = f"""You are an AI assistant for assessing the quality of logical reasoning. The user will provide you with the question and an incomplete attempt, consisting of a series of reasoning steps. Your job is to judge whether current step appears to provide additional information, compared to the previous steps. If the current step is correct and novel, it is useful. If the current step is wrong or redundant, then it is not useful.

Do NOT try to solve the problem yourself. It does not matter if the attempt is not complete. Only comment on whether the current step is useful.

The user will provide the problem and reasoning steps in the following format:

# Problem
{{ problem }}

# Reasoning
## step 1
{{ reasoning step 1 }}

## step 2
{{ reasoning step 2 }}

...

## step k
{{ reasoning step k }}

...

## current step
{{ current reasoning step }}
"""
        user_prompt = f"""# Problem
{question}

# Reasoning
{thoughts}

Explain your reasoning, and end your response on a new line with only "Yes" if the current step provides new information or "No" otherwise (without quotes).
"""

        new_prompts.append([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ])
    return new_prompts

In [15]:
prompts_novel = []
index_novel = []
idx_to_index_novel = [[] for _ in range(1000)]

for i, item in enumerate(ds):
    prompts = generate_novelty_prompts(item, batch_size=1)
    prompts_novel.extend(prompts)
    index_novel.extend([i] * len(prompts))
    idx_to_index_novel[i] = [len(all_prompts) - len(prompts) + i for i in range(len(prompts))]

In [16]:
# with open(os.path.join(PROMPT_DIR, "s1_novel.json"), "w") as f:
#     json.dump(prompts_novel, f)

# fp_metadata_novel = "cache/s1_metadata_step.json"
# with open(fp_metadata_novel, "w") as f:
#     json.dump({
#         "batch_index": index_novel,
#         "idx_to_index": idx_to_index_novel,
#     }, f)

### Leaf probe

In [17]:
def generate_leaf_prompts(item, batch_size=10):
    question = item["question"]
    steps = separate_steps(item["deepseek_thinking_trajectory"])
    # split into progressively longer prompts
    new_prompts = []
    # technically 1-2 is deterministic, but this is for quality control
    for i in range(1, len(steps), batch_size):
        thoughts = steps[i].strip()
        system_prompt = f"""You are an AI assistant for parsing LLM outputs. The user will provide you with the question and an intermediate reasoning step. Your job is to judge whether the given step contains an attempt at a final answer.

Do NOT attempt to solve the problem yourself. It does not matter if the answer is correct. Only comment on whether an attempt has been made.

The user will provide the problem and reasoning steps in the following format:

# Problem

{{ problem }}

# Reasoning step

{{ reasoning step }}
"""
        user_prompt = f"""# Problem

{question}

# Reasoning step

{thoughts}

Explain your reasoning, and end your response on a new line with only "Yes" or "No" indicating whether or the given step makes an attempt at providing the final answer.
"""

        new_prompts.append([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ])
    return new_prompts

In [18]:
prompts_leaf = []
index_leaf = []
idx_to_index_leaf = [[] for _ in range(1000)]

for i, item in enumerate(ds):
    prompts = generate_leaf_prompts(item, batch_size=1)
    prompts_leaf.extend(prompts)
    index_leaf.extend([i] * len(prompts))
    idx_to_index_leaf[i] = [len(all_prompts) - len(prompts) + i for i in range(len(prompts))]

In [19]:
# with open(os.path.join(PROMPT_DIR, "s1_leaf.json"), "w") as f:
#     json.dump(prompts_leaf, f)

In [20]:
assert index_leaf == index_novel
assert idx_to_index_leaf == idx_to_index_novel