In [3]:
# !pip install openai==0.28

In [4]:
#!/usr/bin/env python3
"""
Script: label_gsm8k_thoughts_parallel.py

Like before, but uses a ThreadPoolExecutor to label multiple chains in parallel.
"""

import os
import json
import pandas as pd
import openai
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# ─── Configuration ───────────────────────────────────────────────────────────
INPUT_DF_PATH   = "reasoning_traces.csv"
OUTPUT_CSV      = "gsm8k_chains_labeled.csv"
MODEL           = "gpt-4"
SYSTEM_PROMPT   = (
    "You will be given a math question and the chain of thought (broken into numbered steps).  "
    "Your job is to tell me which steps are truly necessary to arrive at the final answer "
    "(i.e. non‐redundant reasoning), and which are fluff or repetition.  "
    "Return your answer as a JSON object with two arrays: "
    "`required` (list of step indices) and `redundant` (list of step indices)."
)
MAX_WORKERS     = 8   # number of concurrent threads
TIMEOUT_SECONDS = 60  # per-request timeout

openai.api_key = os.environ["OPENAI_API_KEY"]

def classify_thoughts(question: str, thoughts: list[str]) -> tuple[list[int], list[int]]:
    numbered = "\n".join(f"{i+1}. {t}" for i,t in enumerate(thoughts))
    user = (
        f"Question:\n{question}\n\n"
        f"Chain of thought (each step numbered below):\n{numbered}\n\n"
        "Which step numbers are REQUIRED vs REDUNDANT?"
    )
    resp = openai.ChatCompletion.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": user}
        ],
        temperature=0.0,
        request_timeout=TIMEOUT_SECONDS
    )
    content = resp.choices[0].message.content.strip()
    try:
        j = json.loads(content)
        return j.get("required", []), j.get("redundant", [])
    except json.JSONDecodeError:
        return [], []

def worker(row):
    q     = row["question"]
    chain = row["chain_of_thought"]
    thoughts = [t.strip() for t in chain.split("\n\n") if t.strip()]
    req, red = classify_thoughts(q, thoughts)
    return req, red

def main():
    df = pd.read_csv(INPUT_DF_PATH)
    required_idxs  = [None] * len(df)
    redundant_idxs = [None] * len(df)

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
        # submit all jobs
        futures = {exe.submit(worker, df.iloc[i]): i for i in range(len(df))}
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Labeling chains"):
            idx = futures[fut]
            try:
                req, red = fut.result()
            except Exception as e:
                req, red = [], []
                print(f"[{idx}] ERROR: {e}")
            required_idxs[idx]  = req
            redundant_idxs[idx] = red

    df["required_indices"]  = required_idxs
    df["redundant_indices"] = redundant_idxs

    def pick(inds, thoughts):
        return "\n\n".join(thoughts[i-1] for i in inds if 1 <= i <= len(thoughts))

    # extract the text for required/redundant
    df["required_thoughts"]  = df.apply(
        lambda r: pick(r.required_indices, r.chain_of_thought.split("\n\n")), axis=1
    )
    df["redundant_thoughts"] = df.apply(
        lambda r: pick(r.redundant_indices, r.chain_of_thought.split("\n\n")), axis=1
    )

    df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Saved labeled chains to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

Labeling chains:   0%|          | 0/1200 [00:00<?, ?it/s]

[167] ERROR: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=60)
✅ Saved labeled chains to gsm8k_chains_labeled.csv


In [9]:
df = pd.read_csv('gsm8k_chains_labeled.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          1200 non-null   int64 
 1   idx                 1200 non-null   int64 
 2   question            1200 non-null   object
 3   reference_answer    1200 non-null   object
 4   chain_of_thought    1200 non-null   object
 5   token_lengths       1200 non-null   int64 
 6   num_thoughts        1200 non-null   int64 
 7   required_indices    1200 non-null   object
 8   redundant_indices   1200 non-null   object
 9   required_thoughts   1199 non-null   object
 10  redundant_thoughts  1198 non-null   object
dtypes: int64(4), object(7)
memory usage: 103.2+ KB


In [12]:
#!/usr/bin/env python3
import pandas as pd
from transformers import AutoTokenizer

# ─── Config ────────────────────────────────────────────────────────────────
MODEL_NAME    = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
OUTPUT_CSV    = "gsm8k_chains_labeled_with_tokens.csv"

# ─── Load the labeled chains DataFrame ────────────────────────────────────
df = pd.read_csv('gsm8k_chains_labeled.csv')

# ─── Initialize tokenizer ─────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# ─── Make sure empty entries are strings ───────────────────────────────────
for c in ["chain_of_thought", "required_thoughts", "redundant_thoughts"]:
    if c not in df.columns:
        df[c] = ""
    else:
        df[c] = df[c].fillna("")

# ─── Helper to count tokens (no special tokens) ────────────────────────────
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))

# ─── Compute and append the new columns ────────────────────────────────────
df["total_tokens"]     = df["chain_of_thought"].map(count_tokens)
df["required_tokens"]  = df["required_thoughts"].map(count_tokens)
df["redundant_tokens"] = df["redundant_thoughts"].map(count_tokens)

# ─── Percent of tokens that are redundant (could be saved) ───────────────
# guard zero‐division
df["token_saving_pct"] = (
    df["redundant_tokens"] / df["total_tokens"].replace(0, 1)
) * 100

# ─── Save the augmented DataFrame ─────────────────────────────────────────
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Augmented DataFrame saved to {OUTPUT_CSV}")


✅ Augmented DataFrame saved to gsm8k_chains_labeled_with_tokens.csv


In [16]:
df['token_saving_pct'].median() # 63% tokens can be saved.

63.367159478610795