## Generate data

In [1]:
import pickle
import pandas as pd
import os

def read_traces_to_dataframe(pickle_path):
    """
    Read a pickle file containing trace data and create a pandas DataFrame.
    
    Args:
        pickle_path: Path to the pickle file
        
    Returns:
        pandas.DataFrame: DataFrame with columns for each key in data["data"]
    """
    # Load the pickle file
    with open(pickle_path, "rb") as f:
        data = pickle.load(f)
    
    # Extract the data dictionary
    if "data" in data:
        data_dict = data["data"]
        # Convert to DataFrame
        df = pd.DataFrame(data_dict)
        return df
    else:
        raise ValueError("No 'data' key found in pickle file")


In [None]:
# Get the root directory
# Try to find root by going up from current directory until we find outputs/ directory
import pathlib

root_dir = pathlib.Path("/user_data/mdhawan/projects/categorization/code/external/legible-traces-RL")## "/data/user_data/ssridha4/legible-traces-RL"

model_name = "Qwen_Qwen3-8B"

traces_dir = root_dir / "outputs" / "traces" / "gsm8k" / f"{model_name}" / "test"

# List of file variants to process
variants = ["default", "numbered", "self_check", "structured"]


# Dictionary to store dataframes
dataframes = {}

# Read each file and create a dataframe
for variant in variants:
    filename = f"traces_{model_name}_{variant}.pkl"
    filepath = traces_dir / filename
    
    print(f"Reading {filename}...")
    df = read_traces_to_dataframe(str(filepath))
    dataframes[variant] = df
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print()

Reading traces_Qwen_Qwen3-8B_default.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_numbered.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_self_check.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_structured.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']



In [None]:
# Display summary information about each dataframe
for variant, df in dataframes.items():
    print(f"=== {variant.upper()} ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"First few rows:")
    print(df.head())
    print("\n")




In [11]:
print(dataframes["default"]["completions"][0])

<|im_start|>system
You are a helpful assistant that solves grade school math problems step by step. Read the question and formulate a response. Please reason step by step.

Formatting instructions:

Return your final answer followed by #### at the end of your response.
<|im_end|>
<|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
<think>
Okay, let's see. Natalia sold clips to 48 friends in April. Then in May, she sold half as many as she did in April. The question is asking how many clips she sold altogether in April and May.

First, I need to figure out how many clips she sold in May. Since it's half as many as April, I should take the number from April, which is 48, and divide it by 2. Let me do that calculation: 48 divided by 2 equals 24. So, in May, she sold 24 clips.

Now, to find the total number of clips sold in both months, I 

In [29]:
import re

def extract_reasoning_trace(completion):
    """
    Extract content between <think> and </think> tags.
    
    Args:
        completion: String containing the completion text
        
    Returns:
        str: Extracted reasoning trace, or None if not found
    """
    if pd.isna(completion) or not isinstance(completion, str):
        return None

    # Case 1: <think> ... </think> (normal)
    pattern_full = r'<think>(.*?)</think>'
    match = re.search(pattern_full, completion, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Case 2: <think> present but </think> missing
    pattern_open = r'<think>(.*)$'
    match = re.search(pattern_open, completion, re.DOTALL)
    if match:
        return match.group(1).strip()

    return None

def extract_predicted_answer(completion):
    """
    Extract the final answer from an LLM completion.

    Rules:
    1. Only look after the </think> tag. If </think> is missing, return None.
    2. Normal case: final answer comes after ####.
       - Extract the token immediately after ####
       - Stop at whitespace, newline, or <|im_end|>
    3. Edge case: if #### is immediately followed by <|im_end|>,
       - Extract the token immediately before ####
    """
    if not isinstance(completion, str):
        return None

    # Find </think>
    end_think = re.search(r'</think>', completion)
    if not end_think:
        return None

    text_after = completion[end_think.end():].lstrip()

    # Edge case: #### immediately followed by <|im_end|>
    edge_case_match = re.search(r'####\s*<\|im_end\|>', text_after)
    if edge_case_match:
        before = text_after[:edge_case_match.start()].rstrip()
        # Take the last token (split by whitespace)
        tokens = before.split()
        match = tokens[-1] if tokens else ""

        ans_reg = re.compile(r"(\-?[0-9\.\,]+)")
        match = ans_reg.search(match)
        if match:
            try:
                return float(match.group(1).replace(",", ""))
            except:
                return None
        else:
            return None

    # Main case: #### ANSWER ...
    main_match = re.search(r'####\s*(\S+)', text_after)
    if main_match:
        answer = main_match.group(1)
        # Remove <|im_end|> if accidentally included
        answer = answer.split('<|im_end|>')[0].strip()
        ans_reg = re.compile(r"(\-?[0-9\.\,]+)")
        match = ans_reg.search(answer)
        if match:
            try:
                return float(match.group(1).replace(",", ""))
            except:
                return None
        else:
            return None

    return None


def extract_ground_truth_answer(completion):
    """
    Extract the final answer from an the ground truth response.

    """
    ans_reg = re.compile(r"#### (\-?[0-9\.\,]+)")

    match = ans_reg.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        try:
            return float(match_str)
        except:
            return None
    else:
        return None

In [30]:

# Extract reasoning_trace and predicted_answers for each dataframe
for variant, df in dataframes.items():
    print(f"Processing {variant}...")
    
    # Extract reasoning_trace
    df['reasoning_trace'] = df['completions'].apply(extract_reasoning_trace)
    
    # Extract predicted_answers
    df['predicted_answers'] = df['completions'].apply(extract_predicted_answer)
    
    print(f"  Extracted reasoning_trace: {df['reasoning_trace'].notna().sum()} / {len(df)}")
    print(f"  Extracted predicted_answers: {df['predicted_answers'].notna().sum()} / {len(df)}")
    print()

Processing default...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 6023 / 7473

Processing numbered...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 7146 / 7473

Processing self_check...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 6592 / 7473

Processing structured...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 6648 / 7473



In [31]:

# Display sample extractions for verification
for variant, df in dataframes.items():
    print(f"=== {variant.upper()} - Sample Extractions ===")
    sample_idx = 0
    print(f"\nSample {sample_idx}:")
    print(f"Reasoning trace (first 200 chars):")
    print(df.iloc[sample_idx]['completions'])
    if pd.notna(df.iloc[sample_idx]['reasoning_trace']):
        print(df.iloc[sample_idx]['reasoning_trace'])
    else:
        print("None")
    print(f"\nPredicted answer:")
    print(df.iloc[sample_idx]['predicted_answers'])
    print("\n" + "="*80 + "\n")

=== DEFAULT - Sample Extractions ===

Sample 0:
Reasoning trace (first 200 chars):
<|im_start|>system
You are a helpful assistant that solves grade school math problems step by step. Read the question and formulate a response. Please reason step by step.

Formatting instructions:

Return your final answer followed by #### at the end of your response.
<|im_end|>
<|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
<think>
Okay, let's see. Natalia sold clips to 48 friends in April. Then in May, she sold half as many as she did in April. The question is asking how many clips she sold altogether in April and May.

First, I need to figure out how many clips she sold in May. Since it's half as many as April, I should take the number from April, which is 48, and divide it by 2. Let me do that calculation: 48 divided by 2 equals 24. So, in May,

In [32]:
print(dataframes["default"].head())

                                           questions  \
0  Natalia sold clips to 48 of her friends in Apr...   
1  Weng earns $12 an hour for babysitting. Yester...   
2  Betty is saving money for a new wallet which c...   
3  Julie is reading a 120-page book. Yesterday, s...   
4  James writes a 3-page letter to 2 different fr...   

                                         completions  \
0  <|im_start|>system\nYou are a helpful assistan...   
1  <|im_start|>system\nYou are a helpful assistan...   
2  <|im_start|>system\nYou are a helpful assistan...   
3  <|im_start|>system\nYou are a helpful assistan...   
4  <|im_start|>system\nYou are a helpful assistan...   

                                ground_truth_answers  finished  \
0  Natalia sold 48/2 = <<48/2=24>>24 clips in May...      True   
1  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...      True   
2  In the beginning, Betty has only 100 / 2 = $<<...      True   
3  Maila read 12 x 2 = <<12*2=24>>24 pages today....      True

In [34]:
# save each of these dataframes to a csv file
output_dir = root_dir / "outputs" / "data"
output_dir.mkdir(parents=True, exist_ok=True)

for variant, df in dataframes.items():
    output_path = output_dir / f"dataframes_{variant}.csv"
    df.to_csv(output_path, index=False)
    print(f"Saved {variant} to {output_path}")


Saved default to /data/user_data/ssridha4/legible-traces-RL/outputs/data/dataframes_default.csv
Saved numbered to /data/user_data/ssridha4/legible-traces-RL/outputs/data/dataframes_numbered.csv
Saved self_check to /data/user_data/ssridha4/legible-traces-RL/outputs/data/dataframes_self_check.csv
Saved structured to /data/user_data/ssridha4/legible-traces-RL/outputs/data/dataframes_structured.csv


## Analyze data

In [3]:
import pickle
import pandas as pd
import os

# Get the root directory
# Try to find root by going up from current directory until we find outputs/ directory
import pathlib

root_dir = pathlib.Path("/data/user_data/ssridha4/legible-traces-RL")

data_dir = root_dir / "outputs" / "data"

# List of file variants to process
variants = ["default", "numbered", "self_check", "structured"]

# Dictionary to store dataframes
dataframes = {}

# Read each file and create a dataframe
for variant in variants:
    filename = f"dataframes_{variant}.csv"
    filepath = data_dir / filename
    
    print(f"Reading {filename}...")
    df = pd.read_csv(str(filepath))
    dataframes[variant] = df
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print()

Reading dataframes_default.csv...
  Shape: (7473, 6)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished', 'reasoning_trace', 'predicted_answers']

Reading dataframes_numbered.csv...
  Shape: (7473, 6)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished', 'reasoning_trace', 'predicted_answers']

Reading dataframes_self_check.csv...
  Shape: (7473, 6)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished', 'reasoning_trace', 'predicted_answers']

Reading dataframes_structured.csv...
  Shape: (7473, 6)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished', 'reasoning_trace', 'predicted_answers']



In [35]:
# get the average length of the traces
for variant, df in dataframes.items():
    print(f"Average length of traces for {variant}: {df['reasoning_trace'].apply(len).mean()}")


Average length of traces for default: 2578.8067710424193
Average length of traces for numbered: 1776.3611668673893
Average length of traces for self_check: 3494.4274053258396
Average length of traces for structured: 2284.9035193362774


In [33]:
# get the accuracy of the predicted answers
for variant, df in dataframes.items():
    df['gt_answers'] = df['ground_truth_answers'].apply(extract_ground_truth_answer)
    df['correct'] = df['predicted_answers'] == df['gt_answers']
    accuracy = df['correct'].mean()
    print(f"Accuracy of predicted answers for {variant}: {accuracy}")

Accuracy of predicted answers for default: 0.7850930014719657
Accuracy of predicted answers for numbered: 0.9369731031714171
Accuracy of predicted answers for self_check: 0.8647129666800482
Accuracy of predicted answers for structured: 0.8731433159373746
