In [2]:
import pickle
import pandas as pd
import os

def read_traces_to_dataframe(pickle_path):
    """
    Read a pickle file containing trace data and create a pandas DataFrame.
    
    Args:
        pickle_path: Path to the pickle file
        
    Returns:
        pandas.DataFrame: DataFrame with columns for each key in data["data"]
    """
    # Load the pickle file
    with open(pickle_path, "rb") as f:
        data = pickle.load(f)
    
    # Extract the data dictionary
    if "data" in data:
        data_dict = data["data"]
        # Convert to DataFrame
        df = pd.DataFrame(data_dict)
        return df
    else:
        raise ValueError("No 'data' key found in pickle file")


In [3]:
# Get the root directory
# Try to find root by going up from current directory until we find outputs/ directory
import pathlib

root_dir = pathlib.Path("/data/user_data/ssridha4/legible-traces-RL")

traces_dir = root_dir / "outputs" / "traces" / "gsm8k" / "train"

# List of file variants to process
variants = ["default", "numbered", "self_check", "structured"]
model_name = "Qwen_Qwen3-8B"

# Dictionary to store dataframes
dataframes = {}

# Read each file and create a dataframe
for variant in variants:
    filename = f"traces_{model_name}_{variant}.pkl"
    filepath = traces_dir / filename
    
    print(f"Reading {filename}...")
    df = read_traces_to_dataframe(str(filepath))
    dataframes[variant] = df
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print()


Reading traces_Qwen_Qwen3-8B_default.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_numbered.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_self_check.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']

Reading traces_Qwen_Qwen3-8B_structured.pkl...
  Shape: (7473, 4)
  Columns: ['questions', 'completions', 'ground_truth_answers', 'finished']



In [None]:
# Display summary information about each dataframe
for variant, df in dataframes.items():
    print(f"=== {variant.upper()} ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"First few rows:")
    print(df.head())
    print("\n")




In [5]:
print(dataframes["default"]["completions"][0])

<|im_start|>system
You are a helpful assistant that solves grade school math problems step by step. Read the question and formulate a response. Please reason step by step.

Formatting instructions:

Return your final answer followed by #### at the end of your response.
<|im_end|>
<|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
<think>
Okay, let's see. Natalia sold clips to 48 friends in April. Then in May, she sold half as many as she did in April. The question is asking how many clips she sold altogether in April and May.

First, I need to figure out how many clips she sold in May. Since it's half as many as April, I should take the number from April, which is 48, and divide it by 2. Let me do that calculation: 48 divided by 2 equals 24. So, in May, she sold 24 clips.

Now, to find the total number of clips sold in both months, I 

In [11]:
import re

def extract_reasoning_trace(completion):
    """
    Extract content between <think> and </think> tags.
    
    Args:
        completion: String containing the completion text
        
    Returns:
        str: Extracted reasoning trace, or None if not found
    """
    if pd.isna(completion) or not isinstance(completion, str):
        return None

    # Case 1: <think> ... </think> (normal)
    pattern_full = r'<think>(.*?)</think>'
    match = re.search(pattern_full, completion, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Case 2: <think> present but </think> missing
    pattern_open = r'<think>(.*)$'
    match = re.search(pattern_open, completion, re.DOTALL)
    if match:
        return match.group(1).strip()

    return None

def extract_predicted_answer(completion):
    """
    Extract the predicted answer following ####.
    Handles multiple edge cases, including:
    1. '#### <answer><|im_end|>' → stop at <|im_end|>.
    2. '<answer> ####<|im_end|>' → answer appears BEFORE ####.
    3. Fallbacks for malformed or missing <think> tags.
    """
    if pd.isna(completion) or not isinstance(completion, str):
        return None

    # === 1. Handle case: #### immediately followed by <|im_end|> ===
    # Example: "72\n####<|im_end|>"
    m = re.search(r'####\s*<\|im_end\|>', completion)
    if m:
        # Return the token immediately before ####
        before = completion[:m.start()]
        before = completion[:m.start()].rstrip()
        last_token = before.split()[-1]  # splits by whitespace
        return last_token

    # === 2. Normal case: #### ANSWER <|im_end|> ===
    # Stop at <|im_end|> if present
    m = re.search(r'####\s*(.*?)(?:<\|im_end\|>|$)', completion, flags=re.DOTALL)
    if m and m.group(1).strip():
        return m.group(1).strip()

    # === 3. Handle think-block priority ===
    # Prefer #### after </think>
    end_think = re.search(r'</think>', completion)
    if end_think:
        after = completion[end_think.end():]
        m = re.search(r'####\s*(.*)', after)
        if m:
            return m.group(1).strip()

    # # === 4. Fallback: #### after <think> ===
    # start_think = re.search(r'<think>', completion)
    # if start_think:
    #     after = completion[start_think.end():]
    #     m = re.search(r'####\s*(.*)', after)
    #     if m:
    #         return m.group(1).strip()

    # # === 5. Final fallback: #### anywhere ===
    # m = re.search(r'####\s*(.*)', completion)
    # if m:
    #     return m.group(1).strip()

    return None

In [12]:

# Extract reasoning_trace and predicted_answers for each dataframe
for variant, df in dataframes.items():
    print(f"Processing {variant}...")
    
    # Extract reasoning_trace
    df['reasoning_trace'] = df['completions'].apply(extract_reasoning_trace)
    
    # Extract predicted_answers
    df['predicted_answers'] = df['completions'].apply(extract_predicted_answer)
    
    print(f"  Extracted reasoning_trace: {df['reasoning_trace'].notna().sum()} / {len(df)}")
    print(f"  Extracted predicted_answers: {df['predicted_answers'].notna().sum()} / {len(df)}")
    print()

Processing default...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 7473 / 7473

Processing numbered...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 7473 / 7473

Processing self_check...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 7473 / 7473

Processing structured...
  Extracted reasoning_trace: 7473 / 7473
  Extracted predicted_answers: 7473 / 7473



In [13]:

# Display sample extractions for verification
for variant, df in dataframes.items():
    print(f"=== {variant.upper()} - Sample Extractions ===")
    sample_idx = 0
    print(f"\nSample {sample_idx}:")
    print(f"Reasoning trace (first 200 chars):")
    print(df.iloc[sample_idx]['completions'])
    if pd.notna(df.iloc[sample_idx]['reasoning_trace']):
        print(df.iloc[sample_idx]['reasoning_trace'])
    else:
        print("None")
    print(f"\nPredicted answer:")
    print(df.iloc[sample_idx]['predicted_answers'])
    print("\n" + "="*80 + "\n")

=== DEFAULT - Sample Extractions ===

Sample 0:
Reasoning trace (first 200 chars):
<|im_start|>system
You are a helpful assistant that solves grade school math problems step by step. Read the question and formulate a response. Please reason step by step.

Formatting instructions:

Return your final answer followed by #### at the end of your response.
<|im_end|>
<|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
<think>
Okay, let's see. Natalia sold clips to 48 friends in April. Then in May, she sold half as many as she did in April. The question is asking how many clips she sold altogether in April and May.

First, I need to figure out how many clips she sold in May. Since it's half as many as April, I should take the number from April, which is 48, and divide it by 2. Let me do that calculation: 48 divided by 2 equals 24. So, in May,