In [2]:
import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = pl.read_parquet('hf://datasets/open-r1/OpenR1-Math-220k/extended/train-*.parquet')

In [3]:
open_r1_df = df.to_pandas()

In [17]:
import re
def extract_boxed_text(text):
    # Return the last occurrence of "boxed" in the text
    texts = re.findall(r'boxed\{(\d+)\}', text)
    final = texts[-1] if len(texts) > 0 else None
    if final and final.isdigit() and int(final) <= 1000 and int(final) >= 0:
        return int(final)
    return None


def get_has_valid_answer(generations):
    for generation in generations:
        if len(generation) > 15872 or len(generation) < 500:
            continue
        ans = extract_boxed_text(generation)
        if ans is not None:
            return True
    return False

def get_has_valid_answer_index(generations):
    for i, generation in enumerate(generations):
        if len(generation) > 15872 or len(generation) < 500:
            continue
        ans = extract_boxed_text(generation)
        if ans is not None:
            return i
    return -1

mask = open_r1_df['generations'].apply(lambda x: get_has_valid_answer(x))

In [18]:
sum(mask)

29833

In [19]:
open_r1_df = open_r1_df[mask]

In [20]:
# new_columns "reasoning", "problem", "id"

open_r1_df.columns

Index(['problem', 'solution', 'answer', 'problem_type', 'question_type',
       'source', 'uuid', 'is_reasoning_complete', 'generations',
       'correctness_math_verify', 'correctness_llama', 'finish_reasons',
       'correctness_count', 'messages'],
      dtype='object')

In [27]:
corr_mask = open_r1_df['correctness_math_verify'].apply(lambda x: sum(x) > 0)
open_r1_df = open_r1_df[corr_mask]

In [41]:
import numpy as np
open_r1_df['reasoning'] = open_r1_df.apply(lambda x: x['generations'][np.where(x['correctness_math_verify'])[0][0]], axis=1)
open_r1_df['id'] = open_r1_df.apply(lambda x: '--'.join([x['uuid'],x['problem_type']+'-'+x['source']]), axis=1)

In [42]:
open_r1_df.drop(columns=[ 'solution', 'answer', 'problem_type', 'question_type',
       'source', 'uuid', 'is_reasoning_complete', 'generations',
       'correctness_math_verify', 'correctness_llama', 'finish_reasons',
       'correctness_count', 'messages'], inplace=True)

In [43]:
open_r1_df.to_csv('open_r1_traces.csv', index=False)