In [None]:
# Download data from the llm_evals table at this psql link
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()
conn_str = os.getenv('PG_STR')


In [None]:

conn = psycopg2.connect(conn_str)

cursor = conn.cursor()

cursor.execute("SELECT * FROM llm_evals")

results = cursor.fetchall()


In [None]:
conn.close()

In [None]:
import pandas as pd
columns = [i[0] for i in cursor.description]

df = pd.DataFrame(results, columns=columns)



In [None]:
df["prop_correct"] = df.apply(lambda x: x.extracted_answers.count(x.label)/(len(x.extracted_answers) or 1), axis=1)

In [None]:
df["run_name"].unique()

In [None]:
aime_run = df[df["run_name"] == 'DeepSeek-R1-Distill-Qwen-14B-AWQ-5origprompt-H100-16384-t0.5']

In [None]:
aime_run["prop_correct"].hist(bins=20)

In [None]:
aime_run_easy_medium = aime_run[aime_run["prop_correct"].apply(lambda x: x < 0.85 and x>0)]

In [None]:
aime_run_hard = aime_run[aime_run["prop_correct"].apply(lambda x: x == 0)]

In [None]:
from ast import literal_eval
def my_func(x):
    x = literal_eval(x)
    if x[0][1]["role"] != "user":
        raise TypeError()
    return x[0][1]["content"]

aime_run_easy_medium["problem"] = aime_run_easy_medium["reasoning"].copy(deep=True).apply(my_func)
aime_run_hard["problem"] = aime_run_hard["reasoning"].copy(deep=True).apply(my_func)


In [None]:
aime_run_easy_medium["solution"] = aime_run_easy_medium["label"].apply(lambda x: f"\\boxed{{{int(x)}}}")
aime_run_hard["solution"] = aime_run_hard["label"].apply(lambda x: f"\\boxed{{{int(x)}}}")


In [None]:
# Calculate how many of the hard and easy_medium problems have [asy] tags in them
# Count problems with [asy] tags
asy_count_easy_medium = aime_run_easy_medium["problem"].str.contains("\[asy\]").sum()
asy_count_hard = aime_run_hard["problem"].str.contains("\[asy\]").sum()

print(f"Problems with [asy] tags in easy/medium set: {asy_count_easy_medium}")
print(f"Problems with [asy] tags in hard set: {asy_count_hard}")

# Remove these problems
aime_run_easy_medium = aime_run_easy_medium[~aime_run_easy_medium["problem"].str.contains("\[asy\]")]
aime_run_hard = aime_run_hard[~aime_run_hard["problem"].str.contains("\[asy\]")]



In [None]:
aime_run_easy_medium = aime_run_easy_medium.drop(columns=['uuid', 'exec_time', 'runtime_s', 'p_id', 'run_name', 'prediction',
       'label', 'extracted_answers', 'reasoning', 'prop_correct'])
aime_run_hard = aime_run_hard.drop(columns=['uuid', 'exec_time', 'runtime_s', 'p_id', 'run_name', 'prediction',
       'label', 'extracted_answers', 'reasoning', 'prop_correct'])

In [None]:
# tokenize tinputs and find their max size using the qwen tokenizer
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")

# Combine easy_medium and hard datasets
aime_run = pd.concat([aime_run_easy_medium, aime_run_hard])

# Tokenize problems and solutions
problem_tokens = aime_run["problem"].apply(lambda x: len(tokenizer.encode(x)))
solution_tokens = aime_run["solution"].apply(lambda x: len(tokenizer.encode(x)))

print(f"Max problem tokens: {problem_tokens.max()}")
print(f"Mean problem tokens: {problem_tokens.mean():.1f}")
print(f"Max solution tokens: {solution_tokens.max()}")
print(f"Mean solution tokens: {solution_tokens.mean():.1f}")


In [None]:
aime_run["problem"].iloc[0]

In [None]:
# Check all aime run questions end with "modulo 1000."
sum(aime_run["problem"].apply(lambda text: text[-13:] == "modulo 1000. "))

In [None]:
# Convert to huggingface dataset
from datasets import Dataset

hf_tok = os.getenv("HF_TOK")

# Convert pandas DataFrame to Huggingface Dataset
hf_dataset = Dataset.from_pandas(aime_run_easy_medium.iloc[:10], split="train")

# Push to hub
hf_dataset.push_to_hub(
    "samitizerxu/math-easy-medium-t",
    private=True,
    token=hf_tok  # Replace with actual token
)

hf_dataset = Dataset.from_pandas(aime_run_hard.iloc[:10], split="train")

# Push to hub
hf_dataset.push_to_hub(
    "samitizerxu/math-hard-t",
    private=True,
    token=hf_tok  # Replace with actual token
)


In [None]:
hf_dataset = Dataset.from_pandas(aime_run.iloc[:10], split="train")

# Push to hub
hf_dataset.push_to_hub(
    "samitizerxu/math-all-t",
    private=True,
    token=hf_tok  # Replace with actual token
)