In [None]:
import plotly.express as px
import numpy as np
import duckdb

In [None]:
import ray

ray.init()

In [None]:
import os
import psycopg2
import json

conn_string = os.environ["SUPABASE_CONNECTION_URL"]

conn = psycopg2.connect(conn_string)

import pandas as pd

In [None]:
# df = pd.read_sql("SELECT * FROM public.encoding_schemes WHERE (data->'experiment_tags'->'sft')::boolean", conn)
df = pd.read_sql("""
SELECT * FROM public.encoding_schemes 
    WHERE 

    (
        (data->'experiment_params'->'encoding_scheme')::TEXT LIKE '%speaking_math_%_steg%'
        OR (data->'experiment_params'->'encoding_scheme')::TEXT LIKE '%identity%'
        OR (data->'experiment_params'->'encoding_scheme')::TEXT LIKE '%zero_shot%'
    )
    AND (data->'experiment_params'->'use_sft_model_for_sampling')::BOOL
    AND (data->'experiment_params'->'model')::TEXT LIKE '%Qwen2.5%'
    AND (data->'experiment_params'->'model')::TEXT NOT LIKE '%32B%'
    AND (data->'experiment_params'->'sampling_params'->'n')::INT = 1

ORDER BY created_at DESC
""", conn)

df.head()

In [None]:
df['data'].iloc[0]

In [None]:
root_dir = "~/sky_workdir/encoding-schemes/output"

In [None]:
l_examples = df.to_dict('records')

l_examples[:5]

In [None]:

def bootstrap_ci(data, statistic=np.mean, alpha=0.05, n_boot=10_000, random_state=None):
    """
    Returns (point_estimate, low_CI, high_CI) for given 1D data.
    Works with bool, int, or float data.
    """
    x = np.asarray(data).astype(float)  # ensure numeric
    x = x[~np.isnan(x)]
    if len(x) == 0:
        raise ValueError("No valid data for bootstrapping.")

    rng = np.random.default_rng(random_state)
    n = len(x)

    # Draw bootstrap samples
    idx = rng.integers(0, n, size=(n_boot, n))
    samples = x[idx]

    # Apply statistic row-wise
    stats = np.apply_along_axis(statistic, 1, samples)

    point = statistic(x)
    lo = np.percentile(stats, 100 * (alpha / 2))
    hi = np.percentile(stats, 100 * (1 - alpha / 2))
    return point, lo, hi

In [None]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")


@ray.remote
def count_tokens_from_messages(s):
    try:
        return len(encoding.encode(s, disallowed_special=()))
    except ValueError as e:
        print(e)
        return 0


In [None]:
from tqdm import tqdm

In [None]:

for example in l_examples:
    try:
        df_data = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "math_scores.parquet"))
    except Exception as e:
        print(e)
        continue

    df_sft_data = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "sft.parquet"))
    df_prompted_cot = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "prompted_cot.parquet"))

    try:
        example['contains_math_solving'] = df_prompted_cot['contains_math_solving'].apply(np.mean).mean()
        mid, lo, hi = bootstrap_ci(df_prompted_cot['contains_math_solving'].apply(np.mean))
        example['contains_math_solving_low_ci'] = mid - lo
        example['contains_math_solving_hi_ci'] = hi - mid
    except Exception as e:
        print(e, example['data'])
    
    example['is_corrects'] = df_data['is_corrects'].apply(np.mean).mean()
    mid, lo, hi = bootstrap_ci(df_data['is_corrects'].apply(np.mean))
    example['is_corrects_low_ci'] = mid - lo
    example['is_corrects_hi_ci'] = hi - mid

    example['followed_encoding_style_raw'] = df_prompted_cot['followed_encoding_style']
    example['followed_encoding_style'] = df_prompted_cot['followed_encoding_style'].apply(np.mean).mean()
    mid, lo, hi = bootstrap_ci(df_prompted_cot['followed_encoding_style'].apply(np.mean))
    example['followed_encoding_style_low_ci'] = mid - lo
    example['followed_encoding_style_hi_ci'] = hi - mid
    
    example['reference_translation'] = df_sft_data['messages'].map(lambda x: x[-1]['content'])

    # l_token_lens = []
    # for decoded_cot in tqdm(df_data['decoded_cot']):
    #     l_token_lens.extend([count_tokens_from_messages.remote(s) for s in decoded_cot])
    # l_token_lens = ray.get(l_token_lens)

    # example['num_tokens_output'] = l_token_lens

    for col in df_data.columns:
        example[f"{col}_df"] = df_data[col]

In [None]:
# for example in l_examples:
#     df_sft_train = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "sft_train.parquet"))

#     l_token_lens = []
#     for conversation in tqdm(df_sft_train['messages']):
#         l_token_lens.extend([count_tokens_from_messages.remote(s["content"]) for s in conversation])
#     l_token_lens = ray.get(l_token_lens)

#     example["n_total_train_tok"] = np.sum(l_token_lens)

In [None]:
df_data.head()

In [None]:
def humanize_number(num: float) -> str:
    """
    Converts a number into a human-readable string with k, M, or B suffixes.
    
    Args:
        num (float): The number to format.
    
    Returns:
        str: Human-readable string representation.
    """
    if num >= 1_000_000_000:
        return f"{num / 1_000_000_000:.1f}B"
    elif num >= 1_000_000:
        return f"{num / 1_000_000:.1f}M"
    elif num >= 1_000:
        return f"{num / 1_000:.1f}k"
    else:
        return str(num)

In [None]:
import re

def parse_params(model):
    if 'gpt' in model:
        if 'nano' in model:
            return 0
        elif 'mini' in model:
            return 1
        else:
            return 2


    if 'claude' in model:
        if 'haiku' in model:
            return 3
        elif 'sonnet' in model:
            return 4
        else:
            return 5
    
    return int(re.search("([0-9]+)B", model).group(1))

In [None]:
import sys
sys.path.append("/home/ubuntu/sky_workdir/encoding-schemes")

from evaluation.metrics.math_accuracy import extract_answer, timeout
from verl.utils.reward_score.math import compute_score, last_boxed_only_string, remove_boxed


def calculate_accuracy(example):
    df_sft_train = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "sft_train.parquet"))
    df_gt = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "ground_truth_translation_train.parquet"))

    print(len(df_gt), len(df_sft_train))
    if len(df_gt) != len(df_sft_train):
        return []
    
    l_correct = []
    for i, row in df_sft_train.iterrows():
        l_sample_correct = []

        try:
            with timeout():
                extracted_model_response = extract_answer(row["messages"][-1]["content"])
        except Exception as e:
            print(e)
            l_sample_correct.append(0.0)
            continue

        if len(extracted_model_response) == 0:
            l_sample_correct.append(0.0)
            continue

        if len(df_gt.iloc[i]["answer"]) == 0:
            l_sample_correct.append(0.0)
            continue

        try:
            with timeout():
                l_sample_correct.append(compute_score(extracted_model_response, df_gt.iloc[i]["answer"]))
        except Exception as e:
            print(e)
            l_sample_correct.append(0.0)
            continue

        l_correct.append(l_sample_correct[0])

    return l_correct

for example in l_examples:
    example['train_acc'] = np.mean(calculate_accuracy(example))


In [None]:
df_viz = pd.DataFrame(l_examples)

df_viz['encoding_scheme'] = df_viz['data'].map(lambda x: x['experiment_params']['encoding_scheme'])
df_viz['model'] = df_viz['data'].map(lambda x: x['experiment_params']['model'])

try:
    df_viz['model_size'] = df_viz['model'].map(parse_params)
except Exception as e:
    print(e)
df_viz['input_type'] = df_viz['data'].map(lambda x: "_".join(x['experiment_name'].split("_")[:2]))

df_viz['n_few_shot_examples'] = df_viz['data'].map(lambda x: x['experiment_params'].get('n_few_shot_examples', None))
try:
    df_viz['total_train_tok'] = df_viz['n_total_train_tok'].map(humanize_number)
except Exception as e:
    print(e)

df_viz.head()

In [None]:
df_viz['input_type'].unique()

In [None]:
filter_set = ['math_cot']

In [None]:
# check that train set has all valid inputs
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]


df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples',
    # 'n_total_train_tok'
])

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})

fig = px.bar(df_viz_tmp, 
             x='encoding_scheme',
             y='train_acc',
             height=600, width=1600,
             # height=1600, width=1600,
             # color='model',
             # color='n_few_shot_examples',
             # facet_row='model',
             facet_col='model',
             color='input_type',
             # color='total_train_tok',
             # color_discrete_map=color_discrete_map,
             barmode="group"
            )


fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]

# df_viz_tmp = duckdb.query("WITH t1 AS (SELECT model, encoding_scheme, COUNT(*) as ct FROM df_viz_tmp GROUP BY model, encoding_scheme HAVING ct > 1) SELECT * FROM df_viz_tmp WHERE encoding_scheme IN (SELECT encoding_scheme FROM t1)").to_df()

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples',
    # 'n_total_train_tok'
])

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})

# unique_tokens = df_viz_tmp[['total_train_tok', 'n_total_train_tok']].drop_duplicates()
# unique_tokens = unique_tokens.sort_values('n_total_train_tok')['total_train_tok'].tolist()

# # Sample evenly from a continuous colormap
# colorscale = px.colors.sample_colorscale(
#     "Plasma",  # <-- You can swap with "Plasma", "Cividis", etc.
#     np.linspace(0, 1, len(unique_tokens))
# )

# # Create discrete mapping: string label → continuous color
# color_discrete_map = dict(zip(unique_tokens, colorscale))

fig = px.bar(df_viz_tmp, 
             x='encoding_scheme',
             y='contains_math_solving',
             height=600, width=1600,
             # height=1600, width=1600,
             # color='model',
             error_y='contains_math_solving_hi_ci',
             error_y_minus='contains_math_solving_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             facet_col='model',
             color='input_type',
             # color='total_train_tok',
             # color_discrete_map=color_discrete_map,
             title="% of CoT flagged by Qwen3 32B monitor as trying to solve a math problem",
             barmode="group"
            )

fig.update_yaxes(title="% flagged", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples',
    # 'n_total_train_tok'
])

# df_viz_tmp = duckdb.query("WITH t1 AS (SELECT model, encoding_scheme, COUNT(*) as ct FROM df_viz_tmp GROUP BY model, encoding_scheme HAVING ct > 1) SELECT df_viz_tmp.* FROM df_viz_tmp INNER JOIN t1 ON t1.model = df_viz_tmp.model AND t1.encoding_scheme = df_viz_tmp.encoding_scheme").to_df()

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})

# unique_tokens = df_viz_tmp[['total_train_tok', 'n_total_train_tok']].drop_duplicates()
# unique_tokens = unique_tokens.sort_values('n_total_train_tok')['total_train_tok'].tolist()

# # Sample evenly from a continuous colormap
# colorscale = px.colors.sample_colorscale(
#     "Plasma",  # <-- You can swap with "Plasma", "Cividis", etc.
#     np.linspace(0, 1, len(unique_tokens))
# )

# # Create discrete mapping: string label → continuous color
# color_discrete_map = dict(zip(unique_tokens, colorscale))


fig = px.bar(df_viz_tmp, x='encoding_scheme', y='followed_encoding_style',
             height=600, width=1600,
             # height=1600, width=1600,
             # color='model',
             error_y='followed_encoding_style_hi_ci',
             error_y_minus='followed_encoding_style_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             facet_col='model',
             # color='training_augmentation',
             # color='total_train_tok',
             # color_discrete_map=color_discrete_map,
             title="Encoding style adherence<br><sup>Adherence determined by prompting Qwen3 32B to compare the model's output to the reference encoded reasoning.",
             barmode="group"
            )

fig.update_yaxes(title="% adherent encodings", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples',
    # 'n_total_train_tok'
])

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})

# df_viz_tmp = duckdb.query("WITH t1 AS (SELECT model, encoding_scheme, COUNT(*) as ct FROM df_viz_tmp GROUP BY model, encoding_scheme HAVING ct > 1) SELECT df_viz_tmp.* FROM df_viz_tmp INNER JOIN t1 ON t1.model = df_viz_tmp.model AND t1.encoding_scheme = df_viz_tmp.encoding_scheme").to_df()


# df_viz_tmp['training_augmentation'] = df_viz_tmp['input_type'].map({
#     'math_cot': 'CoT only',
#     'mathcot_dataaugmentation': 'fwd+bwd translate + CoT'
# })


# unique_tokens = df_viz_tmp[['total_train_tok', 'n_total_train_tok']].drop_duplicates()
# unique_tokens = unique_tokens.sort_values('n_total_train_tok')['total_train_tok'].tolist()

# # Sample evenly from a continuous colormap
# colorscale = px.colors.sample_colorscale(
#     "Plasma",  # <-- You can swap with "Plasma", "Cividis", etc.
#     np.linspace(0, 1, len(unique_tokens))
# )

# # Create discrete mapping: string label → continuous color
# color_discrete_map = dict(zip(unique_tokens, colorscale))

fig = px.bar(df_viz_tmp, x='encoding_scheme', y='is_corrects',
             height=600, width=1600,
             # height=1600, width=1600,
             # color='encoding_scheme',
             error_y='is_corrects_hi_ci',
             error_y_minus='is_corrects_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             # color='training_augmentation',
             facet_col='model',
             # color='total_train_tok',
             # color_discrete_map=color_discrete_map,
             title="Accuracy, MATH subset",
             barmode="group"
            )

fig.update_yaxes(title="Accuracy", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples'
])

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})

fig = px.bar(df_viz_tmp, x='encoding_scheme', y='fully_coherent_and_correct',
             height=600, width=1600,
             # height=900, width=1600,
             color='model',
             error_y_minus='fully_coherent_and_correct_low_ci',
             error_y='fully_coherent_and_correct_hi_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             title="% of responses correct AND adhered to encoding format AND produced coherent English post-decode, MATH subset",
             barmode="group"
            )

fig.update_yaxes(title="% of responses", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'].isin(filter_set)]

df_viz_tmp = df_viz_tmp.explode('num_tokens_output')

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples'
])

df_viz_tmp = df_viz_tmp.astype({'n_few_shot_examples': str})


fig = px.box(df_viz_tmp, x='encoding_scheme', y='num_tokens_output',
             # height=600, width=1600,
             height=1600, width=1600,
             # color='model',
             color='n_few_shot_examples',
             facet_row='model',
             title="# tokens generated, MATH subset, SFT",
            )

fig.update_yaxes(title="# tokens", range=[0, 5000], dtick=500)
fig.update_traces(marker=dict(opacity=0.08))

fig.show()

# Inspect the data

In [None]:
test_scheme = 'speaking_math_safety_steg'
test_model = 'Qwen/Qwen2.5-14B-Instruct'
test_idx = 3

In [None]:
df_viz[(df_viz['model'] == test_model) & (df_viz['encoding_scheme'] == test_scheme)]['reference_problem_df'].iloc[0][test_idx]

In [None]:
df_viz[(df_viz['model'] == test_model) & (df_viz['encoding_scheme'] == test_scheme)]['reference_solution_df'].iloc[0][test_idx]

In [None]:
df_viz[(df_viz['model'] == test_model) & (df_viz['encoding_scheme'] == test_scheme)]['model_cot_df'].iloc[0][test_idx]

In [None]:
example_hash = df_viz[(df_viz['model'] == test_model) & (df_viz['encoding_scheme'] == test_scheme)]['experiment_hash'].iloc[0]
example_hash

In [None]:
with open(f"/home/ubuntu/sky_workdir/encoding-schemes/output/{example_hash}/data/sft_model_meta.json", "r") as fp:
    d_example = json.load(fp)

d_example

In [None]:
df_sft = pd.read_parquet(f"/home/ubuntu/sky_workdir/encoding-schemes/output/{example_hash}/data/sft_train.parquet")
df_sft['messages'].iloc[3]

In [None]:
df_test = pd.read_parquet("/home/ubuntu/sky_workdir/encoding-schemes/output/52e3d18631a304edc688e3a05e6cc8b3dfa2c4c2/data/joined_output.parquet")

df_test.head()

In [None]:
df_test.iloc[0]['generated_backtranslations']