In [None]:
import plotly.express as px
import numpy as np

In [None]:
import os
import psycopg2
import json

conn_string = os.environ["SUPABASE_CONNECTION_URL"]

conn = psycopg2.connect(conn_string)

import pandas as pd

In [None]:
# df = pd.read_sql("SELECT * FROM public.encoding_schemes WHERE (data->'experiment_tags'->'sft')::boolean", conn)
df = pd.read_sql("""
SELECT * FROM public.encoding_schemes 
    WHERE 
    (data->'experiment_name')::TEXT LIKE '%conversational_%'
    AND (data->'experiment_params'->'dataset')::TEXT LIKE '%lmsys_chat_1m_1_turn_english_subset%'
ORDER BY created_at DESC
""", conn)

df.head()

In [None]:
df['data'].iloc[0]

In [None]:
root_dir = "~/sky_workdir/encoding-schemes/output"

In [None]:
l_examples = df.to_dict('records')

l_examples[:5]

In [None]:

def bootstrap_ci(data, statistic=np.mean, alpha=0.05, n_boot=10_000, random_state=None):
    """
    Returns (point_estimate, low_CI, high_CI) for given 1D data.
    Works with bool, int, or float data.
    """
    x = np.asarray(data).astype(float)  # ensure numeric
    x = x[~np.isnan(x)]
    if len(x) == 0:
        raise ValueError("No valid data for bootstrapping.")

    rng = np.random.default_rng(random_state)
    n = len(x)

    # Draw bootstrap samples
    idx = rng.integers(0, n, size=(n_boot, n))
    samples = x[idx]

    # Apply statistic row-wise
    stats = np.apply_along_axis(statistic, 1, samples)

    point = statistic(x)
    lo = np.percentile(stats, 100 * (alpha / 2))
    hi = np.percentile(stats, 100 * (1 - alpha / 2))
    return point, lo, hi

In [None]:

for example in l_examples:
    df_sft_data = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "sft.parquet"))
    df_prompted_cot = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], "data", "prompted_chat.parquet"))

    example['english_coherence_scores'] = df_prompted_cot['english_coherence_scores'].apply(np.mean).mean()
    mid, lo, hi = bootstrap_ci(df_prompted_cot['english_coherence_scores'].apply(np.mean))
    example['english_coherence_scores_low_ci'] = mid - lo
    example['english_coherence_scores_hi_ci'] = hi - mid

    ppls = [-np.sum(logprobs) for logprobs in df_prompted_cot['gt_logprobs']]
    example['ppls'] = np.array(ppls)
    
    example['followed_encoding_style'] = df_prompted_cot['followed_encoding_style'].apply(np.mean).mean()
    mid, lo, hi = bootstrap_ci(df_prompted_cot['followed_encoding_style'].apply(np.mean))
    example['followed_encoding_style_low_ci'] = mid - lo
    example['followed_encoding_style_hi_ci'] = hi - mid

    l_coherent_and_formatted = []
    for i in range(len(df_prompted_cot)):
        l_coherent_and_formatted.append(
            (
                df_prompted_cot['english_coherence_scores'].iloc[i].astype(bool) & \
                df_prompted_cot['followed_encoding_style'].iloc[i].astype(bool)
            ).mean()
        )
    example['coherent_and_formatted'] = np.mean(l_coherent_and_formatted)
    mid, lo, hi = bootstrap_ci(l_coherent_and_formatted)
    example['coherent_and_formatted_low_ci'] = mid - lo
    example['coherent_and_formatted_hi_ci'] = hi - mid
    
    example['reference_translation'] = df_sft_data['messages'].map(lambda x: x[-1]['content'])

    for col in df_prompted_cot.columns:
        example[f"{col}_df"] = df_prompted_cot[col]

In [None]:
df_prompted_cot.head()

In [None]:
import re

def parse_params(model):
    return int(re.search("([0-9]+)B", model).group(1))

In [None]:
df_viz = pd.DataFrame(l_examples)

df_viz['encoding_scheme'] = df_viz['data'].map(lambda x: x['experiment_params']['encoding_scheme'])
df_viz['model'] = df_viz['data'].map(lambda x: x['experiment_params']['model'])
df_viz['model_size'] = df_viz['model'].map(parse_params)
df_viz['input_type'] = df_viz['data'].map(lambda x: "_".join(x['experiment_name'].split("_")[:2]))

df_viz['n_few_shot_examples'] = df_viz['data'].map(lambda x: x['experiment_params'].get('n_few_shot_examples', None))

df_viz.head()

In [None]:
df_viz[(df_viz['model_size'] == 32) & (df_viz['encoding_scheme'] == 'speaking_letter_to_word_with_dot')]['reference_translation'].iloc[0][0]

In [None]:
df_viz[(df_viz['model_size'] == 32) & (df_viz['encoding_scheme'] == 'speaking_letter_to_word_with_dot')]['decoded_response_df'].iloc[0][0][0]

In [None]:
df_viz['input_type'].unique()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'] == 'conversational_sft']

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples'
])

fig = px.bar(df_viz_tmp, x='encoding_scheme', y='english_coherence_scores',
             height=600, width=1600,
             color='model',
             error_y='english_coherence_scores_hi_ci',
             error_y_minus='english_coherence_scores_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             title="Coherence of decoded output, SFT<br><sup>Coherence determined by decoding encoded output to English and then prompting Qwen3 235B to judge if the result is coherent English.",
             barmode="group"
            )

fig.update_yaxes(title="% coherent English", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'] == 'conversational_sft']

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples'
])

fig = px.bar(df_viz_tmp, x='encoding_scheme', y='followed_encoding_style',
             height=600, width=1600,
             color='model',
             error_y='followed_encoding_style_hi_ci',
             error_y_minus='followed_encoding_style_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             title="Encoding style adherence, SFT<br><sup>Adherence determined by prompting Qwen3 235B to compare the model's output to the reference encoded reasoning.",
             barmode="group"
            )

fig.update_yaxes(title="% adherent encodings", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'] == 'conversational_sft']

df_viz_tmp = df_viz_tmp.sort_values([
    'model_size',
    'encoding_scheme', 
    'n_few_shot_examples'
])

fig = px.bar(df_viz_tmp, x='encoding_scheme', y='coherent_and_formatted',
             height=600, width=1600,
             color='model',
             error_y='coherent_and_formatted_hi_ci',
             error_y_minus='coherent_and_formatted_low_ci',
             # color='n_few_shot_examples',
             # facet_row='model',
             title="% of responses that adhere to encoding style AND produce coherent English after decoding<br><sup>Adherence determined by prompting Qwen3 235B to compare the model's output to the reference encoded reasoning.",
             barmode="group"
            )

fig.update_yaxes(title="% adherent & coherent", dtick=0.05)

fig.show()

In [None]:
df_viz_tmp = df_viz[df_viz['input_type'] == 'conversational_sft']
df_viz_tmp = df_viz_tmp.explode('ppls')
df_viz_tmp = df_viz_tmp.sort_values(['model_size', 'encoding_scheme', 'n_few_shot_examples'])

fig = px.box(df_viz_tmp, 
             x='encoding_scheme', 
             y='ppls', 
             height=600, 
             width=1600,
             color='model',
             title="Total log loss on ground truth encoded chat output, SFT",
)

# Get unique models and encoding schemes
models = df_viz_tmp['model'].unique()
encoding_schemes = df_viz_tmp['encoding_scheme'].unique()

# Calculate offset for each model in grouped box plot
n_models = len(models)
offsets = np.linspace(-(n_models-1)*0.2/2, (n_models-1)*0.2/2, n_models)
model_offset_map = dict(zip(models, offsets))

# Calculate medians and add as text annotations
for model in models:
    for encoding in encoding_schemes:
        # Filter data for this specific combination
        data_subset = df_viz_tmp[(df_viz_tmp['model'] == model) & 
                                  (df_viz_tmp['encoding_scheme'] == encoding)]['ppls']
        
        if len(data_subset) > 0:
            median_val = data_subset.median()
            
            # Get the offset for this model
            x_offset = model_offset_map[model]
            
            # Add text annotation for median with offset
            fig.add_annotation(
                x=encoding,
                y=median_val,
                text=f"{median_val:.0f}",
                showarrow=False,
                font=dict(size=10, color='black'),
                bgcolor='rgba(255, 255, 255, 0.8)',
                bordercolor='black',
                borderwidth=1,
                borderpad=2,
                xshift=x_offset * 500,  # Convert offset to pixels (adjust multiplier as needed)
                yshift=0
            )

fig.update_yaxes(title="Total log loss", dtick=1000, range=[0, 5000])
fig.update_traces(marker=dict(opacity=0.08))
fig.show()

In [None]:
df_test = pd.read_parquet("~/sky_workdir/encoding-schemes/output/45321663b9a0bf538b14734f76ac5f644f5eb505/data/sft_train.parquet")

len(df_test)