# Table - Best raw score VS Best line score

In [2]:
import pandas as pd
import numpy as np


df = pd.read_csv("results/mt_results.csv")

PRETTY = {
    "wmt14_csen": "Cs-En",
    "wmt14_deen": "De-En",
    "wmt14_ruen": "Ru-En",
    "wmt14_fren": "Fr-En",
    "wmt19_deen": "De-En",
    "wmt19_fien": "Fi-En",
    "wmt19_lten": "Lt-En",
    "wmt19_ruen": "Ru-En",
}

GROUP = { 
    "wmt14_csen": "WMT14",
    "wmt14_deen": "WMT14",
    "wmt14_ruen": "WMT14",
    "wmt14_fren": "WMT14",
    "wmt19_deen": "WMT19",
    "wmt19_fien": "WMT19",
    "wmt19_lten": "WMT19",
    "wmt19_ruen": "WMT19",
}


WMT14_ORDER = ["wmt14_csen", "wmt14_deen", "wmt14_ruen", "wmt14_fren"]
WMT19_ORDER = ["wmt19_deen", "wmt19_fien", "wmt19_lten", "wmt19_ruen"]
DATASET_ORDER = WMT14_ORDER + WMT19_ORDER

# === 3) Split Base (RAW) vs LINE (detrended) and take the best across methods ===
is_line = df["method"].str.endswith("-LINE")
base = (
    df[~is_line]
    .groupby(["model", "dataset", "metric"], as_index=False)["prr_score"]
    .max()
    .rename(columns={"prr_score": "base"})
)

line = (
    df[is_line]
    .groupby(["model", "dataset", "metric"], as_index=False)["prr_score"]
    .max()
    .rename(columns={"prr_score": "line"})
)

best = pd.merge(base, line, on=["model", "dataset", "metric"], how="outer")

# === 4) Helper to build one table (block) per metric ===
def build_metric_block(best_df, metric_name):
    sub = best_df[best_df["metric"] == metric_name].copy()
    if sub.empty:
        return None

    # keep only datasets we know how to display (ordered)
    sub = sub[sub["dataset"].isin(DATASET_ORDER)]
    # round values
    sub["base_r"] = sub["base"].round(2)
    sub["line_r"] = sub["line"].round(2)
    # arrow if LINE > BASE (strict)
    sub["line_fmt"] = np.where(
        (sub["line"].notna()) & (sub["base"].notna()) & (sub["line"] > sub["base"]),
        sub["line_r"].map("{:.2f}".format) + "↑",
        sub["line_r"].map(lambda x: f"{x:.2f}" if pd.notna(x) else "")
    )
    sub["base_fmt"] = sub["base_r"].map(lambda x: f"{x:.2f}" if pd.notna(x) else "")

    # Build wide columns: MultiIndex (Group, PrettyLang, Variant)
    tuples = []
    frames = []
    for ds in DATASET_ORDER:
        ds_sub = sub[sub["dataset"] == ds].set_index("model")
        # 2 subcols: Base, LINE
        frames.extend([ds_sub["base_fmt"], ds_sub["line_fmt"]])
        tuples.extend([
            (GROUP[ds], PRETTY[ds], "Base"),
            (GROUP[ds], PRETTY[ds], "LINE"),
        ])

    block = pd.concat(frames, axis=1)
    block.columns = pd.MultiIndex.from_tuples(tuples, names=["", "", ""])
    # ensure row order by model label
    block = block.sort_index()
    return block

# === 5) Render all metric blocks (one after another) ===
metrics = best["metric"].unique()
for m in metrics:
    print(f"\n{m}")
    tbl = build_metric_block(best, m)
    if tbl is not None:
        display(tbl)



Comet WMT22


Unnamed: 0_level_0,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19
Unnamed: 0_level_1,Cs-En,Cs-En,De-En,De-En,Ru-En,Ru-En,Fr-En,Fr-En,De-En,De-En,Fi-En,Fi-En,Lt-En,Lt-En,Ru-En,Ru-En
Unnamed: 0_level_2,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
EuroLLM 9B,0.52,0.57↑,0.52,0.55↑,0.46,0.56↑,0.47,0.52↑,0.52,0.58↑,0.51,0.52↑,0.37,0.45↑,0.43,0.45↑
Gemma 2 9B,0.44,0.49↑,0.5,0.54↑,0.43,0.53↑,0.37,0.44↑,0.49,0.53↑,0.49,0.49↑,0.35,0.36↑,0.4,0.41↑
Llama 3.1 8B,0.48,0.58↑,0.48,0.56↑,0.45,0.59↑,0.37,0.48↑,0.46,0.55↑,0.54,0.56↑,0.52,0.56↑,0.43,0.53↑



MetricX XXL


Unnamed: 0_level_0,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19
Unnamed: 0_level_1,Cs-En,Cs-En,De-En,De-En,Ru-En,Ru-En,Fr-En,Fr-En,De-En,De-En,Fi-En,Fi-En,Lt-En,Lt-En,Ru-En,Ru-En
Unnamed: 0_level_2,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
EuroLLM 9B,0.54,0.55↑,0.54,0.55↑,0.48,0.47,0.46,0.46↑,0.5,0.51↑,0.49,0.47,0.42,0.47↑,0.36,0.42↑
Gemma 2 9B,0.45,0.46↑,0.47,0.49↑,0.42,0.46↑,0.36,0.37↑,0.44,0.47↑,0.45,0.45,0.34,0.37↑,0.38,0.41↑
Llama 3.1 8B,0.47,0.54↑,0.48,0.51↑,0.46,0.54↑,0.39,0.43↑,0.43,0.47↑,0.52,0.51,0.49,0.49↑,0.36,0.45↑



XComet XXL


Unnamed: 0_level_0,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT14,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19,WMT19
Unnamed: 0_level_1,Cs-En,Cs-En,De-En,De-En,Ru-En,Ru-En,Fr-En,Fr-En,De-En,De-En,Fi-En,Fi-En,Lt-En,Lt-En,Ru-En,Ru-En
Unnamed: 0_level_2,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE,Base,LINE
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
EuroLLM 9B,0.43,0.46↑,0.42,0.46↑,0.39,0.51↑,0.35,0.42↑,0.43,0.47↑,0.45,0.42,0.39,0.38,0.36,0.44↑
Gemma 2 9B,0.35,0.37↑,0.35,0.38↑,0.39,0.48↑,0.27,0.34↑,0.34,0.38↑,0.42,0.4,0.29,0.31↑,0.35,0.37↑
Llama 3.1 8B,0.4,0.48↑,0.37,0.47↑,0.41,0.53↑,0.33,0.42↑,0.34,0.44↑,0.51,0.49,0.53,0.52,0.37,0.51↑


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("results/sum_mr_results.csv")
# expects columns: model, dataset, metric, method, prr_score

# Pretty names (edit if needed)
TASK_LABEL = {"xsum": "XSUM", "gsm8k": "GSM8K"}
METRIC_PRETTY = {"Align Score": "Align Score", "Accuracy": "Accuracy"}

# Desired dataset (task) order
DATASET_ORDER = ["xsum", "gsm8k"]

# If you know the metric(s) per dataset, list them here (keeps order)
# If you prefer to infer dynamically, comment this out and see the dynamic block below.
METRICS_BY_DATASET = {
    "xsum":  ["Align Score"],
    "gsm8k": ["Accuracy"],
}

# --- Split Base (RAW) vs LINE (detrended) and take best across methods ---
is_line = df["method"].str.endswith("-LINE")
base = (
    df[~is_line]
    .groupby(["model", "dataset", "metric"], as_index=False)["prr_score"]
    .max()
    .rename(columns={"prr_score": "base"})
)
line = (
    df[is_line]
    .groupby(["model", "dataset", "metric"], as_index=False)["prr_score"]
    .max()
    .rename(columns={"prr_score": "line"})
)
best = pd.merge(base, line, on=["model", "dataset", "metric"], how="outer")

# --- Build one combined table: columns = (Task, Metric, Variant) ---
tuples = []
frames = []

for ds in DATASET_ORDER:
    # choose metric order for this dataset
    metric_order = METRICS_BY_DATASET.get(ds)
    if metric_order is None:
        # dynamic fallback: use whatever appears in the data, sorted
        metric_order = sorted(best.loc[best["dataset"] == ds, "metric"].dropna().unique())

    for m in metric_order:
        sub = best[(best["dataset"] == ds) & (best["metric"] == m)].copy()
        if sub.empty:
            # ensure empty columns still appear
            empty = pd.Series(dtype=object, name="base_fmt")
            frames.extend([empty, empty])
            task = TASK_LABEL.get(ds, ds.upper())
            met  = METRIC_PRETTY.get(m, m)
            tuples.extend([(task, met, "Base"), (task, met, "LINE")])
            continue

        # format values & add arrow if LINE > BASE
        sub["base_r"] = sub["base"].round(2)
        sub["line_r"] = sub["line"].round(2)
        sub["base_fmt"] = sub["base_r"].map(lambda x: f"{x:.2f}" if pd.notna(x) else "")
        sub["line_fmt"] = np.where(
            (sub["line"].notna()) & (sub["base"].notna()) & (sub["line"] > sub["base"]),
            sub["line_r"].map("{:.2f}".format) + "↑",
            sub["line_r"].map(lambda x: f"{x:.2f}" if pd.notna(x) else "")
        )

        sub = sub.set_index("model")
        frames.extend([sub["base_fmt"], sub["line_fmt"]])

        task = TASK_LABEL.get(ds, ds.upper())
        met  = METRIC_PRETTY.get(m, m)
        tuples.extend([(task, met, "Base"), (task, met, "LINE")])

table = pd.concat(frames, axis=1)
table.columns = pd.MultiIndex.from_tuples(tuples, names=["", "", ""])
table = table.sort_index()  # sort models alphabetically; customize if needed

display(table)


Unnamed: 0_level_0,XSUM,XSUM,GSM8K,GSM8K
Unnamed: 0_level_1,Align Score,Align Score,Accuracy,Accuracy
Unnamed: 0_level_2,Base,LINE,Base,LINE
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Gemma 2 9B,0.35,0.38↑,0.39,0.40↑
Llama 3.1 8B,0.37,0.37,0.36,0.40↑


# Avg Improvements Table

In [None]:
import pandas as pd
import numpy as np

# --- file paths ---
WMT_CSV        = "results/mt_results.csv"         # translation runs (WMT)
XSUM_GSM8K_CSV = "results/sum_mr_results.csv"     # summarization + math runs (both in one file)

# Row order (adjust if needed)
METHOD_ORDER = ["MSP", "PPL", "MTE", "MCSE", "MCNSE", "LSRL"]

def load_and_compute_deltas(csv_path: str) -> pd.DataFrame:
    """Return columns [model, dataset, metric, method, delta] where
    delta = (same-method LINE) - (same-method RAW)."""
    df = pd.read_csv(csv_path)

    # Ensure numeric score
    df["prr_score"] = pd.to_numeric(df["prr_score"], errors="coerce")

    # Treat 'metric' as already pretty; do NOT map it
    df["metric"] = df["metric"].astype(str)

    # Split RAW vs LINE for same base method
    is_line = df["method"].astype(str).str.endswith("-LINE")
    df["method_base"] = df["method"].astype(str).str.replace(r"-LINE$", "", regex=True)

    raw = (
        df[~is_line][["model", "dataset", "metric", "method_base", "prr_score"]]
        .rename(columns={"prr_score": "raw"})
    )
    line = (
        df[is_line][["model", "dataset", "metric", "method_base", "prr_score"]]
        .rename(columns={"prr_score": "line"})
    )

    merged = pd.merge(raw, line, on=["model", "dataset", "metric", "method_base"], how="inner")
    merged["raw"]  = pd.to_numeric(merged["raw"],  errors="coerce")
    merged["line"] = pd.to_numeric(merged["line"], errors="coerce")

    merged["method"] = merged["method_base"]
    merged["delta"]  = merged["line"] - merged["raw"]
    return merged[["model", "dataset", "metric", "method", "delta"]]

def summarize_mean_sem(df: pd.DataFrame, task_name: str, metric_order_pretty: list) -> pd.DataFrame:
    """Return wide table with rows=methods and columns=(task_name, metric) showing 'mean ± sem' strings."""
    df = df.copy()

    # Keep only the requested pretty metric names
    df = df[df["metric"].isin(metric_order_pretty)].copy()

    # If empty, return an empty block with desired shape
    if df.empty:
        empty = pd.DataFrame(index=METHOD_ORDER, columns=metric_order_pretty)
        empty.columns = pd.MultiIndex.from_product([[task_name], empty.columns])
        return empty

    # Make sure numeric
    df["delta"] = pd.to_numeric(df["delta"], errors="coerce")

    g = df.groupby(["method", "metric"])["delta"]
    mean = g.mean()

    def sem_safe(x):
        n = len(x)
        return 0.0 if n <= 1 else float(x.std(ddof=1)) / np.sqrt(n)

    sem = g.apply(sem_safe).reindex(mean.index)

    # Build strings only (no numeric + string mixing)
    mean_str = mean.map(lambda x: f"{x:+.2f}")
    sem_str  = sem.map(lambda x: f"{x:.2f}")
    fmt = mean_str.str.cat(sem_str, sep=" ± ")

    wide = fmt.unstack("metric")

    # Enforce method & metric order, tolerate missing
    wide = wide.reindex(index=METHOD_ORDER)
    wide = wide.reindex(columns=metric_order_pretty)

    # Add task level
    wide.columns = pd.MultiIndex.from_product([[task_name], wide.columns])
    return wide

# --- compute deltas ---
wmt_df      = load_and_compute_deltas(WMT_CSV)
xsum_gsm_df = load_and_compute_deltas(XSUM_GSM8K_CSV)

# Split combined file by dataset name (case-insensitive)
xsum_df  = xsum_gsm_df[xsum_gsm_df["dataset"].astype(str).str.lower() == "xsum"]
gsm8k_df = xsum_gsm_df[xsum_gsm_df["dataset"].astype(str).str.lower() == "gsm8k"]

# Metric names EXACTLY as they appear in your CSV (already pretty)
WMT_METRICS_PRETTY   = ["Comet WMT22", "XComet XXL", "MetricX XXL"]
XSUM_METRICS_PRETTY  = ["Align Score"]
GSM8K_METRICS_PRETTY = ["Accuracy"]

# --- build blocks ---
wmt_block   = summarize_mean_sem(wmt_df,   "WMT",   WMT_METRICS_PRETTY)
xsum_block  = summarize_mean_sem(xsum_df,  "XSum",  XSUM_METRICS_PRETTY)
gsm8k_block = summarize_mean_sem(gsm8k_df, "GSM8K", GSM8K_METRICS_PRETTY)

# --- final table ---
table = pd.concat([wmt_block, xsum_block, gsm8k_block], axis=1)
display(table)


Unnamed: 0_level_0,WMT,WMT,WMT,XSum,GSM8K
metric,Comet WMT22,XComet XXL,MetricX XXL,Align Score,Accuracy
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
MSP,+0.09 ± 0.02,+0.09 ± 0.02,+0.18 ± 0.01,+0.03 ± 0.00,-0.00 ± 0.01
PPL,+0.05 ± 0.01,+0.05 ± 0.01,+0.02 ± 0.00,+0.01 ± 0.01,+0.09 ± 0.02
MTE,+0.08 ± 0.01,+0.07 ± 0.01,+0.03 ± 0.01,+0.01 ± 0.01,+0.08 ± 0.02
MCSE,+0.07 ± 0.02,+0.07 ± 0.02,+0.16 ± 0.01,+0.02 ± 0.01,+0.00 ± 0.00
MCNSE,+0.02 ± 0.01,+0.02 ± 0.01,+0.00 ± 0.00,+0.01 ± 0.00,+0.02 ± 0.00
LSRL,-0.00 ± 0.01,+0.01 ± 0.01,+0.00 ± 0.00,+0.03 ± 0.02,-0.00 ± 0.00
