# Collect results (Machine Translation)

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from utils import detrend_ue
from pathlib import Path
import pathlib


methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ]
}

METRICS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX XXL',
    'XComet-XCOMET-XXL': 'XComet XXL',
    'Comet-wmt22-comet-da': 'Comet WMT22',
}


def main():
    rows = []  # will accumulate dicts with columns: model, dataset, metric, method, prr_score

    for metric_key, metric_name in METRICS.items():
        datasets = DATASETS[metric_key]
        

        for model_key, model_name in MODELS.items():
            ue_methods = list(methods_dict.values())

            ue_scores, _, _ = detrend_ue(
                datasets,
                model_key,
                [metric_key],
                ue_methods,
                methods_dict
            )

            for method_full, method_short in methods_dict.items():
                raw_scores = ue_scores[f"{method_short}_raw"]
                detr_scores = ue_scores[f"{method_short}_detr"]

                for dataset_name, raw, detr in zip(datasets, raw_scores, detr_scores):
                    # raw row (method name unchanged)
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": method_short,
                        "prr_score": float(raw),
                    })
                    # detrended row (method-LINE)
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": f"{method_short}-LINE",
                        "prr_score": float(detr),
                    })

    # write single CSV with all metrics/models/datasets/methods
    df = pd.DataFrame(rows, columns=["model", "dataset", "metric", "method", "prr_score"])
    out_path = Path("results") / "mt_results.csv"
    df.to_csv(out_path, index=False)
    print(f"Wrote {out_path.resolve()} with {len(df)} rows.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd


df = pd.read_csv("results/mt_results.csv")


import numpy as np
from IPython.display import display, Markdown


methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}
method_order = [v for v in methods_dict.values()]
method_display_order = [m for pair in zip(method_order, [m + "-LINE" for m in method_order]) for m in pair]

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}
model_display_order = list(MODELS.values())

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ]
}

# ---- highlighter: best = bold, second = underline (ties handled) ----
def highlight_best_second(col: pd.Series):
    if col.dtype.kind not in "fi":
        return [''] * len(col)
    vals = col.astype(float)
    uniq_sorted = np.unique(vals[~vals.isna()])[::-1]  # desc unique
    best = uniq_sorted[0] if len(uniq_sorted) > 0 else np.nan
    second = uniq_sorted[1] if len(uniq_sorted) > 1 else np.nan

    styles = []
    for v in vals:
        if pd.isna(v):
            styles.append('')
        elif np.isclose(v, best, rtol=1e-9, atol=1e-12):
            styles.append('font-weight: bold;')
        elif not np.isnan(second) and np.isclose(v, second, rtol=1e-9, atol=1e-12):
            styles.append('text-decoration: underline;')
        else:
            styles.append('')
    return styles

def show_metric_tables(df_in: pd.DataFrame, metric_name: str):
    display(Markdown(f"## {metric_name}"))
    metric_df = df_in[df_in["metric"] == metric_name].copy()

    candidate_order = None
    for ds_list in DATASETS.values():
        if set(ds_list).issubset(set(metric_df["dataset"].unique())):
            candidate_order = ds_list
            break
    if candidate_order is None:
        candidate_order = list(metric_df["dataset"].unique())

    for model_name in model_display_order:
        block = metric_df[metric_df["model"] == model_name]
        if block.empty:
            continue

        piv = block.pivot_table(
            index="method", columns="dataset", values="prr_score", aggfunc="mean"
        )
        piv = piv.reindex(index=method_display_order)
        piv = piv.reindex(columns=[c for c in candidate_order if c in piv.columns])

        styled = (
            piv.round(3)
               .style
               .apply(highlight_best_second, axis=0)
               .set_caption(model_name)
        )
        display(styled)

for metric_name in df["metric"].unique():
    show_metric_tables(df, metric_name)


## MetricX XXL

dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.215,0.225,0.313,0.194,0.225,0.084,0.119,0.264
MSP-LINE,0.396,0.422,0.436,0.31,0.403,0.412,0.382,0.355
PPL,0.429,0.45,0.414,0.319,0.39,0.482,0.434,0.311
PPL-LINE,0.484,0.473,0.485,0.357,0.404,0.482,0.436,0.396
MTE,0.475,0.477,0.459,0.393,0.427,0.52,0.487,0.356
MTE-LINE,0.537,0.514,0.538,0.435,0.471,0.509,0.489,0.454
MCSE,0.157,0.14,0.202,0.163,0.143,-0.002,0.065,0.213
MCSE-LINE,0.292,0.291,0.304,0.246,0.318,0.295,0.288,0.275
MCNSE,0.423,0.378,0.404,0.324,0.383,0.393,0.438,0.324
MCNSE-LINE,0.425,0.38,0.432,0.342,0.385,0.393,0.438,0.361


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.189,0.219,0.288,0.133,0.279,0.058,0.241,0.265
MSP-LINE,0.392,0.446,0.412,0.29,0.446,0.352,0.349,0.386
PPL,0.425,0.471,0.405,0.326,0.419,0.408,0.331,0.34
PPL-LINE,0.429,0.479,0.427,0.33,0.427,0.407,0.342,0.371
MTE,0.45,0.475,0.419,0.359,0.44,0.454,0.335,0.35
MTE-LINE,0.463,0.495,0.46,0.371,0.467,0.446,0.373,0.414
MCSE,0.11,0.155,0.213,0.12,0.201,-0.027,0.172,0.229
MCSE-LINE,0.308,0.359,0.328,0.276,0.401,0.266,0.245,0.352
MCNSE,0.383,0.431,0.402,0.33,0.429,0.363,0.319,0.381
MCNSE-LINE,0.383,0.43,0.413,0.333,0.436,0.364,0.319,0.397


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.127,0.226,0.297,0.111,0.23,0.041,0.155,0.284
MSP-LINE,0.324,0.444,0.445,0.265,0.428,0.339,0.312,0.379
PPL,0.512,0.521,0.454,0.415,0.481,0.445,0.403,0.334
PPL-LINE,0.515,0.529,0.458,0.419,0.483,0.436,0.423,0.371
MTE,0.541,0.537,0.477,0.456,0.498,0.486,0.424,0.361
MTE-LINE,0.548,0.547,0.47,0.465,0.512,0.47,0.467,0.42
MCSE,0.2,0.237,0.283,0.164,0.247,0.098,0.254,0.273
MCSE-LINE,0.417,0.421,0.389,0.36,0.402,0.357,0.356,0.343
MCNSE,0.23,0.33,0.295,0.223,0.276,0.324,0.264,0.214
MCNSE-LINE,0.21,0.327,0.283,0.214,0.256,0.304,0.251,0.234


## XComet XXL

dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.249,0.349,0.408,0.329,0.306,0.041,0.147,0.371
MSP-LINE,0.33,0.38,0.408,0.334,0.37,0.389,0.38,0.336
PPL,0.362,0.351,0.297,0.238,0.33,0.489,0.489,0.272
PPL-LINE,0.424,0.426,0.475,0.35,0.368,0.486,0.483,0.417
MTE,0.399,0.371,0.331,0.303,0.344,0.513,0.532,0.317
MTE-LINE,0.477,0.475,0.533,0.423,0.438,0.49,0.52,0.512
MCSE,0.201,0.286,0.328,0.287,0.244,-0.042,0.063,0.305
MCSE-LINE,0.263,0.274,0.275,0.239,0.29,0.266,0.276,0.278
MCNSE,0.36,0.345,0.32,0.272,0.335,0.384,0.425,0.322
MCNSE-LINE,0.362,0.353,0.395,0.313,0.349,0.383,0.425,0.401


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.203,0.354,0.391,0.274,0.344,0.001,0.147,0.349
MSP-LINE,0.293,0.383,0.382,0.292,0.377,0.287,0.217,0.335
PPL,0.318,0.338,0.292,0.251,0.331,0.373,0.281,0.273
PPL-LINE,0.33,0.367,0.448,0.304,0.347,0.372,0.284,0.328
MTE,0.346,0.327,0.286,0.255,0.321,0.415,0.294,0.271
MTE-LINE,0.37,0.383,0.476,0.343,0.37,0.403,0.31,0.374
MCSE,0.147,0.298,0.338,0.264,0.275,-0.07,0.088,0.323
MCSE-LINE,0.23,0.323,0.341,0.258,0.333,0.198,0.117,0.325
MCNSE,0.278,0.336,0.326,0.247,0.318,0.288,0.186,0.329
MCNSE-LINE,0.275,0.333,0.401,0.272,0.331,0.289,0.186,0.361


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.133,0.308,0.379,0.205,0.268,-0.027,0.058,0.355
MSP-LINE,0.237,0.384,0.425,0.248,0.383,0.261,0.189,0.365
PPL,0.394,0.405,0.375,0.328,0.425,0.397,0.331,0.314
PPL-LINE,0.412,0.435,0.499,0.373,0.437,0.384,0.336,0.39
MTE,0.434,0.417,0.389,0.345,0.43,0.448,0.386,0.322
MTE-LINE,0.462,0.462,0.511,0.417,0.473,0.421,0.381,0.44
MCSE,0.185,0.305,0.348,0.263,0.303,-0.009,0.08,0.337
MCSE-LINE,0.307,0.365,0.383,0.331,0.384,0.279,0.2,0.322
MCNSE,0.191,0.277,0.229,0.171,0.259,0.3,0.225,0.212
MCNSE-LINE,0.183,0.281,0.289,0.173,0.254,0.276,0.193,0.258


## Comet WMT22

dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.424,0.394,0.453,0.349,0.458,0.186,0.288,0.431
MSP-LINE,0.469,0.487,0.481,0.398,0.511,0.467,0.473,0.414
PPL,0.417,0.455,0.371,0.314,0.407,0.516,0.475,0.315
PPL-LINE,0.521,0.515,0.526,0.412,0.459,0.521,0.495,0.464
MTE,0.445,0.479,0.406,0.374,0.418,0.54,0.518,0.331
MTE-LINE,0.576,0.563,0.589,0.484,0.546,0.561,0.555,0.526
MCSE,0.365,0.322,0.347,0.296,0.361,0.085,0.201,0.359
MCSE-LINE,0.376,0.356,0.328,0.28,0.385,0.318,0.355,0.32
MCNSE,0.481,0.435,0.401,0.358,0.432,0.455,0.483,0.353
MCNSE-LINE,0.486,0.441,0.473,0.395,0.45,0.457,0.485,0.439


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.398,0.373,0.425,0.286,0.49,0.185,0.35,0.401
MSP-LINE,0.482,0.501,0.473,0.383,0.533,0.422,0.36,0.412
PPL,0.436,0.484,0.382,0.361,0.445,0.456,0.304,0.305
PPL-LINE,0.457,0.506,0.496,0.4,0.466,0.456,0.321,0.355
MTE,0.443,0.489,0.384,0.371,0.436,0.487,0.302,0.304
MTE-LINE,0.492,0.537,0.529,0.436,0.506,0.493,0.36,0.401
MCSE,0.319,0.308,0.353,0.283,0.406,0.092,0.287,0.361
MCSE-LINE,0.394,0.431,0.375,0.352,0.472,0.307,0.294,0.387
MCNSE,0.442,0.502,0.417,0.37,0.473,0.413,0.352,0.373
MCNSE-LINE,0.438,0.499,0.48,0.389,0.491,0.414,0.351,0.405


dataset,wmt14_csen,wmt14_deen,wmt14_ruen,wmt14_fren,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MSP,0.293,0.332,0.423,0.238,0.401,0.162,0.277,0.43
MSP-LINE,0.374,0.461,0.5,0.342,0.507,0.394,0.341,0.436
PPL,0.505,0.505,0.434,0.44,0.516,0.484,0.357,0.325
PPL-LINE,0.531,0.531,0.537,0.473,0.536,0.486,0.403,0.394
MTE,0.525,0.517,0.456,0.466,0.511,0.512,0.372,0.341
MTE-LINE,0.572,0.553,0.564,0.52,0.575,0.521,0.45,0.453
MCSE,0.351,0.359,0.424,0.283,0.414,0.211,0.343,0.422
MCSE-LINE,0.462,0.466,0.456,0.396,0.487,0.4,0.375,0.389
MCNSE,0.225,0.357,0.282,0.221,0.338,0.355,0.277,0.245
MCNSE-LINE,0.22,0.359,0.334,0.22,0.337,0.346,0.279,0.294


# Write to latex

In [None]:
import os
import numpy as np
import pandas as pd

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}
method_order = [v for v in methods_dict.values()]
method_display_order = [m for pair in zip(method_order, [m + "-LINE" for m in method_order]) for m in pair]

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}
model_display_order = list(MODELS.values())

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen','wmt14_deen','wmt14_ruen','wmt14_fren',
        'wmt19_deen','wmt19_fien','wmt19_lten','wmt19_ruen',
    ]
}

def _choose_dataset_order(metric_df: pd.DataFrame) -> list[str]:
    cand = None
    for ds_list in DATASETS.values():
        if set(ds_list).issubset(set(metric_df["dataset"].unique())):
            cand = ds_list
            break
    if cand is None:
        cand = list(metric_df["dataset"].unique())
    return cand

def _format_with_rankings(col: pd.Series, ndigits: int = 3) -> pd.Series:
    vals = pd.to_numeric(col, errors="coerce")
    uniq = np.unique(vals[~vals.isna()])
    if uniq.size == 0:
        best = second = np.nan
    else:
        uniq.sort()           # ascending
        uniq = uniq[::-1]     # descending
        best = uniq[0]
        second = uniq[1] if uniq.size > 1 else np.nan

    out = []
    for v in vals:
        if pd.isna(v):
            out.append("--")
            continue
        s = f"{v:.{ndigits}f}"
        if np.isfinite(best) and np.isclose(v, best, rtol=1e-9, atol=1e-12):
            out.append(r"\textbf{" + s + "}")
        elif np.isfinite(second) and np.isclose(v, second, rtol=1e-9, atol=1e-12):
            out.append(r"\underline{" + s + "}")
        else:
            out.append(s)
    return pd.Series(out, index=col.index)

def generate_latex_tables(
    df_in: pd.DataFrame,
    value_col: str = "prr_score",
    out_dir: str = "latex_tables",
    ndigits: int = 2,
    table_env: str = "table",     
    use_booktabs: bool = True
) -> dict:
    """
    Generate LaTeX tables for each (metric, model) slice.
    Returns a dict: {(metric, model): filepath}.
    """
    os.makedirs(out_dir, exist_ok=True)
    paths = {}

    for metric_name in df_in["metric"].unique():
        metric_df = df_in[df_in["metric"] == metric_name].copy()
        dataset_order = _choose_dataset_order(metric_df)

        for model_name in model_display_order:
            block = metric_df[metric_df["model"] == model_name]
            if block.empty:
                continue

            piv = block.pivot_table(
                index="method", columns="dataset", values=value_col, aggfunc="mean"
            )

            # enforce display order
            piv = piv.reindex(index=method_display_order)
            piv = piv.reindex(columns=[c for c in dataset_order if c in piv.columns])

            fmt = piv.apply(_format_with_rankings, axis=0, ndigits=ndigits)

            fmt = fmt.fillna("--")

            colfmt = "l" + "r" * fmt.shape[1]

            caption = f"{model_name} — {metric_name}"
            label = f"tab:{metric_name.replace(' ', '_')}__{model_name.replace(' ', '_')}"

            latex = fmt.to_latex(
                escape=False,           
                index=True,
                caption=caption,
                label=label,
                na_rep="--",
                column_format=colfmt,
                bold_rows=False,
                longtable=False
            )

            if use_booktabs:
                pass

            latex = latex.replace("\\begin{tabular}", "\\begin{tabular}")
            if "method" in (piv.index.names or []) or True:
                lines = latex.splitlines()
                try:
                    top_idx = next(i for i,l in enumerate(lines) if r"\toprule" in l)
                    header_idx = top_idx + 1
                    header_line = lines[header_idx]
                    if header_line.startswith("{}"):
                        lines[header_idx] = header_line.replace("{}", r"\textbf{Method}", 1)
                    elif header_line.startswith("&"):
                        lines[header_idx] = r"\textbf{Method} " + header_line
                    latex = "\n".join(lines)
                except StopIteration:
                    pass

            # write out
            safe_metric = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in metric_name)
            safe_model = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in model_name)
            fname = f"{safe_metric}__{safe_model}.tex"
            fpath = os.path.join(out_dir, fname)
            with open(fpath, "w", encoding="utf-8") as f:
                f.write(latex)

            paths[(metric_name, model_name)] = fpath

    return paths


df= pd.read_csv('results/mt_results.csv')
paths = generate_latex_tables(df)
for (metric, model), p in paths.items():
    print(f"{metric} | {model} -> {p}")


MetricX XXL | Llama 3.1 8B -> latex_tables/MetricX_XXL__Llama_3.1_8B.tex
MetricX XXL | Gemma 2 9B -> latex_tables/MetricX_XXL__Gemma_2_9B.tex
MetricX XXL | EuroLLM 9B -> latex_tables/MetricX_XXL__EuroLLM_9B.tex
XComet XXL | Llama 3.1 8B -> latex_tables/XComet_XXL__Llama_3.1_8B.tex
XComet XXL | Gemma 2 9B -> latex_tables/XComet_XXL__Gemma_2_9B.tex
XComet XXL | EuroLLM 9B -> latex_tables/XComet_XXL__EuroLLM_9B.tex
Comet WMT22 | Llama 3.1 8B -> latex_tables/Comet_WMT22__Llama_3.1_8B.tex
Comet WMT22 | Gemma 2 9B -> latex_tables/Comet_WMT22__Gemma_2_9B.tex
Comet WMT22 | EuroLLM 9B -> latex_tables/Comet_WMT22__EuroLLM_9B.tex


# Collect results (Summarization and Math Reasoning)

In [None]:
from utils import detrend_ue_w_quality
from pathlib import Path


MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
}

DATASETS = {
    'Accuracy': [
        'gsm8k'
    ],
    'AlignScoreInputOutput': [
        'xsum'
    ]
}

METRICS = {
    'Accuracy': 'Accuracy',
    'AlignScoreInputOutput': 'Align Score',
}


def main():
    rows = []  

    for metric_key, metric_name in METRICS.items():
        datasets = DATASETS[metric_key]
        ue_methods = list(methods_dict.values())

        for model_key, model_name in MODELS.items():
            ue_scores, _, _ = detrend_ue_w_quality(
                datasets,
                model_key,
                [metric_key],
                ue_methods,
                methods_dict
            )

            for method_full, method_short in methods_dict.items():
                raw_scores = ue_scores[f"{method_short}_raw"]
                detr_scores = ue_scores[f"{method_short}_detr"]

                for dataset_name, raw, detr in zip(datasets, raw_scores, detr_scores):
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": method_short,
                        "prr_score": float(raw),
                    })
                    # detrended row (method-LINE)
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": f"{method_short}-LINE",
                        "prr_score": float(detr),
                    })

    df = pd.DataFrame(rows, columns=["model", "dataset", "metric", "method", "prr_score"])
    out_path = Path("results") / "sum_mr_results.csv"
    df.to_csv(out_path, index=False)
    print(f"Wrote {out_path.resolve()} with {len(df)} rows.")

if __name__ == "__main__":
    main()


In [None]:


df = pd.read_csv("results/sum_mr_results.csv")


import numpy as np
from IPython.display import display, Markdown


methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}
method_order = [v for v in methods_dict.values()]
method_display_order = [m for pair in zip(method_order, [m + "-LINE" for m in method_order]) for m in pair]

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
}
model_display_order = list(MODELS.values())


DATASETS = {
    'Accuracy': [
        'gsm8k'
    ],
    'AlignScoreInputOutput': [
        'xsum'
    ]
}

METRICS = {
    'Accuracy': 'Accuracy',
    'AlignScoreInputOutput': 'Align Score',
}

def highlight_best_second(col: pd.Series):
    if col.dtype.kind not in "fi":
        return [''] * len(col)
    vals = col.astype(float)
    uniq_sorted = np.unique(vals[~vals.isna()])[::-1]  # desc unique
    best = uniq_sorted[0] if len(uniq_sorted) > 0 else np.nan
    second = uniq_sorted[1] if len(uniq_sorted) > 1 else np.nan

    styles = []
    for v in vals:
        if pd.isna(v):
            styles.append('')
        elif np.isclose(v, best, rtol=1e-9, atol=1e-12):
            styles.append('font-weight: bold;')
        elif not np.isnan(second) and np.isclose(v, second, rtol=1e-9, atol=1e-12):
            styles.append('text-decoration: underline;')
        else:
            styles.append('')
    return styles

def show_metric_tables(df_in: pd.DataFrame, metric_name: str):
    display(Markdown(f"## {metric_name}"))
    metric_df = df_in[df_in["metric"] == metric_name].copy()

    candidate_order = None
    for ds_list in DATASETS.values():
        if set(ds_list).issubset(set(metric_df["dataset"].unique())):
            candidate_order = ds_list
            break
    if candidate_order is None:
        candidate_order = list(metric_df["dataset"].unique())

    for model_name in model_display_order:
        block = metric_df[metric_df["model"] == model_name]
        if block.empty:
            continue

        piv = block.pivot_table(
            index="method", columns="dataset", values="prr_score", aggfunc="mean"
        )
        piv = piv.reindex(index=method_display_order)
        piv = piv.reindex(columns=[c for c in candidate_order if c in piv.columns])

        styled = (
            piv.round(3)
               .style
               .apply(highlight_best_second, axis=0)
               .set_caption(model_name)
        )
        display(styled)

for metric_name in df["metric"].unique():
    show_metric_tables(df, metric_name)


## Accuracy

dataset,gsm8k
method,Unnamed: 1_level_1
MSP,0.324
MSP-LINE,0.329
PPL,0.303
PPL-LINE,0.377
MTE,0.339
MTE-LINE,0.4
MCSE,0.351
MCSE-LINE,0.352
MCNSE,0.343
MCNSE-LINE,0.361


dataset,gsm8k
method,Unnamed: 1_level_1
MSP,0.303
MSP-LINE,0.296
PPL,0.248
PPL-LINE,0.358
MTE,0.292
MTE-LINE,0.399
MCSE,0.393
MCSE-LINE,0.399
MCNSE,0.356
MCNSE-LINE,0.371


## Align Score

dataset,xsum
method,Unnamed: 1_level_1
MSP,0.328
MSP-LINE,0.357
PPL,0.369
PPL-LINE,0.366
MTE,0.357
MTE-LINE,0.35
MCSE,0.033
MCSE-LINE,0.043
MCNSE,0.024
MCNSE-LINE,0.029


dataset,xsum
method,Unnamed: 1_level_1
MSP,0.351
MSP-LINE,0.378
PPL,0.354
PPL-LINE,0.373
MTE,0.333
MTE-LINE,0.356
MCSE,0.003
MCSE-LINE,0.032
MCNSE,0.016
MCNSE-LINE,0.032


In [None]:
import os
import numpy as np
import pandas as pd
df = pd.read_csv("results/sum_mr_results.csv")

import os
import numpy as np
import pandas as pd

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}
method_order = [v for v in methods_dict.values()]
method_display_order = [m for pair in zip(method_order, [m + "-LINE" for m in method_order]) for m in pair]

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
}
model_display_order = list(MODELS.values())

COLUMN_SPECS = [
        {"col": "xsum",  "metric": "Align Score", "pretty": "Align Score"},

    {"col": "gsm8k", "metric": "Accuracy", "pretty": "Accuracy"},
]

def _format_with_rankings(col: pd.Series, ndigits: int = 3) -> pd.Series:
    """Return a string-formatted column with best bolded and second underlined (ties respected)."""
    vals = pd.to_numeric(col, errors="coerce")
    uniq = np.unique(vals[~vals.isna()])
    if uniq.size == 0:
        best = second = np.nan
    else:
        uniq.sort()
        uniq = uniq[::-1]  # desc
        best = uniq[0]
        second = uniq[1] if uniq.size > 1 else np.nan

    out = []
    for v in vals:
        if pd.isna(v):
            out.append("--")
            continue
        s = f"{v:.{ndigits}f}"
        if np.isfinite(best) and np.isclose(v, best, rtol=1e-9, atol=1e-12):
            out.append(r"\textbf{" + s + "}")
        elif np.isfinite(second) and np.isclose(v, second, rtol=1e-9, atol=1e-12):
            out.append(r"\underline{" + s + "}")
        else:
            out.append(s)
    return pd.Series(out, index=col.index)

def generate_latex_tables_mixed_metrics(
    df_in: pd.DataFrame,
    value_col: str = "prr_score",
    out_dir: str = "latex_tables",
    ndigits: int = 2,
) -> dict:
  
    os.makedirs(out_dir, exist_ok=True)
    paths = {}

    for model_name in model_display_order:
        cols = {}
        for spec in COLUMN_SPECS:
            sub = df_in[
                (df_in["model"] == model_name)
                & (df_in["dataset"] == spec["col"])
                & (df_in["metric"] == spec["metric"])
            ]
            s = (
                sub.pivot_table(
                    index="method", values=value_col, aggfunc="mean"
                )[value_col]
                if not sub.empty else pd.Series(dtype=float)
            )
            cols[spec["col"]] = s

        block = pd.DataFrame(cols)

        block = block.reindex(index=method_display_order)
        block = block.reindex(columns=[spec["col"] for spec in COLUMN_SPECS])

        fmt = block.apply(_format_with_rankings, axis=0, ndigits=ndigits).fillna("--")

        colfmt = "l" + "r" * fmt.shape[1]

        col_metric_parts = [f"{spec['col']} ({spec['pretty']})" for spec in COLUMN_SPECS]
        caption = f"{model_name} — " + " vs. ".join(col_metric_parts)
        label = f"tab:{model_name.replace(' ', '_')}__gsm8k_xsum"

        latex = fmt.to_latex(
            escape=False,          
            index=True,
            caption=caption,
            label=label,
            na_rep="--",
            column_format=colfmt,
            bold_rows=False,
            longtable=False
        )

        lines = latex.splitlines()
        try:
            top_idx = next(i for i, l in enumerate(lines) if r"\toprule" in l)
            header_idx = top_idx + 1
            if lines[header_idx].startswith("{}"):
                lines[header_idx] = lines[header_idx].replace("{}", r"\textbf{Method}", 1)
            elif lines[header_idx].startswith("&"):
                lines[header_idx] = r"\textbf{Method} " + lines[header_idx]
            latex = "\n".join(lines)
        except StopIteration:
            pass

        safe_model = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in model_name)
        fname = f"{safe_model}__gsm8k_xsum.tex"
        fpath = os.path.join(out_dir, fname)
        with open(fpath, "w", encoding="utf-8") as f:
            f.write(latex)

        paths[model_name] = fpath

    return paths

paths = generate_latex_tables_mixed_metrics(df)
for model, p in paths.items():
    print(model, "->", p)


Llama 3.1 8B -> latex_tables/Llama_3.1_8B__gsm8k_xsum.tex
Gemma 2 9B -> latex_tables/Gemma_2_9B__gsm8k_xsum.tex


# Results with smaller sample size

In [None]:
from utils import detrend_ue_w_quality

import numpy as np
import pandas as pd
from collections import defaultdict
from pathlib import Path


methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
}

DATASETS = {
    'Accuracy': [
        'gsm8k'
    ],
    'AlignScoreInputOutput': [
        'xsum'
    ]
}

METRICS = {
    'Accuracy': 'Accuracy',
    'AlignScoreInputOutput': 'Align Score',
}


def main():
    rows = []  
    for metric_key, metric_name in METRICS.items():
        datasets = DATASETS[metric_key]
        ue_methods = list(methods_dict.values())

        for model_key, model_name in MODELS.items():
            ue_scores, _, _ = detrend_ue_w_quality(
                datasets,
                model_key,
                [metric_key],
                ue_methods,
                methods_dict,
                quality_fit_sample_size=500
            )

            for method_full, method_short in methods_dict.items():
                raw_scores = ue_scores[f"{method_short}_raw"]
                detr_scores = ue_scores[f"{method_short}_detr"]

                for dataset_name, raw, detr in zip(datasets, raw_scores, detr_scores):
                    # raw row (method name unchanged)
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": method_short,
                        "prr_score": float(raw),
                    })
                    # detrended row (method-LINE)
                    rows.append({
                        "model": model_name,
                        "dataset": dataset_name,
                        "metric": metric_name,
                        "method": f"{method_short}-LINE",
                        "prr_score": float(detr),
                    })

    # write single CSV with all metrics/models/datasets/methods
    df = pd.DataFrame(rows, columns=["model", "dataset", "metric", "method", "prr_score"])
    out_path = Path("results") / "sum_mr_results_500.csv"
    df.to_csv(out_path, index=False)
    print(f"Wrote {out_path.resolve()} with {len(df)} rows.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd


sample_500 = pd.read_csv("results/sum_mr_results_500.csv")

sample_full = pd.read_csv("results/sum_mr_results.csv")



import pandas as pd
import numpy as np
from IPython.display import display, Markdown

# expects these to already exist:
# sample_full, sample_500  with columns: model, dataset, metric, method, prr_score

# ---- split out the three variants ----
cols = ["model","dataset","metric","method","prr_score"]
full_raw  = sample_full.loc[~sample_full["method"].str.endswith("-LINE"), cols].copy()
full_detr = sample_full.loc[ sample_full["method"].str.endswith("-LINE"), cols].copy()
s500_detr = sample_500.loc[ sample_500["method"].str.endswith("-LINE"), cols].copy()

# normalize to base method name (no suffix)
for df in (full_detr, s500_detr):
    df["method_base"] = df["method"].str.replace("-LINE","", regex=False)
full_raw["method_base"] = full_raw["method"]

full_raw["variant"]  = "raw"
s500_detr["variant"] = "500"
full_detr["variant"] = "full"

full_raw["method_display"]  = full_raw["method_base"]
s500_detr["method_display"] = s500_detr["method_base"] + "-LINE (500 sample)"
full_detr["method_display"] = full_detr["method_base"] + "-LINE (Full sample)"

full_raw["variant_order"]  = 0
s500_detr["variant_order"] = 1
full_detr["variant_order"] = 2

combined = pd.concat([full_raw, s500_detr, full_detr], ignore_index=True)

method_order = ["MSP","PPL","MTE","MCSE","MCNSE","LSRL"]
present_methods = [m for m in method_order if m in combined["method_base"].unique()]


if not present_methods:
    present_methods = list(combined["method_base"].unique())

def highlight_best_second(col: pd.Series):
    if col.dtype.kind not in "fi":
        return [''] * len(col)
    vals = col.astype(float)
    uniq = np.unique(vals[~vals.isna()])[::-1]
    best = uniq[0] if len(uniq) else np.nan
    second = uniq[1] if len(uniq) > 1 else np.nan
    out = []
    for v in vals:
        if pd.isna(v): out.append('')
        elif np.isclose(v, best): out.append('font-weight: bold;')
        elif not np.isnan(second) and np.isclose(v, second): out.append('text-decoration: underline;')
        else: out.append('')
    return out

for model in combined["model"].unique():
    block = combined[combined["model"] == model].copy()

    dataset_order = sorted(block["dataset"].unique())

    block["method_base"] = pd.Categorical(block["method_base"], categories=present_methods, ordered=True)
    block = block.sort_values(["method_base","variant_order"])

    piv = (block
           .pivot_table(index=["method_base","variant_order","method_display"],
                        columns="dataset", values="prr_score", aggfunc="mean")
           .reindex(columns=dataset_order)
           .sort_index(level=[0,1])
          )

    piv = piv.reset_index().set_index("method_display").drop(columns=["method_base","variant_order"])
    styled = (piv.round(4)
                 .style
                 .apply(highlight_best_second, axis=0)
                 .set_caption(model))

    display(Markdown(f"### {model}"))
    display(styled)



  piv = (block


### Llama 3.1 8B

dataset,gsm8k,xsum
method_display,Unnamed: 1_level_1,Unnamed: 2_level_1
MSP,0.3244,0.3276
MSP-LINE (500 sample),0.3318,0.3574
MSP-LINE (Full sample),0.3289,0.3569
PPL,0.3026,0.3691
PPL-LINE (500 sample),0.3772,0.3661
PPL-LINE (Full sample),0.3765,0.3656
MTE,0.3393,0.3569
MTE-LINE (500 sample),0.3914,0.3537
MTE-LINE (Full sample),0.3998,0.3504
MCSE,0.3512,0.0326


  piv = (block


### Gemma 2 9B

dataset,gsm8k,xsum
method_display,Unnamed: 1_level_1,Unnamed: 2_level_1
MSP,0.3028,0.3507
MSP-LINE (500 sample),0.2947,0.3782
MSP-LINE (Full sample),0.2964,0.3777
PPL,0.2483,0.3541
PPL-LINE (500 sample),0.357,0.372
PPL-LINE (Full sample),0.3576,0.3726
MTE,0.2915,0.3332
MTE-LINE (500 sample),0.3992,0.3556
MTE-LINE (Full sample),0.3986,0.356
MCSE,0.3933,0.0031


# Results with 1-3 degrees polynomial fits

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from utils import detrend_ue_degreed 
from pathlib import Path

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen', 'wmt14_deen', 'wmt14_ruen', 'wmt14_fren',
        'wmt19_deen', 'wmt19_fien', 'wmt19_lten', 'wmt19_ruen',
    ],
}

METRICS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX XXL',
    'XComet-XCOMET-XXL': 'XComet XXL',
    'Comet-wmt22-comet-da': 'Comet WMT22',
}

def main():
    rows = []  # columns: model, dataset, metric, method, prr_score, correction
    coef_rows = []  # columns: model, dataset, metric, method, degree, coef_idx, coef_val

    for metric_key, metric_name in METRICS.items():
        datasets = DATASETS[metric_key]
        ue_methods = list(methods_dict.values())

        for model_key, model_name in MODELS.items():
            ue_scores, ue_coefs, _ = detrend_ue_degreed(
                datasets,
                model_key,
                [metric_key],
                ue_methods,
                methods_dict
            )

            # Flatten scores
            for method_full, method_short in methods_dict.items():
                method_keys = [
                    (f"{method_short}_raw", "RAW"),
                    (f"{method_short}_deg1", "DEG1"),
                    (f"{method_short}_deg2", "DEG2"),
                    (f"{method_short}_deg3", "DEG3"),
                ]

                # Each key maps to a list aligned with `datasets`
                for key, corr in method_keys:
                    scores = ue_scores.get(key, [])
                    for dataset_name, score in zip(datasets, scores):
                        rows.append({
                            "model": model_name,
                            "dataset": dataset_name,
                            "metric": metric_name,
                            "method": method_short,
                            "correction": corr,
                            "prr_score": float(score),
                        })

                # Optionally also flatten the regression coefficients per degree
                for deg in (1, 2, 3):
                    coef_key = f"{method_short}_deg{deg}"
                    coef_list = ue_coefs.get(coef_key, [])
                    # coef_list has one coef array per dataset (matching order in `datasets`)
                    for dataset_name, coef_arr in zip(datasets, coef_list):
                        # Some degrees have 1 feature, some more (poly features)
                        for i, c in enumerate(np.ravel(coef_arr)):
                            coef_rows.append({
                                "model": model_name,
                                "dataset": dataset_name,
                                "metric": metric_name,
                                "method": method_short,
                                "degree": deg,
                                "coef_idx": i,
                                "coef_val": float(c),
                            })

    # Write the main experimental results
    df = pd.DataFrame(rows, columns=["model", "dataset", "metric", "method", "correction", "prr_score"])
    out_dir = Path("results")
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "mt_polynomials_results.csv"
    df.to_csv(out_path, index=False)
    print(f"Wrote {out_path.resolve()} with {len(df)} rows.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd

df = pd.read_csv("results/mt_polynomials_results.csv")


CORR_ORDER = ["RAW", "DEG1", "DEG2", "DEG3"]
CORR_ORDER_MAP = {c: i for i, c in enumerate(CORR_ORDER)}

def format_table_for_model_metric(df, model_name, metric_name):
    sub = df[(df["model"] == model_name) & (df["metric"] == metric_name)].copy()
    if sub.empty:
        return None

    sub["corr_order"] = sub["correction"].map(CORR_ORDER_MAP).fillna(999).astype(int)
    sub = sub.sort_values(["method", "corr_order"])
    sub["method_corr"] = sub["method"] + "-" + sub["correction"]

    pivot = sub.pivot_table(
        index="method_corr",
        columns="dataset",
        values="prr_score",
        aggfunc="mean"
    ).sort_index(key=lambda s: s.map(
        lambda x: (x.split("-", 1)[0], CORR_ORDER_MAP.get(x.split("-", 1)[1], 999))
    ))

    def style_top2(col: pd.Series):
        styles = [''] * len(col)
        valid = col.dropna()
        if valid.empty:
            return styles

        top_val = valid.max()
        second_candidates = valid[valid < top_val]
        second_val = second_candidates.max() if not second_candidates.empty else None

        for i, idx in enumerate(col.index):
            v = col.loc[idx]
            if pd.isna(v):
                continue
            if v == top_val:
                styles[i] = 'font-weight: bold;'
            elif second_val is not None and v == second_val:
                styles[i] = 'text-decoration: underline;'
        return styles

    styled = pivot.style.format("{:.2f}").apply(style_top2, axis=0)
    return styled

models = df["model"].dropna().unique()
metrics = df["metric"].dropna().unique()

for model in models:
    for metric in metrics:
        print(f"\n=== {model} | {metric} ===")
        st = format_table_for_model_metric(df, model, metric)
        if st is not None:
            display(st)
        else:
            print("(no data)")



=== Llama 3.1 8B | MetricX XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.39,0.35,0.3,0.37,0.36,0.32,0.42,0.31
LSRL-DEG1,0.38,0.35,0.29,0.38,0.35,0.33,0.4,0.33
LSRL-DEG2,0.4,0.36,0.27,0.28,0.35,0.35,0.42,0.28
LSRL-DEG3,0.4,0.34,0.31,0.38,0.35,0.34,0.42,0.32
MCNSE-RAW,0.42,0.38,0.32,0.4,0.38,0.39,0.44,0.32
MCNSE-DEG1,0.42,0.38,0.34,0.43,0.39,0.39,0.44,0.36
MCNSE-DEG2,0.38,0.39,0.28,0.35,0.38,0.39,0.43,0.31
MCNSE-DEG3,0.45,0.39,0.34,0.43,0.39,0.4,0.46,0.35
MCSE-RAW,0.16,0.14,0.16,0.2,0.14,-0.0,0.07,0.21
MCSE-DEG1,0.29,0.29,0.25,0.3,0.32,0.29,0.29,0.27



=== Llama 3.1 8B | XComet XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.34,0.35,0.27,0.29,0.3,0.3,0.32,0.3
LSRL-DEG1,0.33,0.31,0.24,0.38,0.25,0.33,0.34,0.33
LSRL-DEG2,0.35,0.35,0.21,0.2,0.27,0.35,0.35,0.25
LSRL-DEG3,0.35,0.27,0.27,0.32,0.28,0.34,0.35,0.32
MCNSE-RAW,0.36,0.34,0.27,0.32,0.33,0.38,0.43,0.32
MCNSE-DEG1,0.36,0.35,0.31,0.39,0.35,0.38,0.42,0.4
MCNSE-DEG2,0.31,0.39,0.23,0.27,0.31,0.38,0.41,0.31
MCNSE-DEG3,0.39,0.39,0.32,0.36,0.36,0.38,0.42,0.38
MCSE-RAW,0.2,0.29,0.29,0.33,0.24,-0.04,0.06,0.31
MCSE-DEG1,0.26,0.27,0.24,0.28,0.29,0.27,0.28,0.28



=== Llama 3.1 8B | Comet WMT22 ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.45,0.44,0.35,0.38,0.46,0.37,0.42,0.35
LSRL-DEG1,0.41,0.41,0.32,0.44,0.4,0.37,0.4,0.38
LSRL-DEG2,0.45,0.44,0.29,0.31,0.42,0.39,0.41,0.31
LSRL-DEG3,0.46,0.38,0.35,0.43,0.44,0.39,0.41,0.37
MCNSE-RAW,0.48,0.44,0.36,0.4,0.43,0.46,0.48,0.35
MCNSE-DEG1,0.49,0.44,0.39,0.47,0.45,0.46,0.48,0.44
MCNSE-DEG2,0.42,0.46,0.3,0.37,0.42,0.45,0.47,0.37
MCNSE-DEG3,0.53,0.46,0.4,0.46,0.47,0.47,0.49,0.43
MCSE-RAW,0.36,0.32,0.3,0.35,0.36,0.08,0.2,0.36
MCSE-DEG1,0.38,0.36,0.28,0.33,0.38,0.32,0.36,0.32



=== Gemma 2 9B | MetricX XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.35,0.38,0.26,0.36,0.36,0.34,0.34,0.34
LSRL-DEG1,0.35,0.38,0.26,0.37,0.36,0.36,0.31,0.34
LSRL-DEG2,0.34,0.38,0.26,0.33,0.36,0.36,0.31,0.33
LSRL-DEG3,0.36,0.39,0.26,0.36,0.36,0.35,0.3,0.35
MCNSE-RAW,0.38,0.43,0.33,0.4,0.43,0.36,0.32,0.38
MCNSE-DEG1,0.38,0.43,0.33,0.41,0.44,0.36,0.32,0.4
MCNSE-DEG2,0.38,0.43,0.33,0.38,0.43,0.36,0.29,0.38
MCNSE-DEG3,0.38,0.43,0.33,0.4,0.44,0.35,0.33,0.4
MCSE-RAW,0.11,0.15,0.12,0.21,0.2,-0.03,0.17,0.23
MCSE-DEG1,0.31,0.36,0.28,0.33,0.4,0.27,0.25,0.35



=== Gemma 2 9B | XComet XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.26,0.3,0.2,0.3,0.28,0.29,0.19,0.27
LSRL-DEG1,0.25,0.28,0.19,0.38,0.26,0.32,0.16,0.29
LSRL-DEG2,0.24,0.28,0.19,0.26,0.26,0.32,0.16,0.25
LSRL-DEG3,0.26,0.29,0.19,0.31,0.27,0.31,0.16,0.28
MCNSE-RAW,0.28,0.34,0.25,0.33,0.32,0.29,0.19,0.33
MCNSE-DEG1,0.28,0.33,0.27,0.4,0.33,0.29,0.19,0.36
MCNSE-DEG2,0.27,0.33,0.27,0.31,0.31,0.27,0.14,0.33
MCNSE-DEG3,0.27,0.34,0.27,0.34,0.33,0.27,0.21,0.35
MCSE-RAW,0.15,0.3,0.26,0.34,0.27,-0.07,0.09,0.32
MCSE-DEG1,0.23,0.32,0.26,0.34,0.33,0.2,0.12,0.32



=== Gemma 2 9B | Comet WMT22 ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.4,0.47,0.33,0.4,0.43,0.4,0.34,0.34
LSRL-DEG1,0.38,0.46,0.32,0.45,0.41,0.4,0.28,0.35
LSRL-DEG2,0.38,0.46,0.32,0.39,0.41,0.4,0.28,0.34
LSRL-DEG3,0.4,0.47,0.32,0.42,0.42,0.4,0.28,0.35
MCNSE-RAW,0.44,0.5,0.37,0.42,0.47,0.41,0.35,0.37
MCNSE-DEG1,0.44,0.5,0.39,0.48,0.49,0.41,0.35,0.41
MCNSE-DEG2,0.43,0.5,0.39,0.44,0.48,0.4,0.32,0.39
MCNSE-DEG3,0.43,0.5,0.39,0.44,0.5,0.4,0.36,0.4
MCSE-RAW,0.32,0.31,0.28,0.35,0.41,0.09,0.29,0.36
MCSE-DEG1,0.39,0.43,0.35,0.38,0.47,0.31,0.29,0.39



=== EuroLLM 9B | MetricX XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.29,0.34,0.27,0.3,0.31,0.3,0.31,0.2
LSRL-DEG1,0.29,0.34,0.27,0.3,0.3,0.32,0.31,0.21
LSRL-DEG2,0.26,0.34,0.27,0.27,0.3,0.31,0.28,0.2
LSRL-DEG3,0.27,0.33,0.27,0.29,0.3,0.31,0.31,0.22
MCNSE-RAW,0.23,0.33,0.22,0.29,0.28,0.32,0.26,0.21
MCNSE-DEG1,0.21,0.33,0.21,0.28,0.26,0.3,0.25,0.23
MCNSE-DEG2,0.19,0.32,0.2,0.26,0.24,0.28,0.21,0.22
MCNSE-DEG3,0.19,0.32,0.19,0.29,0.23,0.28,0.24,0.24
MCSE-RAW,0.2,0.24,0.16,0.28,0.25,0.1,0.25,0.27
MCSE-DEG1,0.42,0.42,0.36,0.39,0.4,0.36,0.36,0.34



=== EuroLLM 9B | XComet XXL ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.23,0.28,0.23,0.22,0.28,0.24,0.18,0.18
LSRL-DEG1,0.23,0.28,0.23,0.26,0.28,0.26,0.18,0.22
LSRL-DEG2,0.19,0.27,0.23,0.17,0.28,0.25,0.16,0.19
LSRL-DEG3,0.2,0.27,0.23,0.22,0.27,0.25,0.19,0.22
MCNSE-RAW,0.19,0.28,0.17,0.23,0.26,0.3,0.22,0.21
MCNSE-DEG1,0.18,0.28,0.17,0.29,0.25,0.28,0.19,0.26
MCNSE-DEG2,0.15,0.27,0.16,0.19,0.22,0.25,0.14,0.21
MCNSE-DEG3,0.15,0.27,0.15,0.25,0.22,0.23,0.18,0.24
MCSE-RAW,0.19,0.31,0.26,0.35,0.3,-0.01,0.08,0.34
MCSE-DEG1,0.31,0.36,0.33,0.38,0.38,0.28,0.2,0.32



=== EuroLLM 9B | Comet WMT22 ===


dataset,wmt14_csen,wmt14_deen,wmt14_fren,wmt14_ruen,wmt19_deen,wmt19_fien,wmt19_lten,wmt19_ruen
method_corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LSRL-RAW,0.32,0.38,0.29,0.31,0.4,0.35,0.32,0.26
LSRL-DEG1,0.32,0.37,0.29,0.36,0.41,0.35,0.31,0.29
LSRL-DEG2,0.29,0.37,0.29,0.31,0.4,0.35,0.3,0.28
LSRL-DEG3,0.3,0.37,0.29,0.32,0.4,0.35,0.31,0.3
MCNSE-RAW,0.23,0.36,0.22,0.28,0.34,0.36,0.28,0.24
MCNSE-DEG1,0.22,0.36,0.22,0.33,0.34,0.35,0.28,0.29
MCNSE-DEG2,0.2,0.35,0.21,0.28,0.31,0.33,0.25,0.28
MCNSE-DEG3,0.2,0.35,0.19,0.31,0.31,0.32,0.27,0.29
MCSE-RAW,0.35,0.36,0.28,0.42,0.41,0.21,0.34,0.42
MCSE-DEG1,0.46,0.47,0.4,0.46,0.49,0.4,0.37,0.39
