In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict


In [2]:
methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
        'LexicalSimilarity_rougeL': 'LSRL',
        'TokenSAR':'TokenSAR'

}

DATASETS_MT = [
    'wmt14_deen',
    'wmt14_fren',
    'wmt14_csen',
    'wmt14_ruen',
    'wmt19_ruen',
    'wmt19_fien',
    'wmt19_deen',
    'wmt19_lten'
]

all_metrics_mt = ['Comet-wmt22-comet-da', 'XComet-XCOMET-XXL', 'metricx-metricx-24-hybrid-large-v2p6']
all_methods =['MSP', 'PPL', 'MTE', 'MCSE', 'MCNSE', 'LSRL','TokenSAR']


metrics_dict ={
  'Comet':'Comet', 'XComet-XCOMET-XXL' :'XComet-XXL', 'metricx-metricx-24-hybrid-large-v2p6' :'MetricX-Large' ,
  'AlignScoreInputOutput':'Align Score', 'Accuracy':'Acc', 'AlignScoreInputOutput':'Align Score','Rouge_rougeL':'Rouge L', 'Comet-wmt22-comet-da':'Comet',
    'MSP':'MSP', 'PPL' :'PPL', 'MTE' :'MTE',  'MCSE':'MCSE', 'MCNSE':'MCNSE', 'LSRL': 'LSRL','TokenSAR':'TokenSAR'}

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import linregress
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

sns.set(style="whitegrid", font_scale=1.4, rc={"font.family": "serif"})


def format_dataset_name(raw_name):
    try:
        prefix, lang_pair = raw_name.split("_")
        prefix = prefix.upper()

        if len(lang_pair) == 4:  # e.g., fren → Fr-En
            src = lang_pair[:2].capitalize()
            tgt = lang_pair[2:].capitalize()
            lang_fmt = f"{src}-{tgt}"
        else:
            lang_fmt = lang_pair.upper()

        return f"{prefix} {lang_fmt}"
    except Exception:
        return raw_name.upper()


def plot_metric_vs_length(
    gen_lengths, metric_values,
    metric_name, dataset_name, save_path='plot.pdf', model='llama', task ='nmt'
):

    # Trim outliers
    upper_q, lower_q = np.quantile(gen_lengths, [0.95, 0.05])
    mask = (gen_lengths > lower_q) & (gen_lengths < upper_q)
    gen_lengths = gen_lengths[mask]
    metric_values = metric_values[mask]

    # Normalize
    scaler_len = MinMaxScaler()
    scaler_val = MinMaxScaler()

    norm_len = scaler_len.fit_transform(gen_lengths[:, None]).squeeze()
    norm_val = scaler_val.fit_transform(metric_values[:, None]).squeeze()

    # Bin and smooth
    df = pd.DataFrame({"length": norm_len, "metric": norm_val})
    grouped = df.groupby("length").agg(['mean', 'sem'])
    x_vals = grouped.index.values
    y_vals = grouped['metric']['mean'].values
    y_errs = grouped['metric']['sem'].values

    # Fit regression (on raw normalized data)
    linreg = LinearRegression().fit(norm_len[:, None], norm_val)
    slope = linreg.coef_[0]

    # Compute p-value
    slope_, intercept_, r_val, p_val, std_err = linregress(norm_len, norm_val)

    x_line = np.linspace(0, 1, 100)
    y_line = linreg.predict(x_line[:, None])

    # Plot
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(x_vals, y_vals, label='AVG metric value', color="navy")
    ax.fill_between(x_vals, y_vals - y_errs, y_vals + y_errs, alpha=0.2, color="navy")
    ax.plot(x_line, y_line, linestyle='--', color='crimson', label='Regression Line')


    if task=='nmt':
        pretty_dataset = format_dataset_name(dataset_name)
    else:
        pretty_dataset = dataset_name.capitalize()
    ax.set_title(f"{metrics_dict[metric_name]} vs. Length ({pretty_dataset})", fontsize=14)
    ax.set_xlabel("Generated sequence length (normalized)")
    ax.set_ylabel(f"{metrics_dict[metric_name]} (normalized)")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)


# Metric trends plots (Translation)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from utils import extract_and_prepare_data
import numpy as np
import os 

models =['llama','gemma','eurollm']
for model in models:
    for dataset in DATASETS_MT:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_mt, model=model)

        for metric in all_metrics_mt:
            os.makedirs(f'plots', exist_ok=True)

            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_metric_values[metric]),
                metric_name=metric,
                dataset_name=dataset,
                save_path=f'plots/{model}_{dataset}_{metric}_train.pdf',
            )


# Metric trends plots (Summarization)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os 

DATASETS_SUM =['xsum']
all_metrics_sum = ['AlignScoreInputOutput']
models_sum =['llama','gemma']

for model in models_sum:
    for dataset in DATASETS_SUM:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_sum, model=model)

        for metric in all_metrics_sum:
            os.makedirs(f'plots', exist_ok=True)

            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_metric_values[metric]),
                metric_name=metric,
                dataset_name=dataset,
                save_path=f'plots/{dataset}_{metric}_{model}_train.pdf',
                task='sum'
            )


# Metric trends plots (Mathematical Reasoning)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os 
DATASETS_MR =['gsm8k']
all_metrics_mr=['Accuracy']
models_mr =['llama','gemma']
for model in models_mr:
    for dataset in DATASETS_MR:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_mr, model=model)

        for metric in all_metrics_mr:
            os.makedirs(f'plots', exist_ok=True)

            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_metric_values[metric]),
                metric_name=metric,
                dataset_name=dataset,
                save_path=f'plots/{dataset}_{metric}_{model}_train.pdf',
                task='mr'
            )


# UE metrics trends (Translation)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}

for model in models:
    for dataset in DATASETS_MT:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_mt, model=model)
        
        for metric, metric_short  in methods_dict.items():
            os.makedirs(f'plots', exist_ok=True)
            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_ue_values[metric_short]),
                metric_name=metric_short,
                dataset_name=dataset,
                save_path=f'plots/{dataset}_{metric}_{model}_train.pdf',
            )

# UE metrics trends (Summarization)

In [None]:

for model in models_sum:
    for dataset in DATASETS_SUM:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_sum, model=model)

        for metric, metric_short  in methods_dict.items():
            os.makedirs(f'plots', exist_ok=True)
            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_ue_values[metric_short]),
                metric_name=metric_short,
                dataset_name=dataset,
                save_path=f'plots/{dataset}_{metric}_{model}_train.pdf',
                task='sum'
            )

# UE metrics trends (Mathematical Reasoning)

In [None]:

for model in models_mr:
    for dataset in DATASETS_MR:
        train_ue_values, test_ue_values, train_metric_values, test_metric_values, train_gen_lengths, gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics_mr, model=model)

        for metric, metric_short  in methods_dict.items():
            os.makedirs(f'plots', exist_ok=True)
            plot_metric_vs_length(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_ue_values[metric_short]),
                metric_name=metric_short,
                dataset_name=dataset,
                save_path=f'plots/{dataset}_{metric}_{model}_train.pdf',
                task='mr'
            )

# Tables with slopes and coefficients

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict


In [14]:
methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}

DATASETS_MT = [
    'wmt14_deen',
    'wmt14_fren',
    'wmt14_csen',
    'wmt14_ruen',
    'wmt19_ruen',
    'wmt19_fien',
    'wmt19_deen',
    'wmt19_lten'
]

all_metrics_mt = ['Comet-wmt22-comet-da', 'XComet-XCOMET-XXL', 'metricx-metricx-24-hybrid-large-v2p6']
all_methods =['MSP', 'PPL', 'MTE', 'MCSE', 'MCNSE', 'LSRL', 'TokenSAR']


metrics_dict ={
  'Comet':'Comet', 'XComet-XCOMET-XXL' :'XComet-XXL', 'metricx-metricx-24-hybrid-large-v2p6' :'MetricX-Large' ,
  'AlignScoreInputOutput':'Align Score', 'Accuracy':'Acc', 'AlignScoreInputOutput':'Align Score','Rouge_rougeL':'Rouge L', 'Comet-wmt22-comet-da':'Comet',
    'MSP':'MSP', 'PPL' :'PPL', 'MTE' :'MTE',  'MCSE':'MCSE', 'MCNSE':'MCNSE', 'LSRL': 'LSRL'}


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import linregress

def compute_length_metric_correlation(gen_lengths, metric_values, trim_quantiles=(0.05, 0.95)):
    
    gen_lengths = np.asarray(gen_lengths)
    metric_values = np.asarray(metric_values)

    # --- Trim outliers ---
    lower_q, upper_q = np.quantile(gen_lengths, [trim_quantiles[0], trim_quantiles[1]])
    mask = (gen_lengths > lower_q) & (gen_lengths < upper_q)
    gen_lengths = gen_lengths[mask]
    metric_values = metric_values[mask]

    # --- Normalize ---
    scaler_len = MinMaxScaler()
    scaler_val = MinMaxScaler()
    norm_len = scaler_len.fit_transform(gen_lengths[:, None]).squeeze()
    norm_val = scaler_val.fit_transform(metric_values[:, None]).squeeze()

    # --- Linear regression ---
    linreg = LinearRegression().fit(norm_len[:, None], norm_val)
    slope = float(linreg.coef_[0])

    # SciPy linregress for p-value and correlation
    slope_, intercept_, r_val, p_val, std_err = linregress(norm_len, norm_val)

    return slope, p_val, r_val




In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
    'TokenSAR':'TokenSAR'
}
import numpy as np
import pandas as pd


rows = []  

for model in models:
    for dataset in DATASETS_MT:
        (train_ue_values, test_ue_values,
         train_metric_values, test_metric_values,
         train_gen_lengths, gen_lengths) = extract_and_prepare_data(
            dataset, methods_dict, all_metrics_mt, model=model
        )

        for long_name, short_name in methods_dict.items():
            slope, p_val, r_val = compute_length_metric_correlation(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_ue_values[short_name]),
            )
            rows.append({
                "model": model,
                "dataset": dataset,
                "category": "ue",         
                "name": short_name,        
                "slope": float(slope),
                "p_value": float(p_val),
                "r_value": float(r_val),
            })

        for metric in all_metrics_mt:
            slope, p_val, r_val = compute_length_metric_correlation(
                gen_lengths=np.array(train_gen_lengths),
                metric_values=np.array(train_metric_values[metric]),
            )
            rows.append({
                "model": model,
                "dataset": dataset,
                "category": "metric",      # e.g., MetricX, COMET...
                "name": metric,
                "slope": float(slope),
                "p_value": float(p_val),
                "r_value": float(r_val),
            })

# Save one CSV
df = pd.DataFrame(rows)
out_path = "results/length_vs_ue_corr.csv"
df.to_csv(out_path, index=False)
print(f"Wrote {out_path} with {len(df)} rows.")




Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e76c6da0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f98151e0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ebdc4be0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb7c8a00>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f175d780>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e74df520>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ea5cf6a0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb6f82b0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eba766b0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f06852d0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e6cbf970>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e79d8c10>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ebb3b520>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e79dba60>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f100bd00>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e79d9540>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eba77550>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb208190>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f8b0f520>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f0452710>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ea565060>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e74b8700>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eba75780>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb79d3c0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f04530a0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ebb3b040>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e74bb610>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f8b0f1f0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f8b0d0f0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f0451f60>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f06878e0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f0453130>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f0686560>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e79d8310>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f8b0feb0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eba75510>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e6cbe620>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f1008cd0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb79c790>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb5d34c0>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4eb5d3790>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4ea565030>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4f0684430>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e6cbd750>]


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7ff4e74eca60>]


In [None]:
import pandas as pd

# Load the CSV
corr = pd.read_csv("results/length_vs_ue_corr.csv")

method_order = list(methods_dict.values())  
corr_ue = corr[corr["category"] == "ue"].copy()

piv_slope = corr_ue.pivot_table(index=["model", "dataset"], columns="name", values="slope", aggfunc="first")
piv_pval  = corr_ue.pivot_table(index=["model", "dataset"], columns="name", values="p_value", aggfunc="first")

piv_slope = piv_slope.reindex(columns=method_order)
piv_pval  = piv_pval.reindex(columns=method_order)

cols = []
parts = []
for m in method_order:
    cols.extend([(m, "slope"), (m, "p-val")])
    parts.append(piv_slope[m])
    parts.append(piv_pval[m])

table = pd.concat(parts, axis=1)
table.columns = pd.MultiIndex.from_tuples(cols, names=["Method", ""])

table = table.round({col: 3 for col in table.columns})  

for model_name in table.index.get_level_values("model").unique():
    print(f"\n=== {model_name} ===")
    display(table.loc[model_name])  



=== eurollm ===


Method,MSP,MSP,PPL,PPL,MTE,MTE,MCSE,MCSE,MCNSE,MCNSE,LSRL,LSRL
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
wmt14_csen,0.299,0.0,-0.056,0.0,-0.091,0.0,0.424,0.0,-0.057,0.0,-0.014,0.426
wmt14_deen,0.359,0.0,-0.083,0.0,-0.117,0.0,0.34,0.0,-0.025,0.109,0.011,0.553
wmt14_fren,0.368,0.0,-0.121,0.0,-0.141,0.0,0.169,0.0,-0.042,0.002,0.0,0.988
wmt14_ruen,0.266,0.0,-0.136,0.0,-0.195,0.0,0.227,0.0,-0.171,0.0,-0.164,0.0
wmt19_deen,0.203,0.0,-0.043,0.002,-0.106,0.0,0.265,0.0,-0.071,0.0,-0.015,0.433
wmt19_fien,0.462,0.0,-0.037,0.001,-0.075,0.0,0.312,0.0,-0.037,0.003,0.048,0.009
wmt19_lten,0.318,0.0,-0.057,0.0,-0.118,0.0,0.282,0.0,-0.07,0.0,0.023,0.232
wmt19_ruen,0.451,0.0,-0.083,0.0,-0.137,0.0,0.363,0.0,-0.115,0.0,-0.092,0.0



=== gemma ===


Method,MSP,MSP,PPL,PPL,MTE,MTE,MCSE,MCSE,MCNSE,MCNSE,LSRL,LSRL
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
wmt14_csen,0.551,0.0,-0.049,0.0,-0.1,0.0,0.587,0.0,0.017,0.226,0.045,0.001
wmt14_deen,0.395,0.0,-0.076,0.0,-0.118,0.0,0.486,0.0,0.009,0.526,0.046,0.001
wmt14_fren,0.485,0.0,-0.088,0.0,-0.138,0.0,0.567,0.0,-0.052,0.0,0.028,0.035
wmt14_ruen,0.377,0.0,-0.175,0.0,-0.236,0.0,0.485,0.0,-0.137,0.0,-0.167,0.0
wmt19_deen,0.376,0.0,-0.029,0.008,-0.092,0.0,0.322,0.0,-0.028,0.029,0.048,0.001
wmt19_fien,0.506,0.0,-0.004,0.79,-0.057,0.0,0.507,0.0,0.003,0.836,0.085,0.0
wmt19_lten,0.567,0.0,-0.019,0.029,-0.065,0.0,0.398,0.0,0.001,0.962,0.09,0.0
wmt19_ruen,0.631,0.0,-0.055,0.0,-0.106,0.0,0.591,0.0,-0.066,0.0,-0.025,0.084



=== llama ===


Method,MSP,MSP,PPL,PPL,MTE,MTE,MCSE,MCSE,MCNSE,MCNSE,LSRL,LSRL
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
wmt14_csen,0.467,0.0,-0.07,0.0,-0.083,0.0,0.449,0.0,-0.004,0.757,0.046,0.002
wmt14_deen,0.445,0.0,-0.076,0.0,-0.113,0.0,0.428,0.0,-0.006,0.666,0.059,0.0
wmt14_fren,0.519,0.0,-0.079,0.0,-0.102,0.0,0.477,0.0,-0.048,0.0,0.047,0.001
wmt14_ruen,0.329,0.0,-0.069,0.0,-0.118,0.0,0.387,0.0,-0.088,0.0,-0.153,0.0
wmt19_deen,0.447,0.0,-0.046,0.0,-0.13,0.0,0.315,0.0,-0.025,0.064,0.087,0.0
wmt19_fien,0.478,0.0,-0.014,0.15,-0.077,0.0,0.385,0.0,-0.004,0.736,0.081,0.0
wmt19_lten,0.426,0.0,-0.046,0.0,-0.081,0.0,0.453,0.0,-0.005,0.734,0.086,0.0
wmt19_ruen,0.455,0.0,-0.09,0.0,-0.142,0.0,0.357,0.0,-0.098,0.0,-0.062,0.0


In [None]:
import pandas as pd

corr = pd.read_csv("results/length_vs_ue_corr.csv")

METRICS_PRETTY = {
    "metricx-metricx-24-hybrid-large-v2p6": "MetricX XXL",
    "XComet-XCOMET-XXL": "XComet XXL",
    "Comet-wmt22-comet-da": "Comet WMT22",
}
corr["metric_pretty"] = corr["name"].map(METRICS_PRETTY).fillna(corr["name"])

corr_metrics = corr[corr["category"] == "metric"].copy()

if METRICS_PRETTY:
    metric_order = [METRICS_PRETTY[k] for k in METRICS_PRETTY if k in corr["name"].unique()]
else:
    metric_order = sorted(corr_metrics["metric_pretty"].unique())

piv_slope = corr_metrics.pivot_table(
    index=["model", "dataset"], columns="metric_pretty", values="slope", aggfunc="first"
).reindex(columns=metric_order)

piv_pval = corr_metrics.pivot_table(
    index=["model", "dataset"], columns="metric_pretty", values="p_value", aggfunc="first"
).reindex(columns=metric_order)

cols = []
parts = []
for m in metric_order:
    cols.extend([(m, "slope"), (m, "p-val")])
    parts.append(piv_slope[m])
    parts.append(piv_pval[m])

metric_table = pd.concat(parts, axis=1)
metric_table.columns = pd.MultiIndex.from_tuples(cols, names=["Metric", ""])
metric_table = metric_table.round(3)

# Show one block per model (rows = datasets)
for model_name in metric_table.index.get_level_values("model").unique():
    print(f"\n=== {model_name} ===")
    display(metric_table.loc[model_name])



=== eurollm ===


Metric,MetricX XXL,MetricX XXL,XComet XXL,XComet XXL,Comet WMT22,Comet WMT22
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
wmt14_csen,0.027,0.04,-0.011,0.572,-0.047,0.001
wmt14_deen,0.002,0.929,-0.035,0.157,-0.016,0.413
wmt14_fren,0.007,0.666,-0.034,0.152,-0.029,0.042
wmt14_ruen,-0.058,0.001,-0.031,0.25,-0.101,0.0
wmt19_deen,-0.05,0.0,-0.096,0.0,-0.156,0.0
wmt19_fien,0.104,0.0,0.17,0.0,0.007,0.648
wmt19_lten,0.026,0.078,0.156,0.0,-0.027,0.053
wmt19_ruen,0.049,0.0,0.135,0.0,-0.034,0.003



=== gemma ===


Metric,MetricX XXL,MetricX XXL,XComet XXL,XComet XXL,Comet WMT22,Comet WMT22
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
wmt14_csen,0.035,0.013,-0.047,0.018,-0.033,0.018
wmt14_deen,0.001,0.936,-0.053,0.027,-0.023,0.251
wmt14_fren,0.016,0.302,-0.051,0.035,-0.028,0.048
wmt14_ruen,-0.046,0.01,-0.021,0.404,-0.103,0.0
wmt19_deen,-0.046,0.0,-0.095,0.0,-0.154,0.0
wmt19_fien,0.074,0.0,0.121,0.0,-0.006,0.647
wmt19_lten,0.022,0.178,0.11,0.0,-0.022,0.108
wmt19_ruen,0.049,0.0,0.12,0.0,-0.034,0.004



=== llama ===


Metric,MetricX XXL,MetricX XXL,XComet XXL,XComet XXL,Comet WMT22,Comet WMT22
Unnamed: 0_level_1,slope,p-val,slope,p-val,slope,p-val
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
wmt14_csen,0.026,0.071,-0.033,0.092,-0.032,0.019
wmt14_deen,-0.001,0.977,-0.073,0.004,-0.027,0.176
wmt14_fren,0.01,0.534,-0.077,0.002,-0.043,0.002
wmt14_ruen,-0.028,0.13,-0.029,0.272,-0.1,0.0
wmt19_deen,-0.057,0.0,-0.118,0.0,-0.176,0.0
wmt19_fien,0.054,0.001,0.067,0.005,-0.016,0.288
wmt19_lten,0.03,0.085,0.093,0.0,-0.018,0.223
wmt19_ruen,0.044,0.001,0.09,0.0,-0.03,0.011
