# Detailed tables

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from utils import extract_and_prepare_data, detrend_ue, detrend_ue_w_quality
from pathlib import Path
import pathlib
import argparse

normalize = True

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'bleu_proper': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
}

METRICS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX XXL',
    'XComet-XCOMET-XXL': 'XComet XXL',
    'Comet-wmt22-comet-da': 'Comet WMT22',
    # 'bleu_proper': 'BLEU',
}

pathlib.Path('tables').mkdir(parents=True, exist_ok=True)
pathlib.Path('charts').mkdir(parents=True, exist_ok=True)

def get_header(caption):
    return (
        "\\begin{table*}\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\caption{{{caption}}}\n"
        "\\begin{tabular}{lcccccccc}\n"
        "\\toprule\n"
        "&\multicolumn{4}{c}{\\textbf{WMT14}}&\multicolumn{4}{c}{\\textbf{WMT19}}\\\\\n"
        "\cmidrule(lr){2-5}\n"
        "\cmidrule(lr){6-9}\n"

    )

def footer():
    return (
        "\midrule\n"
        "\end{tabular}\n"
        "\end{table*}\n"
    )

def colname(dataset):
    if '_' in dataset:
        dataset = dataset.split('_')[1]

    return dataset[:2].capitalize() + '-' + dataset[2:].capitalize()


def main(type_):
    for metric, metric_name in METRICS.items():
        datasets = DATASETS[metric]
        ue_methods = list(methods_dict.values())

        caption = f"Detailed PRR scores (raw and detrended) for all methods. Metric: {metric_name}."
        header = get_header(caption)
        header += "&" + "&".join([colname(dataset).replace('_', '\\_') for dataset in datasets]) + "\\\\\n"
        latex = header

        for model_type in ['base']:
            for model, model_name in MODELS.items():
                model_title = model_name if model_type == 'base' else f"{model_name} Instruct"
                latex += "\midrule\n"
                latex += "& \\multicolumn{8}{c}{" + model_title + "}\\\\\n"
                latex += "\midrule\n"

                ue_scores, _, _ = detrend_ue(datasets, model, model_type, [metric], ue_methods, methods_dict, return_unprocessed=True)

                # Find best and second-best scores per dataset (column-wise)
                best_map = {}
                second_map = {}
                for col_idx in range(len(datasets)):
                    scores = []
                    for method_short in methods_dict.values():
                        raw = ue_scores[f"{method_short}_raw"][col_idx]
                        detr = ue_scores[f"{method_short}_detr"][col_idx]
                        scores.extend([raw, detr])
                    sorted_scores = sorted(scores, reverse=True)
                    best_map[col_idx] = sorted_scores[0]
                    second_map[col_idx] = sorted_scores[1]

                for method_full, method_short in methods_dict.items():
                    raw_scores = ue_scores[f"{method_short}_raw"]
                    detr_scores = ue_scores[f"{method_short}_detr"]

                    raw_row = [f"Raw {method_short}"]
                    detr_row = [f"Detr {method_short}"]

                    for col_idx, (raw, detr) in enumerate(zip(raw_scores, detr_scores)):
                        # Format raw
                        if np.isclose(raw, best_map[col_idx]):
                            raw_row.append(f"\\textbf{{{raw:.2f}}}")
                        elif np.isclose(raw, second_map[col_idx]):
                            raw_row.append(f"\\underline{{{raw:.2f}}}")
                        else:
                            raw_row.append(f"{raw:.2f}")

                        # Format detr
                        if np.isclose(detr, best_map[col_idx]):
                            detr_row.append(f"\\textbf{{{detr:.2f}}}")
                        elif np.isclose(detr, second_map[col_idx]):
                            detr_row.append(f"\\underline{{{detr:.2f}}}")
                        else:
                            detr_row.append(f"{detr:.2f}")

                    latex += " & ".join(raw_row) + " \\\\\n"
                    latex += " & ".join(detr_row) + " \\\\\n"
                    latex += "\\midrule\n"

        latex += footer()
        name = f'tables/{metric}_detailed_results_per_dataset_ranking_poly_3rd.tex'
        with open(name, 'w') as f:
            f.write(latex)


if __name__ == "__main__":
    type_='all'
    main(type_)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from utils import extract_and_prepare_data, detrend_ue, detrend_ue_w_quality
from pathlib import Path
import pathlib
import argparse

normalize = True

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'bleu_proper': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Accuracy':['gsm8k'],
    'AlignScoreInputOutput':['xsum']
}

METRICS = {
    'Accuracy': 'Accuracy',
    'AlignScoreInputOutput': 'Align Score',
    # 'Comet-wmt22-comet-da': 'Comet WMT22',
    # 'bleu_proper': 'BLEU',
}

pathlib.Path('tables').mkdir(parents=True, exist_ok=True)
pathlib.Path('charts').mkdir(parents=True, exist_ok=True)

def get_header(caption):
    return (
        "\\begin{table*}\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\caption{{{caption}}}\n"
        "\\begin{tabular}{lc}\n"
                "\\toprule\n"
    )

def footer():
    return (
        "\midrule\n"
        "\end{tabular}\n"
        "\end{table*}\n"
    )

def colname(dataset):
    if '_' in dataset:
        dataset = dataset.split('_')[1]

    return dataset[:2].capitalize() + '-' + dataset[2:].capitalize()


def main(type_):
    for metric, metric_name in METRICS.items():
        datasets = DATASETS[metric]
        ue_methods = list(methods_dict.values())

        caption = f"Detailed PRR scores (raw and detrended) for all methods. Metric: {metric_name}."
        header = get_header(caption)
        header += "&" + "&".join([colname(dataset).replace('_', '\\_') for dataset in datasets]) + "\\\\\n"
        latex = header

        for model_type in ['base']:
            for model, model_name in MODELS.items():
                model_title = model_name if model_type == 'base' else f"{model_name} Instruct"
                latex += "\midrule\n"
                latex += "& \\multicolumn{8}{c}{" + model_title + "}\\\\\n"
                latex += "\midrule\n"

                ue_scores, _ , _, _,_= detrend_ue_w_quality(datasets, model, model_type, [metric], ue_methods, methods_dict, return_unprocessed=True)

                # Find best and second-best scores per dataset (column-wise)
                best_map = {}
                second_map = {}
                for col_idx in range(len(datasets)):
                    scores = []
                    for method_short in methods_dict.values():
                        raw = ue_scores[f"{method_short}_raw"][col_idx]
                        detr = ue_scores[f"{method_short}_detr"][col_idx]
                        scores.extend([raw, detr])
                    sorted_scores = sorted(scores, reverse=True)
                    best_map[col_idx] = sorted_scores[0]
                    second_map[col_idx] = sorted_scores[1]

                for method_full, method_short in methods_dict.items():
                    raw_scores = ue_scores[f"{method_short}_raw"]
                    detr_scores = ue_scores[f"{method_short}_detr"]

                    raw_row = [f"Raw {method_short}"]
                    detr_row = [f"Detr {method_short}"]

                    for col_idx, (raw, detr) in enumerate(zip(raw_scores, detr_scores)):
                        # Format raw
                        if np.isclose(raw, best_map[col_idx]):
                            raw_row.append(f"\\textbf{{{raw:.2f}}}")
                        elif np.isclose(raw, second_map[col_idx]):
                            raw_row.append(f"\\underline{{{raw:.2f}}}")
                        else:
                            raw_row.append(f"{raw:.2f}")

                        # Format detr
                        if np.isclose(detr, best_map[col_idx]):
                            detr_row.append(f"\\textbf{{{detr:.2f}}}")
                        elif np.isclose(detr, second_map[col_idx]):
                            detr_row.append(f"\\underline{{{detr:.2f}}}")
                        else:
                            detr_row.append(f"{detr:.2f}")

                    latex += " & ".join(raw_row) + " \\\\\n"
                    latex += " & ".join(detr_row) + " \\\\\n"
                    latex += "\\midrule\n"

        latex += footer()
        name = f'tables/{metric}_detailed_results_per_dataset_ranking_w_quality_poly_3d.tex'
        with open(name, 'w') as f:
            f.write(latex)


if __name__ == "__main__":
    type_='all'
    main(type_)

# Best scores

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from utils import extract_and_prepare_data, detrend_ue
from pathlib import Path
import pathlib
import argparse

normalize = True

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'bleu_proper': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
}

METRICS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX XXL',
    'XComet-XCOMET-XXL': 'XComet XXL',
    'Comet-wmt22-comet-da': 'Comet WMT22',
    # 'bleu_proper': 'BLEU',
}

pathlib.Path('tables').mkdir(parents=True, exist_ok=True)
pathlib.Path('charts').mkdir(parents=True, exist_ok=True)

def get_header(caption):
    return (
        "\\begin{table*}\n"
        "\\footnotesize\n"
        f"\caption{{{caption}}}\n"
        "\\begin{tabular}{lcccccccc}\n"
        "&\multicolumn{4}{c}{\\textbf{WMT14}}&\multicolumn{4}{c}{\\textbf{WMT19}}\\\\\n"
        "\cmidrule(lr){2-5}\n"
        "\cmidrule(lr){6-9}\n"
    )

def footer():
    return (
        "\midrule\n"
        "\end{tabular}\n"
        "\end{table*}\n"
    )

def colname(dataset):
    if '_' in dataset:
        dataset = dataset.split('_')[1]

    return dataset[:2].capitalize() + '-' + dataset[2:].capitalize()
def format_score_pair(raw, detr, threshold=0.00):
    arrow = "$\\uparrow$" if detr - raw > threshold else ""
    return f"{raw:.2f}", f"{detr:.2f}{arrow}"

def main(type_):
    caption = "Best raw and detrended PRR scores across all metrics and models."
    latex = "\\begin{table*}[ht]\n\\centering\n\\small\n"
    latex += f"\\caption{{{caption}}}\n"
    total_cols = max(len(v) for v in DATASETS.values())
    col_spec = "l" + "cc" * total_cols
    latex += f"\\begin{{tabular}}{{{col_spec}}}\n\\toprule\n"

    # Dataset header (generic, since all blocks will have same layout)
    dataset_line = "Model"
    subheader_line = " "
    example_metric = next(iter(METRICS))
    for d in DATASETS[example_metric]:
        name = colname(d)
        dataset_line += f" & \\multicolumn{{2}}{{c}}{{{name}}}"
        subheader_line += " & Raw & Detr"
    latex += dataset_line + " \\\\\n"
    latex += subheader_line + " \\\\\n\\midrule\n"

    for metric, metric_name in METRICS.items():
        datasets = DATASETS[metric]
        ue_methods = list(methods_dict.values())

        # Section header for metric
        n_cols = 1 + 2 * len(datasets)
        latex += f"\\multicolumn{{{n_cols}}}{{c}}{{\\textbf{{{metric_name}}}}} \\\\\n"

        for model_key, model_name in MODELS.items():
            ue_scores, _, _ = detrend_ue(
                datasets, model_key, "base", [metric], ue_methods, methods_dict, return_unprocessed=True
            )

            row = [model_name]
            for i in range(len(datasets)):
                best_raw = -float("inf")
                best_detr = -float("inf")

                for method in ue_methods:
                    raw = ue_scores[f"{method}_raw"][i]
                    detr = ue_scores[f"{method}_detr"][i]
                    best_raw = max(best_raw, raw)
                    best_detr = max(best_detr, detr)

                raw_fmt, detr_fmt = format_score_pair(best_raw, best_detr)
                row.extend([raw_fmt, detr_fmt])

            latex += " & ".join(row) + " \\\\\n"

        latex += "\\midrule\n"

    latex += "\\bottomrule\n\\end{tabular}\n\\end{table*}\n"

    with open("tables/final_rowwise_metric_blocks_poly_3rd.tex", "w") as f:
        f.write(latex)



if __name__ == "__main__":
    type_='all'
    main(type_)

# Avg Improvements

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from utils import extract_and_prepare_data, detrend_ue, detrend_ue_w_quality, detrend_ue_w_quality_only
from pathlib import Path
import pathlib
import argparse
from sklearn.linear_model import LinearRegression

normalize = True

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'bleu_proper': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'AlignScoreInputOutput' : ['xsum'],
        'Accuracy' : ['gsm8k'],
            # 'Rouge_rougeL' : ['xsum'],


}

METRICS = {
                # 'Rouge_rougeL' : 'Rouge L',
    # 'AlignScoreInputOutput': 'Align Score',
    'XComet-XCOMET-XXL': 'XComet XXL',
    # 'Accuracy':'Acc',
    'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX-XXL',
    'Comet-wmt22-comet-da' :'Comet WMT22',
    # 'bleu_proper': 'BLEU',
}

pathlib.Path('tables').mkdir(parents=True, exist_ok=True)
pathlib.Path('charts').mkdir(parents=True, exist_ok=True)

def get_header(caption):
    return (
        "\\begin{table*}\n"
        "\\footnotesize\n"
        "\\centering\n"
        "\\toprule\n"
        f"\caption{{{caption}}}\n"
        "\\begin{tabular}{lcccccccc}\n"
        "&\multicolumn{4}{c}{\\textbf{WMT14}}&\multicolumn{4}{c}{\\textbf{WMT19}}\\\\\n"
        "\cmidrule(lr){2-5}\n"
        "\cmidrule(lr){6-9}\n"
    )

def footer():
    return (
        "\midrule\n"
        "\end{tabular}\n"
        "\end{table*}\n"
    )

def colname(dataset):
    if '_' in dataset:
        dataset = dataset.split('_')[1]

    return dataset[:2].capitalize() + '-' + dataset[2:].capitalize()

def plot_raw_vs_detrended(lengths, raw_scores, detr_scores, method_name, save_path=None):
    lengths = np.array(lengths).reshape(-1, 1)
    raw_scores = np.array(raw_scores)
    detr_scores = np.array(detr_scores)

    # Fit linear regression models
    reg_raw = LinearRegression().fit(lengths, raw_scores)
    reg_detr = LinearRegression().fit(lengths, detr_scores)

    # Predict
    raw_fit = reg_raw.predict(lengths)
    detr_fit = reg_detr.predict(lengths)

    plt.figure(figsize=(8, 6))
    plt.scatter(lengths, raw_scores, label='Raw Scores', alpha=0.6, color='blue')
    plt.plot(lengths, raw_fit, color='blue', linestyle='--', label=f'Raw Fit (slope={reg_raw.coef_[0]:.3f})')

    plt.scatter(lengths, detr_scores, label='Detrended Scores', alpha=0.6, color='green')
    plt.plot(lengths, detr_fit, color='green', linestyle='--', label=f'Detrended Fit (slope={reg_detr.coef_[0]:.3f})')

    plt.title(f'Length Effect for {method_name}')
    plt.xlabel('Sequence Length')
    plt.ylabel('Uncertainty Score')
    plt.legend()
    plt.grid(True)

    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
    else:
        plt.show()


def main(type_):
    results = []
    for metric, metric_name in METRICS.items():
        datasets = DATASETS[metric]
        ue_methods = list(methods_dict.values())

        for model_type in ['base']:
            for model, model_name in MODELS.items():
                model_title = model_name if model_type == 'base' else f"{model_name} Instruct"

                ue_scores, _, _ = detrend_ue(datasets, model, model_type, [metric], ue_methods, methods_dict, return_unprocessed=True)

                # Find best and second-best scores per dataset (column-wise)
                best_map = {}
                second_map = {}
                for col_idx in range(len(datasets)):
                    scores = []
                    for method_short in methods_dict.values():
                        raw = ue_scores[f"{method_short}_raw"][col_idx]
                        detr = ue_scores[f"{method_short}_detr"][col_idx]
                        scores.extend([raw, detr])
                        # print( method_short, metric,datasets[col_idx], ':', raw, ' -> ', detr)
                        results.append({
                            "Metric": metric_name,
                            "Dataset": datasets[col_idx],
                            "Model": model_title,
                            "Method": method_short,
                            "ScoreType": "Raw",
                            "Score": raw
                        })

                        results.append({
                            "Metric": metric_name,
                            "Dataset": datasets[col_idx],
                            "Model": model_title,
                            "Method": method_short,
                            "ScoreType": "Detrended",
                            "Score": detr
                        })
    df = pd.DataFrame(results)
    csv_path = "ue_scores_results_wmt3_poly_3rd.csv"
    df.to_csv(csv_path, index=False)
    print(f"Results saved to {csv_path}")

                    # sorted_scores = sorted(scores, reverse=True)
                    # best_map[col_idx] = sorted_scores[0]
                    # second_map[col_idx] = sorted_scores[1]

    


if __name__ == "__main__":
    type_='all'
    main(type_)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from utils import extract_and_prepare_data, detrend_ue, detrend_ue_w_quality, detrend_ue_w_quality_only
from pathlib import Path
import pathlib
import argparse
from sklearn.linear_model import LinearRegression

normalize = True

methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
    'LexicalSimilarity_rougeL': 'LSRL',
}

MODELS = {
    'llama': 'Llama 3.1 8B',
    'gemma': 'Gemma 2 9B',
    # 'eurollm': 'EuroLLM 9B',
}

DATASETS = {
    'metricx-metricx-24-hybrid-xxl-v2p6': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'XComet-XCOMET-XXL': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'Comet-wmt22-comet-da': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'bleu_proper': [
        'wmt14_csen',
        'wmt14_deen',
        'wmt14_ruen',
        'wmt14_fren',
        'wmt19_deen',
        'wmt19_fien',
        'wmt19_lten',
        'wmt19_ruen',
    ],
    'AlignScoreInputOutput' : ['xsum'],
        'Accuracy' : ['gsm8k'],
            # 'Rouge_rougeL' : ['xsum'],


}

METRICS = {
                # 'Rouge_rougeL' : 'Rouge L',
    'AlignScoreInputOutput': 'Align Score',
    # 'XComet-XCOMET-XXL': 'XComet XXL',
    'Accuracy':'Acc',
    # 'metricx-metricx-24-hybrid-xxl-v2p6': 'MetricX-XXL',
    # 'Comet-wmt22-comet-da' :'Comet WMT22',
    # 'bleu_proper': 'BLEU',
}

pathlib.Path('tables').mkdir(parents=True, exist_ok=True)
pathlib.Path('charts').mkdir(parents=True, exist_ok=True)

def get_header(caption):
    return (
        "\\begin{table*}\n"
        "\\footnotesize\n"
        "\\centering\n"
        "\\toprule\n"
        f"\caption{{{caption}}}\n"
        "\\begin{tabular}{lcccccccc}\n"
        "&\multicolumn{4}{c}{\\textbf{WMT14}}&\multicolumn{4}{c}{\\textbf{WMT19}}\\\\\n"
        "\cmidrule(lr){2-5}\n"
        "\cmidrule(lr){6-9}\n"
    )

def footer():
    return (
        "\midrule\n"
        "\end{tabular}\n"
        "\end{table*}\n"
    )

def colname(dataset):
    if '_' in dataset:
        dataset = dataset.split('_')[1]

    return dataset[:2].capitalize() + '-' + dataset[2:].capitalize()

def plot_raw_vs_detrended(lengths, raw_scores, detr_scores, method_name, save_path=None):
    lengths = np.array(lengths).reshape(-1, 1)
    raw_scores = np.array(raw_scores)
    detr_scores = np.array(detr_scores)

    # Fit linear regression models
    reg_raw = LinearRegression().fit(lengths, raw_scores)
    reg_detr = LinearRegression().fit(lengths, detr_scores)

    # Predict
    raw_fit = reg_raw.predict(lengths)
    detr_fit = reg_detr.predict(lengths)

    plt.figure(figsize=(8, 6))
    plt.scatter(lengths, raw_scores, label='Raw Scores', alpha=0.6, color='blue')
    plt.plot(lengths, raw_fit, color='blue', linestyle='--', label=f'Raw Fit (slope={reg_raw.coef_[0]:.3f})')

    plt.scatter(lengths, detr_scores, label='Detrended Scores', alpha=0.6, color='green')
    plt.plot(lengths, detr_fit, color='green', linestyle='--', label=f'Detrended Fit (slope={reg_detr.coef_[0]:.3f})')

    plt.title(f'Length Effect for {method_name}')
    plt.xlabel('Sequence Length')
    plt.ylabel('Uncertainty Score')
    plt.legend()
    plt.grid(True)

    if save_path:
        plt.savefig(save_path, bbox_inches='tight')
    else:
        plt.show()


def main(type_):
    results = []
    for metric, metric_name in METRICS.items():
        datasets = DATASETS[metric]
        ue_methods = list(methods_dict.values())

        for model_type in ['base']:
            for model, model_name in MODELS.items():
                model_title = model_name if model_type == 'base' else f"{model_name} Instruct"

                ue_scores, _, _,_,_ = detrend_ue_w_quality(datasets, model, model_type, [metric], ue_methods, methods_dict, return_unprocessed=True)

                # Find best and second-best scores per dataset (column-wise)
                best_map = {}
                second_map = {}
                for col_idx in range(len(datasets)):
                    scores = []
                    for method_short in methods_dict.values():
                        raw = ue_scores[f"{method_short}_raw"][col_idx]
                        detr = ue_scores[f"{method_short}_detr"][col_idx]
                        scores.extend([raw, detr])
                        # print( method_short, metric,datasets[col_idx], ':', raw, ' -> ', detr)
                        results.append({
                            "Metric": metric_name,
                            "Dataset": datasets[col_idx],
                            "Model": model_title,
                            "Method": method_short,
                            "ScoreType": "Raw",
                            "Score": raw
                        })

                        results.append({
                            "Metric": metric_name,
                            "Dataset": datasets[col_idx],
                            "Model": model_title,
                            "Method": method_short,
                            "ScoreType": "Detrended",
                            "Score": detr
                        })
    df = pd.DataFrame(results)
    csv_path = "ue_scores_results_gsm8k_xsum_poly_3rd.csv"
    df.to_csv(csv_path, index=False)
    print(f"Results saved to {csv_path}")

                    # sorted_scores = sorted(scores, reverse=True)
                    # best_map[col_idx] = sorted_scores[0]
                    # second_map[col_idx] = sorted_scores[1]

    


if __name__ == "__main__":
    type_='all'
    main(type_)

In [None]:
import pandas as pd

# Load your actual DataFrame here
df = pd.read_csv("ue_scores_results_wmt3.csv")  # Or use your in-memory DataFrame
df_2 = pd.read_csv("ue_scores_results_gsm8k_xsum.csv")

df = pd.concat([df, df_2]).reset_index(drop=True)

# df = df[df['Metric']!='BLEU'].reset_index(drop=True)
# Calculate improvement per Method, Metric, Dataset
pivot = df.pivot_table(index=["Method", "Metric", "Dataset","Model"], columns="ScoreType", values="Score").reset_index()
pivot["Improvement"] = pivot["Detrended"] - pivot["Raw"]

# print(pivot)
# Compute average improvement and std per Method and Metric (i.e. task)
summary = pivot.groupby(["Method", "Metric"])["Improvement"].agg(['mean', 'sem']).reset_index()
# print(summary)
summary["formatted"] = summary.apply(lambda x: f"{x['mean']:.2f} $\\pm$ {x['sem']:.2f}", axis=1)

# summary
# Pivot to get desired LaTeX format
latex_table = summary.pivot(index="Method", columns="Metric", values="formatted").fillna("—")

method_order = ["MSP", "PPL", "MTE", "MCSE", "MCNSE", "LSRL"]

# Reindex before converting to LaTeX
latex_table = latex_table.reindex(method_order)

metric_order = ["Comet WMT22", "XComet XXL", "MetricX-XXL", "Align Score", "Acc"]

# Reindex columns after pivot
latex_table = latex_table.reindex(columns=metric_order)

# Convert to LaTeX
latex_code = latex_table.to_latex(escape=False)
print(latex_code)
