In [None]:
import numpy as np
import matplotlib.pyplot as plt
from lm_polygraph.ue_metrics.pred_rej_area import PredictionRejectionArea
from lm_polygraph.ue_metrics.ue_metric import (
    get_random_scores,
    normalize_metric,
)
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from sacrebleu import CHRF, BLEU
from utils import extract_and_prepare_data

In [None]:
methods_dict = {
    'MeanTokenEntropy': 'MTE',
}

# DATASETS = [
#     'wmt14_csen',
#     'wmt14_deen',
#     'wmt14_ruen',
#     'wmt14_fren',
#     'wmt19_deen',
#     'wmt19_fien',
#     'wmt19_lten',
#     'wmt19_ruen',
# ]

DATASETS = [
    'wmt14_csen',
    #'wmt14_deen',
    'wmt14_ruen',
    #'wmt14_fren',
    #'wmt19_deen',
    #'wmt19_fien',
    #'wmt19_lten',
    #'wmt19_ruen',
]

all_metrics = ['Comet']

In [None]:
coefs = defaultdict(list)

for dataset in DATASETS:
    train_ue_values, \
    test_ue_values, \
    train_metric_values, \
    test_metric_values, \
    train_gen_lengths, \
    gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics, model='llama1b')

    upper_q = np.quantile(train_gen_lengths, 0.95)
    lower_q = np.quantile(train_gen_lengths, 0.05)
    below_q_ids = (train_gen_lengths < upper_q) & (train_gen_lengths > lower_q)
    train_gen_lengths = train_gen_lengths[below_q_ids]

    fig, axs = plt.subplots(1, 2, figsize=(8 * 2, 7))
    #for i, method in enumerate(ue_methods):
    method = 'MTE'
    gen_length_scaler = MinMaxScaler()
    train_gen_lengths_normalized = gen_length_scaler.fit_transform(train_gen_lengths[:, np.newaxis]).squeeze()
    test_gen_lengths_normalized = gen_length_scaler.transform(gen_lengths[:, np.newaxis]).squeeze()

    train_ue = train_ue_values[method][below_q_ids]
    test_ue = test_ue_values[method]
    
    scaler = MinMaxScaler()
    train_normalized_ue = scaler.fit_transform(train_ue[:, np.newaxis]).squeeze()
    test_normalized_ue = scaler.transform(test_ue[:, np.newaxis]).squeeze()
    
    train_linreg = sklearn.linear_model.LinearRegression()
    train_linreg.fit(train_gen_lengths_normalized[:, np.newaxis], train_normalized_ue)
    coef = train_linreg.coef_[0]

    test_residuals = train_linreg.predict(test_gen_lengths_normalized[:, np.newaxis])
    test_detr_ue = test_normalized_ue - test_residuals
    
    train_sort_order = np.argsort(train_gen_lengths_normalized)
    test_sort_order = np.argsort(test_gen_lengths_normalized)
    
    axs[0].plot(train_gen_lengths_normalized[train_sort_order], train_normalized_ue[train_sort_order], label='train')
    axs[0].plot(train_gen_lengths_normalized[train_sort_order], train_linreg.predict(train_gen_lengths_normalized[train_sort_order][:, np.newaxis]), label=f'train_pred: {round(coef,3)}')
    axs[0].legend()
    axs[0].set_title('MTE train')
    axs[0].set_xlabel('Generated sequence length (Normalized)')
    axs[0].set_ylabel('UE value (Normalized)')

    axs[1].plot(test_gen_lengths_normalized[test_sort_order], test_normalized_ue[test_sort_order], label='test')
    axs[1].plot(test_gen_lengths_normalized[test_sort_order], test_detr_ue[test_sort_order], label='test_detr')
    axs[1].legend()
    axs[1].set_title('MTE test')
    axs[1].set_xlabel('Generated sequence length (Normalized)')
    axs[1].set_ylabel('UE value (Normalized)')

    fig.suptitle(dataset)

    plt.tight_layout()
    plt.show()