In [None]:
import numpy as np
import matplotlib.pyplot as plt
from lm_polygraph.ue_metrics.pred_rej_area import PredictionRejectionArea
from lm_polygraph.ue_metrics.ue_metric import (
    get_random_scores,
    normalize_metric,
)
import sklearn
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import defaultdict
from sacrebleu import CHRF, BLEU
from utils import extract_and_prepare_data

In [None]:
methods_dict = {
    'MaximumSequenceProbability': 'MSP',
    'Perplexity': 'PPL',
    'MeanTokenEntropy': 'MTE',
    'MeanPointwiseMutualInformation': 'MPMI',
    'MeanConditionalPointwiseMutualInformation': 'MCPMI',
    'PTrue': 'PTrue',
    'PTrueSampling': 'PTrueS',
    'MonteCarloSequenceEntropy': 'MCSE',
    'MonteCarloNormalizedSequenceEntropy': 'MCNSE',
}

DATASETS = [
    'wmt14_csen',
    'wmt14_deen',
    'wmt14_ruen',
    'wmt14_fren',
    'wmt19_deen',
    'wmt19_fien',
    'wmt19_lten',
    'wmt19_ruen',
]

all_metrics = ['Comet', 'BLEU']

In [None]:
coefs = defaultdict(list)

for dataset in DATASETS:
    train_ue_values, \
    test_ue_values, \
    train_metric_values, \
    test_metric_values, \
    train_gen_lengths, \
    gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics)
    
    upper_q = np.quantile(train_gen_lengths, 0.95)
    lower_q = np.quantile(train_gen_lengths, 0.05)
    below_q_ids = (train_gen_lengths < upper_q) & (train_gen_lengths > lower_q)
    train_gen_lengths = train_gen_lengths[below_q_ids]
    
    train_normalized_metric_values = {}
    test_normalized_metric_values = {}

    fig, axs = plt.subplots(1, len(all_metrics), figsize=(8 * len(all_metrics), 7))
    for i, metric in enumerate(all_metrics):
        gen_length_scaler = MinMaxScaler()
        train_gen_lengths_normalized = gen_length_scaler.fit_transform(train_gen_lengths[:, np.newaxis]).squeeze()
        #test_gen_lengths_normalized = gen_length_scaler.transform(gen_lengths[:, np.newaxis]).squeeze()

        train_metric = train_metric_values[metric][below_q_ids]
        
        scaler = MinMaxScaler()
        train_normalized_metric_values[metric] = scaler.fit_transform(train_metric[:, np.newaxis]).squeeze()
        train_linreg = sklearn.linear_model.LinearRegression()
        train_linreg.fit(train_gen_lengths_normalized[:, np.newaxis], train_normalized_metric_values[metric])
        coef = train_linreg.coef_[0]

        # test_normalized_metric_values[metric] = scaler.transform(test_metric_values[metric][:, np.newaxis]).squeeze()

        train_sort_order = np.argsort(train_gen_lengths_normalized)
        #test_sort_order = np.argsort(test_gen_lengths_normalized)
        
        
        ax = axs[i]
        ax.plot(train_gen_lengths_normalized[train_sort_order], train_normalized_metric_values[metric][train_sort_order], label='train')
        ax.plot(train_gen_lengths_normalized[train_sort_order], train_linreg.predict(train_gen_lengths_normalized[train_sort_order][:, np.newaxis]), label=f'train_pred: {round(coef,3)}')

        #ax.plot(test_gen_lengths_normalized[test_sort_order], test_normalized_metric_values[metric][test_sort_order], label='test')
        #ax.plot(test_gen_lengths_normalized[test_sort_order], train_linreg.predict(test_gen_lengths_normalized[test_sort_order][:, np.newaxis]), label='test_pred')
        ax.legend()
        ax.set_title(metric)
        ax.set_xlabel('Generated sequence length (Normalized)')
        ax.set_ylabel('Metric value (Normalized)')

    fig.suptitle(dataset)

    plt.tight_layout()
    plt.show()

In [None]:
coefs = defaultdict(list)

for dataset in DATASETS:
    train_ue_values, \
    test_ue_values, \
    train_metric_values, \
    test_metric_values, \
    train_gen_lengths, \
    gen_lengths = extract_and_prepare_data(dataset, methods_dict, all_metrics)
    
    upper_q = np.quantile(train_gen_lengths, 0.95)
    lower_q = np.quantile(train_gen_lengths, 0.05)
    below_q_ids = (train_gen_lengths < upper_q) & (train_gen_lengths > lower_q)
    train_gen_lengths = train_gen_lengths[below_q_ids]

    fig, axs = plt.subplots(1, len(all_metrics), figsize=(8 * len(all_metrics), 7))
    for i, metric in enumerate(all_metrics):
        train_metric = train_metric_values[metric][below_q_ids]
        
        train_linreg = sklearn.linear_model.LinearRegression()
        train_linreg.fit(train_gen_lengths[:, np.newaxis], train_metric)
        coef = train_linreg.coef_[0]

        # test_normalized_metric_values[metric] = scaler.transform(test_metric_values[metric][:, np.newaxis]).squeeze()

        train_sort_order = np.argsort(train_gen_lengths)        
        
        ax = axs[i]
        ax.plot(train_gen_lengths[train_sort_order], train_metric[train_sort_order], label='train')
        ax.plot(train_gen_lengths[train_sort_order], train_linreg.predict(train_gen_lengths[train_sort_order][:, np.newaxis]), label=f'train_pred: {round(coef,3)}')

        ax.legend()
        ax.set_title(metric)
        ax.set_xlabel('Generated sequence length (Normalized)')
        ax.set_ylabel('Metric value (Normalized)')

    fig.suptitle(dataset)

    plt.tight_layout()
    plt.show()