In [40]:
import pandas as pd
from itertools import combinations
import numpy as np
rng = np.random.default_rng()

In [43]:
data = dict()

data['base_deberta'] = pd.read_pickle('results/base_deberta.pkl')
data['visattn'] = pd.read_pickle('results/visattn.pkl')
data['vlscore_all'] = pd.read_pickle('results/vlscore_all.pkl')
data['vlscore_bef_4'] = pd.read_pickle('results/vlscore_bef-4.pkl')
data['vlscore_bef'] = pd.read_pickle('results/vlscore_bef.pkl')
data['vlscore_patch_visattn'] = pd.read_pickle('results/vlscore_patch_visattn.pkl')
data['vlscore_visattn'] = pd.read_pickle('results/vlscore_visattn.pkl')
data['vlscore'] = pd.read_pickle('results/vlscore.pkl')

# The labels are all equal
true_labels = data['base_deberta']['labels']

In [34]:
def eval_with_paired_bootstraptrue(gold, sys1, sys2, num_samples=10000, sample_ratio=0.5):
    """Evaluate with paired boostrap
    This compares two systems, performing a significance tests with
    paired bootstrap resampling to compare the accuracy of the two systems.

    Parameters
    ----------
    gold
      The correct labels
    sys1
      The output of system 1
    sys2
      The output of system 2
    num_samples
      The number of bootstrap samples to take
    sample_ratio
      The ratio of samples to take every time

    """
    assert len(gold) == len(sys1)
    assert len(gold) == len(sys2)

    gold = np.array(gold)
    sys1 = np.array(sys1)
    sys2 = np.array(sys2)

    sys1_scores = []
    sys2_scores = []
    wins = [0, 0, 0]
    n = len(gold)

    for _ in range(num_samples):
        # Subsample the gold and system outputs
        subset_idxs = rng.choice(n, int(n * sample_ratio), replace=True)
        sys1_score = (sys1[subset_idxs] == gold[subset_idxs]).mean()
        sys2_score = (sys2[subset_idxs] == gold[subset_idxs]).mean()

        if sys1_score > sys2_score:
            wins[0] += 1
        elif sys1_score < sys2_score:
            wins[1] += 1
        else:
            wins[2] += 1

        sys1_scores.append(sys1_score)
        sys2_scores.append(sys2_score)

    # Print win stats
    wins = [x / float(num_samples) for x in wins]
    print("Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f" % (wins[0], wins[1], wins[2]))
    if wins[0] > wins[1]:
        print("(sys1 is superior with p value p=%.3f)\n" % (1 - wins[0]))
    elif wins[1] > wins[0]:
        print("(sys2 is superior with p value p=%.3f)\n" % (1 - wins[1]))

    # Print system stats
    sys1_scores.sort()
    sys2_scores.sort()
    
    return np.mean(sys1_scores), np.median(sys1_scores), sys1_scores[int(num_samples * 0.025)], sys1_scores[int(num_samples * 0.975)], np.mean(sys2_scores), np.median(sys2_scores), sys2_scores[int(num_samples * 0.025)], sys2_scores[int(num_samples * 0.975)] 

def report(sys1mean, sys1median, sys1_95int_start, sys1_95int_end, sys2mean, sys2median, sys2_95int_start, sys2_95int_end):
    print(
        "sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            sys1mean,
            sys1median,
            sys1_95int_start,
            sys1_95int_end,
        )
    )
    print(
        "sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]"
        % (
            sys2mean,
            sys2median,
            sys2_95int_start,
            sys2_95int_end,
        )
    )

In [41]:
for sys1, sys2 in combinations(data, 2):
    print(f'Comparing {sys1} and {sys2}.')
    eval_with_paired_bootstrap(true_labels, data[sys1]['preds'], data[sys2]['preds'])
    print('-'*64)

Comparing base_deberta and visattn.
Win ratio: sys1=0.476, sys2=0.512, tie=0.012
(sys2 is superior with p value p=0.488)

----------------------------------------------------------------
Comparing base_deberta and vlscore_all.
Win ratio: sys1=0.000, sys2=1.000, tie=0.000
(sys2 is superior with p value p=0.000)

----------------------------------------------------------------
Comparing base_deberta and vlscore_bef_4.
Win ratio: sys1=0.000, sys2=1.000, tie=0.000
(sys2 is superior with p value p=0.000)

----------------------------------------------------------------
Comparing base_deberta and vlscore_bef.
Win ratio: sys1=0.000, sys2=1.000, tie=0.000
(sys2 is superior with p value p=0.000)

----------------------------------------------------------------
Comparing base_deberta and vlscore_patch_visattn.
Win ratio: sys1=0.674, sys2=0.315, tie=0.012
(sys1 is superior with p value p=0.326)

----------------------------------------------------------------
Comparing base_deberta and vlscore_vi

In [44]:
vlscore_all

{'labels': [1,
  2,
  1,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  2,
  2,
  2,
  2,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  2,
  2,
  1,
  2,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  2,
