In [None]:
import sys; sys.path.append("..")
from importlib import reload
import persist_to_disk as ptd
import os
ptd.config.set_project_path(os.path.abspath("../"))
import tqdm
import pandas as pd
import numpy as np
import re
import torch

import utils

import matplotlib.pyplot as plt
%matplotlib inline

## Read the toy data

In [None]:
pd.__version__

In [None]:
data_path = './demo_data.pkl'
if not os.path.isfile(data_path):
    import models
    import dataeval.load as dload
    import _settings
    tokenizer = models.load_tokenizer('llama-13b-hf')
    res = dload.read_cleaned_outputs_new(_settings.GEN_PATHS['trivia']['llama-13b'])
    idx = np.random.RandomState(42).choice(len(res), 500, replace=False)
    demo_data = []
    for _idx in idx:
        curr = {_k: res[_idx][_k] for _k in ['prompt', 'id', 'question', 'answer', 'generations']}
        curr['prompt'] = tokenizer.decode(curr['prompt'])
        curr['generations'] = {_k: _v for _k, _v in curr['generations'].items() if _k.startswith("text")}
        demo_data.append(curr)
    pd.to_pickle(demo_data, data_path)
demo_data = pd.read_pickle(data_path)[:10]

## UQ object

In [None]:
import pipeline.uq_bb as uq_bb
reload(uq_bb)
obj = uq_bb.UQ_summ(demo_data, clean=True,
                    #split='test', cal_size=5, seed=1,
                   gpteval_examples = [{'question': 'In Scotland a bothy/bothie is a?',
                      'reference': 'House','answer': 'House',
                        },
                      {'question': 'Where in England was Dame Judi Dench born?',
                       'reference':  'York', 'answer': 'London'
                        }])
# when split is not set, uses the default hyperparameters

In [None]:
_ea, _ia = obj.get_acc('generations|gpt|acc')
print("Expected Accuracy (mean over all generations)")
_ea

In [None]:
print("Individual Accuracy")
_ia

In [None]:
_u, _c = obj.get_uq('generations|eccentricity|agreement_w', temperature=3., eigv_threshold=0.9)
print("Uncertainty (higher=more uncertain)")
_u

In [None]:
print("(Negative) Confidence (higher=less confident)")
_c

## Summary

In [None]:
num_gens = 10
summ_kwargs = {
    'u+ea': {'overall': True, 'use_conf': False},
    'u+ia': {'overall': False, 'use_conf': False},
    'c+ia': {'overall': False, 'use_conf': True},
}['c+ia']

summ_obj = obj.summ([
        'generations|numsets', 'lexical_sim',
    
        'generations|spectral_eigv_clip|disagreement_w',
        'generations|eccentricity|disagreement_w',
        'generations|degree|disagreement_w',

        'generations|spectral_eigv_clip|agreement_w',
        'generations|eccentricity|agreement_w',
        'generations|degree|agreement_w',


        'generations|spectral_eigv_clip|jaccard',
        'generations|eccentricity|jaccard',
        'generations|degree|jaccard',
], 
    
    acc_name='generations|gpt|acc',
    num_gens=num_gens, **summ_kwargs
)

In [None]:
# U + EA (using uncertainty to predict expected accuarcy)
summ_obj.summ_overall('auarc')

In [None]:
# C + IA (using confidence to predict individual accuracy)
sum(summ_obj.summ_individual('auarc', use_conf=True)) / num_gens

In [None]:
# C + IA (using confidence to predict individual accuracy)
sum(summ_obj.summ_individual('auroc', use_conf=True)) / num_gens

## Plots

In [None]:
reload(uq_bb)
plt.figure(figsize=(6, 3.5))
def name_map(v):
    if v == 'self_prob': return "P(true)"
    v = v.replace("|disagreement_w", "|(C)")
    v = v.replace("|agreement_w", "|(E)")
    v = v.replace("|jaccard", "|(J)")
    v = v.replace("spectral_eigv_clip|", "EigV")
    v = v.replace("eccentricity|", "Ecc")
    v = v.replace("degree|", "Deg")
    return {'numsets': 'NumSet', 'semanticEntropy|unnorm': 'SE',
            'blind': 'Basse Accuracy'}.get(v,v)
    return v
summ_obj.plot('roc', name_map=name_map, 
              methods=[
                  'generations|numsets',
                       'generations|eccentricity|agreement_w', 
                       'generations|spectral_eigv_clip|agreement_w', 
                       'generations|degree|agreement_w', 
                  'self_prob', 'semanticEntropy|unnorm'], 
              cutoff=1, iloc=1)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")


In [None]:
reload(uq_bb)
plt.figure(figsize=(6,3.5))
def name_map(v):
    if v == 'self_prob': return "P(true)"
    v = v.replace("|disagreement_w", "|(C)")
    v = v.replace("|agreement_w", "|(E)")
    v = v.replace("|jaccard", "|(J)")
    v = v.replace("spectral_eigv_clip|", "EigV")
    v = v.replace("eccentricity|", "Ecc")
    v = v.replace("degree|", "Deg")
    return {'numsets': 'NumSet', 'semanticEntropy|unnorm': 'SE', 'oracle': "Oracle",
            'blind': 'Base Accuracy'}.get(v,v)
    return v
summ_obj.plot('arc', name_map=name_map, 
              methods=[
                  'generations|numsets',
                       'generations|eccentricity|agreement_w', 
                       'generations|spectral_eigv_clip|agreement_w', 
                       'generations|degree|agreement_w', 
                       'oracle', 'blind', 'self_prob', 'semanticEntropy|unnorm'], 
              cutoff=1)
plt.xlabel("Rejection Rate")
plt.ylabel("Average Accuracy")
plt.title("ARC")
