## Import necessary packages

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4,5,6,7"
from kilt import retrieval
from kilt import kilt_utils as utils
import tasks
# from kilt.retrievers import DPR_connector
import utils
from rouge_score import rouge_scorer
import random
import numpy as np
import torch
torch.set_grad_enabled(False)
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset
import json
from tqdm import tqdm
import opensource

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pickle
def write_list(a_list, file_name):
    # store list in binary file so 'wb' mode
    with open(file_name, 'wb') as fp:
        pickle.dump(a_list, fp)
        print('Done writing list into a binary file')
def read_list(file_name):
    # for reading also binary mode is important
    with open(file_name, 'rb') as fp:
        n_list = pickle.load(fp)
        return n_list

def read_results(task):
    retrieved_scores = read_list(f'uncertain_retrieved_scores_{task}.p')
    retrieved_true_scores = read_list(f'uncertain_retrieved_true_scores_{task}.p')
    queries = read_list(f'uncertain_queries_{task}.p')
    answers = read_list(f'uncertain_answers_{task}.p')
    opensource_true_scores = read_list(f'uncertain_opensource_true_scores_{task}.p')
    opensource_answers = read_list(f'uncertain_opensource_answers_{task}.p')
    opensource_semantics = read_list(f'uncertain_opensource_semantics_{task}.p')
    opensource_occurances = read_list(f'uncertain_occurances_{task}.p')
    opensource_semantic_ids = read_list(f'uncertain_semantic_ids_{task}.p')
    opensource_probs = read_list(f'uncertain_probs_{task}.p')
    
    return retrieved_scores, retrieved_true_scores, \
           queries, answers, \
           opensource_true_scores, opensource_answers, \
           opensource_occurances, opensource_semantic_ids, \
           opensource_probs

In [3]:
task = 'trivia'

In [4]:
semantic = False
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)
if semantic:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    # setup semantic model
    semantic_tokenizer = \
        AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
    semantic_model = \
        AutoModelForSequenceClassification.from_pretrained(
            "microsoft/deberta-large-mnli"
        ).cuda()

In [5]:
retrieved_scores, retrieved_true_scores, queries, answers, opensource_true_scores, opensource_answers, opensource_occurances, opensource_semantic_ids, opensource_probs = \
read_results(task)

In [6]:
len(retrieved_scores)

688

In [7]:
indices = np.arange(len(retrieved_true_scores)-1)
random.shuffle(indices)
cal_first_indices = indices[:int(len(indices) * 0.3)]
cal_second_indices = indices[int(len(indices) * 0.3) : int(len(indices) * 0.6)]
test_indices = indices[int(len(indices) * 0.6):]

cal_first_retrieved_true_scores = utils.split(retrieved_true_scores, cal_first_indices)
cal_second_retrieved_true_scores = utils.split(retrieved_true_scores, cal_second_indices)
test_retrieved_true_scores = utils.split(retrieved_true_scores, test_indices)
cal_first_opensource_true_scores = utils.split(opensource_true_scores, cal_first_indices)
cal_second_opensource_true_scores = utils.split(opensource_true_scores, cal_second_indices)
test_opensource_true_scores = utils.split(opensource_true_scores, test_indices)
cal_first_retrieved_scores = utils.split(retrieved_scores, cal_first_indices)
cal_second_retrieved_scores = utils.split(retrieved_scores, cal_second_indices)
test_retrieved_scores = utils.split(retrieved_scores, test_indices)
cal_first_opensource_occurances = utils.split(opensource_occurances, cal_first_indices)
cal_second_opensource_occurances = utils.split(opensource_occurances, cal_second_indices)
test_opensource_occurances = utils.split(opensource_occurances, test_indices)
cal_first_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_first_indices)
cal_second_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_second_indices)
test_opensource_semantic_ids = utils.split(opensource_semantic_ids, test_indices)
cal_first_queries = utils.split(queries, cal_first_indices)
cal_second_queries = utils.split(queries, cal_second_indices)
test_queries = utils.split(queries, test_indices)
cal_first_opensource_answers = utils.split(opensource_answers, cal_first_indices)
cal_second_opensource_answers = utils.split(opensource_answers, cal_second_indices)
test_opensource_answers = utils.split(opensource_answers, test_indices)
cal_first_opensource_probs = utils.split(opensource_probs, cal_first_indices)
cal_second_opensource_probs = utils.split(opensource_probs, cal_second_indices)
test_opensource_probs = utils.split(opensource_probs, test_indices)

In [8]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)

## Compute coverage rate

In [9]:
def coverage(
        retrieved_true_scores_list, opensource_true_scores_list,
        retrieved_thr, qa_thr):

    includes = []
    for idx, (retrieved_true_score, opensource_true_score) in enumerate(zip(retrieved_true_scores_list, opensource_true_scores_list)):
#         if idx > 20:
        opensource_true_score = np.max(opensource_true_score)
        include = True if (retrieved_true_score >= retrieved_thr and 
                           opensource_true_score >= qa_thr) \
                       else False
        includes.append(include)
    return includes

In [13]:
alpha = 0.6
cal_first_scores = []
for probs in cal_first_opensource_probs:
    cal_first_scores.append(np.max(list(probs.values())))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=1-alpha)
print('QA (I am not certain) threshold', opensource_qa_thr)

cal_second_scores = []
for probs in cal_second_opensource_probs:
    cal_second_scores.append(np.max(list(probs.values())))
qa_coverage = np.mean(np.array(cal_second_scores) <= opensource_qa_thr)
print('validation qa coverage', qa_coverage)

test_scores = []
for probs in test_opensource_probs:
    test_scores.append(np.max(list(probs.values())))
qa_coverage = np.mean(np.array(test_scores) <= opensource_qa_thr)
print('test qa coverage', qa_coverage)

QA (I am not certain) threshold 0.7666666666666667
validation qa coverage 0.44660194174757284
test qa coverage 0.4109090909090909
