## Import necessary packages

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4,5,6,7"
from kilt import retrieval
from kilt import kilt_utils as utils
import tasks
# from kilt.retrievers import DPR_connector
import utils
from rouge_score import rouge_scorer
import random
import numpy as np
import torch
torch.set_grad_enabled(False)
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset
import json
from tqdm import tqdm
import opensource

np.random.seed(21)

In [3]:
import pickle
def write_list(a_list, file_name):
    # store list in binary file so 'wb' mode
    with open(file_name, 'wb') as fp:
        pickle.dump(a_list, fp)
        print('Done writing list into a binary file')
def read_list(file_name):
    # for reading also binary mode is important
    with open(file_name, 'rb') as fp:
        n_list = pickle.load(fp)
        return n_list

def save_results(task):
    # save retrieved_scores to a pickle file
    write_list(retrieved_scores, f'retrieved_scores_{task}.p')
    # save retrieved_true_scores to a pickle file
    write_list(retrieved_true_scores, f'retrieved_true_scores_{task}.p')
    # save queries to a pickle file
    write_list(queries, f'queries_{task}.p')
    # save answers to a pickle file
    write_list(answers, f'answers_{task}.p')
    # save passages to a pickle file
    write_list(passages, f'passages_{task}.p')
    # save opensource_true_scores to a pickle file
    write_list(opensource_true_scores, f'opensource_true_scores_{task}.p')
    # save opensource_texts to a ickle file
#     write_list(opensource_texts, f'opensource_texts_{task}.p')
    # save opensource_answers to a pickle file
    write_list(opensource_answers, f'opensource_answers_{task}.p')
    # save opensource_semantics to a picle file
    write_list(opensource_semantics, f'opensource_semantics_{task}.p')
    # save feasibilities to a pickle file
    write_list(feasibilities, f'feasibilities_{task}.p')
    # save occurances to a pickle file
    write_list(occurances, f'occurances_{task}.p')
    # save semantic_ids to a pickle file
    write_list(semantic_ids, f'semantic_ids_{task}.p')

def read_results(task):
    retrieved_scores = read_list(f'retrieved_scores_{task}.p')
    retrieved_true_scores = read_list(f'retrieved_true_scores_{task}.p')
    queries = read_list(f'queries_{task}.p')
    answers = read_list(f'answers_{task}.p')
    opensource_true_scores = read_list(f'opensource_true_scores_{task}.p')
    opensource_answers = read_list(f'opensource_answers_{task}.p')
    opensource_semantics = read_list(f'opensource_semantics_{task}.p')
    opensource_occurances = read_list(f'occurances_{task}.p')
    opensource_semantic_ids = read_list(f'semantic_ids_{task}.p')
    opensource_probs = read_list(f'probs_{task}.p')
    
    return retrieved_scores, retrieved_true_scores, \
           queries, answers, \
           opensource_true_scores, opensource_answers, \
           opensource_occurances, opensource_semantic_ids, opensource_probs

In [43]:
task = 'nq'

In [44]:
semantic = False
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)
if semantic:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    # setup semantic model
    semantic_tokenizer = \
        AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
    semantic_model = \
        AutoModelForSequenceClassification.from_pretrained(
            "microsoft/deberta-large-mnli"
        ).cuda()

In [45]:
retrieved_scores, retrieved_true_scores, queries, answers, opensource_true_scores, opensource_answers, opensource_occurances, opensource_semantic_ids, opensource_probs = \
        read_results(task)

In [46]:
# indices = np.arange(1000)
indices = np.arange(len(queries))
random.shuffle(indices)
cal_first_indices = indices[:int(len(indices) * 0.3)]
cal_second_indices = indices[int(len(indices) * 0.3) : int(len(indices) * 0.6)]
test_indices = indices[int(len(indices) * 0.6):]
# test_indices = indices[int(len(indices) * 0.3):]

cal_first_retrieved_true_scores = utils.split(retrieved_true_scores, cal_first_indices)
cal_second_retrieved_true_scores = utils.split(retrieved_true_scores, cal_second_indices)
test_retrieved_true_scores = utils.split(retrieved_true_scores, test_indices)
cal_first_opensource_true_scores = utils.split(opensource_true_scores, cal_first_indices)
cal_second_opensource_true_scores = utils.split(opensource_true_scores, cal_second_indices)
test_opensource_true_scores = utils.split(opensource_true_scores, test_indices)
cal_first_retrieved_scores = utils.split(retrieved_scores, cal_first_indices)
cal_second_retrieved_scores = utils.split(retrieved_scores, cal_second_indices)
test_retrieved_scores = utils.split(retrieved_scores, test_indices)
cal_first_opensource_occurances = utils.split(opensource_occurances, cal_first_indices)
cal_second_opensource_occurances = utils.split(opensource_occurances, cal_second_indices)
test_opensource_occurances = utils.split(opensource_occurances, test_indices)
cal_first_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_first_indices)
cal_second_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_second_indices)
test_opensource_semantic_ids = utils.split(opensource_semantic_ids, test_indices)
cal_first_queries = utils.split(queries, cal_first_indices)
cal_second_queries = utils.split(queries, cal_second_indices)
test_queries = utils.split(queries, test_indices)
cal_first_opensource_answers = utils.split(opensource_answers, cal_first_indices)
cal_second_opensource_answers = utils.split(opensource_answers, cal_second_indices)
test_opensource_answers = utils.split(opensource_answers, test_indices)
cal_first_answers = utils.split(answers, cal_first_indices)
cal_second_answers = utils.split(answers, cal_second_indices)
test_answers = utils.split(answers, test_indices)
cal_first_opensource_probs = utils.split(opensource_probs, cal_first_indices)
cal_second_opensource_probs = utils.split(opensource_probs, cal_second_indices)
test_opensource_probs = utils.split(opensource_probs, test_indices)


In [47]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)

## Compute coverage rate

In [48]:
def coverage(
        retrieved_true_scores_list, opensource_true_scores_list,
        retrieved_thr, qa_thr):

    includes = []
    for idx, (retrieved_true_score, opensource_true_score) in enumerate(zip(retrieved_true_scores_list, opensource_true_scores_list)):
#         if idx > 20:
        opensource_true_score = np.max(opensource_true_score)
        include = True if (retrieved_true_score >= retrieved_thr and 
                           opensource_true_score >= qa_thr) \
                       else False
        includes.append(include)
    return includes

In [49]:
alpha = 0.1
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha/2)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha/2)

retrieved_coverage = np.mean(np.array(cal_second_retrieved_true_scores) >= retrieved_thr)
cal_second_scores = []
for scores in cal_second_opensource_true_scores:
    cal_second_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(cal_second_scores) >= opensource_qa_thr)
print('retrieval coverage', retrieved_coverage)
print('qa coverage', qa_coverage)

retrieved_coverage = np.mean(np.array(test_retrieved_true_scores) >= retrieved_thr)
test_scores = []
for scores in test_opensource_true_scores:
    test_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(test_scores) >= opensource_qa_thr)
print('test retrieval coverage', retrieved_coverage)
print('test qa coverage', qa_coverage)

retrieval coverage 0.9422222222222222
qa coverage 0.9733333333333334
test retrieval coverage 0.93
test qa coverage 0.96


In [50]:
coverages = coverage(test_retrieved_true_scores, 
                    test_opensource_true_scores,
                    retrieved_thr,
                    opensource_qa_thr
                   )
print('End-to-end coverage', np.mean(coverages))

End-to-end coverage 0.895


## PAC-Bonf

In [51]:
from pac_utils import find_maximum_train_error_allow

In [52]:
delta = 0.1
retrieve_alpha = find_maximum_train_error_allow(alpha/2.0, delta/2.0, len(cal_first_indices))
qa_alpha = find_maximum_train_error_allow(alpha/2.0, delta/2.0, len(cal_first_indices))
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=retrieve_alpha)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=qa_alpha)

retrieved_coverage = np.mean(np.array(cal_second_retrieved_true_scores) >= retrieved_thr)
cal_second_scores = []
for scores in cal_second_opensource_true_scores:
    cal_second_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(cal_second_scores) >= opensource_qa_thr)
print('retrieval coverage', retrieved_coverage)
print('qa coverage', qa_coverage)

retrieved_coverage = np.mean(np.array(test_retrieved_true_scores) >= retrieved_thr)
test_scores = []
for scores in test_opensource_true_scores:
    test_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(test_scores) >= opensource_qa_thr)
print('test retrieval coverage', retrieved_coverage)
print('test qa coverage', qa_coverage)

retrieval coverage 0.9644444444444444
qa coverage 0.9755555555555555
test retrieval coverage 0.9483333333333334
test qa coverage 0.9733333333333334


In [53]:
coverages = coverage(test_retrieved_true_scores, 
                    test_opensource_true_scores,
                    retrieved_thr,
                    opensource_qa_thr
                   )
print('End-to-end coverage', np.mean(coverages))

End-to-end coverage 0.9216666666666666


## Bayesian optimization

In [54]:
from skopt.space import Real
from skopt import gp_minimize
from skopt.utils import use_named_args
import multiprocessing
from multiprocessing import Value

In [74]:
def evaluate(
        test_retrieved_scores,
        test_queries, test_answers, test_chatgpt_answers, 
        test_chatgpt_semantic_ids, test_chatgpt_probs,
        retrieved_thr, chatgpt_qa_thr,
        cluster=True, kernel=40, verbose=True):
    
    length = len(test_retrieved_scores)
    lens = np.linspace(0, length, kernel+1)
    test_retrieved_scores_list = [test_retrieved_scores[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_chatgpt_semantic_ids_list = [test_chatgpt_semantic_ids[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_chatgpt_probs_list = [test_chatgpt_probs[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_answers_list = [test_answers[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    
    def run(i, shared_includes, shared_semantic_counts):
        includes = []
        semantics_total = []
        coverages = []
        semantic_counts = []
        for idx, (retrieved_scores_tmp, answers_tmp,\
                chatgpt_semantic_ids_tmp, chatgpt_probs_tmp) \
            in tqdm(enumerate(zip(
                test_retrieved_scores_list[i], test_answers_list[i], \
                test_chatgpt_semantic_ids_list[i], test_chatgpt_probs_list[i])), total=len(test_retrieved_scores_list[i])):

            include = False
    #         coverage_tmp = coverage(retrieved_true_scores_tmp, 
    #                         [chatgpt_true_scores_tmp],
    #                         retrieved_thr,
    #                         chatgpt_qa_thr
    #                        )[0]
    #         coverages.append(coverage_tmp)
            retrieved_count = 0
            semantics = []
            for retrieved_score, semantic_set_ids, probs in zip(retrieved_scores_tmp, chatgpt_semantic_ids_tmp, chatgpt_probs_tmp):
                    if retrieved_score < retrieved_thr:
                        continue
                    else:
                        retrieved_count += 1
                        for predicted_answer in semantic_set_ids.keys():
                            concept_id = semantic_set_ids[predicted_answer]
                            prob = probs[concept_id]
                            if prob >= chatgpt_qa_thr:
                                semantics.append(predicted_answer)

                                # TODO: check if the semantic is consistent with true answer
                                if include is False:
                                    for answer_tmp in answers_tmp:
                                        scores = scorer.score(answer_tmp,
                                                              predicted_answer)
                                        scores = scores['rouge1'][2]
                                        if scores >= 0.3:
                                            include = True
                                            break
            if cluster:
                semantic_set_ids, semantic_probs, item_occurance = \
                            utils.clustering(semantics, "", scorer=scorer)
                semantic_counts.append(len(semantic_probs.keys()))
            else:
                semantic_counts.append(len(semantics))
            semantics_total.append(semantic_set_ids)
    #         answer_counts.append(retrieved_count)
            includes.append(include)
        shared_includes.value += np.sum(includes)
        shared_semantic_counts.value += np.sum(semantic_counts)
    
    processes = []
    shared_includes = Value('f', 0.0)
    shared_semantic_counts = Value('f', 0.0)

    for i in range(0, kernel):
        p = multiprocessing.Process(target=run, args=(i, shared_includes, shared_semantic_counts))
        processes.append(p)
        p.start()

    for process in processes:
        process.join()
        
    return shared_includes.value/length, 0.0, shared_semantic_counts.value/length

## Bonf method

In [75]:
def softmax(vec):
    nom = np.exp(vec - np.mean(vec))
    return nom / np.sum(nom)

In [70]:
"""
Weight HMP module
"""
w1 = Real(name='w1', low=0.0, high=1.0)
w2 = Real(name='w2', low=0.0, high=1.0)

# Gather the search-space dimensions in a list.
dimensions = [w1, w2]

@use_named_args(dimensions=dimensions)
def objective(w1, w2):
    weights = softmax(np.array([w1, w2])).reshape(-1, 1)
    alpha_retrieve = alpha * weights[0]
    alpha_qa = alpha * weights[1]
    retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
    cal_first_scores = []
    for scores in cal_first_opensource_true_scores:
        cal_first_scores.append(np.max(scores))
    opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
    results = evaluate(cal_second_retrieved_scores, cal_second_queries, \
                       cal_second_answers, cal_second_opensource_answers, \
                       cal_second_opensource_semantic_ids, cal_second_opensource_probs, \
                       retrieved_thr, opensource_qa_thr,
                       cluster=True
                      )
    coverage = np.mean(results[0])
    average_answer = np.mean(results[1])
    average_semantic = np.mean(results[2])
    return average_semantic

result = gp_minimize(func=objective,
                     dimensions=dimensions,
                     acq_func="EI",      # the acquisition function
                     n_calls=10,
                     random_state=42,
                     verbose=True)

print("Best fitness:", result.fun)
print("Best parameters:", softmax(result.x))

SyntaxError: unexpected character after line continuation character (2107705822.py, line 22)

In [71]:
weights = softmax(result.x).reshape(-1, 1)
alpha_retrieve = alpha * weights[0]
alpha_qa = alpha * weights[1]
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_semantic_ids, test_opensource_probs,
    retrieved_thr, opensource_qa_thr,
    cluster=True
)

NameError: name 'result' is not defined

In [72]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

Desired Coverage 0.9
Coverage 0.8027156549520766
Average semantic 10.73961661341853


In [76]:
alpha_retrieve = alpha * (1/2.0)
alpha_qa = alpha * (1/2.0)
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_semantic_ids, test_opensource_probs,
    retrieved_thr, opensource_qa_thr,
    cluster=True
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 55.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 52.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 39.10it/s]
 67%|████████████████████████████████████████████████████████████████████████████████

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 29.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 31.43it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 25.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████

In [33]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

Desired Coverage 0.9
Coverage 0.9704472843450479
Average semantic 30.206869009584665


In [40]:
def evaluate_vanila(
        test_retrieved_scores,
        test_queries, test_answers, test_chatgpt_answers, 
        test_chatgpt_semantic_ids, test_chatgpt_probs,
        cluster=True, kernel=40, verbose=True):
    
    length = len(test_retrieved_scores)
    lens = np.linspace(0, length, kernel+1)
    test_retrieved_scores_list = [test_retrieved_scores[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_chatgpt_semantic_ids_list = [test_chatgpt_semantic_ids[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_chatgpt_probs_list = [test_chatgpt_probs[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    test_answers_list = [test_answers[int(lens[i]):int(lens[i+1])] for i in range(kernel)]
    
    def run(i, shared_includes, shared_semantic_counts):
        includes = []
        semantics_total = []
        coverages = []
        semantic_counts = []
        for idx, (retrieved_scores_tmp, answers_tmp,\
                chatgpt_semantic_ids_tmp, chatgpt_probs_tmp) \
            in tqdm(enumerate(zip(
                test_retrieved_scores_list[i], test_answers_list[i], \
                test_chatgpt_semantic_ids_list[i], test_chatgpt_probs_list[i])), total=len(test_retrieved_scores_list[i])):

            include = False
    #         coverage_tmp = coverage(retrieved_true_scores_tmp, 
    #                         [chatgpt_true_scores_tmp],
    #                         retrieved_thr,
    #                         chatgpt_qa_thr
    #                        )[0]
    #         coverages.append(coverage_tmp)
            retrieved_count = 0
            semantics = []
            for retrieved_score, semantic_set_ids, probs in zip(retrieved_scores_tmp, chatgpt_semantic_ids_tmp, chatgpt_probs_tmp):
                retrieved_count += 1
                for predicted_answer in semantic_set_ids.keys():
                    concept_id = semantic_set_ids[predicted_answer]
                    prob = probs[concept_id]
                    semantics.append(predicted_answer)

                    # TODO: check if the semantic is consistent with true answer
                    if include is False:
                        for answer_tmp in answers_tmp:
                            scores = scorer.score(answer_tmp,
                                                  predicted_answer)
                            scores = scores['rouge1'][2]
                            if scores >= 0.3:
                                include = True
                                break
                break
            if cluster:
                semantic_set_ids, semantic_probs, item_occurance = \
                            utils.clustering(semantics, "", scorer=scorer)
                semantic_counts.append(len(semantic_probs.keys()))
            else:
                semantic_counts.append(len(semantics))
            semantics_total.append(semantic_set_ids)
    #         answer_counts.append(retrieved_count)
            includes.append(include)
        shared_includes.value += np.sum(includes)
        shared_semantic_counts.value += np.sum(semantic_counts)
    
    processes = []
    shared_includes = Value('f', 0.0)
    shared_semantic_counts = Value('f', 0.0)

    for i in range(0, kernel):
        p = multiprocessing.Process(target=run, args=(i, shared_includes, shared_semantic_counts))
        processes.append(p)
        p.start()

    for process in processes:
        process.join()
        
    return shared_includes.value/length, 0.0, shared_semantic_counts.value/length

In [41]:
%%time
results = evaluate_vanila(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_semantic_ids, test_opensource_probs, 
    cluster=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 72.88it/s]


  0%|                                                                                                                                                                                                                                                                                    | 0/32 [00:00<?, ?it/s]
 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 30/31 [00:00<00:00, 88.97it/s]
 45%|██████████████████████████████████████████████████████████████████████████████

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 85.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 64.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 52.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████

CPU times: user 95.1 ms, sys: 1.09 s, total: 1.19 s
Wall time: 1.31 s


In [42]:
print('Desired coverage rate', 1-alpha)
print('Coverage', np.mean(results[0]))
print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

Desired coverage rate 0.9
Coverage 0.8027156549520766
Average answer 0.0
Average semantic 10.73961661341853


## PAC-Bonf method

In [34]:
alpha_retrieve = alpha * (1/2.0)
alpha_qa = alpha * (1/2.0)

delta = 0.1
alpha_retrieve_pac = find_maximum_train_error_allow(alpha_retrieve, delta/2.0, len(cal_first_indices))
alpha_qa_pac = find_maximum_train_error_allow(alpha_qa, delta/2.0, len(cal_first_indices))

retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve_pac)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa_pac)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_occurances, test_opensource_semantic_ids,
    retrieved_thr, opensource_qa_thr)

In [35]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

Desired Coverage 0.9
Coverage 0.9779951100244498
Average semantic 23.882640586797066
