## Import necessary packages

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4,5,6,7"
from kilt import retrieval
from kilt import kilt_utils as utils
import tasks
# from kilt.retrievers import DPR_connector
import utils
from rouge_score import rouge_scorer
import random
import numpy as np
import torch
torch.set_grad_enabled(False)
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import load_dataset
import json
from tqdm import tqdm
import opensource

np.random.seed(21)

In [3]:
import pickle
def write_list(a_list, file_name):
    # store list in binary file so 'wb' mode
    with open(file_name, 'wb') as fp:
        pickle.dump(a_list, fp)
        print('Done writing list into a binary file')
def read_list(file_name):
    # for reading also binary mode is important
    with open(file_name, 'rb') as fp:
        n_list = pickle.load(fp)
        return n_list

def save_results(task):
    # save retrieved_scores to a pickle file
    write_list(retrieved_scores, f'retrieved_scores_{task}.p')
    # save retrieved_true_scores to a pickle file
    write_list(retrieved_true_scores, f'retrieved_true_scores_{task}.p')
    # save queries to a pickle file
    write_list(queries, f'queries_{task}.p')
    # save answers to a pickle file
    write_list(answers, f'answers_{task}.p')
    # save passages to a pickle file
    write_list(passages, f'passages_{task}.p')
    # save opensource_true_scores to a pickle file
    write_list(opensource_true_scores, f'opensource_true_scores_{task}.p')
    # save opensource_texts to a ickle file
#     write_list(opensource_texts, f'opensource_texts_{task}.p')
    # save opensource_answers to a pickle file
    write_list(opensource_answers, f'opensource_answers_{task}.p')
    # save opensource_semantics to a picle file
    write_list(opensource_semantics, f'opensource_semantics_{task}.p')
    # save feasibilities to a pickle file
    write_list(feasibilities, f'feasibilities_{task}.p')
    # save occurances to a pickle file
    write_list(occurances, f'occurances_{task}.p')
    # save semantic_ids to a pickle file
    write_list(semantic_ids, f'semantic_ids_{task}.p')

def read_results(task, end=None):
    retrieved_scores = read_list(f'retrieved_scores_{task}.p')[:end]
    retrieved_true_scores = read_list(f'retrieved_true_scores_{task}.p')[:end]
    queries = read_list(f'queries_{task}.p')[:end]
    answers = read_list(f'answers_{task}.p')[:end]
    opensource_true_scores = read_list(f'opensource_true_scores_{task}.p')[:end]
    opensource_answers = read_list(f'opensource_answers_{task}.p')[:end]
    opensource_semantics = read_list(f'opensource_semantics_{task}.p')[:end]
    opensource_occurances = read_list(f'occurances_{task}.p')[:end]
    opensource_semantic_ids = read_list(f'semantic_ids_{task}.p')[:end]
    
    return retrieved_scores, retrieved_true_scores, \
           queries, answers, \
           opensource_true_scores, opensource_answers, \
           opensource_occurances, opensource_semantic_ids

In [4]:
task = 'nq'

In [5]:
semantic = True
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)
if semantic:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    # setup semantic model
    semantic_tokenizer = \
        AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
    semantic_model = \
        AutoModelForSequenceClassification.from_pretrained(
            "microsoft/deberta-large-mnli"
        ).cuda()

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
retrieved_scores, retrieved_true_scores, queries, answers, opensource_true_scores, opensource_answers, opensource_occurances, opensource_semantic_ids = \
        read_results(task, end=200)

In [7]:
answers_semantic = []
for true_score, scores, returned_answers in zip(retrieved_true_scores, retrieved_scores, opensource_answers):
    idx = list(scores).index(true_score)
    tmp = returned_answers[idx]
    answers_semantic.append(tmp)

In [79]:
opensource_true_scores_semantic = []
skip = []
for idx, [query_tmp, true_answer, generated_answer] in tqdm(enumerate(zip(queries, answers, answers_semantic)), total=len(queries)):

    prompt = utils.get_prompt_template(query_tmp, "", task="Natural Questions")
    semantic_set_ids, semantic_probs, item_occurance = \
                utils.clustering(
        sequences=generated_answer, 
        prompt=prompt,
        semantic_model=semantic_model,
        semantic_tokenizer=semantic_tokenizer,
        scorer=scorer,
        semantic=True
    )
    true_scores, matched_answer, semantics = utils.processing_answers(
        semantic_set_ids, semantic_probs, 
        item_occurance, true_answer, scorer,
        threshold=0.3
    )
    if len(true_scores) == 0:
        print(idx)
        skip.append(idx)
        continue
    opensource_true_scores_semantic.append(true_scores)

 16%|██████████████████████████████████████████▎                                                                                                                                                                                                                               | 72/452 [00:28<02:16,  2.79it/s]

70


 58%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                              | 263/452 [01:22<01:09,  2.74it/s]

258


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 318/452 [01:40<00:27,  4.86it/s]

312


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 452/452 [02:33<00:00,  2.95it/s]


In [8]:
skip = [70, 258, 312]

In [16]:
def compute_semantic_clusterring(model, tokenizer,
                                 question, answers):
    # filter out non-unique answers
    unique_generated_texts, item_occurance = \
        utils.unique_items(answers)

    # unique_generated_texts = list(set(answers))
    semantic_set_ids = {}
    for index, answer in enumerate(unique_generated_texts):
        semantic_set_ids[answer] = index
    # print('Number of unique answers:', len(unique_generated_texts))

    with torch.no_grad():
        semantics = [0]
        for i in range(1, len(unique_generated_texts)):
            included = False
            for semantic in semantics:
                qa_1 = question + ' ' + unique_generated_texts[semantic]
                qa_2 = question + ' ' + unique_generated_texts[i]

                input = qa_1 + ' [SEP] ' + qa_2
                encoded_input = tokenizer.encode(input, padding=True)
                prediction = model(torch.tensor([encoded_input], device='cuda'))['logits']
                predicted_label = torch.argmax(prediction, dim=1)

                reverse_input = qa_2 + ' [SEP] ' + qa_1
                encoded_reverse_input = tokenizer.encode(
                    reverse_input,
                    padding=True)
                reverse_prediction = model(torch.tensor([encoded_reverse_input], device='cuda'))['logits']
                reverse_predicted_label = torch.argmax(
                    reverse_prediction,
                    dim=1)

                if 2 in predicted_label and 2 in reverse_predicted_label:
                    semantic_set_ids[unique_generated_texts[i]] = \
                        semantic_set_ids[unique_generated_texts[semantic]]
                    included = True
                    break
            if not included:
                semantics.append(i)

    semantic_probs = utils.get_semantic_prob(semantic_set_ids, item_occurance)
    return semantic_set_ids, semantic_probs, item_occurance

In [None]:
occurances_semantic = []
semantic_ids_semantic = []
probs_semantic = []
for idx, [query_tmp, generated_answers] in tqdm(enumerate(zip(queries, opensource_answers)), total=len(queries)):
    if idx in skip:
        continue
    probs_tmp = []
    answers_tmp = []
    semantic_id_tmp = []
    occurance_tmp = []
    semantic_tmp = []
    for generated_answer in generated_answers:
        prompt = utils.get_prompt_template(query_tmp, "", task="Natural Questions")
        semantic_set_ids, semantic_probs, item_occurance = \
            compute_semantic_clusterring(
            answers=generated_answer, 
            question=query_tmp,
            model=semantic_model,
            tokenizer=semantic_tokenizer,
        )
        probs_tmp.append(semantic_probs)
        occurance_tmp.append(item_occurance)
        semantic_id_tmp.append(semantic_set_ids)
    occurances_semantic.append(occurance_tmp)
    semantic_ids_semantic.append(semantic_id_tmp)
    probs_semantic.append(probs_tmp)

 45%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                  | 90/200 [20:15<11:06,  6.06s/it]

In [None]:
write_list(opensource_true_scores_semantic, f'opensource_true_scores_{task}_semantic.p')
write_list(occurances_semantic, f'occurances_{task}_semantic.p')
write_list(semantic_ids_semantic, f'semantic_ids_{task}_semantic.p')
write_list(probs_semantic, f'semantic_ids_{task}_semantic.p')

In [None]:
opensource_true_scores_semantic = read_list(f'opensource_true_scores_{task}_semantic.p')[:end]
opensource_occurances_semantic = read_list(f'occurances_{task}_semantic.p')[:end]
opensource_semantic_ids_semantic = read_list(f'semantic_ids_{task}_semantic.p')[:end]

In [None]:
# indices = np.arange(1000)
indices = np.arange(len(queries))
random.shuffle(indices)
cal_first_indices = indices[:int(len(indices) * 0.3)]
cal_second_indices = indices[int(len(indices) * 0.3) : int(len(indices) * 0.6)]
test_indices = indices[int(len(indices) * 0.6):]
# test_indices = indices[int(len(indices) * 0.3):]

cal_first_queries = utils.split(queries, cal_first_indices)
cal_second_queries = utils.split(queries, cal_second_indices)
test_queries = utils.split(queries, test_indices)

cal_first_answers = utils.split(answers, cal_first_indices)
cal_second_answers = utils.split(answers, cal_second_indices)
test_answers = utils.split(answers, test_indices)

cal_first_answers_semantic = utils.split(answers_semantic, cal_first_indices)
cal_second_answers_semantic = utils.split(answers_semantic, cal_second_indices)
test_answers_semantic = utils.split(answers_semantic, test_indices)

cal_first_retrieved_true_scores = utils.split(retrieved_true_scores, cal_first_indices)
cal_second_retrieved_true_scores = utils.split(retrieved_true_scores, cal_second_indices)
test_retrieved_true_scores = utils.split(retrieved_true_scores, test_indices)

cal_first_opensource_true_scores = utils.split(opensource_true_scores, cal_first_indices)
cal_second_opensource_true_scores = utils.split(opensource_true_scores, cal_second_indices)
test_opensource_true_scores = utils.split(opensource_true_scores, test_indices)

cal_first_retrieved_scores = utils.split(retrieved_scores, cal_first_indices)
cal_second_retrieved_scores = utils.split(retrieved_scores, cal_second_indices)
test_retrieved_scores = utils.split(retrieved_scores, test_indices)

cal_first_opensource_occurances = utils.split(opensource_occurances, cal_first_indices)
cal_second_opensource_occurances = utils.split(opensource_occurances, cal_second_indices)
test_opensource_occurances = utils.split(opensource_occurances, test_indices)

cal_first_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_first_indices)
cal_second_opensource_semantic_ids = utils.split(opensource_semantic_ids, cal_second_indices)
test_opensource_semantic_ids = utils.split(opensource_semantic_ids, test_indices)

cal_first_queries = utils.split(queries, cal_first_indices)
cal_second_queries = utils.split(queries, cal_second_indices)
test_queries = utils.split(queries, test_indices)

cal_first_opensource_answers = utils.split(opensource_answers, cal_first_indices)
cal_second_opensource_answers = utils.split(opensource_answers, cal_second_indices)
test_opensource_answers = utils.split(opensource_answers, test_indices)

cal_first_answers = utils.split(answers, cal_first_indices)
cal_second_answers = utils.split(answers, cal_second_indices)
test_answers = utils.split(answers, test_indices)

In [None]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                        use_stemmer=True)

## Compute coverage rate

In [None]:
def coverage(
        retrieved_true_scores_list, opensource_true_scores_list,
        retrieved_thr, qa_thr):

    includes = []
    for idx, (retrieved_true_score, opensource_true_score) in enumerate(zip(retrieved_true_scores_list, opensource_true_scores_list)):
#         if idx > 20:
        opensource_true_score = np.max(opensource_true_score)
        include = True if (retrieved_true_score >= retrieved_thr and 
                           opensource_true_score >= qa_thr) \
                       else False
        includes.append(include)
    return includes

In [None]:
alpha = 0.1
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha/2)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha/2)

retrieved_coverage = np.mean(np.array(cal_second_retrieved_true_scores) >= retrieved_thr)
cal_second_scores = []
for scores in cal_second_opensource_true_scores:
    cal_second_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(cal_second_scores) >= opensource_qa_thr)
print('retrieval coverage', retrieved_coverage)
print('qa coverage', qa_coverage)

retrieved_coverage = np.mean(np.array(test_retrieved_true_scores) >= retrieved_thr)
test_scores = []
for scores in test_opensource_true_scores:
    test_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(test_scores) >= opensource_qa_thr)
print('test retrieval coverage', retrieved_coverage)
print('test qa coverage', qa_coverage)

In [None]:
coverages = coverage(test_retrieved_true_scores, 
                    test_opensource_true_scores,
                    retrieved_thr,
                    opensource_qa_thr
                   )
print('End-to-end coverage', np.mean(coverages))

## PAC-Bonf

In [None]:
from pac_utils import find_maximum_train_error_allow

In [None]:
delta = 0.1
retrieve_alpha = find_maximum_train_error_allow(alpha/2.0, delta/2.0, len(cal_first_indices))
qa_alpha = find_maximum_train_error_allow(alpha/2.0, delta/2.0, len(cal_first_indices))
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=retrieve_alpha)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=qa_alpha)

retrieved_coverage = np.mean(np.array(cal_second_retrieved_true_scores) >= retrieved_thr)
cal_second_scores = []
for scores in cal_second_opensource_true_scores:
    cal_second_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(cal_second_scores) >= opensource_qa_thr)
print('retrieval coverage', retrieved_coverage)
print('qa coverage', qa_coverage)

retrieved_coverage = np.mean(np.array(test_retrieved_true_scores) >= retrieved_thr)
test_scores = []
for scores in test_opensource_true_scores:
    test_scores.append(np.max(scores))
qa_coverage = np.mean(np.array(test_scores) >= opensource_qa_thr)
print('test retrieval coverage', retrieved_coverage)
print('test qa coverage', qa_coverage)

In [None]:
coverages = coverage(test_retrieved_true_scores, 
                    test_opensource_true_scores,
                    retrieved_thr,
                    opensource_qa_thr
                   )
print('End-to-end coverage', np.mean(coverages))

## Bayesian optimization

In [None]:
from skopt.space import Real
from skopt import gp_minimize
from skopt.utils import use_named_args

In [None]:
def evaluate(
        test_retrieved_scores,
        test_queries, test_answers, test_opensource_answers, 
        test_opensource_occurances, test_opensource_semantic_ids,
        retrieved_thr, opensource_qa_thr,
        cluster=True):

    includes = []
    answer_counts = []
    semantic_counts = []
    includes = []
    coverages = []
    for idx, (retrieved_scores_tmp, \
            query_tmp, answers_tmp, opensource_answers_tmp, \
            opensource_occurances_tmp, opensource_semantic_ids_tmp) \
        in tqdm(enumerate(zip(
            test_retrieved_scores, \
            test_queries, test_answers, test_opensource_answers, \
            test_opensource_occurances, test_opensource_semantic_ids)),
            total=len(test_retrieved_scores)):
        
#         opensource_true_score = np.max(opensource_true_score)
#         include = True if (retrieved_true_score >= retrieved_thr and 
#                            opensource_true_score >= qa_thr) \
#                        else False
        
        include = False
        retrieved_count = 0
        semantics = []
        for retrieved_score, answer, item_occurance, semantic_set_ids in zip(retrieved_scores_tmp, opensource_answers_tmp, opensource_occurances_tmp, opensource_semantic_ids_tmp):
            if retrieved_score < retrieved_thr:
                continue
            else:
                retrieved_count += 1
                for predicted_answer in semantic_set_ids.keys():
                    concept_id = semantic_set_ids[predicted_answer]
                    repeat = item_occurance[predicted_answer]
                    prob = repeat / len(answer)
                    if prob >= opensource_qa_thr:
                        semantics.append(predicted_answer)
                        
                        # TODO: check if the semantic is consistent with true answer
                        if include is False:
                            for answer_tmp in answers_tmp:
                                scores = scorer.score(answer_tmp,
                                                      predicted_answer)
                                scores = scores['rouge1'][2]
                                if scores >= 0.3:
                                    include = True
                                    break
        if cluster:
#             semantic_set_ids, semantic_probs, item_occurance = \
#                         utils.clustering(semantics, "", scorer=scorer)
            
            prompt = utils.get_prompt_template(query_tmp, "", task="Natural Questions")
#             print(semantics)
            semantic_set_ids, semantic_probs, item_occurance = \
                utils.clustering(
                sequences=semantics, 
                prompt=prompt,
                semantic_model=semantic_model,
                semantic_tokenizer=semantic_tokenizer,
                scorer=scorer,
                semantic=True
            )
#             true_scores, matched_answer, semantics = utils.processing_answers(
#                 semantic_set_ids, semantic_probs, 
#                 item_occurance, true_answer, scorer,
#                 threshold=0.3
#             )

            semantic_counts.append(len(semantic_probs.keys()))
        else:
            semantic_counts.append(len(semantics))
        includes.append(include)
        
    return [includes, answer_counts, semantic_counts]

## Bonf method

In [None]:
def softmax(vec):
    nom = np.exp(vec - np.mean(vec))
    return nom / np.sum(nom)

In [None]:
"""
Weight HMP module
"""
w1 = Real(name='w1', low=0.0, high=1.0)
w2 = Real(name='w2', low=0.0, high=1.0)

# Gather the search-space dimensions in a list.
dimensions = [w1, w2]

@use_named_args(dimensions=dimensions)
def objective(w1, w2):
    weights = softmax(np.array([w1, w2])).reshape(-1, 1)
    alpha_retrieve = alpha * weights[0]
    alpha_qa = alpha * weights[1]
    retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
    cal_first_scores = []
    for scores in cal_first_opensource_true_scores:
        cal_first_scores.append(np.max(scores))
    opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
    results = evaluate(cal_second_retrieved_scores, cal_second_queries, 
                       cal_second_answers, cal_second_opensource_answers, \
                       cal_second_opensource_occurances, cal_second_opensource_semantic_ids,
                       retrieved_thr, opensource_qa_thr,
                       cluster=True
                      )
    coverage = np.mean(results[0])
    average_answer = np.mean(results[1])
    average_semantic = np.mean(results[2])
    return average_semantic

result = gp_minimize(func=objective,
                     dimensions=dimensions,
                     acq_func="EI",      # the acquisition function
                     n_calls=10,
                     random_state=42,
                     verbose=True)

print("Best fitness:", result.fun)
print("Best parameters:", softmax(result.x))

In [None]:
weights = softmax(result.x).reshape(-1, 1)
alpha_retrieve = alpha * weights[0]
alpha_qa = alpha * weights[1]
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_occurances, test_opensource_semantic_ids,
    retrieved_thr, opensource_qa_thr,
    cluster=True
)

In [None]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

In [None]:
alpha_retrieve = alpha * (1/2.0)
alpha_qa = alpha * (1/2.0)
retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_occurances, test_opensource_semantic_ids,
    retrieved_thr, opensource_qa_thr,
    cluster=True
)

In [None]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

In [None]:
def evaluate_vanila(
        test_retrieved_scores,
        test_queries, test_answers, test_chatgpt_answers, 
        test_chatgpt_occurances, test_chatgpt_semantic_ids,
        cluster=True):

    includes = []
    answer_counts = []
    semantic_counts = []
    includes = []
    coverages = []
    for idx, (retrieved_scores_tmp, \
            query_tmp, answers_tmp, chatgpt_answers_tmp, \
            chatgpt_occurances_tmp, chatgpt_semantic_ids_tmp) \
        in enumerate(zip(
            test_retrieved_scores, \
            test_queries, test_answers, test_chatgpt_answers, \
            test_chatgpt_occurances, test_chatgpt_semantic_ids)):
        include = False
        retrieved_count = 0
        semantics = []
        for retrieved_score, answer, item_occurance, semantic_set_ids in zip(retrieved_scores_tmp, chatgpt_answers_tmp, chatgpt_occurances_tmp, chatgpt_semantic_ids_tmp):
            retrieved_count += 1
            for predicted_answer in semantic_set_ids.keys():
                concept_id = semantic_set_ids[predicted_answer]
                repeat = item_occurance[predicted_answer]
                prob = repeat / len(answer)
                # TODO: check if the semantic is consistent with true answer
                if include is False:
                    for answer_tmp in answers_tmp:
                        scores = scorer.score(answer_tmp,
                                              predicted_answer)
                        scores = scores['rouge1'][2]
                        if scores >= 0.3:
                            include = True
                            break
            break
        if cluster:
#             semantic_set_ids, semantic_probs, item_occurance = \
#                         utils.clustering(semantics, "", scorer=scorer)
            prompt = utils.get_prompt_template(query_tmp, "", task="Natural Questions")
            semantic_set_ids, semantic_probs, item_occurance = \
                utils.clustering(
                sequences=semantics, 
                prompt=prompt,
                semantic_model=semantic_model,
                semantic_tokenizer=semantic_tokenizer,
                scorer=scorer,
                semantic=True
            )
            semantic_counts.append(len(semantic_probs.keys()))
        else:
            semantic_counts.append(len(semantics))
        answer_counts.append(retrieved_count)
        includes.append(include)
    return [includes, answer_counts, semantic_counts]

In [None]:
%%time
results = evaluate_vanila(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_occurances, test_opensource_semantic_ids,
    cluster=True)

In [None]:
print('Desired coverage rate', 1-alpha)
print('Coverage', np.mean(results[0]))
print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))

## PAC-Bonf method

In [None]:
alpha_retrieve = alpha * (1/2.0)
alpha_qa = alpha * (1/2.0)

delta = 0.1
alpha_retrieve_pac = find_maximum_train_error_allow(alpha_retrieve, delta/2.0, len(cal_first_indices))
alpha_qa_pac = find_maximum_train_error_allow(alpha_qa, delta/2.0, len(cal_first_indices))

retrieved_thr = utils.compute_threshold(cal_first_retrieved_true_scores, alpha=alpha_retrieve_pac)
cal_first_scores = []
for scores in cal_first_opensource_true_scores:
    cal_first_scores.append(np.max(scores))
opensource_qa_thr = utils.compute_threshold(cal_first_scores, alpha=alpha_qa_pac)
results = evaluate(
    test_retrieved_scores, test_queries,
    test_answers, test_opensource_answers, 
    test_opensource_occurances, test_opensource_semantic_ids,
    retrieved_thr, opensource_qa_thr)

In [None]:
print('Desired Coverage', 1-alpha)
print('Coverage', np.mean(results[0]))
# print('Average answer', np.mean(results[1]))
print('Average semantic', np.mean(results[2]))