# Step 3
This notebook is used for finalizing the AGent dataset.
Firstly, we need to manually annotate 200 challenging unanswerable candidates. Some challenging unanswerable candidates are actually answerable. Save the ids of these answerable questions into `answerable_ids_path`.

Get the predictions of adversarial models on 200 annotated challenging unanswerable candidates. Save the predictions in `annotated_pred_path`.

In [None]:
annotated_pred_path = "Prediction/Step3_Annotated"
answerable_ids_path = "answerable_ids.json"

full_pred_dev_path = "src/step1/retriever_component/unans_cdd/squad_dev_unans_cdd.json"      # path to predictions of the 6 adversarial models
dev_path = "Prediction/Adversarial_Models/SQuAD_Dev"    # path to all challenging unanswerable candidates (product of step 2)
squadv1_dev_path = "src/step1/data/SQuAD/original/squad_dev.json"
save_dev_path = "AGent_data/SQuAD/dev.json"


tfidf_train_path = "src/step1/retriever_component/unans_cdd/squad_train_unans_cdd.json"
full_pred_train_path = "Prediction/Adversarial_Models/SQuAD_Train"
squadv1_train_path = "src/step1/data/SQuAD/original/squad_train.json"
save_train_path = "AGent_data/SQuAD/train.json"

In [None]:
import json
import os
import numpy as np
from itertools import product
import random

In [None]:
def load_predictions(pred_path):
    """
    Load all predictions into one dictionary with structure 
    {"model_name":{"question_id": "prediction"}}
    + The list of models are predefined below

    INPUT:
        pred_path: all predictions of models are saved in the same folder
    OUTPUT:
        pred_dictionary:
    """
    models = ["bert-base-cased", "roberta-base", "SpanBERT/spanbert-base-cased",
              "bert-large-cased", "roberta-large", "SpanBERT/spanbert-large-cased"]
    pred_dictionary = {}
    for i in range(len(models)):
        model = models[i]
        pred_model = {}
        complete_path = os.path.join(
            pred_path, model, "nbest_predictions_eval.json")
        with open(complete_path) as dataset_file:
            pred_model = json.load(dataset_file)
        if i == 0:
            for id in pred_model:
                pred_dictionary[id] = [pred_model[id][0]]
        else:
            for id in pred_model:
                pred_dictionary[id].append(pred_model[id][0])
    return pred_dictionary

def load_answerable_ids(id_path):
    with open(id_path) as dataset_file:
        data = json.load(dataset_file)
    answerable_ids = data['ids']
    return answerable_ids

def tune_formula(all_predictions, answerable_ids, tuning_range, formula):
    """
    Use the grid search to find best parameters in the tuning_range

    INPUT:
        all_predictions: predictions on all ids by all models
        tuning_range: (list of list) in the format of [[start, end, step],[start, end, step]]
        formula: the formula that we are tuning
    OUTPUT:
        best_recall:
        best_threshold:
        best parameters:
    """
    best_recall, best_threshold = 0, 0
    best_parameters = [-1 for _ in range(len(tuning_range))]
    all_parameters = [list(np.arange(tune[0], tune[1], tune[2])) for tune in tuning_range]
    for parameters in product(*all_parameters):
        current_threshold, current_recall = calculate_best_recall(all_predictions, answerable_ids,
                                                                  parameters, formula)
        if current_recall > best_recall:
            best_recall = current_recall
            best_threshold = current_threshold
            best_parameters = parameters
    return best_recall, best_threshold, best_parameters

def calculate_best_recall(all_predictions, answerable_ids, parameters, formula):
    """
    INPUT:
        all_predictions: predictions on all ids by all models
        parameters: set of parameters that we are considering
        formula: the formula that we are tuning
    """
    answerable_values = []
    unanswerable_values = []
    for id in all_predictions:
        value = formula(all_predictions[id], parameters)
        if id in answerable_ids:
            answerable_values.append(value)
        else:
            unanswerable_values.append(value)
    threshold = min(answerable_values)
    recall = len(list(filter(lambda x: (x < threshold), unanswerable_values))) / len(unanswerable_values)
    return threshold, recall

def formula(predictions, parameters):
    """
    This is the formular descibed in the appendix of the paper.

    INPUT:
        predictions: (list of dict) list of 6 predictions on one id
        parameters: (list of float) list of all parameters needed
    OUTPUT:
        value: The value V(q) of the corresponding question
    """
    assert len(parameters) == 2
    [alpha, beta] = parameters
    unans_confidence, unans_count = 0, 0
    ans_confidence, ans_count = 0, 0
    for pred in predictions:
        if pred['text'] != "":
            ans_confidence += pred['probability']
            ans_count += 1
        else:
            unans_confidence += pred['probability']
            unans_count += 1
    ans_average = ans_confidence / ans_count if ans_count > 0 else ans_confidence
    unans_average = unans_confidence / unans_confidence if unans_count > 0 else unans_confidence
    value =  ans_confidence * (alpha**ans_count) - unans_confidence * (beta**unans_count)
    return value

In [None]:
"""
We manually annotated 200 questions for tuning the formula. 
Get the predictions of the 6 adversarial models on these 200 questions.
Use these predictions to tune the formula and then apply the formula on the rest of the data.
"""
annotated_pred_path = "/Volumes/Share/tran_s2/squad_devXtrain/Result/squad_tfidf/annotated_question"
all_predictions = load_predictions(pred_path)
print(len(all_predictions))
answerable_ids_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/answerable_id.json"
answerable_ids = load_answerable_ids(answerable_ids_path)
print(len(answerable_ids))
best_recall, best_threshold, best_parameters = tune_formula(all_predictions, answerable_ids, [[0,2,0.01], [0,2,0.01]], formula_2)
print("We can achieve recall {} with parameters {} and threshold {}".format(best_recall, best_parameters, best_threshold))

200
38
We can achieve recall 0.42592592592592593 with parameters (0.64, 0.6900000000000001) and threshold -0.3428283339143994


# Extract Using Tuned Formula

In [None]:
def difficulty(predictions):
    difficulty_level = 0
    for pred in predictions:
        if pred['text'] != "":
            difficulty_level += 1
    return difficulty_level
def extract_unans(data_path, full_pred_path, formula, parameters, threshold):
    """
    Extract the product of AGent pipeline given the tuned formula.
    Condition: (V(q) < threshold) && (difficulty(q) >= 2)
    INPUT:
        data_path: path to the full data of challenging unanswerable candidates
        full_pred_path: path to predictions of 6 adversarial models on challenging unanswerable candidates
        formula: tuned formula
        parameters: parameters for the formula
        threshold: threshold of the unanswerable questions 
    OUTPUT:
        data_save: the unanswerable questions created by AGent.
    """
    with open(data_path) as dataset_file:
        data = json.load(dataset_file)['data']
    data_save = []
    
    all_predictions = load_predictions(full_pred_path)
    print(len(all_predictions))
    survey_dict = {2:0, 3:0, 4:0, 5:0, 6:0}

    for qas in data:
        id = qas['id']
        value = formula(all_predictions[id], parameters)
        if value < threshold and difficulty(all_predictions[id]) >= 2:
            data_save.append(qas)
            survey_dict[difficulty(all_predictions[id])] +=1
    print(survey_dict)
    return data_save

In [None]:
# Apply the formula on the dev set.
full_pred_dev_path = "/Volumes/Share/tran_s2/squad_devXtrain/Result/squad_tfidf/for_extract_unans"      # path to predictions of the 6 adversarial models
dev_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/squad_tfidf_hard_unans/verMay23/2.json"    # path to all challenging unanswerable candidates (product of step 2)
dev_unans = extract_unans(dev_path, full_pred_dev_path, formula_2, best_parameters, best_threshold)
print(len(dev_unans))

7117
{2: 853, 3: 743, 4: 603, 5: 189, 6: 0}
2388


In [None]:
# Save Dev
squadv1_dev_path = "/Volumes/Share/tran_s2/Public_Datasets/mrqa/mrqa_dev/squad1/SQuAD-train-from-MRQA-new-format.json"
with open(squadv1_dev_path) as dataset_file:
    dataset = json.load(dataset_file)['data']
dataset.extend(dev_unans)
random.shuffle(dataset)
to_save = {"version": "SQuAD AGent dev", "data": dataset}
json_object = json.dumps(to_save, indent=4)
save_dev_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/squad_tfidf_hard_unans/SQuAD_train/SQuAD_AGent_train.json"
with open(save_dev_path, "w") as outfile:
    outfile.write(json_object)

In [None]:
# Apply the formula on the train set.
tfidf_train_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/raw_tfidf/raw_tfidf_of_squad1_train_top10/raw_tfidf_of_squad1_train_top10-new-format.json"
full_pred_train_path = "/Volumes/Share/tran_s2/squad_devXtrain/Result/squad_tfidf_model/eval_squad1_train_top10"
train_unans = extract_unans(tfidf_train_path, full_pred_train_path, formula_2, best_parameters, best_threshold)
print(len(train_unans))

870588
{2: 31951, 3: 9720, 4: 4809, 5: 1536, 6: 0}
48016


In [None]:
# Save Train
with open(squadv1_train_path) as dataset_file:
    dataset = json.load(dataset_file)['data']
dataset.extend(train_unans)
random.shuffle(dataset)
to_save = {"version": "SQuAD AGent", "data": dataset}
json_object = json.dumps(to_save, indent=4)
with open(save_train_path, "w") as outfile:
    outfile.write(json_object)