In [None]:
import json
import os

In [None]:
def load_predictions(pred_path):
    """
    Load all predictions into one dictionary with structure 
    {"model_name":{"question_id": "prediction"}}
    + The list of models are predefined below

    INPUT:
        pred_path: all predictions of models are saved in the same folder
    OUTPUT:
        pred_dictionary: all predictions of the six adversarial models 
                            {"model_name":{"question_id": "prediction"}}
    """
    models = ["bert-base-cased", "roberta-base", "SpanBERT/spanbert-base-cased",
              "bert-large-cased", "roberta-large", "SpanBERT/spanbert-large-cased"]
    chunks = [""]
    pred_dictionary = {}
    for model in models:
        pred_model = {}
        for chunk in chunks:
            complete_path = os.path.join(
                pred_path, model, chunk, "nbest_predictions_eval.json")
            with open(complete_path) as dataset_file:
                pred_model.update(json.load(dataset_file))
        shorten_pred_model = {}
        for id in pred_model:
            shorten_pred_model[id] = pred_model[id][0]
        pred_dictionary[model] = shorten_pred_model
    return pred_dictionary

def extract_hard_unans(tfidf, all_predictions):
    """

    """
    hard_unans = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}
    difficulty_dict = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0}
    for qas in tfidf:
        id = qas['id']
        qas_pred = []                           # [[prediction, confidence]]
        num_models = len(all_predictions)       # number of adversarial models
        for model in all_predictions:
            model_pred = all_predictions[model][id]['text']
            if model_pred != "":
                qas_pred.append(model_pred)

        difficulty_level = len(qas_pred)
        difficulty_dict[difficulty_level] += 1
        if difficulty_level == 0:
            continue
        elif difficulty_level == 1:
            hard_unans['1'].append(qas)
        elif difficulty_level == 2:
            hard_unans['2'].append(qas)
        elif difficulty_level == 3:
            hard_unans['3'].append(qas)
        elif difficulty_level == 4:
            hard_unans['4'].append(qas)
        elif difficulty_level == 5:
            hard_unans['5'].append(qas)
        elif difficulty_level == 6:
            hard_unans['6'].append(qas)
    print(difficulty_dict)
    return hard_unans

In [None]:
tfidf_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/raw_tfidf/raw_tfidf_of_squad1_devXtrain/raw_tfidf_of_squad1_devXtrain-new-format.json"
with open(tfidf_path) as dataset_file:
    tfidf = json.load(dataset_file)['data']

pred_path = "/Volumes/Share/tran_s2/squad_devXtrain/Result/squad_tfidf_model/eval_squad1_devXtrain"
predictions_dictionary = load_predictions(pred_path)
hard_unans = extract_hard_unans(tfidf, predictions_dictionary)

In [None]:
folder_path = "/Volumes/Share/tran_s2/squad_devXtrain/Data/squad_tfidf_hard_unans/verMay21"
for type in hard_unans:
    type_file = type + ".json"
    save_path = os.path.join(folder_path, type_file)
    to_save = {'data': hard_unans[type]}
    with open(save_path, 'w') as f:
        json.dump(to_save, f)