In [1]:
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys

In [2]:
data = []
for line in open('datasets/train_1k.json', 'r'):
    data.append(json.loads(line))

In [3]:
with open('results/predictions.json',"r") as f:
    preds = json.load(f)

In [4]:
#dict of each example_id with True/False based on whether it has a long answer
def make_qid_to_has_ans(data):
    qid_to_has_ans = {}
    for entry in data:
        qas_id = entry['example_id']
        if entry['annotations'][0]['long_answer']['start_token']>=0 \
        and entry['annotations'][0]['long_answer']['end_token']>=0:
            qid_to_has_ans[str(qas_id)] = True
        else:
            qid_to_has_ans[str(qas_id)] = False
    return qid_to_has_ans

In [5]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [6]:
def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

In [7]:
def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

In [8]:
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [9]:
def get_raw_scores(data, preds):
    exact_scores = {}
    f1_scores = {}
    for entry in data:
        #find correct answer, empty string if there is no long answer
        qas_id = entry['example_id']
        if entry['annotations'][0]['long_answer']['start_token']>=0 \
        and entry['annotations'][0]['long_answer']['end_token']>=0:
            long_answer_start= entry['annotations'][0]['long_answer']['start_token']
            long_answer_end = entry['annotations'][0]['long_answer']['end_token']
            para = entry['document_text']
            split_text = para.split(' ')
            doc_text = ' '.join([item for item in split_text[long_answer_start:long_answer_end] \
                                     if not ('<' in item or '>' in item) ])
            a_gold = ' '
            if len(entry['annotations'][0]['short_answers'])==0 and len(entry['annotations'][0]['long_answer'])!=0:
                is_impossible=True
            else:
                is_impossible=False

            if not is_impossible:
                start_position = entry['annotations'][0]['short_answers'][0]['start_token']
                end_position = entry['annotations'][0]['short_answers'][0]['end_token']
                a_gold = ' '.join([item for item in split_text[start_position:end_position] \
                                         if not ('<' in item or '>' in item) ])
                a_pred = preds[str(qas_id)]
                exact_scores[str(qas_id)] = compute_exact(a_gold,a_pred)
                f1_scores[str(qas_id)] = compute_f1(a_gold,a_pred)
#                 print(a_gold)
            else:
                start_position = -1
                end_position = -1
                a_gold = ""
                a_pred = preds[str(qas_id)]
                exact_scores[str(qas_id)] = compute_exact(a_gold,a_pred)
                f1_scores[str(qas_id)] = compute_f1(a_gold,a_pred)
#             gold_answers = [orig_answer_text]
        else:
            a_gold=''
#         if qas_id not in preds:
#             print('Missing prediction for %s' % qas_id)
#             continue
#         a_pred = preds[qas_id]
#         exact_scores[qas_id] = compute_exact(a_gold,a_pred)
#         f1_scores[qas_id] = compute_f1(a_gold,a_pred)
    return exact_scores, f1_scores

In [10]:
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
    new_scores = {}
    for qid, s in scores.items():
        pred_na = na_probs[str(qid)] > na_prob_thresh
        if pred_na:
            new_scores[str(qid)] = float(not qid_to_has_ans[str(qid)])
        else:
            new_scores[str(qid)] = s
    return new_scores

In [11]:
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
    if not qid_list:
        total = len(exact_scores)
        return collections.OrderedDict([('exact', 100.0 * sum(exact_scores.values()) / total),\
                                        ('f1', 100.0 * sum(f1_scores.values()) / total),('total', total),])
    else:
        total = len(qid_list)
        return collections.OrderedDict([('exact', 100.0 * sum(exact_scores[str(k)] for k in qid_list) / total),\
                                        ('f1', 100.0 * sum(f1_scores[str(k)] for k in qid_list) / total),\
                                        ('total', total),])

In [12]:
def merge_eval(main_eval, new_eval, prefix):
    for k in new_eval:
        main_eval['%s_%s' % (prefix, str(k))] = new_eval[str(k)]

In [13]:
def plot_pr_curve(precisions, recalls, out_image, title):
    plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
    plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.title(title)
    plt.savefig(out_image)
    plt.clf()

In [14]:
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
                               out_image=None, title=None):
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
    true_pos = 0.0
    cur_p = 1.0
    cur_r = 0.0
    precisions = [1.0]
    recalls = [0.0]
    avg_prec = 0.0
    for i, qid in enumerate(qid_list):
        if qid_to_has_ans[str(qid)]:
            true_pos += scores[str(qid)]
        cur_p = true_pos / float(i+1)
        cur_r = true_pos / float(num_true_pos)
        if i == len(qid_list) - 1 or na_probs[str(qid)] != na_probs[qid_list[i+1]]:
            # i.e., if we can put a threshold after this point
            avg_prec += cur_p * (cur_r - recalls[-1])
            precisions.append(cur_p)
            recalls.append(cur_r)
    if out_image:
        plot_pr_curve(precisions, recalls, out_image, title)
    return {'ap': 100.0 * avg_prec}

In [15]:
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
                                  qid_to_has_ans, out_image_dir):
    if out_image_dir and not os.path.exists(out_image_dir):
        os.makedirs(out_image_dir)
    num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
    if num_true_pos == 0:
        return
    pr_exact = make_precision_recall_eval(
      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
      title='Precision-Recall curve for Exact Match score')
    pr_f1 = make_precision_recall_eval(
      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
      title='Precision-Recall curve for F1 score')
    oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
    pr_oracle = make_precision_recall_eval(
      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
    merge_eval(main_eval, pr_exact, 'pr_exact')
    merge_eval(main_eval, pr_f1, 'pr_f1')
    merge_eval(main_eval, pr_oracle, 'pr_oracle')

In [16]:
def histogram_na_prob(na_probs, qid_list, image_dir, name):
    if not qid_list:
        return
    x = [na_probs[str(k)] for k in qid_list]
    weights = np.ones_like(x) / float(len(x))
    plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
    plt.xlabel('Model probability of no-answer')
    plt.ylabel('Proportion of dataset')
    plt.title('Histogram of no-answer probability: %s' % name)
    plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
    plt.clf()

In [17]:
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[str(k)])
    cur_score = num_no_ans
    best_score = cur_score
    best_thresh = 0.0
    qid_list = sorted(na_probs, key=lambda k: na_probs[str(k)])
    for i, qid in enumerate(qid_list):
        if qid not in scores: continue
        if qid_to_has_ans[str(qid)]:
            diff = scores[str(qid)]
        else:
            if preds[str(qid)]:
                diff = -1
            else:
                diff = 0
        cur_score += diff
        if cur_score > best_score:
            best_score = cur_score
            best_thresh = na_probs[str(qid)]
    return 100.0 * best_score / len(scores), best_thresh

In [18]:
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
    main_eval['best_exact'] = best_exact
    main_eval['best_exact_thresh'] = exact_thresh
    main_eval['best_f1'] = best_f1
    main_eval['best_f1_thresh'] = f1_thresh

In [19]:
na_probs = {k: 0.0 for k in preds}

In [20]:
qid_to_has_ans = make_qid_to_has_ans(data)  # maps qid to True/False
exact_raw, f1_raw = get_raw_scores(data, preds)

In [21]:
len(f1_raw)

498

In [22]:
len(exact_raw)

498

In [23]:
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]

In [24]:
exact_raw, f1_raw = get_raw_scores(data, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,1.0)

In [25]:
out_eval = make_eval_dict(exact_thresh, f1_thresh)

In [26]:
out_eval

OrderedDict([('exact', 54.618473895582326),
             ('f1', 58.290175942010926),
             ('total', 498)])