In [100]:
import pandas as pd
import json
import collections
import math
import numpy as np
from tqdm import tqdm
import pickle
from bert_utils import AnswerType

In [101]:
ScoreSummary = collections.namedtuple("ScoreSummary", ["short_span_score", "cls_token_score",
                                                       "answer_type_logits", "answer_type",
                                                       "start_logits", "end_logits", "unique_id",
                                                      "start_idx_in_chunk", "end_idx_in_chunk"])
def empty_score_summary():
    return ScoreSummary(None, None, None, None, None, None, None, None, None)
    
Span = collections.namedtuple("Span", ["start_token_idx", "end_token_idx", "score", "summary"])

In [102]:
def safe_devide(a, b):
    if b == 0:
        return 0
    return a / b

def compute_f1(tp, fp, fn):
    precision = safe_devide(tp, (tp + fp))
    recall = safe_devide(tp, (tp + fn))
    f1 = 2 * safe_devide((precision * recall), (precision + recall))
    return f1

gold_e_by_id_static = {}
gold_nq_jsonl_path = '../output/1k/nq-train-part.jsonl'
with open(gold_nq_jsonl_path) as gold_nq_jsonl:
    for index, line in enumerate(gold_nq_jsonl):
        gold_e = json.loads(line, object_pairs_hook=collections.OrderedDict)
        gold_e_by_id_static[gold_e['example_id']] = gold_e

def compute_score(gold_e_by_id, answers_csv_df, tqdm=None):
    
    #answers_csv_df = pd.read_csv(answers_csv_path)
            
    tp_short = 0
    fp_short = 0
    fn_short = 0
    tn_short = 0
    tp_long = 0
    fp_long = 0
    fn_long = 0
    tn_long = 0
    is_short_fp_column = []
    is_long_fp_column = []
    
    for index, row in answers_csv_df.iterrows():
        example_id = row['example_id']
        if not example_id in gold_e_by_id:
            raise ValueError('example id not found in gold_nq_jsonl file: ' + str(example_id))
        annotations = gold_e_by_id[example_id]['annotations']
        assert(len(annotations) == 1)
        gold_answer = annotations[0]
        # short answer
        short_answer = row['short_answer']
        short_answers_gold = gold_answer['short_answers']
        yes_no_answer_gold = gold_answer['yes_no_answer']
        #short_answer_score = row['short_answer_score']
        is_short_fp = False
        is_long_fp = False
#         print('short_answer:', short_answer)
#         print('short_answers_gold:', short_answers_gold)
#         print('yes_no_answer_gold:', yes_no_answer_gold)
        
        if short_answer == 'YES' or short_answer == 'NO':
            # Check for empty gold short answer, otherwise it should be SHORT
            ok = short_answer == yes_no_answer_gold and len(short_answers_gold) == 0
            tp_short += ok
            fp_short += not ok
            is_short_fp = not ok
#             print(ok)
        elif isinstance(short_answer, str):
            assert(':' in short_answer)
            start_token, end_token = [int(x) for x in short_answer.split(':')]
            
            ok = len(short_answers_gold) != 0 and \
            start_token == short_answers_gold[0]['start_token'] and \
            end_token == short_answers_gold[-1]['end_token']
            
            tp_short += ok
            fp_short += not ok
            is_short_fp = not ok
#             print(ok)
        elif isinstance(short_answer, float) and math.isnan(short_answer):
            # blank short answer
            ok = yes_no_answer_gold == 'NONE' and len(short_answers_gold) == 0
            tn_short += ok
            fn_short += not ok
#             print(ok)
        else:
            raise Exception('wrong short_answer type, short_answer:', short_answer)
            
        # long answer
        long_answer = row['long_answer']
        long_answer_gold = gold_answer['long_answer']
#         print('long_answer', long_answer)
#         print('long_answer_gold', long_answer_gold)
        long_answer_gold_str = str(long_answer_gold['start_token']) + ':' + str(long_answer_gold['end_token'])
        if isinstance(long_answer, str):
            ok = long_answer == long_answer_gold_str
            tp_long += ok
            fp_long += not ok
            is_long_fp = not ok
#             print(ok)
        elif isinstance(long_answer, float) and math.isnan(long_answer):
            # blank long answer
            ok = long_answer_gold_str == "-1:-1"
            tn_long += ok
            fn_long += not ok
#             print(ok)
            
#         print('')

        
        is_short_fp_column.append(is_short_fp)
        is_long_fp_column.append(is_long_fp)

#     print('short tp:', tp_short, 'fp:', fp_short, 'fn:', fn_short, 'tn:', tn_short,
#           'all:', tp_short + fp_short + fn_short + tn_short)
#     print('long tp:', tp_long, 'fp:', fp_long, 'fn:', fn_long, 'tn:', tn_long,
#          'all:', tp_long + fp_long + fn_long + tn_long)
    
    f1_short = compute_f1(tp=tp_short, fp=fp_short, fn=fn_short)
    f1_long = compute_f1(tp=tp_long, fp=fp_long, fn=fn_long)
    
    tp = tp_short + tp_long
    fp = fp_short + fp_long
    fn = fn_short + fn_long
    f1 = compute_f1(tp=tp, fp=fp, fn=fn)
    
    return f1, f1_short, f1_long

In [106]:
def create_short_answer(entry, threshold, check_type):
    answer = []    
    if entry['answer_type'] == 0:
        return ""
    if check_type and entry['answer_type'] == 4:
        return ""
    
    elif entry['answer_type'] == 1:
        return 'YES'
    
    elif entry['answer_type'] == 2:
        return 'NO'
        
    elif entry["short_answer_score"] < threshold:
        return ""
    
    else:
        for short_answer in entry["short_answers"]:
            if short_answer["start_token"] > -1:
                answer.append(str(short_answer["start_token"]) + ":" + str(short_answer["end_token"]))
    
        return " ".join(answer)

def create_long_answer(entry, threshold, check_type):
    
    answer = []
    
    if entry['answer_type'] == 0:
        return ''
    if check_type and entry['answer_type'] == 4:
        return ""
    
    elif entry["long_answer_score"] < threshold:
        return ""

    elif entry["long_answer"]["start_token"] > -1:
        answer.append(str(entry["long_answer"]["start_token"]) + ":" + str(entry["long_answer"]["end_token"]))
        return " ".join(answer)
    
def df_long_index_score(df, threshold, check_type):
    answers = []
    cont = 0
    for e in df['long_spans']:
        summary = e[3]
        if check_type and summary.answer_type == AnswerType.UNKNOWN:
            continue
        if e[2] > threshold:
            index = {}
            index['start'] = e[0]
            index['end'] = e[1]
            index['score'] = e[2]
            answers.append(index)
            cont += 1
        # number of answers
        if cont == 1:
            break
            
    return answers

def df_short_index_score(df, threshold, check_type):
    answers = []
    cont = 0
    for e in df['short_spans']:
        summary = e[3]
        if check_type and (summary.answer_type == AnswerType.LONG or
                           summary.answer_type == AnswerType.UNKNOWN):
            continue
        if e[2] > threshold:
            index = {}
            index['start'] = e[0]
            index['end'] = e[1]
            index['score'] = e[2]
            answers.append(index)
            cont += 1
        # number of answers
        if cont == 1:
            break
            
    return answers

def df_example_id(df):
    return df['example_id']

In [117]:
# predictions_json_static = pd.read_json("../output/1k/predictions.json")
with open("../output/1k/predictions.pkl", 'rb') as f:
    predictions_json_static = pickle.load(f)

def create_answer(entry):
    answer = []
    for e in entry:
        answer.append(str(e['start']) + ':'+ str(e['end']))
    if not answer:
        answer = ""
    return ", ".join(answer)

def test_params(threshold_short, check_type_short, threshold0_long, check_type_long):

    df_short_index_score_l = lambda q: df_short_index_score(q, threshold=threshold_short,
                                                       check_type=check_type_short)
    df_long_index_score_l = lambda q: df_long_index_score(q, threshold=threshold0_long, check_type=check_type_long)
    
    answers_df = pd.DataFrame.from_dict(predictions_json_static)
    answers_df['example_id'] = answers_df['predictions'].apply(df_example_id)

    answers_df['long_indexes_and_scores'] = answers_df['predictions'].apply(df_long_index_score_l)

    answers_df['short_indexes_and_scores'] = answers_df['predictions'].apply(df_short_index_score_l)

    answers_df["long_answer"] = answers_df['long_indexes_and_scores'].apply(create_answer)
    answers_df["short_answer"] = answers_df['short_indexes_and_scores'].apply(create_answer)
    answers_df["example_id"] = answers_df['example_id'].apply(lambda q: str(q))

    long_answers = dict(zip(answers_df["example_id"], answers_df["long_answer"]))
    short_answers = dict(zip(answers_df["example_id"], answers_df["short_answer"]))

    answers_df.example_id = answers_df.example_id.astype('int64')
    answers_df = answers_df.replace(r'', np.NaN)
    return compute_score(gold_e_by_id_static, answers_df)

In [138]:
def find_best_from_range(min_0, max_0, min_1, max_1):
    check_type_short = [True]
    check_type_long = [True]
    delta_0 = (max_0 - min_0) / 20
    delta_1 = (max_1 - min_1) / 20
    threshold_short_params = np.arange(min_0, max_0, delta_0).tolist()
    threshold_long_params = np.arange(min_1, max_1, delta_1).tolist()
    short_params = []
    long_params = []
    for ct in check_type_short:
        short_params += [(x, ct) for x in threshold_short_params]
    for ct in check_type_long:
        long_params += [(x, ct) for x in threshold_long_params]

    params = [(*x, *long_params[0]) for x in short_params]

    results = []
    for p in tqdm(params):
        res = test_params(*p)
        res_with_params = (res, p)
        results.append(res_with_params)
    results.sort(reverse=True)
    best_short_params = results[0][1][:2]

    results = []
    params = [(*best_short_params, *x) for x in long_params]
    for p in tqdm(params):
        res = test_params(*p)
        res_with_params = (res, p)
        results.append(res_with_params)
    results.sort(reverse=True)
#     print(results[:5])
    return results[0]

In [139]:
params_0 = find_best_from_range(1, 10, 1, 10)
p0 = params_0[1][0]
p1 = params_0[1][2]
s = params_0[0][0]
params_1 = find_best_from_range(p0 - 1, p0 + 1, p1 - 1, p1 + 1)
print('first search:', 'score {0:.3f},'.format(s), 'params {0:.3f},'.format(p0), '{0:.3f}'.format(p1))
p0 = params_1[1][0]
p1 = params_1[1][2]
s = params_1[0][0]
print('final search:', 'score {0:.3f},'.format(s), 'params {0:.3f},'.format(p0), '{0:.3f}'.format(p1))

100%|██████████| 20/20 [00:02<00:00,  8.73it/s]
100%|██████████| 20/20 [00:02<00:00,  8.89it/s]
100%|██████████| 20/20 [00:02<00:00,  7.82it/s]
100%|██████████| 20/20 [00:02<00:00,  8.85it/s]

first search: score 0.600, params 8.650, 7.300
final search: score 0.602, params 8.350, 7.000





### Found best params

In [48]:
threshold_short = 0.98
check_type_short = False
threshold_long = 0.83
check_type_long = False
test_params(threshold_short, check_type_short, threshold_long, check_type_long)

short tp: 186 fp: 214 fn: 86 tn: 514 all: 1000
long tp: 318 fp: 341 fn: 65 tn: 276 all: 1000


(0.5535714285714286, 0.6103646833013435, 0.5880980163360561)