In [None]:
import pandas as pd
import json

In [None]:
def evaluate(df, label):
    """ Compute recall, precision, and F1 """
    # compute recall
    recall_df = df[df["label"] == label]
    recall_count = 0
    for _, row in recall_df.iterrows():
        if row["label"] == row["predictions"]:
            recall_count += 1
    recall = recall_count / recall_df.shape[0] # denom could be 0 but this never actually happens

    # compute precision
    precision_df = df[df["predictions"] == label]
    precision_count = 0
    for _, row in precision_df.iterrows():
        if row["label"] == row["predictions"]:
            precision_count += 1
    precision = precision_count / precision_df.shape[0] # denom could be 0 but this never actually happens
    
    return round(recall, 2), round(precision, 2), round((2*recall*precision) / (recall + precision), 2)

In [None]:
def eval_entry(json_file, dataset):
    df = pd.read_csv(dataset)
    with open(json_file) as f:
        _preds = json.load(f)
        _preds = {int(_key):_preds[_key] for _key in _preds}
    df["predictions"] = [_preds[id] for id in df["id"]]

    true_recall, true_precision, true_f1 = evaluate(df, True)
    false_recall, false_precision, false_f1 = evaluate(df, False)

    eval_records = []
    eval_records.append({"label": True, "precision": true_precision, "recall": true_recall, "f1": true_f1})
    eval_records.append({"label": False, "precision": false_precision, "recall": false_recall, "f1": false_f1})
    eval_records = pd.DataFrame.from_records(eval_records)

    f1_scores = []
    f1_scores.append({"f1 type": "macro-f1", "score": round((true_f1 + false_f1) / 2, 2)})
    prop_true_examples = df[df["label"] == True].shape[0] / df.shape[0]
    prop_false_examples = df[df["label"] == False].shape[0] / df.shape[0]
    f1_scores.append({"f1 type": "weighted-f1", "score": round((true_f1 * prop_true_examples) + (false_f1 * prop_false_examples), 2)})
    f1_scores = pd.DataFrame.from_records(f1_scores)
    return eval_records, f1_scores

In [None]:
eval_records, f1_scores = eval_entry("predictions.json", # output from cli.py
                                     "datasets/bingcheck_sampled.csv") # path to dataset with ground truth labels
print(eval_records)
print("\n")
print(f1_scores)