In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
import csv
import json
import numpy as np
import os
import sys

from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

from constants import *

In [2]:
df = pd.read_pickle('dataframes/result.pkl')

In [77]:
df.head()

Unnamed: 0,id,label_set,labels,text,predictions,predictions_sigmoid,predictions_binary,predictions_raw
0,89109144591,"[244.9, 250.0, 272.4, 38.91, 38.93, 427.31, 42...","[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",sex: f medicine allergies: levofloxacin ...,"[38.93, 401.9, 518.81, 96.6, 96.72]","[0.23890312016010284, 0.16363947093486786, 0.4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1.1587026119232178, -1.6313940286636353, -0...."
1,89112183380,"[305.1, 37.23, 401.9, 414.01, 427.31, 88.56]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...",sex: f medicine allergies: patient record...,"[37.22, 401.9, 414.01, 88.56]","[0.006338672246783972, 0.06435754150152206, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[-5.054727077484131, -2.676779270172119, -2.00..."
2,89119191630,"[244.9, 272.4, 38.93, 39.95, 428.0, 599.0]","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",sex: m medicine allergies: patient record...,"[39.95, 428.0]","[0.07126564532518387, 0.11346027255058289, 0.2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-2.567408323287964, -2.055873155593872, -1.32..."
3,89124123322,[272.4],"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",sex: f medicine allergies: tegaderm / tax...,[],"[0.05356749892234802, 0.16385111212730408, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-2.8717570304870605, -1.629848599433899, -2.4..."
4,89124189821,"[272.4, 38.93, 486, 599.0, 96.04, 96.6, 96.72]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",sex: f orthopaedics allergies: patient re...,[],"[0.009319636039435863, 0.08270774781703949, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-4.666268348693848, -2.4061129093170166, -2.0..."


In [81]:
yhat = np.array(list((df['predictions_binary'])))
yhat_raw = np.array(list((df['predictions_raw'])))
y = np.array(list(df['labels']))

In [16]:
def all_metrics(yhat, y, k=8, yhat_raw=None, calc_auc=True):
    names = ['acc', 'prec', 'rec', 'f1']
    
    # macro
    macro = all_macro(yhat, y)
    
    # micro
    ymic = y.ravel()
    yhatmic = yhat.ravel()
    micro = all_micro(yhatmic, ymic)
    
    metrics = {names[i] + '_macro': macro[i] for i in range(len(macro))}
    
    metrics.update({names[i] + '_micro': micro[i] for i in range(len(micro))})
    
    # AUC and @k
    if yhat_raw is not None and calc_auc:
        if type(k) != list:
            k = [k]
        for k_i in k:
            rec_at_k = recall_at_k(yhat_raw, y, k_i)
            metrics['rec_at_%d' % k_i] = rec_at_k
            prec_at_k = precision_at_k(yhat_raw, y, k_i)
            metrics['prec_at_%d' % k_i] = prec_at_k
            metrics['f1_at_%d' % k_i] = 2*(prec_at_k*rec_at_k)/(prec_at_k+rec_at_k)
            
        roc_auc = auc_metrics(yhat_raw, y, ymic)
        metrics.update(roc_auc)
        
    return metrics

def all_macro(yhat, y):
    return macro_accuracy(yhat, y), macro_precision(yhat, y), macro_recall(yhat, y), macro_f1(yhat, y)

def all_micro(yhatmic, ymic):
    return micro_accuracy(yhatmic, ymic), micro_precision(yhatmic, ymic), micro_recall(yhatmic, ymic), micro_f1(yhatmic, ymic)

In [25]:
# Axis = 0 represents label level whereas axis = 1 represents instance level

def union_size(yhat, y, axis):
    return np.logical_or(yhat, y).sum(axis=axis).astype(float)

def intersect_size(yhat, y, axis):
    return np.logical_and(yhat, y).sum(axis=axis).astype(float)

## Macro Metrics
Calculate metrics for each label and average across the labels. 

In [47]:
def macro_accuracy(yhat, y):
    num = intersect_size(yhat, y, 0) # This calculates true positives (1, 1) at a label level
    den = union_size(yhat, y, 0) + 1e-10 # This calculates all (1, 1), (1, 0), (0, 1) examples. +1e-10 for numerical stability.
    return np.mean(num/den)

def macro_precision(yhat, y):
    num = intersect_size(yhat, y, 0) # TP
    den = yhat.sum(axis=0) + 1e-10 # All the ones in yhat i.e. TP + FP
    return np.mean(num/den)

def macro_recall(yhat, y):
    num = intersect_size(yhat, y, 0)
    den = y.sum(axis=0) + 1e-10 # All the ones in y i.e. TP + FN
    return np.mean(num/den)

def macro_f1(yhat, y):
    prec = macro_precision(yhat, y)
    rec = macro_recall(yhat, y)
    if prec + rec == 0:
        f1 = 0
    else:
        f1 = 2*(prec*rec)/(prec+rec)
    return f1

## Instance Averaged

In [49]:
def inst_precision(yhat, y):
    num = intersect_size(yhat, y, 1) / yhat.sum(axis=1)
    #correct for divide-by-zeros
    num[np.isnan(num)] = 0.
    return np.mean(num)

def inst_recall(yhat, y):
    num = intersect_size(yhat, y, 1) / y.sum(axis=1)
    #correct for divide-by-zeros
    num[np.isnan(num)] = 0.
    return np.mean(num)

def inst_f1(yhat, y):
    prec = inst_precision(yhat, y)
    rec = inst_recall(yhat, y)
    f1 = 2*(prec*rec)/(prec+rec)
    return f1

## @k

In [74]:
def recall_at_k(yhat_raw, y, k):
    #num true labels in top k predictions / num true labels
    sortd = np.argsort(yhat_raw)[:,::-1]
    topk = sortd[:,:k]

    #get recall at k for each example
    vals = []
    for i, tk in enumerate(topk):
        num_true_in_top_k = y[i,tk].sum()
        denom = y[i,:].sum()
        vals.append(num_true_in_top_k / float(denom))

    vals = np.array(vals)
    vals[np.isnan(vals)] = 0.

    return np.mean(vals)

def precision_at_k(yhat_raw, y, k):
    #num true labels in top k predictions / k
    sortd = np.argsort(yhat_raw)[:,::-1]
    topk = sortd[:,:k]

    #get precision at k for each example
    vals = []
    for i, tk in enumerate(topk):
        if len(tk) > 0:
            num_true_in_top_k = y[i,tk].sum()
            denom = len(tk)
            vals.append(num_true_in_top_k / float(denom))

    return np.mean(vals)

## Micro metrics

In [82]:
def micro_accuracy(yhatmic, ymic):
    return intersect_size(yhatmic, ymic, 0) / union_size(yhatmic, ymic, 0)

def micro_precision(yhatmic, ymic):
    return intersect_size(yhatmic, ymic, 0) / yhatmic.sum(axis=0)

def micro_recall(yhatmic, ymic):
    return intersect_size(yhatmic, ymic, 0) / ymic.sum(axis=0)

def micro_f1(yhatmic, ymic):
    prec = micro_precision(yhatmic, ymic)
    rec = micro_recall(yhatmic, ymic)
    if prec + rec == 0:
        f1 = 0.
    else:
        f1 = 2*(prec*rec)/(prec+rec)
    return f1

## AUC

In [83]:
def auc_metrics(yhat_raw, y, ymic):
    if yhat_raw.shape[0] <= 1:
        return
    fpr = {}
    tpr = {}
    roc_auc = {}
    #get AUC for each label individually
    relevant_labels = []
    auc_labels = {}
    for i in range(y.shape[1]):
        #only if there are true positives for this label
        if y[:,i].sum() > 0:
            fpr[i], tpr[i], _ = roc_curve(y[:,i], yhat_raw[:,i])
            if len(fpr[i]) > 1 and len(tpr[i]) > 1:
                auc_score = auc(fpr[i], tpr[i])
                if not np.isnan(auc_score): 
                    auc_labels["auc_%d" % i] = auc_score
                    relevant_labels.append(i)

    #macro-AUC: just average the auc scores
    aucs = []
    for i in relevant_labels:
        aucs.append(auc_labels['auc_%d' % i])
    roc_auc['auc_macro'] = np.mean(aucs)

    #micro-AUC: just look at each individual prediction
    yhatmic = yhat_raw.ravel()
    fpr["micro"], tpr["micro"], _ = roc_curve(ymic, yhatmic) 
    roc_auc["auc_micro"] = auc(fpr["micro"], tpr["micro"])

    return roc_auc

In [86]:
metrics = all_metrics(yhat, y, k=8, yhat_raw=yhat_raw, calc_auc=True)

In [87]:
metrics

{'acc_macro': 0.19345074135362034,
 'acc_micro': 0.2543137084886846,
 'auc_macro': 0.791074331774784,
 'auc_micro': 0.8397917379414975,
 'f1_at_8': 0.4721469682885988,
 'f1_macro': 0.30314291789090314,
 'f1_micro': 0.405502557721554,
 'prec_at_8': 0.40202898550724636,
 'prec_macro': 0.49631257038914556,
 'prec_micro': 0.7263496780584447,
 'rec_at_8': 0.5718905523241907,
 'rec_macro': 0.21821250175767504,
 'rec_micro': 0.2812619869581895}