# Inter-Rater Agreement
#### Measure the agreement among pairs of raters of the HFC dataset (focusing only on 5/12 raters to simplify the analysis).

In [1]:
from typing import Optional, List

import numpy as np
import pandas as pd
import tqdm

import peyes
import analysis.utils as u
from analysis._article_results.hfc._helpers import *

GT_LABELERS = [GT1, GT2, GT3, GT4, GT5]

In [2]:
dataset = peyes.datasets.hfc(directory=u.DATASETS_DIR, save=False, verbose=True)

In [3]:
def calc_sample_level_agreement(
        dataframe: pd.DataFrame,
        metrics: List[str] = None,
        gt_labelers: List[str] = GT_LABELERS,
        pos_labels: Optional[np.ndarray] = None
) -> pd.DataFrame:
    metrics = metrics or ["balanced_accuracy", "cohen's_kappa", "mcc", "complement_nld"]
    trial_ids = dataframe[peyes.constants.TRIAL_ID_STR].unique()
    results = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = dataframe[dataframe["trial_id"] == trial_id]
        labels_per_labeler = {lblr: trial_data[lblr].values for lblr in gt_labelers}
        for i, lblr1 in enumerate(labels_per_labeler.keys()):
            for j, lblr2 in enumerate(labels_per_labeler.keys()):
                if i >= j:
                    continue
                res = peyes.sample_metrics.calculate(labels_per_labeler[lblr1], labels_per_labeler[lblr2], *metrics, pos_labels=pos_labels)
                results[(trial_id, lblr1, lblr2)] = res
    results = pd.DataFrame(results).T
    results.index.names = [peyes.constants.TRIAL_ID_STR, u.GT_STR, u.PRED_STR]
    return results

### Entire Dataset
#### (A) All Annotators

In [10]:
overall_agreement = calc_sample_level_agreement(dataset)
overall_agreement_summary = overall_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
overall_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

overall_agreement_summary

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
100%|██████████| 70/70 [00:47<00:00,  1.47it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DN,JV,balanced_accuracy,70.0,0.954145,0.03371,0.782539,0.945949,0.963407,0.971856,0.999172
DN,JV,cohen's_kappa,70.0,0.826188,0.12961,0.0,0.785086,0.847105,0.903284,0.993013
DN,JV,mcc,70.0,0.835474,0.12629,0.0,0.80323,0.850103,0.905752,0.993038
DN,JV,complement_nld,70.0,0.954126,0.025022,0.880498,0.942743,0.957655,0.968419,0.999172
DN,MN,balanced_accuracy,70.0,0.957476,0.030907,0.803571,0.947174,0.962382,0.975582,1.0
DN,MN,cohen's_kappa,69.0,0.864698,0.090459,0.4046,0.828217,0.877572,0.924482,0.992945
DN,MN,mcc,70.0,0.858964,0.131277,0.0,0.833054,0.881153,0.924973,0.992969
DN,MN,complement_nld,70.0,0.963158,0.025797,0.846132,0.951135,0.96737,0.97901,1.0
DN,RA,balanced_accuracy,70.0,0.951557,0.047473,0.728719,0.942947,0.968134,0.979433,1.0
DN,RA,cohen's_kappa,69.0,0.841203,0.12418,0.286615,0.815378,0.856382,0.921569,0.993016


#### mean agreement over subset of GT annotators
GT annotators = ["IH", "DN", "JV", "MN", "RA"]

In [11]:
overall_agreement_mean = overall_agreement_summary[["mean", "std", "min", "50%", "max"]].groupby(level="metric").mean()
overall_agreement_mean

Unnamed: 0_level_0,mean,std,min,50%,max
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
balanced_accuracy,0.942564,0.056191,0.613455,0.957421,0.997823
cohen's_kappa,0.830965,0.134615,0.136049,0.854887,0.990621
complement_nld,0.954975,0.035766,0.79149,0.961927,0.999586
mcc,0.83724,0.138835,0.050768,0.860924,0.990658


#### (B) GT1-GT2 Agreement

In [5]:
gt1_gt2_overall_agreement = calc_sample_level_agreement(dataset, gt_labelers=[GT1, GT2])
gt1_gt2_overall_agreement_summary = gt1_gt2_overall_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
gt1_gt2_overall_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

gt1_gt2_overall_agreement_summary

100%|██████████| 70/70 [00:04<00:00, 14.13it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
IH,DN,balanced_accuracy,70.0,0.952527,0.065126,0.5,0.946946,0.965825,0.98368,1.0
IH,DN,cohen's_kappa,70.0,0.856607,0.133165,0.0,0.800699,0.88586,0.934016,1.0
IH,DN,mcc,70.0,0.861905,0.129886,0.0,0.817095,0.887913,0.934514,1.0
IH,DN,complement_nld,70.0,0.96832,0.025204,0.808415,0.961966,0.971785,0.980753,1.0


### Free-Viewing Subset
#### (A) All Annotators

In [6]:
fv_dataset = dataset[dataset[peyes.constants.STIMULUS_TYPE_STR] == "free_viewing"]

fv_agreement = calc_sample_level_agreement(fv_dataset)
fv_agreement_summary = fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

fv_agreement_mean = fv_agreement_summary[["mean", "std", "min", "50%", "max"]].groupby(level="metric").mean()
fv_agreement_mean

100%|██████████| 10/10 [00:23<00:00,  2.37s/it]


Unnamed: 0_level_0,mean,std,min,50%,max
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
balanced_accuracy,0.922851,0.04841,0.803203,0.937937,0.964052
cohen's_kappa,0.749121,0.112153,0.471965,0.781806,0.846246
complement_nld,0.940056,0.034906,0.850641,0.951405,0.966081
mcc,0.763777,0.104619,0.504275,0.795819,0.852501


#### (B) GT1-GT2 Agreement

In [7]:
gt1_gt2_fv_agreement = calc_sample_level_agreement(fv_dataset, gt_labelers=[GT1, GT2])
gt1_gt2_fv_agreement_summary = gt1_gt2_fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
gt1_gt2_fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

gt1_gt2_fv_agreement_summary

100%|██████████| 10/10 [00:02<00:00,  4.10it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
IH,DN,balanced_accuracy,10.0,0.944648,0.020104,0.900926,0.939128,0.94531,0.959501,0.970974
IH,DN,cohen's_kappa,10.0,0.806712,0.04206,0.764313,0.777827,0.781803,0.837265,0.891147
IH,DN,mcc,10.0,0.813266,0.038667,0.773348,0.786875,0.793781,0.839614,0.891203
IH,DN,complement_nld,10.0,0.964497,0.011655,0.935806,0.961901,0.96657,0.970491,0.978681


### (C) RA-MN Agreement
(Raters _RA_ and _MN_ are the ground truth raters for dataset _lund2013_, so lets check their agreement in this dataset too)

In [8]:
ra_mn_fv_agreement = calc_sample_level_agreement(fv_dataset, gt_labelers=["RA", "MN"])
ra_mn_fv_agreement_summary = ra_mn_fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
ra_mn_fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

ra_mn_fv_agreement_summary

100%|██████████| 10/10 [00:02<00:00,  4.12it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RA,MN,balanced_accuracy,10.0,0.880287,0.112928,0.583651,0.887296,0.914027,0.938763,0.973741
RA,MN,cohen's_kappa,10.0,0.774451,0.218439,0.18479,0.796618,0.84791,0.883487,0.927445
RA,MN,mcc,10.0,0.776534,0.215308,0.194232,0.797772,0.848371,0.88353,0.927873
RA,MN,complement_nld,10.0,0.94038,0.069153,0.751241,0.936417,0.968995,0.974736,0.981572
