In [2]:
from typing import Optional, List

import numpy as np
import pandas as pd
import tqdm

import peyes
import analysis.utils as u

## Load Data

In [3]:
dataset = peyes.datasets.lund2013(directory=u.DATASETS_DIR, save=False, verbose=True)
dataset.head()

Unnamed: 0,trial_id,subject_id,stimulus_type,stimulus_name,t,x,y,pupil,pixel_size,viewer_distance,MN,RA
0,1,TH20,moving_dot,1,0.0,123.2532,22.6264,,0.037824,67.0,1.0,1.0
1,1,TH20,moving_dot,1,2.0,123.5395,22.9064,,0.037824,67.0,1.0,1.0
2,1,TH20,moving_dot,1,4.0,123.223,21.9909,,0.037824,67.0,1.0,1.0
3,1,TH20,moving_dot,1,6.0,123.1883,21.774,,0.037824,67.0,1.0,1.0
4,1,TH20,moving_dot,1,8.0,125.054,21.1805,,0.037824,67.0,1.0,1.0


## Rater Agreement
Calculate the agreement between the two human annotators, globally and for each type of stimulus.

In [4]:
def calc_sample_level_agreement(
        dataframe: pd.DataFrame,
        labeler1: str,
        labeler2: str,
        metrics: List[str] = None,
        pos_labels: Optional[np.ndarray] = None
) -> pd.DataFrame:
    metrics = metrics or ["balanced_accuracy", "cohen's_kappa", "mcc", "complement_nld"]
    both_not_null = dataframe.groupby("trial_id").filter(lambda x: all(x[labeler1].notnull()) and all(x[labeler2].notnull()))
    trial_ids = both_not_null[peyes.constants.TRIAL_ID_STR].unique()
    results = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = both_not_null[both_not_null["trial_id"] == trial_id]
        labeler1_labels = trial_data[labeler1].values
        labeler2_labels = trial_data[labeler2].values
        res = peyes.sample_metrics.calculate(labeler1_labels, labeler2_labels, *metrics, pos_labels=pos_labels)
        results[trial_id] = res
    results = pd.DataFrame(results).T
    results.index.name = peyes.constants.TRIAL_ID_STR
    return results

In [5]:
rater_agreement = calc_sample_level_agreement(dataset, "RA", "MN")
rater_agreement.describe()

100%|██████████| 33/33 [00:00<00:00, 112.65it/s]


Unnamed: 0,balanced_accuracy,cohen's_kappa,mcc,complement_nld
count,33.0,33.0,33.0,33.0
mean,0.789322,0.746069,0.766235,0.889908
std,0.113567,0.197002,0.164724,0.117654
min,0.548912,0.221206,0.354871,0.533183
25%,0.706671,0.677382,0.685639,0.883343
50%,0.805947,0.836427,0.838001,0.929044
75%,0.877022,0.872848,0.876027,0.956413
max,0.968041,0.962272,0.962441,0.988962


In [6]:
image_dataset = dataset[dataset[peyes.constants.STIMULUS_TYPE_STR] == peyes.constants.IMAGE_STR]

rater_agreement_image = calc_sample_level_agreement(image_dataset, "RA", "MN")
rater_agreement_image.describe()

100%|██████████| 14/14 [00:00<00:00, 75.47it/s]


Unnamed: 0,balanced_accuracy,cohen's_kappa,mcc,complement_nld
count,14.0,14.0,14.0,14.0
mean,0.748272,0.790884,0.80472,0.935951
std,0.121493,0.178359,0.145753,0.033054
min,0.548912,0.221206,0.354871,0.861067
25%,0.666809,0.790568,0.796352,0.922393
50%,0.718305,0.838506,0.844081,0.942161
75%,0.873997,0.8961,0.897724,0.955667
max,0.923596,0.9064,0.907393,0.974338
