In [1]:
from typing import Optional

import numpy as np
import pandas as pd
import tqdm

import peyes
import analysis.utils as u

## Load Data

In [2]:
dataset = peyes.datasets.lund2013(directory=u.DATASETS_DIR, save=False, verbose=True)
dataset.head()

Unnamed: 0,trial_id,subject_id,stimulus_type,stimulus_name,t,x,y,pupil,pixel_size,viewer_distance,MN,RA
0,1,TH20,moving_dot,1,0.0,123.2532,22.6264,,0.037824,67.0,1.0,1.0
1,1,TH20,moving_dot,1,2.0,123.5395,22.9064,,0.037824,67.0,1.0,1.0
2,1,TH20,moving_dot,1,4.0,123.223,21.9909,,0.037824,67.0,1.0,1.0
3,1,TH20,moving_dot,1,6.0,123.1883,21.774,,0.037824,67.0,1.0,1.0
4,1,TH20,moving_dot,1,8.0,125.054,21.1805,,0.037824,67.0,1.0,1.0


### Sampling Rate Distribution

In [3]:
sampling_rates = dataset.groupby(peyes.constants.TRIAL_ID_STR)[peyes.constants.T].apply(
    lambda sub: peyes._utils.event_utils.calculate_sampling_rate(sub.values)
).rename(peyes.constants.SAMPLING_RATE_STR)
sampling_rates = sampling_rates.reset_index().groupby(peyes.constants.SAMPLING_RATE_STR)[peyes.constants.TRIAL_ID_STR].apply(
    lambda sub: sub.values
).to_frame()
sampling_rates['counts'] = sampling_rates[peyes.constants.TRIAL_ID_STR].apply(lambda arr: len(arr))
sampling_rates['num_samples'] = sampling_rates[peyes.constants.TRIAL_ID_STR].apply(
    lambda arr: dataset[np.isin(dataset[peyes.constants.TRIAL_ID_STR], arr)].groupby(peyes.constants.TRIAL_ID_STR).size().sum()
)
sampling_rates

Unnamed: 0_level_0,trial_id,counts,num_samples
sampling_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200.0,"[33, 34, 39, 44, 54, 58, 63]",7,42770
500.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",56,340442


In [4]:
stim_srs = dataset.groupby([peyes.constants.STIMULUS_TYPE_STR, peyes.constants.TRIAL_ID_STR])[peyes.constants.T].apply(
    lambda sub: peyes._utils.event_utils.calculate_sampling_rate(sub.values)
).rename(peyes.constants.SAMPLING_RATE_STR)

stim_srs = stim_srs.reset_index().groupby(
    [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.SAMPLING_RATE_STR]
)[peyes.constants.TRIAL_ID_STR].apply(
    lambda sub: sub.values
).to_frame()
stim_srs['counts'] = stim_srs[peyes.constants.TRIAL_ID_STR].apply(lambda arr: len(arr))
stim_srs['num_samples'] = stim_srs[peyes.constants.TRIAL_ID_STR].apply(
    lambda arr: dataset[np.isin(dataset[peyes.constants.TRIAL_ID_STR], arr)].groupby(peyes.constants.TRIAL_ID_STR).size().sum()
)

stim_srs

Unnamed: 0_level_0,Unnamed: 1_level_0,trial_id,counts,num_samples
stimulus_type,sampling_rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
image,200.0,"[33, 34, 39, 44]",4,7985
image,500.0,"[25, 26, 27, 28, 29, 30, 31, 32, 35, 36, 37, 3...",16,79805
moving_dot,500.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",24,21326
video,200.0,"[54, 58, 63]",3,34785
video,500.0,"[45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 5...",16,239311


### Label Distribution
Extract the distribution of labels in the dataset, for each type of stimulus (image, video, moving dot) and both human annotators ("_RA_" and "_MN_"). Add the "total" count across all stimuli.

In [5]:
def sample_stats(dataframe: pd.DataFrame, labeler: Optional[str]) -> pd.DataFrame:
    if labeler:
        subset = dataframe[dataframe[labeler].notnull()]
    else:
        subset = dataframe
    counts = pd.concat([
        subset.groupby("stimulus_type").size().rename("num_samples"),
        subset.groupby("stimulus_type")["subject_id"].nunique().rename("num_subjects"),
        subset.groupby("stimulus_type")["trial_id"].nunique().rename("num_trials"),
    ], axis=1)
    total_counts = pd.Series(
        [len(subset), subset["subject_id"].nunique(), subset["trial_id"].nunique()],
        index=counts.columns, name="total"
    )
    counts.loc["total"] = total_counts
    
    if not labeler:
        return counts
    stats = pd.concat([
        subset[labeler].value_counts(dropna=True, normalize=True).sort_index().rename("total"),
        subset.groupby("stimulus_type")[labeler].value_counts(dropna=True, normalize=True).unstack().fillna(0).T
    ], axis=1).T * 100
    stats.index.name = peyes.constants.LABEL_STR
    return pd.concat([counts, stats], axis=1)

In [6]:
global_counts = sample_stats(dataset, None)
ra_stats = sample_stats(dataset, "RA")
mn_stats = sample_stats(dataset, "MN")

full_counts = pd.concat([global_counts, ra_stats, mn_stats], keys=["BOTH", "RA", "MN"], axis=0)
full_counts = full_counts.reorder_levels([1, 0]).reindex(
    axis=0, level=0, labels=["image", "video", "moving_dot", "total"]
)
full_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,num_samples,num_subjects,num_trials,0.0,1.0,2.0,3.0,4.0,5.0
stimulus_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
image,BOTH,87790,18,20,,,,,,
image,RA,87790,18,20,0.144663,76.455177,9.181,4.759084,4.777309,4.682766
image,MN,63849,13,14,0.198907,79.597175,8.592147,5.243622,0.853576,5.514573
video,BOTH,274096,18,19,,,,,,
video,RA,274096,18,19,0.079899,33.62654,4.413417,2.635573,57.883734,1.360837
video,MN,29029,9,9,0.055117,42.974267,5.174136,3.382824,46.381205,2.03245
moving_dot,BOTH,21326,19,24,,,,,,
moving_dot,RA,20000,19,23,0.95,12.845,4.72,1.425,79.53,0.53
moving_dot,MN,11867,10,11,1.415691,8.99132,4.533581,2.005562,81.638156,1.415691
total,BOTH,383212,30,63,,,,,,


### Event Distribution
Extract the distribution of events in the dataset, for each type of stimulus (image, video, moving dot) and both human annotators ("_RA_" and "_MN_"). Add the "total" count across all stimuli.

In [7]:
def labels_to_events(dataframe: pd.DataFrame):
    trial_ids = dataframe[peyes.constants.TRIAL_ID_STR].unique()
    annotators = set(dataframe.columns).intersection(set([ann for annotators in u.DATASET_ANNOTATORS.values() for ann in annotators]))
    event_dict = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = dataframe[dataframe["trial_id"] == trial_id]
        stim_type, stim_name = trial_data[[peyes.constants.STIMULUS_TYPE_STR, peyes.constants.STIMULUS_NAME_STR]].values[0]
        t = trial_data[peyes.constants.T].values
        x = trial_data[peyes.constants.X].values
        y = trial_data[peyes.constants.Y].values
        pupil = trial_data[peyes.constants.PUPIL].values
        ps = trial_data[peyes.constants.PIXEL_SIZE_STR].values[0]
        vd = trial_data[peyes.constants.VIEWER_DISTANCE_STR].values[0]
        for annotator in annotators:
            evnts = peyes.create_events(
                labels=trial_data[annotator].values,
                t=t, x=x, y=y, pupil=pupil, pixel_size=ps, viewer_distance=vd,
            )
            evnts = pd.Series(evnts, name=(trial_id, annotator))
            event_dict[(trial_id, stim_type, stim_name, annotator)] = evnts
    event_df = pd.DataFrame(event_dict).T.dropna(axis=0, how='all')
    event_df.index.names = [
        peyes.constants.TRIAL_ID_STR, peyes.constants.STIMULUS_TYPE_STR, peyes.constants.STIMULUS_NAME_STR, "annotator"
    ]
    return event_df


def events_df_to_series(events_df: pd.DataFrame, min_num_samples: int = 2) -> pd.Series:
    events_as_series = events_df.groupby(
        level=np.arange(events_df.index.nlevels).tolist()
    ).apply(
        lambda sub: pd.Series(sub.values.flatten()).dropna()
    )
    events_as_series = events_as_series[events_as_series.map(lambda x: x.num_samples >= min_num_samples)]
    return events_as_series

In [8]:
events = labels_to_events(dataset)
events_series = events_df_to_series(events, min_num_samples=2)

100%|██████████| 63/63 [00:00<00:00, 92.52it/s] 


In [9]:
def _count_events_for_label(series: pd.Series, lbl) -> pd.DataFrame:
    sub_series = series[series.map(lambda evnt: evnt.label == lbl)]
    counts = sub_series.groupby(level=[peyes.constants.STIMULUS_TYPE_STR, "annotator"]).size().unstack(1)
    counts["BOTH"] = counts.sum(axis=1)
    return counts


def count_events(series: pd.Series) -> pd.DataFrame:
    lbl_counts = {}
    for lbl in peyes._DataModels.EventLabelEnum.EventLabelEnum:
        lbl_counts[lbl.name] = _count_events_for_label(series, lbl)
    res = pd.concat(lbl_counts, axis=0, keys=lbl_counts.keys()).dropna(axis=0, how='all')
    res = res.unstack(0).stack(0, future_stack=True)
    
    all_stim_counts = res.groupby(level=1).sum()
    all_stim_counts.index = [("ALL", lblr) for lblr in all_stim_counts.index]
    res = pd.concat([res, all_stim_counts], axis=0)
    
    res = res.reindex(["ALL", "image", "video", "moving_dot"], level=0, axis=0)
    res['total'] = res.sum(axis=1)
    return res

In [10]:
event_counts = count_events(events_series)
event_counts

Unnamed: 0,Unnamed: 1,FIXATION,SACCADE,PSO,SMOOTH_PURSUIT,BLINK,total
ALL,BOTH,1686.0,2373.0,1783.0,1004.0,93.0,6939.0
ALL,MN,495.0,543.0,442.0,105.0,27.0,1612.0
ALL,RA,1191.0,1830.0,1341.0,899.0,66.0,5327.0
image,BOTH,967.0,929.0,730.0,33.0,45.0,2704.0
image,MN,404.0,377.0,312.0,3.0,22.0,1118.0
image,RA,563.0,552.0,418.0,30.0,23.0,1586.0
video,BOTH,677.0,1310.0,979.0,833.0,46.0,3845.0
video,MN,81.0,117.0,97.0,51.0,4.0,350.0
video,RA,596.0,1193.0,882.0,782.0,42.0,3495.0
moving_dot,BOTH,42.0,134.0,74.0,138.0,2.0,390.0


In [11]:
event_percents = event_counts.div(event_counts["total"], axis=0) * 100
event_percents

Unnamed: 0,Unnamed: 1,FIXATION,SACCADE,PSO,SMOOTH_PURSUIT,BLINK,total
ALL,BOTH,24.297449,34.198011,25.695345,14.468944,1.340251,100.0
ALL,MN,30.707196,33.684864,27.419355,6.513648,1.674938,100.0
ALL,RA,22.3578,34.353295,25.173644,16.876291,1.238971,100.0
image,BOTH,35.761834,34.356509,26.997041,1.220414,1.664201,100.0
image,MN,36.135957,33.72093,27.906977,0.268336,1.9678,100.0
image,RA,35.498108,34.80454,26.355612,1.891551,1.450189,100.0
video,BOTH,17.607282,34.070221,25.461638,21.664499,1.196359,100.0
video,MN,23.142857,33.428571,27.714286,14.571429,1.142857,100.0
video,RA,17.052933,34.134478,25.236052,22.374821,1.201717,100.0
moving_dot,BOTH,10.769231,34.358974,18.974359,35.384615,0.512821,100.0


In [12]:
event_counts["FIXATION"] / event_counts["SACCADE"]

ALL         BOTH    0.710493
            MN      0.911602
            RA      0.650820
image       BOTH    1.040904
            MN      1.071618
            RA      1.019928
video       BOTH    0.516794
            MN      0.692308
            RA      0.499581
moving_dot  BOTH    0.313433
            MN      0.204082
            RA      0.376471
dtype: float64