# Fig 6: Onset-Offset Comparisons
### Comparing Within-Detector Sensitivity to Onset vs. Offset

In [1]:
import copy
import warnings
from typing import Optional

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scikit_posthocs as sp
import scipy.stats as st
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
from statsmodels import formula
from scipy.stats import wilcoxon, mannwhitneyu

import peyes

from analysis._article_results.lund2013._helpers import *
import analysis.statistics.channel_sdt as ch_sdt

pio.renderers.default = "browser"

  import scipy.linalg


### Set Constants

In [2]:
THRESHOLD = 5       # temporal threshold for analyzing d'
METRIC = peyes.constants.D_PRIME_STR
ONSET_STR, OFFSET_STR = peyes.constants.ONSET_STR, peyes.constants.OFFSET_STR

ALPHA = 0.05
NUM_COMPARISONS = 6
ALPHA_BONF = ALPHA / NUM_COMPARISONS  # Bonferroni correction for multiple comparisons - for MW U-tests

# visualization constants
GRID_LINE_COLOR, GRID_LINE_WIDTH = "lightgray", 1
ZERO_LINE_WIDTH = 2 * GRID_LINE_WIDTH

SINGLE_MEASURE_OPACITY, MEDIAN_OPACITY = 0.2, 1.0
SINGLE_MEASURE_LINE = dict(width=2 * GRID_LINE_WIDTH)
SINGLE_MEASURE_MARKER = dict(symbol="circle-open", size=6 * GRID_LINE_WIDTH,)
MEADIAN_LINE = dict(width=SINGLE_MEASURE_LINE['width'])
MEADIAN_MARKER = dict(symbol="circle", size=2 * SINGLE_MEASURE_MARKER['size'],)

FONT_FAMILY, FONT_COLOR = "Calibri", "black"
TITLE_FONT = dict(family=FONT_FAMILY, size=25, color=FONT_COLOR)
SUBTITLE_FONT = dict(family=FONT_FAMILY, size=20, color=FONT_COLOR)
AXIS_LABEL_FONT = dict(family=FONT_FAMILY, size=20, color=FONT_COLOR)
AXIS_TICK_FONT = dict(family=FONT_FAMILY, size=18, color=FONT_COLOR)
AXIS_LABEL_STANDOFF = 3

In [3]:
def extract_subset(metric_df: pd.DataFrame, gt: str, pred: str) -> pd.DataFrame:
    """ Extracts the subset of metrics for the given GT and PRED annotators. """
    return metric_df.xs(gt, level=u.GT_STR, axis=0, drop_level=True).xs(pred, level=u.PRED_STR, axis=0, drop_level=True)


def human_annotators_figure(metric_df: pd.DataFrame, event_name: str) -> go.Figure:
    # extract data
    plot_data = pd.concat([extract_subset(metric_df, "RA", "MN"), extract_subset(metric_df, "MN", "RA")], axis=0, keys=["RA", "MN"])
    plot_data = plot_data.droplevel(peyes.constants.TRIAL_ID_STR)
    plot_data.index.name, plot_data.columns.name = u.GT_STR, None
    plot_data = plot_data.groupby(level=u.GT_STR)
    plot_data = pd.concat([plot_data.mean().stack().rename("mean"), plot_data.sem().stack().rename("sem")], axis=1)
    plot_data = plot_data.reset_index().rename(columns={"level_1": "boundary"})
    plot_data[u.PRED_STR] = plot_data[u.GT_STR].map(lambda val: "MN" if val == "RA" else "RA")

    fig = px.bar(
        plot_data, x=u.PRED_STR, y="mean", error_y="sem", color="boundary",
        category_orders={u.PRED_STR: ["MN", "RA"], "boundary": [ONSET_STR, OFFSET_STR]},
        hover_data=[u.GT_STR],
        labels={u.PRED_STR: "Pred. Annotator", u.GT_STR: "GT Annotator", "boundary": "Event Boundary", "mean": r"d'", },
        barmode='group',
    )
    fig.update_xaxes(
        title=dict(font=AXIS_LABEL_FONT, standoff=AXIS_LABEL_STANDOFF),
        showline=False,
        showgrid=False, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
        zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
        tickfont=AXIS_TICK_FONT,
    )
    fig.update_yaxes(
        title=dict(font=AXIS_LABEL_FONT, standoff=AXIS_LABEL_STANDOFF),
        showline=False,
        showgrid=True, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
        zeroline=True, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
        tickfont=AXIS_TICK_FONT,
    )
    fig.update_layout(
        title=dict(text=f"{event_name.capitalize()} Boundary Sensitivity Comparison (Human Annotators)", font=TITLE_FONT),
        width=800, height=450,
        # paper_bgcolor='rgba(0, 0, 0, 0)',
        plot_bgcolor='rgba(0, 0, 0, 0)',
        legend=dict(
            orientation="h", yanchor="top", xanchor="left", xref='paper', yref='paper', x=-0.05, y=1.04, itemwidth=30,
        ),
        margin=dict(l=0, r=0, t=50, b=0, pad=0),
    )
    return fig


def multi_detector_figure(metric_df: pd.DataFrame, event_name: str) -> go.Figure:

    def _rename_detector(det: str) -> str:
        if det in [GT1, GT2]:
            return f"2<sup>nd</sup> Ann."
        if det.lower() == "remodnav":
            return "REMoDNaV"
        if det.lower().startswith("i"):
            return det.replace("i", "i-").upper()
        if det.lower() == "all detectors":
            return "all detectors".title()
        return det.upper()

    fig = make_subplots(
        rows=len([GT1, GT2]), cols=1, shared_xaxes=True, shared_yaxes=False,
        subplot_titles=[f"GT: <i>{gt}</i>" for gt in [GT1, GT2]],
        # x_title="Detector",
        vertical_spacing=0.005
    )
    ordered_detectors = sorted(metric_df.index.get_level_values(u.PRED_STR).unique(), key=lambda dett: LABELER_PLOTTING_CONFIG[dett][0])
    ordered_detectors.append("all detectors")
    for r, gt in enumerate([GT1, GT2]):
        for pred in ordered_detectors:
            if pred.startswith("all"):
                subset = metric_df.xs(gt, level=u.GT_STR, axis=0, drop_level=True)
                subset = subset.drop(["RA", "MN"], level=u.PRED_STR, axis=0, errors="ignore")    # all detectors, exclude 2nd annotator
                pred_color = u.DEFAULT_DISCRETE_COLORMAP[10]
            else:
                subset = metric_df.xs((gt, pred), level=[u.GT_STR, u.PRED_STR], axis=0, drop_level=True)    # only specific detector
                pred_color = "#bab0ac" if pred in [GT1, GT2] else LABELER_PLOTTING_CONFIG[pred][1]
            if subset.empty:
                continue
            pred_name = _rename_detector(pred)
            xs = [[pred_name, pred_name], [ONSET_STR, OFFSET_STR]]
            for _j, row in subset.iterrows():
                onset, offset = row[ONSET_STR], row[OFFSET_STR]
                fig.add_trace(
                    row=r+1, col=1,
                    trace=go.Scatter(
                        x=xs, y=[onset, offset], mode='markers+lines',
                        name=pred_name, legendgroup=pred_name, showlegend=False,
                        marker={**SINGLE_MEASURE_MARKER, **{"color": pred_color}},
                        line={**SINGLE_MEASURE_LINE, **{"color": pred_color}},
                        opacity=SINGLE_MEASURE_OPACITY,
                    )
                )
            fig.add_trace(
                row=r+1, col=1,
                trace=go.Scatter(
                    x=xs, y=subset.median().values, mode='markers+lines',
                    name=pred_name, legendgroup=pred_name, showlegend=r==0,
                    marker={**MEADIAN_MARKER, **{"color": pred_color}},
                    line={**MEADIAN_LINE, **{"color": pred_color}},
                    opacity=MEDIAN_OPACITY,
                )
            )

    # update layout
    fig.for_each_annotation(lambda ann: ann.update(
        font=SUBTITLE_FONT, textangle=0, xref='paper', xanchor="left", x=0.01 if ann.text.startswith("GT:") else 0.5,
    ))
    fig.update_xaxes(
        title=dict(font=AXIS_LABEL_FONT),
        showline=False,
        showgrid=False, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
        zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
        tickfont=AXIS_TICK_FONT, ticks=None, dividercolor="rgba(0, 0, 0, 0)",
    )
    fig.update_yaxes(
        title=dict(text=r"$d'$", font=AXIS_LABEL_FONT, standoff=10),
        showline=False,
        showgrid=True, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
        zeroline=True, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
        tickfont=AXIS_TICK_FONT,
    )
    fig.update_layout(
        title=dict(
            text=f"{event_name.capitalize()} Boundary Sensitivity", font=TITLE_FONT,
            xanchor="center", xref='paper', x=0.5,
        ),
        width=1200, height=500,
        # paper_bgcolor='rgba(0, 0, 0, 0)',
        plot_bgcolor='rgba(0, 0, 0, 0)',
        margin=dict(l=0, r=0, t=40, b=20, pad=0),
        legend=dict(
            font=AXIS_TICK_FONT,
            orientation="h", yanchor="top", xanchor="center", xref='paper', yref='paper', x=0.5, y=-0.125,
            visible=False,
        ),
    )
    return fig

## Saccade Onset vs. Offset
We compare the sensitivity ($d'$) to saccade onsets vs. offsets across all trials, using Wilcoxon's Signed-Rank test. We hypothesize that **saccade onset has higher $d'$ than saccade offset**.

## Load Data

In [4]:
ALTERNATIVE = "greater"  # alternative hypothesis for the Wilcoxon test
EVENT_LABEL = 2     # EventLabelEnum.SACCADE.value
metrics = ch_sdt.load(
    dataset_name=DATASET_NAME,
    output_dir=PROCESSED_DATA_DIR,
    label=EVENT_LABEL,
    stimulus_type=STIMULUS_TYPE,
    threshold=THRESHOLD,
    channel_type=None,
)

# Remove unused columns (metrics) and index levels
metrics.drop(index=['P', 'PP', 'N', 'TP'], level=peyes.constants.METRIC_STR, inplace=True)
metrics = metrics.droplevel('threshold')

# extract subset of the relevant `METRIC`
d_prime = metrics.xs(METRIC, level=peyes.constants.METRIC_STR, axis=0, drop_level=True).T
d_prime.columns.name = None
d_prime

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onset,offset
trial_id,gt,pred,Unnamed: 3_level_1,Unnamed: 4_level_1
25,RA,engbert,4.989576,1.777960
25,RA,idt,3.187557,0.881339
25,RA,ivt,4.935210,3.164184
25,RA,nh,1.792695,2.201913
25,RA,idvt,3.187557,0.881339
...,...,...,...,...
44,MN,nh,2.660013,1.251726
44,MN,idvt,1.707312,1.707312
44,MN,RA,5.091656,5.091656
44,MN,remodnav,4.807632,4.013900


### Human Annotator Sensitivity Comparison
Using each annotator (_RA_, _MN_) as GT and the other one as PRED, we compare PRED's sensitivity to **saccade** onset vs. offset in a within-trial design - using the Wilcoxon Signed-Rank test.

In [5]:
# using RA as GT and MN as PRED
gt, pred = "RA", "MN"
gt_ra_pred_mn = extract_subset(d_prime, gt, pred)
diffs = np.round(gt_ra_pred_mn[ONSET_STR] - gt_ra_pred_mn[OFFSET_STR], decimals=3)
wilx_res = wilcoxon(diffs, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\tPRED: {pred}\t::\tW={W_mn},\t\tp={p_mn:.5f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# using MN as GT and RA as PRED
gt, pred = "MN", "RA"
gt_mn_pred_ra = extract_subset(d_prime, gt, pred)
diffs = np.round(gt_mn_pred_ra[ONSET_STR] - gt_mn_pred_ra[OFFSET_STR], decimals=3)
wilx_res = wilcoxon(diffs, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\tPRED: {pred}\t::\tW={W_mn},\t\tp={p_mn:.5f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# create a bar-plot figure for the human annotators
fig = human_annotators_figure(d_prime, "saccade")
# fig.update_layout(
#     paper_bgcolor='rgba(0, 0, 0, 0)',
# )
fig.show()

GT: RA	PRED: MN	::	W=96.0,		p=0.00231,	onset-offset diff: 0.88 (±0.25 s.e.m.),	N=14
GT: MN	PRED: RA	::	W=99.0,		p=0.00126,	onset-offset diff: 0.94 (±0.20 s.e.m.),	N=14


### Within-Detector Comparisons (across detectors)
We compare **saccade** onset vs. offset $d'$ scores using a within-detector comparison across all detectors. We use Wilcoxon's Signed-Rank test on the difference of $d'$ scores, across all trials and detectors but split between annotator _RA_ and _MN_. We hypothesize that **saccade onset has higher $d'$ that saccade offset**.

In [6]:
# extract the data
d_prime_no_humans = d_prime.drop(["MN", "RA"], level=u.PRED_STR, axis=0)    # exclude humans from PRED column (only include algs)
all_diffs = np.round(d_prime_no_humans[ONSET_STR] - d_prime_no_humans[OFFSET_STR], decimals=3)  # calculate onset-offset differences
print(f"Median Onset:\tGT-RA:{d_prime_no_humans.xs("RA", level=u.GT_STR)[ONSET_STR].median():.3f}\tGT-MN:{d_prime_no_humans.xs("MN", level=u.GT_STR)[ONSET_STR].median():.3f}")
print(f"Median Offset:\tGT-RA:{d_prime_no_humans.xs("RA", level=u.GT_STR)[OFFSET_STR].median():.3f}\tGT-MN:{d_prime_no_humans.xs("MN", level=u.GT_STR)[OFFSET_STR].median():.3f}")

gt = "RA"
diffs = all_diffs.xs(gt, level=u.GT_STR)
wilx_res = wilcoxon(diffs.values, alternative=ALTERNATIVE, zero_method="pratt")
W_ra, p_ra = wilx_res
print(f"GT: {gt}\t::\tW={W_ra},\t\tp={p_ra:.8f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

gt = "MN"
diffs = all_diffs.xs(gt, level=u.GT_STR)
wilx_res = wilcoxon(diffs.values, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\t::\tW={W_mn},\t\tp={p_mn:.8f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# create the figure
fig = multi_detector_figure(d_prime, "saccade")    # or use `d_prime_no_humans` to exclude human annotators from the plot
fig.update_layout(
    title=None,
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.write_image(os.path.join(FIGURES_DIR, f"fig6-saccade.png"), scale=3)
fig.write_json(os.path.join(FIGURES_DIR, f"fig6-saccade.json"))
fig.show()

Median Onset:	GT-RA:3.167	GT-MN:3.275
Median Offset:	GT-RA:1.900	GT-MN:1.603
GT: RA	::	W=9348.0,		p=0.00000000,	onset-offset diff: 1.49 (±0.10 s.e.m.),	N=140
GT: MN	::	W=4671.0,		p=0.00000000,	onset-offset diff: 1.56 (±0.12 s.e.m.),	N=98


### Pairwise Comparisons: Engbert vs. REmoDNaV
We compare **saccade** detection sensitivity ($d'$) between the _Engbert_ and _REMoDNaV_ detectors, using the Mann-Whitney U-test, **to determine if the ability to detect PSOs at the end of saccades (REMoDNaV) improve saccade detection sensitivity compared to Engbert** (the null hypothesis is that PSO detection does not imrpove or impede saccade detection sensitivity).
We perform three comparisons (applying a Bonferroni correction for multiple comparisons):
1. **Onset**: determine which has higher onset detection sensitivity.
2. **Offset**: determine which has higher saccade offset detection sensitivity.
3. **Diffs**: compare the absolute differences of $d'$ scores between onset and offset, for both detectors (i.e., $|\Delta d'| := |d'_{onset} - d'_{offset}|$). This is a diff-of-diffs comparison, comparing the **stability** of detection sensitivity.

In [7]:
ALTERNATIVE = "two-sided"

##### GT Annotator: _RA_

In [8]:
GT = "RA"

remodnav_scores = extract_subset(d_prime, GT, "remodnav")
engbert_scores = extract_subset(d_prime, GT, "engbert")

# (1) Onset Comparison
print("##################")
print(f"(1) Onset Comparison ({GT})")
remodnav_onset, engbert_onset = remodnav_scores[ONSET_STR], engbert_scores[ONSET_STR]
print(f"REMoDNaV Onset:\t{remodnav_onset.mean():.2f} (±{remodnav_onset.sem():.2f} s.e.m.),\tN={len(remodnav_onset)}")
print(f"Engbert Onset:\t{engbert_onset.mean():.2f} (±{engbert_onset.sem():.2f} s.e.m.),\tN={len(engbert_onset)}")
posthoc_onset = mannwhitneyu(remodnav_onset, engbert_onset, alternative=ALTERNATIVE)
print(f"Onset Results:\tU={posthoc_onset.statistic:.1f},\tp={posthoc_onset.pvalue:.4f}")
print("Significant!" if posthoc_onset.pvalue <= ALPHA_BONF else "Not significant.")

# (2) Offset Comparison
print("\n##################")
print(f"(2) Offset Comparison ({GT})")
remodnav_offset, engbert_offset = remodnav_scores[OFFSET_STR], engbert_scores[OFFSET_STR]
print(f"REMoDNaV Offset:\t{remodnav_offset.mean():.2f} (±{remodnav_offset.sem():.2f} s.e.m.),\tN={len(remodnav_offset)}")
print(f"Engbert Offset:\t{engbert_offset.mean():.2f} (±{engbert_offset.sem():.2f} s.e.m.),\tN={len(engbert_offset)}")
posthoc_offset = mannwhitneyu(remodnav_offset, engbert_offset, alternative=ALTERNATIVE)
print(f"Offset Results:\tU={posthoc_offset.statistic:.1f},\tp={posthoc_offset.pvalue:.4f}")
print("Significant!" if posthoc_offset.pvalue <= ALPHA_BONF else "Not significant.")

# (3) Diffs Comparison
print("\n##################")
print(f"(3) Diffs Comparison ({GT})")
remodnav_diffs = abs(remodnav_onset - remodnav_offset)
engbert_diffs = abs(engbert_onset - engbert_offset)
print(f"REMoDNaV Abs-Diffs:\t{remodnav_diffs.mean():.2f} (±{remodnav_diffs.sem():.2f} s.e.m.),\tN={len(remodnav_diffs)}")
print(f"Engbert Abs-Diffs:\t{engbert_diffs.mean():.2f} (±{engbert_diffs.sem():.2f} s.e.m.),\tN={len(engbert_diffs)}")

diff_of_diffs = np.round(remodnav_diffs - engbert_diffs, decimals=3)
posthoc_diffs = mannwhitneyu(remodnav_diffs, engbert_diffs, alternative=ALTERNATIVE)
print(f"Diffs Results:\tU={posthoc_diffs.statistic:.1f},\tp={posthoc_diffs.pvalue:.4f},\tdiff={diff_of_diffs.mean():.2f} (±{diff_of_diffs.sem():.2f} s.e.m.),\tN={len(diff_of_diffs)}")
print("Significant!" if posthoc_diffs.pvalue <= ALPHA_BONF else "Not significant.")

##################
(1) Onset Comparison (RA)
REMoDNaV Onset:	4.58 (±0.14 s.e.m.),	N=20
Engbert Onset:	4.36 (±0.17 s.e.m.),	N=20
Onset Results:	U=209.0,	p=0.8181
Not significant.

##################
(2) Offset Comparison (RA)
REMoDNaV Offset:	2.87 (±0.16 s.e.m.),	N=20
Engbert Offset:	1.77 (±0.21 s.e.m.),	N=20
Offset Results:	U=336.0,	p=0.0002
Significant!

##################
(3) Diffs Comparison (RA)
REMoDNaV Abs-Diffs:	1.71 (±0.22 s.e.m.),	N=20
Engbert Abs-Diffs:	2.59 (±0.23 s.e.m.),	N=20
Diffs Results:	U=106.0,	p=0.0114,	diff=-0.88 (±0.15 s.e.m.),	N=20
Not significant.


##### GT Annotator: _MN_

In [9]:
GT = "MN"

remodnav_scores = extract_subset(d_prime, GT, "remodnav")
engbert_scores = extract_subset(d_prime, GT, "engbert")

# (1) Onset Comparison
print("##################")
print(f"(1) Onset Comparison ({GT})")
remodnav_onset, engbert_onset = remodnav_scores[ONSET_STR], engbert_scores[ONSET_STR]
print(f"REMoDNaV Onset:\t{remodnav_onset.mean():.2f} (±{remodnav_onset.sem():.2f} s.e.m.),\tN={len(remodnav_onset)}")
print(f"Engbert Onset:\t{engbert_onset.mean():.2f} (±{engbert_onset.sem():.2f} s.e.m.),\tN={len(engbert_onset)}")
posthoc_onset = mannwhitneyu(remodnav_onset, engbert_onset, alternative=ALTERNATIVE)
print(f"Onset Results:\tU={posthoc_onset.statistic:.1f},\tp={posthoc_onset.pvalue:.4f}")
print("Significant!" if posthoc_onset.pvalue <= ALPHA_BONF else "Not significant.")

# (2) Offset Comparison
print("\n##################")
print(f"(2) Offset Comparison ({GT})")
remodnav_offset, engbert_offset = remodnav_scores[OFFSET_STR], engbert_scores[OFFSET_STR]
print(f"REMoDNaV Offset:\t{remodnav_offset.mean():.2f} (±{remodnav_offset.sem():.2f} s.e.m.),\tN={len(remodnav_offset)}")
print(f"Engbert Offset:\t{engbert_offset.mean():.2f} (±{engbert_offset.sem():.2f} s.e.m.),\tN={len(engbert_offset)}")
posthoc_offset = mannwhitneyu(remodnav_offset, engbert_offset, alternative=ALTERNATIVE)
print(f"Offset Results:\tU={posthoc_offset.statistic:.1f},\tp={posthoc_offset.pvalue:.5f}")
print("Significant!" if posthoc_offset.pvalue <= ALPHA_BONF else "Not significant.")

# (3) Diffs Comparison
print("\n##################")
print(f"(3) Diffs Comparison ({GT})")
remodnav_diffs = abs(remodnav_onset - remodnav_offset)
engbert_diffs = abs(engbert_onset - engbert_offset)
print(f"REMoDNaV Abs-Diffs:\t{remodnav_diffs.mean():.2f} (±{remodnav_diffs.sem():.2f} s.e.m.),\tN={len(remodnav_diffs)}")
print(f"Engbert Abs-Diffs:\t{engbert_diffs.mean():.2f} (±{engbert_diffs.sem():.2f} s.e.m.),\tN={len(engbert_diffs)}")

diff_of_diffs = np.round(remodnav_diffs - engbert_diffs, decimals=3)
posthoc_diffs = mannwhitneyu(remodnav_diffs, engbert_diffs, alternative=ALTERNATIVE)
print(f"Diffs Results:\tU={posthoc_diffs.statistic:.1f},\tp={posthoc_diffs.pvalue:.4f},\tdiff={diff_of_diffs.mean():.2f} (±{diff_of_diffs.sem():.2f} s.e.m.),\tN={len(diff_of_diffs)}")
print("Significant!" if posthoc_diffs.pvalue <= ALPHA_BONF else "Not significant.")

##################
(1) Onset Comparison (MN)
REMoDNaV Onset:	4.59 (±0.15 s.e.m.),	N=14
Engbert Onset:	4.23 (±0.20 s.e.m.),	N=14
Onset Results:	U=119.0,	p=0.3462
Not significant.

##################
(2) Offset Comparison (MN)
REMoDNaV Offset:	2.88 (±0.21 s.e.m.),	N=14
Engbert Offset:	1.35 (±0.18 s.e.m.),	N=14
Offset Results:	U=187.0,	p=0.00005
Significant!

##################
(3) Diffs Comparison (MN)
REMoDNaV Abs-Diffs:	1.71 (±0.26 s.e.m.),	N=14
Engbert Abs-Diffs:	2.88 (±0.17 s.e.m.),	N=14
Diffs Results:	U=28.0,	p=0.0014,	diff=-1.17 (±0.22 s.e.m.),	N=14
Significant!


## Fixation Onset vs. Offset
We compare the sensitivity ($d'$) to fixation onsets vs. offsets across all trials, using Wilcoxon's Signed-Rank test. We hypothesize that **fixation onset has lower $d'$ than fixation offset**.

## Load Data

In [10]:
ALTERNATIVE = "less"  # alternative hypothesis for the Wilcoxon test
EVENT_LABEL = 1     # EventLabelEnum.FIXATION.value
metrics = ch_sdt.load(
    dataset_name=DATASET_NAME,
    output_dir=PROCESSED_DATA_DIR,
    label=EVENT_LABEL,
    stimulus_type=STIMULUS_TYPE,
    threshold=THRESHOLD,
    channel_type=None,
)

# Remove unused columns (metrics) and index levels
metrics.drop(index=['P', 'PP', 'N', 'TP'], level=peyes.constants.METRIC_STR, inplace=True)
metrics = metrics.droplevel('threshold')

# extract subset of the relevant `METRIC`
d_prime = metrics.xs(METRIC, level=peyes.constants.METRIC_STR, axis=0, drop_level=True).T
d_prime.columns.name = None
d_prime

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,onset,offset
trial_id,gt,pred,Unnamed: 3_level_1,Unnamed: 4_level_1
25,RA,engbert,3.997164,5.048805
25,RA,idt,-0.051561,2.777103
25,RA,ivt,2.174709,3.598462
25,RA,nh,2.542822,1.872691
25,RA,idvt,-0.051561,2.777103
...,...,...,...,...
44,MN,nh,2.410594,3.047152
44,MN,idvt,-0.508018,2.545828
44,MN,RA,3.759736,4.011339
44,MN,remodnav,1.618094,1.618094


### Human Annotator Sensitivity Comparison
Using each annotator (_RA_, _MN_) as GT and the other one as PRED, we compare PRED's sensitivity to **fixation** onset vs. offset in a within-trial design - using the Wilcoxon Signed-Rank test.

In [11]:
# using RA as GT and MN as PRED
gt, pred = "RA", "MN"
gt_ra_pred_mn = extract_subset(d_prime, gt, pred)
diffs = np.round(gt_ra_pred_mn[ONSET_STR] - gt_ra_pred_mn[OFFSET_STR], decimals=3)
wilx_res = wilcoxon(diffs, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\tPRED: {pred}\t::\tW={W_mn},\t\tp={p_mn:.5f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# using MN as GT and RA as PRED
gt, pred = "MN", "RA"
gt_mn_pred_ra = extract_subset(d_prime, gt, pred)
diffs = np.round(gt_mn_pred_ra[ONSET_STR] - gt_mn_pred_ra[OFFSET_STR], decimals=3)
wilx_res = wilcoxon(diffs, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\tPRED: {pred}\t::\tW={W_mn},\t\tp={p_mn:.5f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# create a bar-plot figure for the human annotators
fig = human_annotators_figure(d_prime, "fixation")
fig.show()

GT: RA	PRED: MN	::	W=0.0,		p=0.00055,	onset-offset diff: -1.57 (±0.22 s.e.m.),	N=14
GT: MN	PRED: RA	::	W=0.0,		p=0.00055,	onset-offset diff: -1.36 (±0.20 s.e.m.),	N=14


### Within-Detector Comparisons (across detectors)
We compare **fixation** onset vs. offset $d'$ scores using a within-detector comparison across all detectors. We use Wilcoxon's Signed-Rank test on the difference of $d'$ scores, across all trials and detectors but split between annotator _RA_ and _MN_. We hypothesize that **fixation onset has lower $d'$ that fixation offset**.

In [12]:
# extract the data
d_prime_no_humans = d_prime.drop(["MN", "RA"], level=u.PRED_STR, axis=0)    # exclude humans from PRED column (only include algs)
all_diffs = np.round(d_prime_no_humans[ONSET_STR] - d_prime_no_humans[OFFSET_STR], decimals=3)  # calculate onset-offset differences
print(f"Median Onset:\tGT-RA:{d_prime_no_humans.xs("RA", level=u.GT_STR)[ONSET_STR].median():.3f}\tGT-MN:{d_prime_no_humans.xs("MN", level=u.GT_STR)[ONSET_STR].median():.3f}")
print(f"Median Offset:\tGT-RA:{d_prime_no_humans.xs("RA", level=u.GT_STR)[OFFSET_STR].median():.3f}\tGT-MN:{d_prime_no_humans.xs("MN", level=u.GT_STR)[OFFSET_STR].median():.3f}")

gt = "RA"
diffs = all_diffs.xs(gt, level=u.GT_STR)
wilx_res = wilcoxon(diffs.values, alternative=ALTERNATIVE, zero_method="pratt")
W_ra, p_ra = wilx_res
print(f"GT: {gt}\t::\tW={W_ra},\t\tp={p_ra:.8f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

gt = "MN"
diffs = all_diffs.xs(gt, level=u.GT_STR)
wilx_res = wilcoxon(diffs.values, alternative=ALTERNATIVE, zero_method="pratt")
W_mn, p_mn = wilx_res
print(f"GT: {gt}\t::\tW={W_mn},\t\tp={p_mn:.8f},\tonset-offset diff: {diffs.mean():.2f} (±{diffs.sem():.2f} s.e.m.),\tN={len(diffs)}")

# create the figure
fig = multi_detector_figure(d_prime, "fixation")    # or use `d_prime_no_humans` to exclude human annotators from the plot
fig.update_layout(
    title=None,
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.write_image(os.path.join(FIGURES_DIR, f"fig6-fixation.png"), scale=3)
fig.write_json(os.path.join(FIGURES_DIR, f"fig6-fixation.json"))
fig.show()

Median Onset:	GT-RA:1.983	GT-MN:1.950
Median Offset:	GT-RA:2.777	GT-MN:2.791
GT: RA	::	W=710.5,		p=0.00000000,	onset-offset diff: -1.29 (±0.10 s.e.m.),	N=140
GT: MN	::	W=262.0,		p=0.00000000,	onset-offset diff: -1.38 (±0.12 s.e.m.),	N=98


### Pairwise Comparisons: Engbert vs. REmoDNaV
We compare **fixation** detection sensitivity ($d'$) between the _Engbert_ and _REMoDNaV_ detectors, using the Mann-Whitney U-test, **to determine if the ability to detect PSOs at the start of fixations (REMoDNaV) improve saccade detection sensitivity compared to Engbert** (the null hypothesis is that PSO detection does not improve or impede fixation detection sensitivity).
We perform three comparisons (applying a Bonferroni correction for multiple comparisons):
1. **Onset**: determine which has higher onset detection sensitivity.
2. **Offset**: determine which has higher fixation offset detection sensitivity.
3. **Diffs**: compare the absolute differences of $d'$ scores between onset and offset, for both detectors (i.e., $|\Delta d'| := |d'_{onset} - d'_{offset}|$). This is a diff-of-diffs comparison, comparing the **stability** of detection sensitivity.

In [13]:
ALTERNATIVE = "two-sided"

##### GT Annotator: _RA_

In [14]:
GT = "RA"

remodnav_scores = extract_subset(d_prime, GT, "remodnav")
engbert_scores = extract_subset(d_prime, GT, "engbert")

# (1) Onset Comparison
print("##################")
print(f"(1) Onset Comparison ({GT})")
remodnav_onset, engbert_onset = remodnav_scores[ONSET_STR], engbert_scores[ONSET_STR]
print(f"REMoDNaV Onset:\t{remodnav_onset.mean():.2f} (±{remodnav_onset.sem():.2f} s.e.m.),\tN={len(remodnav_onset)}")
print(f"Engbert Onset:\t{engbert_onset.mean():.2f} (±{engbert_onset.sem():.2f} s.e.m.),\tN={len(engbert_onset)}")
posthoc_onset = mannwhitneyu(remodnav_onset, engbert_onset, alternative=ALTERNATIVE)
print(f"Onset Results:\tU={posthoc_onset.statistic:.1f},\tp={posthoc_onset.pvalue:.4f}")
print("Significant!" if posthoc_onset.pvalue <= ALPHA_BONF else "Not significant.")

# (2) Offset Comparison
print("\n##################")
print(f"(2) Offset Comparison ({GT})")
remodnav_offset, engbert_offset = remodnav_scores[OFFSET_STR], engbert_scores[OFFSET_STR]
print(f"REMoDNaV Offset:\t{remodnav_offset.mean():.2f} (±{remodnav_offset.sem():.2f} s.e.m.),\tN={len(remodnav_offset)}")
print(f"Engbert Offset:\t{engbert_offset.mean():.2f} (±{engbert_offset.sem():.2f} s.e.m.),\tN={len(engbert_offset)}")
posthoc_offset = mannwhitneyu(remodnav_offset, engbert_offset, alternative=ALTERNATIVE)
print(f"Offset Results:\tU={posthoc_offset.statistic:.1f},\tp={posthoc_offset.pvalue:.4f}")
print("Significant!" if posthoc_offset.pvalue <= ALPHA_BONF else "Not significant.")

# (3) Diffs Comparison
print("\n##################")
print(f"(3) Diffs Comparison ({GT})")
remodnav_diffs = abs(remodnav_onset - remodnav_offset)
engbert_diffs = abs(engbert_onset - engbert_offset)
print(f"REMoDNaV Abs-Diffs:\t{remodnav_diffs.mean():.2f} (±{remodnav_diffs.sem():.2f} s.e.m.),\tN={len(remodnav_diffs)}")
print(f"Engbert Abs-Diffs:\t{engbert_diffs.mean():.2f} (±{engbert_diffs.sem():.2f} s.e.m.),\tN={len(engbert_diffs)}")

diff_of_diffs = np.round(remodnav_diffs - engbert_diffs, decimals=3)
posthoc_diffs = mannwhitneyu(remodnav_diffs, engbert_diffs, alternative=ALTERNATIVE)
print(f"Diffs Results:\tU={posthoc_diffs.statistic:.1f},\tp={posthoc_diffs.pvalue:.4f},\tdiff={diff_of_diffs.mean():.2f} (±{diff_of_diffs.sem():.2f} s.e.m.),\tN={len(diff_of_diffs)}")
print("Significant!" if posthoc_diffs.pvalue <= ALPHA_BONF else "Not significant.")

##################
(1) Onset Comparison (RA)
REMoDNaV Onset:	2.31 (±0.13 s.e.m.),	N=20
Engbert Onset:	3.01 (±0.19 s.e.m.),	N=20
Onset Results:	U=102.0,	p=0.0084
Not significant.

##################
(2) Offset Comparison (RA)
REMoDNaV Offset:	2.55 (±0.16 s.e.m.),	N=20
Engbert Offset:	4.34 (±0.14 s.e.m.),	N=20
Offset Results:	U=14.0,	p=0.0000
Significant!

##################
(3) Diffs Comparison (RA)
REMoDNaV Abs-Diffs:	0.56 (±0.08 s.e.m.),	N=20
Engbert Abs-Diffs:	1.33 (±0.20 s.e.m.),	N=20
Diffs Results:	U=93.0,	p=0.0040,	diff=-0.77 (±0.24 s.e.m.),	N=20
Significant!


##### GT Annotator: _MN_

In [15]:
GT = "MN"

remodnav_scores = extract_subset(d_prime, GT, "remodnav")
engbert_scores = extract_subset(d_prime, GT, "engbert")

# (1) Onset Comparison
print("##################")
print(f"(1) Onset Comparison ({GT})")
remodnav_onset, engbert_onset = remodnav_scores[ONSET_STR], engbert_scores[ONSET_STR]
print(f"REMoDNaV Onset:\t{remodnav_onset.mean():.2f} (±{remodnav_onset.sem():.2f} s.e.m.),\tN={len(remodnav_onset)}")
print(f"Engbert Onset:\t{engbert_onset.mean():.2f} (±{engbert_onset.sem():.2f} s.e.m.),\tN={len(engbert_onset)}")
posthoc_onset = mannwhitneyu(remodnav_onset, engbert_onset, alternative=ALTERNATIVE)
print(f"Onset Results:\tU={posthoc_onset.statistic:.1f},\tp={posthoc_onset.pvalue:.4f}")
print("Significant!" if posthoc_onset.pvalue <= ALPHA_BONF else "Not significant.")

# (2) Offset Comparison
print("\n##################")
print(f"(2) Offset Comparison ({GT})")
remodnav_offset, engbert_offset = remodnav_scores[OFFSET_STR], engbert_scores[OFFSET_STR]
print(f"REMoDNaV Offset:\t{remodnav_offset.mean():.2f} (±{remodnav_offset.sem():.2f} s.e.m.),\tN={len(remodnav_offset)}")
print(f"Engbert Offset:\t{engbert_offset.mean():.2f} (±{engbert_offset.sem():.2f} s.e.m.),\tN={len(engbert_offset)}")
posthoc_offset = mannwhitneyu(remodnav_offset, engbert_offset, alternative=ALTERNATIVE)
print(f"Offset Results:\tU={posthoc_offset.statistic:.1f},\tp={posthoc_offset.pvalue:.5f}")
print("Significant!" if posthoc_offset.pvalue <= ALPHA_BONF else "Not significant.")

# (3) Diffs Comparison
print("\n##################")
print(f"(3) Diffs Comparison ({GT})")
remodnav_diffs = abs(remodnav_onset - remodnav_offset)
engbert_diffs = abs(engbert_onset - engbert_offset)
print(f"REMoDNaV Abs-Diffs:\t{remodnav_diffs.mean():.2f} (±{remodnav_diffs.sem():.2f} s.e.m.),\tN={len(remodnav_diffs)}")
print(f"Engbert Abs-Diffs:\t{engbert_diffs.mean():.2f} (±{engbert_diffs.sem():.2f} s.e.m.),\tN={len(engbert_diffs)}")

diff_of_diffs = np.round(remodnav_diffs - engbert_diffs, decimals=3)
posthoc_diffs = mannwhitneyu(remodnav_diffs, engbert_diffs, alternative=ALTERNATIVE)
print(f"Diffs Results:\tU={posthoc_diffs.statistic:.1f},\tp={posthoc_diffs.pvalue:.4f},\tdiff={diff_of_diffs.mean():.2f} (±{diff_of_diffs.sem():.2f} s.e.m.),\tN={len(diff_of_diffs)}")
print("Significant!" if posthoc_diffs.pvalue <= ALPHA_BONF else "Not significant.")

##################
(1) Onset Comparison (MN)
REMoDNaV Onset:	2.34 (±0.14 s.e.m.),	N=14
Engbert Onset:	2.84 (±0.17 s.e.m.),	N=14
Onset Results:	U=49.0,	p=0.0258
Not significant.

##################
(2) Offset Comparison (MN)
REMoDNaV Offset:	2.57 (±0.21 s.e.m.),	N=14
Engbert Offset:	4.39 (±0.20 s.e.m.),	N=14
Offset Results:	U=12.0,	p=0.00009
Significant!

##################
(3) Diffs Comparison (MN)
REMoDNaV Abs-Diffs:	0.54 (±0.13 s.e.m.),	N=14
Engbert Abs-Diffs:	1.54 (±0.19 s.e.m.),	N=14
Diffs Results:	U=23.0,	p=0.0006,	diff=-1.01 (±0.24 s.e.m.),	N=14
Significant!
