# Results p. 1
## Sample-by-Sample Evaluation 

In [1]:
import copy

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from analysis._article_results.hfc._helpers import *
import analysis.statistics.sample_metrics as sm

pio.renderers.default = "browser"

  import scipy.linalg


## Load Data

In [2]:
global_measures = sm.load_global_metrics(DATASET_NAME, PROCESSED_DATA_DIR, stimulus_type=STIMULUS_TYPE, metric=None, iteration=1)
global_measures.drop(index=[peyes.constants.ACCURACY_STR, peyes.constants.BALANCED_ACCURACY_STR], inplace=True)    # Drop Acc+Balanced-Acc metrics

# drop human annotators that aren't GT1 or GT2 (and aren't detectors)
annotators_to_drop = [ann for ann in global_measures.columns.get_level_values(u.PRED_STR).unique() if ann not in [GT1, GT2] and ann not in DETECTORS.keys()]
global_measures.drop(columns=annotators_to_drop, level=u.GT_STR, inplace=True)
global_measures.drop(columns=annotators_to_drop, level=u.PRED_STR, inplace=True)

global_measures

trial_id,1,1,1,1,1,1,1,1,1,1,...,10,10,10,10,10,10,10,10,10,10
gt,MN,MN,MN,MN,MN,MN,MN,MN,RA,RA,...,MN,MN,RA,RA,RA,RA,RA,RA,RA,RA
pred,remodnav,idvt,RA,engbert,ivt,nh,idt,ivvt,remodnav,idvt,...,idt,ivvt,remodnav,idvt,engbert,ivt,nh,idt,ivvt,MN
cohen's_kappa,0.155296,0.501069,0.790362,0.705601,0.074064,0.749462,0.28242,0.020994,0.165182,0.558357,...,0.418543,0.016072,0.601088,0.427103,0.794106,0.084939,0.783373,0.427103,0.019527,0.927445
complement_nld,0.566829,0.885213,0.941607,0.919849,0.35413,0.94294,0.850577,0.217362,0.589476,0.89254,...,0.886989,0.226243,0.889876,0.893428,0.949378,0.381217,0.958481,0.893428,0.219805,0.981572
mcc,0.220215,0.509764,0.791891,0.706488,0.173524,0.753124,0.30372,0.100405,0.224352,0.575647,...,0.501016,0.067917,0.605847,0.502959,0.795332,0.191513,0.783408,0.502959,0.084991,0.927873


In [3]:
fixation_sdt = sm.load_sdt(DATASET_NAME, PROCESSED_DATA_DIR, label=1, stimulus_type=STIMULUS_TYPE, metric=None, iteration=1)
fixation_sdt = fixation_sdt.loc[[peyes.constants.D_PRIME_STR, peyes.constants.F1_STR]]      # Keep only d' and f1 metrics
fixation_sdt = fixation_sdt.rename(index=lambda idx: f"fixation_{idx}")     # Rename index

# drop human annotators that aren't GT1 or GT2 (and aren't detectors)
annotators_to_drop = [ann for ann in fixation_sdt.columns.get_level_values(u.PRED_STR).unique() if ann not in [GT1, GT2] and ann not in DETECTORS.keys()]
fixation_sdt.drop(columns=annotators_to_drop, level=u.GT_STR, inplace=True)
fixation_sdt.drop(columns=annotators_to_drop, level=u.PRED_STR, inplace=True)

fixation_sdt

trial_id,1,1,1,1,1,1,1,1,1,1,...,10,10,10,10,10,10,10,10,10,10
gt,MN,MN,MN,MN,MN,MN,MN,MN,RA,RA,...,MN,MN,RA,RA,RA,RA,RA,RA,RA,RA
pred,remodnav,idvt,RA,engbert,ivt,nh,idt,ivvt,remodnav,idvt,...,idt,ivvt,remodnav,idvt,engbert,ivt,nh,idt,ivvt,MN
fixation_d_prime,0.816991,1.66521,2.812538,2.369833,0.961723,2.590326,1.079893,1.25886,0.791707,1.900235,...,2.177725,0.453761,2.019124,2.102245,2.808797,1.105628,2.744942,2.102245,0.666834,3.937406
fixation_f1,0.660068,0.929144,0.963768,0.949652,0.377801,0.962072,0.910081,0.120229,0.660208,0.933107,...,0.936413,0.143867,0.932228,0.939911,0.970138,0.420599,0.966837,0.939911,0.146966,0.98869


## Statistical Analyses
### Sample-by-Sample Agreement
Evaluate performance on the sample-by-sample level using metrics for all labels together:
- Cohen's $\kappa$
- MCC
- $1-NLD$ 

In [4]:
global_stats, global_pvalues, global_nemenyi, sm_global_Ns = sm.friedman_nemenyi(global_measures, [GT1, GT2])

pd.concat([global_stats, global_pvalues, global_pvalues <= ALPHA], axis=1, keys=['Q', 'p', 'is_sig']).stack(1, future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Q,p,is_sig
metric,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cohen's_kappa,MN,55.553957,3.582581e-10,True
cohen's_kappa,RA,55.985612,2.930527e-10,True
complement_nld,MN,58.834532,7.761097e-11,True
complement_nld,RA,58.834532,7.761097e-11,True
mcc,MN,55.553957,3.582581e-10,True
mcc,RA,55.769784,3.240238e-10,True


#### Post-Hoc Analysis
**Cohen's Kappa**

In [5]:
post_hoc_kappa = sm.post_hoc_table(global_nemenyi, peyes.constants.COHENS_KAPPA_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_kappa

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,***,***,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,***,**,n.s.
ivvt,MN,0.9900,--,n.s.,*,***,***,n.s.
ivvt,RA,0.9788,--,n.s.,*,***,***,n.s.
idt,MN,0.6768,0.1969,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.6528,0.1361,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.2221,0.0250,0.9933,--,n.s.,n.s.,n.s.
idvt,RA,0.2221,0.0163,0.9949,--,n.s.,n.s.,n.s.
engbert,MN,0.0003,0.0000,0.1700,0.5973,--,n.s.,n.s.
engbert,RA,0.0008,0.0000,0.2783,0.7232,--,n.s.,n.s.


In [6]:
post_hoc_mcc = sm.post_hoc_table(global_nemenyi, peyes.constants.MCC_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_mcc

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,***,***,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,**,**,n.s.
ivvt,MN,0.9906,--,n.s.,*,***,***,n.s.
ivvt,RA,0.9887,--,n.s.,*,***,***,n.s.
idt,MN,0.6886,0.2093,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.7118,0.2135,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.2221,0.0258,0.9923,--,n.s.,n.s.,n.s.
idvt,RA,0.2493,0.0283,0.9933,--,n.s.,n.s.,n.s.
engbert,MN,0.0004,0.0000,0.1700,0.6097,--,n.s.,n.s.
engbert,RA,0.0013,0.0000,0.2733,0.7399,--,n.s.,n.s.


In [7]:
post_hoc_nld = sm.post_hoc_table(global_nemenyi, peyes.constants.COMPLEMENT_NLD_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_nld

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,***,***,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,**,**,n.s.
ivvt,MN,0.9945,--,*,*,***,***,n.s.
ivvt,RA,0.9960,--,*,*,***,***,n.s.
idt,MN,0.2587,0.0420,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.1774,0.0266,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.1183,0.0129,0.9999,--,n.s.,n.s.,n.s.
idvt,RA,0.0903,0.0101,1.0000,--,n.s.,n.s.,n.s.
engbert,MN,0.0008,0.0000,0.6708,0.8601,--,n.s.,†
engbert,RA,0.0014,0.0000,0.8392,0.9374,--,n.s.,n.s.


### Fixation Sample-by-Sample Detection
Evaluate performance by measuring how well the detector detects _fixation-samples_ out of all samples.
Evaluation is based on _Discriminability Index_ ($d'$) and _f1-score_

In [8]:
sdt_statistics, sdt_pvalues, sdt_nemenyi, sdt_Ns = sm.friedman_nemenyi(fixation_sdt, [GT1, GT2])

pd.concat([sdt_statistics, sdt_pvalues, sdt_pvalues <= ALPHA], axis=1, keys=['Q', 'p', 'is_sig']).stack(1, future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Q,p,is_sig
metric,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fixation_d_prime,MN,52.791367,1.292447e-09,True
fixation_d_prime,RA,54.215827,6.673586e-10,True
fixation_f1,MN,58.489209,9.11948e-11,True
fixation_f1,RA,58.705036,8.245064e-11,True


#### Post Hoc Analysis
**_d'_**

In [9]:
post_hoc_fix_dprime = sm.post_hoc_table(sdt_nemenyi, f"fixation_{peyes.constants.D_PRIME_STR}", [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_fix_dprime

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,***,***,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,**,**,n.s.
ivvt,MN,0.9996,--,n.s.,n.s.,***,***,n.s.
ivvt,RA,1.0000,--,n.s.,n.s.,**,**,n.s.
idt,MN,0.5533,0.2783,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.7003,0.5973,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.2354,0.0815,0.9990,--,n.s.,n.s.,n.s.
idvt,RA,0.3144,0.2309,0.9979,--,n.s.,n.s.,n.s.
engbert,MN,0.0007,0.0001,0.3471,0.6886,--,n.s.,†
engbert,RA,0.0041,0.0021,0.4471,0.8212,--,n.s.,*


**_f1_**

In [10]:
post_hoc_fix_f1 = sm.post_hoc_table(sdt_nemenyi, f"fixation_{peyes.constants.F1_STR}", [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_fix_f1

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,**,**,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,**,**,n.s.
ivvt,MN,0.9848,--,*,**,***,***,n.s.
ivvt,RA,0.9830,--,*,**,***,***,n.s.
idt,MN,0.2354,0.0220,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.1890,0.0143,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.1155,0.0070,1.0000,--,n.s.,n.s.,n.s.
idvt,RA,0.1075,0.0058,1.0000,--,n.s.,n.s.,n.s.
engbert,MN,0.0013,0.0000,0.7616,0.9036,--,n.s.,†
engbert,RA,0.0016,0.0000,0.8392,0.9272,--,n.s.,n.s.


## Figures
### Agreement Figure (not in article)

In [11]:
W, H = 600, 450

global_metrics_fig = sm.global_metrics_distributions_figure(
    global_measures,
    gt1=GT1, gt2=GT2,
    colors={k: v[1] for k, v in LABELER_PLOTTING_CONFIG.items()},
    only_box=False,
    show_other_gt=True,
    share_x=True,
)

global_metrics_fig.update_traces(width=0.9)     # make violins wider so there's less space between them
# Make "Other GT" violins double-sided
for t in global_metrics_fig.data:
    if t["legendgroup"] != "Other GT":
        continue
    t["visible"] = t["name"].split(',')[0] == GT1
    t["side"] = None

global_metrics_fig.update_layout(
    title=None,
    width=W, height=H,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',
    yaxis=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    yaxis2=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    yaxis3=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    xaxis3=dict(showgrid=False, tickfont=dict(size=14), tickangle=0),
    margin=dict(l=10, r=10, b=10, t=10, pad=0),
    showlegend=False,
)
# global_metrics_fig.layout.annotations = []    # remove subtitles

# FIG_ID, IS_SUPP = 3, False
# save_fig(global_metrics_fig, FIG_ID, '', 'sample-global-metrics', IS_SUPP)
# global_metrics_fig.show()

### Sensitivity Figure (not in article)

In [12]:
W, H = 750, 400

sdt_metrics_fig = sm.sdt_distributions_figure(
    fixation_sdt,
    GT1, GT2,
    colors={k: v[1] for k, v in LABELER_PLOTTING_CONFIG.items()},
    only_box=False,
    show_other_gt=True,
    share_x=True,
)
sdt_metrics_fig.update_traces(width=0.9)     # make violins wider so there's less space between them

sdt_metrics_fig.update_layout(
    title=None,
    width=W, height=H,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',

    # remove axis grids
    xaxis=dict(showgrid=False, zeroline=False, showline=False), yaxis=dict(showgrid=False, zeroline=False, showline=False),
    xaxis2=dict(showgrid=False, zeroline=False, showline=False), yaxis2=dict(showgrid=False, zeroline=False, showline=False),
    xaxis3=dict(showgrid=False, zeroline=False, showline=False), yaxis3=dict(showgrid=False, zeroline=False, showline=False),
    xaxis4=dict(showgrid=False, zeroline=False, showline=False), yaxis4=dict(showgrid=False, zeroline=False, showline=False),

    # move legend to bottom
    legend=dict(orientation="h", yanchor="top", xanchor="center", xref='container', yref='container', x=0.5, y=0.05),
    showlegend=False,   # hide legend
    margin=dict(l=40, r=0, b=0, t=20, pad=0),
)

# move annotations to the top/left of the plot
# sdt_metrics_fig.for_each_annotation(lambda ann: ann.update(x=-0.05, textangle=-90) if ann.text in row_titles else ann.update(y=0.99))
# sdt_metrics_fig.layout.annotations = []    # remove subtitles

# FIG_ID, IS_SUPP = 8, True
# save_fig(sdt_metrics_fig, FIG_ID, "", f"detection-sample_level", IS_SUPP)
# sdt_metrics_fig.show()

### Article Figure (Ridge plots)

In [13]:
GRID_LINE_COLOR, GRID_LINE_WIDTH = "lightgray", 1
ZERO_LINE_WIDTH = 2 * GRID_LINE_WIDTH

FONT_FAMILY, FONT_COLOR = "Calibri", "black"
TITLE_FONT = dict(family=FONT_FAMILY, size=22, color=FONT_COLOR)
AXIS_LABEL_FONT = dict(family=FONT_FAMILY, size=18, color=FONT_COLOR)
AXIS_TICK_FONT = dict(family=FONT_FAMILY, size=16, color=FONT_COLOR)
AXIS_LABEL_STANDOFF = 2

WIDTH, HEIGHT = 900, 1000
ROW_TITLES = ["Cohen's Kappa", "MCC", "1-<i>NLD</i>", "Fixation <i>d'</i>"]
COL_TITLES = [GT1, GT2]

In [14]:
def convert_violin_to_ridge(tr: go.Trace) -> go.Trace:
    new_trace = copy.deepcopy(tr)
    # change values across all traces
    new_trace["visible"] = True
    new_trace["side"] = "positive"
    new_trace["opacity"] = 0.95
    new_trace["width"] = 1.8
    new_trace["box"] = None
    new_trace["showlegend"] = new_trace["points"] = False
    # convert to ridge plot
    if new_trace['x0'] == "Other GT":
        new_trace['y0'] = new_trace["name"] = new_trace["legendgroup"] = "2<sup>nd</sup> Ann."
    elif new_trace["x0"].startswith("i"):
        new_trace['y0'] = new_trace["name"] = new_trace["legendgroup"] = new_trace["x0"].replace("i", "I-").upper()
    elif new_trace["x0"] == "remodnav":
        new_trace["y0"] = new_trace["name"] = new_trace["legendgroup"] = "REMoDNaV"
    else:
        new_trace["y0"] = new_trace["name"] = new_trace["legendgroup"] = new_trace["x0"].upper()
    new_trace['x'] = new_trace['y']
    new_trace['y'] = new_trace['x0'] = None
    new_trace["meanline"] = dict(visible=True, width=3, color='lightgray')
    return new_trace

In [15]:
fig = make_subplots(
    rows=len(ROW_TITLES), cols=len(COL_TITLES),
    row_titles=ROW_TITLES, column_titles=COL_TITLES,
    shared_xaxes=False, shared_yaxes=True,
    vertical_spacing=0.06, horizontal_spacing=0.02,
)

# AGREEMENT MEASURES
for tr in global_metrics_fig.data:
    new_tr = convert_violin_to_ridge(tr)
    gt_name = tr["name"].split(',')[0]
    col = [GT1, GT2].index(gt_name) + 1
    row = [peyes.constants.COHENS_KAPPA_STR, peyes.constants.MCC_STR, peyes.constants.COMPLEMENT_NLD_STR].index(tr["scalegroup"]) + 1
    new_tr["showlegend"] = row == 0 and col == 0
    fig.add_trace(new_tr, row=row, col=col)

# SENSITIVITY MEASURES
for tr in sdt_metrics_fig.data:
    if not tr["scalegroup"].endswith("d_prime"):
        # ignore all traces that aren't d'
        continue
    new_tr = convert_violin_to_ridge(tr)
    gt_name = tr["name"].split(',')[0]
    col = [GT1, GT2].index(gt_name) + 1
    fig.add_trace(new_tr, row=len(ROW_TITLES), col=col)

# UPDATE LAYOUT
# update axes
fig.for_each_xaxis(lambda xax: xax.update(
    showline=False,
    zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
    showgrid=False, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
    range=[-0.025, 1.025] if xax['anchor'] not in ['y7', 'y8'] else None,
    tickfont=AXIS_TICK_FONT,
))
fig.for_each_yaxis(lambda yax: yax.update(
    title=dict(
        text="Detector" if yax['anchor'] in ['x', 'x3', 'x5', 'x7'] else "", font=AXIS_LABEL_FONT, standoff=AXIS_LABEL_STANDOFF
    ),
    showline=False,
    zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
    showgrid=True, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
    tickfont=AXIS_TICK_FONT,
))

# update titles
fig.for_each_annotation(lambda ann: ann.update(
    font=TITLE_FONT if ann.text in COL_TITLES else AXIS_LABEL_FONT,
    xref='paper', xanchor='center',
    yref='paper', yanchor='top',
    textangle=0,
))
subtitle_ycoords = [0.78, 0.51, 0.25, -0.025]
for ann in fig.layout.annotations:
    if ann.text in COL_TITLES:
        ann['x'] = 0.25 if ann.text == COL_TITLES[0] else 0.75
        ann['y'] = 1.025
    elif ann.text in ROW_TITLES:
        ann['x'] = 0.49
        ann['y'] = subtitle_ycoords[ROW_TITLES.index(ann.text)]
    else:
        print(f"Unknown annotation: {ann.text}")

# update figure layout
fig.update_layout(
    font_family=FONT_FAMILY,
    width=WIDTH, height=HEIGHT,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(l=0, r=0, t=15, b=45, pad=0),
    showlegend=False,
)

NAME = "fig1-sample_agreement"
fig.write_image(os.path.join(FIGURES_DIR, f"{NAME}.png"), scale=3)
fig.write_json(os.path.join(FIGURES_DIR, f"{NAME}.json"))
fig.show()