# Results p. 1
## Sample-by-Sample Evaluation 

In [21]:
import copy

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from analysis._article_results.hfc._helpers import *
import analysis.statistics.sample_metrics as sm

# pio.renderers.default = "browser"

## Load Data

In [22]:
global_measures = sm.load_global_metrics(DATASET_NAME, PROCESSED_DATA_DIR, stimulus_type=STIMULUS_TYPE, metric=None, iteration=1)
global_measures.drop(index=[peyes.constants.ACCURACY_STR, peyes.constants.BALANCED_ACCURACY_STR], inplace=True)    # Drop Acc+Balanced-Acc metrics

# drop human annotators that aren't GT1 or GT2 (and aren't detectors)
annotators_to_drop = [ann for ann in global_measures.columns.get_level_values(u.PRED_STR).unique() if ann not in [GT1, GT2] and ann not in DETECTORS.keys()]
global_measures.drop(columns=annotators_to_drop, level=u.GT_STR, inplace=True)
global_measures.drop(columns=annotators_to_drop, level=u.PRED_STR, inplace=True)

global_measures

trial_id,1,1,1,1,1,1,1,1,1,1,...,10,10,10,10,10,10,10,10,10,10
gt,MN,MN,MN,MN,MN,MN,MN,MN,RA,RA,...,MN,MN,RA,RA,RA,RA,RA,RA,RA,RA
pred,idvt,nh,RA,ivvt,remodnav,engbert,ivt,idt,idvt,nh,...,ivt,idt,idvt,nh,ivvt,remodnav,engbert,MN,ivt,idt
cohen's_kappa,0.236827,0.345449,0.789915,0.006621,0.01896,0.357025,0.034027,0.141882,0.273426,0.329668,...,0.03519,0.200674,0.204895,0.363214,0.004242,0.019474,0.365619,0.927445,0.036265,0.204895
complement_nld,0.799422,0.820524,0.941582,0.059529,0.110395,0.80231,0.199689,0.800311,0.796535,0.807641,...,0.227575,0.837034,0.843917,0.827487,0.074378,0.150311,0.829485,0.981572,0.229352,0.843917
mcc,0.255316,0.374803,0.79145,0.018534,0.060351,0.382572,0.087158,0.160184,0.2995,0.362199,...,0.091059,0.250508,0.251479,0.392835,0.012381,0.057021,0.396546,0.927873,0.0956,0.251479


In [23]:
fixation_sdt = sm.load_sdt(DATASET_NAME, PROCESSED_DATA_DIR, label=1, stimulus_type=STIMULUS_TYPE, metric=None, iteration=1)
fixation_sdt = fixation_sdt.loc[[peyes.constants.D_PRIME_STR, peyes.constants.F1_STR]]      # Keep only d' and f1 metrics
fixation_sdt = fixation_sdt.rename(index=lambda idx: f"fixation_{idx}")     # Rename index

# drop human annotators that aren't GT1 or GT2 (and aren't detectors)
annotators_to_drop = [ann for ann in fixation_sdt.columns.get_level_values(u.PRED_STR).unique() if ann not in [GT1, GT2] and ann not in DETECTORS.keys()]
fixation_sdt.drop(columns=annotators_to_drop, level=u.GT_STR, inplace=True)
fixation_sdt.drop(columns=annotators_to_drop, level=u.PRED_STR, inplace=True)

fixation_sdt

trial_id,1,1,1,1,1,1,1,1,1,1,...,10,10,10,10,10,10,10,10,10,10
gt,MN,MN,MN,MN,MN,MN,MN,MN,RA,RA,...,MN,MN,RA,RA,RA,RA,RA,RA,RA,RA
pred,idvt,nh,RA,ivvt,remodnav,engbert,ivt,idt,idvt,nh,...,ivt,idt,idvt,nh,ivvt,remodnav,engbert,MN,ivt,idt
fixation_d_prime,1.632837,2.588093,2.810875,1.257979,0.430529,2.367849,0.960429,1.047735,1.958827,2.48749,...,0.961187,2.177725,2.102245,2.729531,0.666834,0.762996,2.808797,3.937406,1.105628,2.102245
fixation_f1,0.927405,0.962072,0.963768,0.120229,0.212726,0.949652,0.377801,0.908329,0.935551,0.954533,...,0.418768,0.936413,0.939911,0.966601,0.146966,0.297901,0.970138,0.98869,0.420599,0.939911


## Statistical Analyses
### Sample-by-Sample Agreement
Evaluate performance on the sample-by-sample level using metrics for all labels together:
- Cohen's $\kappa$
- MCC
- $1-NLD$ 

In [24]:
global_stats, global_pvalues, global_nemenyi, sm_global_Ns = sm.friedman_nemenyi(global_measures, [GT1, GT2])

pd.concat([global_stats, global_pvalues, global_pvalues <= ALPHA], axis=1, keys=['Q', 'p', 'is_sig']).stack(1, future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Q,p,is_sig
metric,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cohen's_kappa,MN,57.145422,1.707174e-10,True
cohen's_kappa,RA,57.576302,1.396404e-10,True
complement_nld,MN,49.842011,5.056668e-09,True
complement_nld,RA,49.174147,6.880875e-09,True
mcc,MN,56.989247,1.836102e-10,True
mcc,RA,57.763441,1.279663e-10,True


#### Post-Hoc Analysis
**Cohen's Kappa**

In [25]:
post_hoc_kappa = sm.post_hoc_table(global_nemenyi, peyes.constants.COHENS_KAPPA_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_kappa

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,**,*,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,*,*,n.s.
ivvt,MN,0.8023,--,†,*,***,***,n.s.
ivvt,RA,0.8166,--,†,*,***,***,n.s.
idt,MN,0.8413,0.0724,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.8095,0.0650,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.5187,0.0126,0.9989,--,n.s.,n.s.,n.s.
idvt,RA,0.5124,0.0135,0.9994,--,n.s.,n.s.,n.s.
engbert,MN,0.0054,0.0000,0.3278,0.6738,--,n.s.,***
engbert,RA,0.0120,0.0000,0.4998,0.7998,--,n.s.,***


In [26]:
post_hoc_mcc = sm.post_hoc_table(global_nemenyi, peyes.constants.MCC_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_mcc

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,**,*,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,*,*,n.s.
ivvt,MN,0.7061,--,†,*,***,***,n.s.
ivvt,RA,0.7399,--,†,*,***,***,n.s.
idt,MN,0.8681,0.0528,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.8478,0.0544,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.5910,0.0101,0.9994,--,n.s.,n.s.,n.s.
idvt,RA,0.5596,0.0104,0.9994,--,n.s.,n.s.,n.s.
engbert,MN,0.0075,0.0000,0.3416,0.6588,--,n.s.,***
engbert,RA,0.0124,0.0000,0.4532,0.7669,--,n.s.,***


In [27]:
post_hoc_nld = sm.post_hoc_table(global_nemenyi, peyes.constants.COMPLEMENT_NLD_STR, [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_nld

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,n.s.,n.s.,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,n.s.,n.s.,n.s.
ivvt,MN,0.8258,--,***,**,**,***,n.s.
ivvt,RA,0.8212,--,**,**,**,**,n.s.
idt,MN,0.1541,0.0009,--,n.s.,n.s.,n.s.,*
idt,RA,0.1793,0.0012,--,n.s.,n.s.,n.s.,*
idvt,MN,0.1718,0.0012,1.0000,--,n.s.,n.s.,*
idvt,RA,0.1646,0.0010,1.0000,--,n.s.,n.s.,*
engbert,MN,0.2783,0.0033,1.0000,1.0000,--,n.s.,*
engbert,RA,0.2540,0.0026,1.0000,1.0000,--,n.s.,*


### Fixation Sample-by-Sample Detection
Evaluate performance by measuring how well the detector detects _fixation-samples_ out of all samples.
Evaluation is based on _Discriminability Index_ ($d'$) and _f1-score_

In [28]:
sdt_statistics, sdt_pvalues, sdt_nemenyi, sdt_Ns = sm.friedman_nemenyi(fixation_sdt, [GT1, GT2])

pd.concat([sdt_statistics, sdt_pvalues, sdt_pvalues <= ALPHA], axis=1, keys=['Q', 'p', 'is_sig']).stack(1, future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Q,p,is_sig
metric,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fixation_d_prime,MN,50.848921,3.176149e-09,True
fixation_d_prime,RA,55.467626,3.729411e-10,True
fixation_f1,MN,57.323741,1.570971e-10,True
fixation_f1,RA,57.280576,1.602913e-10,True


#### Post Hoc Analysis
**_d'_**

In [29]:
post_hoc_fix_dprime = sm.post_hoc_table(sdt_nemenyi, f"fixation_{peyes.constants.D_PRIME_STR}", [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_fix_dprime

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,**,**,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,*,*,n.s.
ivvt,MN,0.9988,--,n.s.,n.s.,***,***,n.s.
ivvt,RA,1.0000,--,n.s.,n.s.,**,*,n.s.
idt,MN,0.6886,0.3360,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.7668,0.6768,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.4594,0.1664,0.9999,--,n.s.,n.s.,n.s.
idvt,RA,0.4349,0.3415,0.9991,--,n.s.,n.s.,†
engbert,MN,0.0048,0.0004,0.4842,0.7118,--,n.s.,**
engbert,RA,0.0116,0.0065,0.5470,0.8520,--,n.s.,***


**_f1_**

In [30]:
post_hoc_fix_f1 = sm.post_hoc_table(sdt_nemenyi, f"fixation_{peyes.constants.F1_STR}", [GT1, GT2], alpha=ALPHA, marginal_alpha=MARGINAL_ALPHA)
post_hoc_fix_f1

Unnamed: 0_level_0,pred,ivt,ivvt,idt,idvt,engbert,nh,remodnav
pred,gt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ivt,MN,--,n.s.,n.s.,n.s.,*,*,n.s.
ivt,RA,--,n.s.,n.s.,n.s.,*,*,n.s.
ivvt,MN,0.8071,--,*,*,***,***,n.s.
ivvt,RA,0.8166,--,*,*,***,***,n.s.
idt,MN,0.6886,0.0320,--,n.s.,n.s.,n.s.,n.s.
idt,RA,0.5910,0.0206,--,n.s.,n.s.,n.s.,n.s.
idvt,MN,0.5092,0.0124,1.0000,--,n.s.,n.s.,n.s.
idvt,RA,0.4717,0.0108,1.0000,--,n.s.,n.s.,n.s.
engbert,MN,0.0283,0.0000,0.7874,0.9068,--,n.s.,**
engbert,RA,0.0330,0.0001,0.8795,0.9374,--,n.s.,**


## Figures
### Agreement Figure (not in article)

In [31]:
W, H = 600, 450

global_metrics_fig = sm.global_metrics_distributions_figure(
    global_measures,
    gt1=GT1, gt2=GT2,
    colors={k: v[1] for k, v in LABELER_PLOTTING_CONFIG.items()},
    only_box=False,
    show_other_gt=True,
    share_x=True,
)

global_metrics_fig.update_traces(width=0.9)     # make violins wider so there's less space between them
# Make "Other GT" violins double-sided
for t in global_metrics_fig.data:
    if t["legendgroup"] != "Other GT":
        continue
    t["visible"] = t["name"].split(',')[0] == GT1
    t["side"] = None

global_metrics_fig.update_layout(
    title=None,
    width=W, height=H,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',
    yaxis=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    yaxis2=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    yaxis3=dict(showgrid=False, zeroline=False, showline=False, range=[0, 1], tickfont=dict(size=14)),
    xaxis3=dict(showgrid=False, tickfont=dict(size=14), tickangle=0),
    margin=dict(l=10, r=10, b=10, t=10, pad=0),
    showlegend=False,
)
# global_metrics_fig.layout.annotations = []    # remove subtitles

# FIG_ID, IS_SUPP = 3, False
# save_fig(global_metrics_fig, FIG_ID, '', 'sample-global-metrics', IS_SUPP)
global_metrics_fig.show()

### Sensitivity Figure (not in article)

In [32]:
W, H = 750, 400

sdt_metrics_fig = sm.sdt_distributions_figure(
    fixation_sdt,
    GT1, GT2,
    colors={k: v[1] for k, v in LABELER_PLOTTING_CONFIG.items()},
    only_box=False,
    show_other_gt=True,
    share_x=True,
)
sdt_metrics_fig.update_traces(width=0.9)     # make violins wider so there's less space between them

sdt_metrics_fig.update_layout(
    title=None,
    width=W, height=H,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',

    # remove axis grids
    xaxis=dict(showgrid=False, zeroline=False, showline=False), yaxis=dict(showgrid=False, zeroline=False, showline=False),
    xaxis2=dict(showgrid=False, zeroline=False, showline=False), yaxis2=dict(showgrid=False, zeroline=False, showline=False),
    xaxis3=dict(showgrid=False, zeroline=False, showline=False), yaxis3=dict(showgrid=False, zeroline=False, showline=False),
    xaxis4=dict(showgrid=False, zeroline=False, showline=False), yaxis4=dict(showgrid=False, zeroline=False, showline=False),

    # move legend to bottom
    legend=dict(orientation="h", yanchor="top", xanchor="center", xref='container', yref='container', x=0.5, y=0.05),
    showlegend=False,   # hide legend
    margin=dict(l=40, r=0, b=0, t=20, pad=0),
)

# move annotations to the top/left of the plot
# sdt_metrics_fig.for_each_annotation(lambda ann: ann.update(x=-0.05, textangle=-90) if ann.text in row_titles else ann.update(y=0.99))
# sdt_metrics_fig.layout.annotations = []    # remove subtitles

# FIG_ID, IS_SUPP = 8, True
# save_fig(sdt_metrics_fig, FIG_ID, "", f"detection-sample_level", IS_SUPP)
sdt_metrics_fig.show()

### Article Figure (Ridge plots)

In [62]:
GRID_LINE_COLOR, GRID_LINE_WIDTH = "lightgray", 1
ZERO_LINE_WIDTH = 2 * GRID_LINE_WIDTH

FONT_FAMILY, FONT_COLOR = "Calibri", "black"
TITLE_FONT = dict(family=FONT_FAMILY, size=22, color=FONT_COLOR)
AXIS_LABEL_FONT = dict(family=FONT_FAMILY, size=18, color=FONT_COLOR)
AXIS_TICK_FONT = dict(family=FONT_FAMILY, size=16, color=FONT_COLOR)
AXIS_LABEL_STANDOFF = 2

WIDTH, HEIGHT = 900, 1000
ROW_TITLES = ["Cohen's Kappa", "MCC", "1-<i>NLD</i>", "Fixation <i>d'</i>"]
COL_TITLES = [GT1, GT2]

In [78]:
def convert_violin_to_ridge(tr: go.Trace) -> go.Trace:
    new_trace = copy.deepcopy(tr)
    # change values across all traces
    new_trace["visible"] = True
    new_trace["side"] = "positive"
    new_trace["opacity"] = 0.95
    new_trace["width"] = 1.8
    new_trace["box"] = None
    new_trace["showlegend"] = False
    # convert to ridge plot
    if new_trace['x0'] == "Other GT":
        new_trace['y0'] = new_trace["name"] = new_trace["legendgroup"] = "2<sup>nd</sup> Ann."
    elif new_trace["x0"].startswith("i"):
        new_trace['y0'] = new_trace["name"] = new_trace["legendgroup"] = new_trace["x0"].replace("i", "I-").upper()
    elif new_trace["x0"] == "remodnav":
        new_trace["y0"] = new_trace["name"] = new_trace["legendgroup"] = "REMoDNaV"
    else:
        new_trace["y0"] = new_trace["name"] = new_trace["legendgroup"] = new_trace["x0"].upper()
    new_trace['x'] = new_trace['y']
    new_trace['y'] = new_trace['x0'] = None
    new_trace["meanline"] = dict(visible=True, width=3, color='lightgray')
    return new_trace

In [79]:
fig = make_subplots(
    rows=len(ROW_TITLES), cols=len(COL_TITLES),
    row_titles=ROW_TITLES, column_titles=COL_TITLES,
    shared_xaxes=False, shared_yaxes=True,
    vertical_spacing=0.06, horizontal_spacing=0.02,
)

# AGREEMENT MEASURES
for tr in global_metrics_fig.data:
    new_tr = convert_violin_to_ridge(tr)
    gt_name = tr["name"].split(',')[0]
    col = [GT1, GT2].index(gt_name) + 1
    row = [peyes.constants.COHENS_KAPPA_STR, peyes.constants.MCC_STR, peyes.constants.COMPLEMENT_NLD_STR].index(tr["scalegroup"]) + 1
    new_tr["showlegend"] = row == 0 and col == 0
    fig.add_trace(new_tr, row=row, col=col)

# SENSITIVITY MEASURES
for tr in sdt_metrics_fig.data:
    if not tr["scalegroup"].endswith("d_prime"):
        # ignore all traces that aren't d'
        continue
    new_tr = convert_violin_to_ridge(tr)
    gt_name = tr["name"].split(',')[0]
    col = [GT1, GT2].index(gt_name) + 1
    fig.add_trace(new_tr, row=len(ROW_TITLES), col=col)

# UPDATE LAYOUT
# update axes
fig.for_each_xaxis(lambda xax: xax.update(
    showline=False,
    zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
    showgrid=False, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
    range=[-0.025, 1.025] if xax['anchor'] not in ['y7', 'y8'] else None,
    tickfont=AXIS_TICK_FONT,
))
fig.for_each_yaxis(lambda yax: yax.update(
    title=dict(
        text="Detector" if yax['anchor'] in ['x', 'x3', 'x5', 'x7'] else "", font=AXIS_LABEL_FONT, standoff=AXIS_LABEL_STANDOFF
    ),
    showline=False,
    zeroline=False, zerolinecolor=GRID_LINE_COLOR, zerolinewidth=ZERO_LINE_WIDTH,
    showgrid=True, gridcolor=GRID_LINE_COLOR, gridwidth=GRID_LINE_WIDTH,
    tickfont=AXIS_TICK_FONT,
))

# update titles
fig.for_each_annotation(lambda ann: ann.update(
    font=TITLE_FONT if ann.text in COL_TITLES else AXIS_LABEL_FONT,
    xref='paper', xanchor='center',
    yref='paper', yanchor='top',
    textangle=0,
))
subtitle_ycoords = [0.78, 0.51, 0.25, -0.025]
for ann in fig.layout.annotations:
    if ann.text in COL_TITLES:
        ann['x'] = 0.25 if ann.text == COL_TITLES[0] else 0.75
        ann['y'] = 1.025
    elif ann.text in ROW_TITLES:
        ann['x'] = 0.49
        ann['y'] = subtitle_ycoords[ROW_TITLES.index(ann.text)]
    else:
        print(f"Unknown annotation: {ann.text}")

# update figure layout
fig.update_layout(
    font_family=FONT_FAMILY,
    width=WIDTH, height=HEIGHT,
    paper_bgcolor='rgba(0, 0, 0, 0)', plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(l=0, r=0, t=15, b=45, pad=0),
    showlegend=False,
)

NAME = "fig1-sample_agreement"
fig.write_image(os.path.join(FIGURES_DIR, f"{NAME}.png"), scale=3)
# fig.write_json(os.path.join(FIGURES_DIR, f"{NAME}.json"), indent=4)
fig.show()