# Dreem Open Datasets validation

https://pubmed.ncbi.nlm.nih.gov/32746326/

https://github.com/Dreem-Organization/dreem-learning-open

https://github.com/Dreem-Organization/dreem-learning-evaluation

In [None]:
import io
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import pingouin as pg
import sklearn.metrics as skm
from tqdm.notebook import tqdm
import scipy.stats as sp_stats
import matplotlib.pyplot as plt

from helper_functions import NUM2STR, STR2NUM
from helper_functions import consensus_score, mean_std, median_iqr, perc_transition
sns.set(style="ticks", font_scale=1.1)

outdir = "output/plots/"

## Data loading

In [None]:
# Load predictions file
model = "eeg+eog+emg+demo"
path_dodh = "output/cv/%s/pred_dreem_dodh.csv" % model
path_dodo = "output/cv/%s/pred_dreem_dodo.csv" % model

df = pd.concat([
    pd.read_csv(path_dodh, index_col=[0, 1, 2]),
    pd.read_csv(path_dodo, index_col=[0, 1, 2])
])

# Map stages
labels = ['N1', 'N2', 'N3', 'R', 'W']
cols_stage = df.columns.tolist()[:-3]
print(cols_stage)

for c in cols_stage:
    df[c] = df[c].replace(NUM2STR)
    assert np.unique(df[c]).tolist() == labels

df.reset_index(inplace=True)

# Optional: keep specific dataset
# df = df[df['dataset'] == 'dodh'].reset_index(drop=True)

print(df['subj'].nunique(), 'subjects')
print(df.shape)
df.head().round(2)

In [None]:
# Optional: remove subjects for which the hypnogram is shorter than the EEG by one minute or more
df = df[df['pad'] <= 2].reset_index(drop=True)
print(df['subj'].nunique(), 'subjects remaining')

In [None]:
# Optional: remove subjects with an average inter-rater agreement below 0.7
# df = df[df['avg_human_agreement'] > 0.7].reset_index(drop=True)
# print(df['subj'].nunique(), 'subjects remaining')

In [None]:
print(f"{df.shape[0] / 120:.2f} hours of data")

********

## Calculate scores for each night

In [None]:
cols_scorer = df.columns[df.columns.str.startswith("scorer")].tolist()
print(cols_scorer)

df_scores = []

# Loop across nights
for sub in tqdm(df['subj'].unique(), leave=False):
    df_sub = df[df['subj'] == sub]
    
    # Loop across scorers
    for s in ['cons'] + ['yasa', 'stephansen', 'perslev'] + cols_scorer:
        if s in cols_scorer:
            # Consensus excluding current scorer (unbiased)
            other_scorers = np.setdiff1d(cols_scorer, s).tolist()
            yt = pd.Series(consensus_score(df_sub[other_scorers]), index=df_sub.index)
        else:
            yt = df_sub['cons']  # The reference is the human consensus

        n = yt.shape[0]
        yp = df_sub[s]

        sub_scores = {
            "dataset": df_sub['dataset'].iloc[0],
            "scorer": s,
            # Accuracy
            'accuracy': 100 * skm.accuracy_score(yt, yp),
            'kappa': 100 * skm.cohen_kappa_score(yt, yp),
            'mcc': 100 * skm.matthews_corrcoef(yt, yp),
            'f1_macro': 100 * skm.f1_score(yt, yp, average='macro', zero_division=1),
            # % Transitions
            # 'dur_min': yp.size / 2,
            'perc_trans': perc_transition(yp),
        }

        # F1 for each stage
        f1 = 100 * skm.f1_score(yt, yp, average=None, labels=labels, zero_division=1)
        for f, l in zip(f1, labels):
            sub_scores['f1_' + l] = f

        # Proportion of each stage
        prop = 100 * (yp.value_counts() / n).add_prefix('perc_')
        sub_scores.update(prop.to_dict())

        # Append to main dataframe
        df_scores.append(pd.DataFrame(sub_scores, index=[sub]))

df_scores = pd.concat(df_scores)
df_scores.index.name = 'subj'
df_scores = df_scores.sort_index(axis=1).set_index(["dataset", "scorer"], append=True)
df_scores.round(2)

In [None]:
# Fill the NaN in perc_XX by zero: CAREFUL
# df_scores.isna().sum(0)
df_scores.fillna(0, inplace=True)

In [None]:
# Split into DODH/DODO
df_scores_dodh = df_scores.xs("dodh", level=1)
df_scores_dodo = df_scores.xs("dodo", level=1)

metrics = ['accuracy', 'f1_N1', 'f1_N2', 'f1_N3', 'f1_R', 'f1_W', 'f1_macro']
scorers = ['yasa', 'stephansen', 'perslev'] + cols_scorer

def median_iqr(x):
    """Return the median and IQR."""
    from scipy.stats import iqr
    return f"{x.median():.1f} ± {iqr(x):.1f}"

In [None]:
df_scores_dodh.groupby(level=-1, sort=False).agg(median_iqr)

#### DODH

In [None]:
# DODH only: Table 2
dodh_table = df_scores_dodh.groupby(level=-1, sort=False).agg(median_iqr).T.loc[metrics, scorers]

# Add significance
for metric in metrics:
    
    # Calculate all pairwise tests yasa vs scorers
    ptests = (df_scores_dodh
              .reset_index()
              .pairwise_ttests(dv=metric, within="scorer", subject="subj", return_desc=False)
              [['A', 'B', 'T', 'dof', 'p-unc', 'hedges']]
              .set_index(['A', 'B'])
              .xs("yasa", level=1, drop_level=False)
              .drop(index=('cons', 'yasa'))
              .droplevel(1))
    
    # Adjust for multiple comparisons
    ptests['p-corr'] = pg.multicomp(ptests['p-unc'].to_numpy(), method="holm")[1]
    
    # print(metric)
    # display(ptests.round(3))

    for scorer in cols_scorer + ['stephansen', 'perslev']:
        pval = ptests.loc[scorer, 'p-corr']
        hedges = ptests.loc[scorer, 'hedges']
        if pval < 0.05:
            dodh_table.loc[metric, scorer] += "*"
            # dodh_table.loc[metric, scorer] += f"* ({hedges:.2f})"
            
dodh_table

In [None]:
with io.StringIO() as buffer:
    dodh_table.to_csv(buffer, sep=',', index=True)
    print(buffer.getvalue())

#### DODO

In [None]:
# Number of unique nights
df_scores_dodo.index.get_level_values(0).nunique()

In [None]:
# DODO only: Table 3
dodo_table = df_scores_dodo.groupby(level=-1, sort=False).agg(median_iqr).T.loc[metrics, scorers]

# Add significance
for metric in metrics:
    
    # Calculate all pairwise tests yasa vs scorers
    ptests = (df_scores_dodo
              .reset_index()
              .pairwise_ttests(dv=metric, within="scorer", subject="subj", return_desc=True)
              [['A', 'B', 'T', 'dof', 'p-unc', 'hedges']]
              .set_index(['A', 'B'])
              .xs("yasa", level=1, drop_level=False)
              .drop(index=('cons', 'yasa'))
              .droplevel(1))
    
    # Adjust for multiple comparisons
    ptests['p-corr'] = pg.multicomp(ptests['p-unc'].to_numpy(), method="holm")[1]
    
    # print(metric)
    # display(ptests.round(3))

    for scorer in cols_scorer + ['stephansen', 'perslev']:
        pval = ptests.loc[scorer, 'p-corr']
        hedges = ptests.loc[scorer, 'hedges']
        if pval < 0.05:
            dodo_table.loc[metric, scorer] += "*"
            # dodo_table.loc[metric, scorer] += f"* ({hedges:.2f})"
            
dodo_table

In [None]:
with io.StringIO() as buffer:
    dodo_table.to_csv(buffer, sep=',', index=True)
    print(buffer.getvalue())

### Boxplots

In [None]:
cmap = list(sns.color_palette("Blues", n_colors=10, as_cmap=False, desat=1))
color_pred = cmap[-1]
color_ref = "tab:orange"
cmap_stages = ['#99d7f1', '#009DDC', 'xkcd:twilight blue', 'xkcd:rich purple', 'xkcd:sunflower']

df_f1 = df_scores[['f1_N1', 'f1_N2', 'f1_N3', 'f1_R', 'f1_W']].copy()
df_f1.columns = df_f1.columns.str.split('_').str.get(1)

df_f1_dodh = df_f1.xs("dodh", level=1)
df_f1_dodo = df_f1.xs("dodo", level=1)

### DODH

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4), sharex=True, sharey=True)

sns.boxplot(data=df_f1_dodh.xs("yasa", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax1)
sns.boxplot(data=df_f1_dodh.xs("stephansen", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax2)
sns.boxplot(data=df_f1_dodh.xs("perslev", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax3)

ax1.set_title("YASA")
ax2.set_title("Stephansen 2018")
ax3.set_title("Perslev 2021")

ax1.set_xlabel("Stage")
ax2.set_xlabel("Stage")
ax3.set_xlabel("Stage")
ax1.set_ylabel("F1-score")
ax1.set_ylim(0, 103)
sns.despine()

plt.savefig(outdir + "cv_F1_DODH_algorithms.png", dpi=300, bbox_inches="tight")

#### DODO

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4), sharex=True, sharey=True)

sns.boxplot(data=df_f1_dodo.xs("yasa", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax1)
sns.boxplot(data=df_f1_dodo.xs("stephansen", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax2)
sns.boxplot(data=df_f1_dodo.xs("perslev", level=-1), palette=cmap_stages, width=0.6, fliersize=0, ax=ax3)

ax1.set_title("YASA")
ax2.set_title("Stephansen 2018")
ax3.set_title("Perslev 2021")

ax1.set_xlabel("Stage")
ax2.set_xlabel("Stage")
ax3.set_xlabel("Stage")
ax1.set_ylabel("F1-score")
ax1.set_ylim(0, 103)
sns.despine()

plt.savefig(outdir + "cv_F1_DODO_algorithms.png", dpi=300, bbox_inches="tight")

*****

## Confusion matrices

In [None]:
df_dodo = df[df['dataset'] == "dodo"]
df_dodh = df[df['dataset'] == "dodh"]

#### DODO

In [None]:
# Calculate sensitivity confusion matrices
cm_yasa = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['yasa'], labels=labels, normalize="true")
cm_yasa = pd.DataFrame(cm_yasa, index=labels, columns=labels).round(1)

cm_stephansen = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['stephansen'], labels=labels, normalize="true")
cm_stephansen = pd.DataFrame(cm_stephansen, index=labels, columns=labels).round(1)

cm_perslev = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['perslev'], labels=labels, normalize="true")
cm_perslev = pd.DataFrame(cm_perslev, index=labels, columns=labels).round(1)

# Plot
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4), sharey=True)

hmap_params = dict(annot=True, vmin=0, vmax=100, cmap="Blues", square=True, cbar=False, fmt=".1f")
sns.heatmap(cm_yasa, **hmap_params, ax=ax1)
sns.heatmap(cm_stephansen, **hmap_params, ax=ax2)
sns.heatmap(cm_perslev, **hmap_params, ax=ax3)

ax1.set_ylabel("Reference (human consensus)")
ax1.set_xlabel("Predicted")
ax2.set_xlabel("Predicted")
ax3.set_xlabel("Predicted")

ax1.set_title("YASA")
ax2.set_title("Stephansen 2018")
ax3.set_title("Perslev 2021")

plt.savefig(outdir + "cv_confusion_DODO_algorithms.png", dpi=300, bbox_inches="tight")

#### Individual human scorer

In [None]:
# Calculate sensitivity confusion matrices
cm_h1 = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['scorer_1'], labels=labels, normalize="true")
cm_h1 = pd.DataFrame(cm_h1, index=labels, columns=labels).round(1)

cm_h2 = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['scorer_2'], labels=labels, normalize="true")
cm_h2 = pd.DataFrame(cm_h2, index=labels, columns=labels).round(1)

cm_h3 = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['scorer_3'], labels=labels, normalize="true")
cm_h3 = pd.DataFrame(cm_h3, index=labels, columns=labels).round(1)

cm_h4 = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['scorer_4'], labels=labels, normalize="true")
cm_h4 = pd.DataFrame(cm_h4, index=labels, columns=labels).round(1)

cm_h5 = 100 * skm.confusion_matrix(df_dodo['cons'], df_dodo['scorer_5'], labels=labels, normalize="true")
cm_h5 = pd.DataFrame(cm_h5, index=labels, columns=labels).round(1)


# Plot
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(10, 8), sharey=True)

hmap_params = dict(annot=True, vmin=0, vmax=100, cmap="Blues", square=True, cbar=False, fmt=".1f")
sns.heatmap(cm_h1, **hmap_params, ax=ax1)
sns.heatmap(cm_h2, **hmap_params, ax=ax2)
sns.heatmap(cm_h3, **hmap_params, ax=ax3)
sns.heatmap(cm_h4, **hmap_params, ax=ax4)
sns.heatmap(cm_h5, **hmap_params, ax=ax5)

ax1.set_ylabel("N-1 consensus")
ax4.set_ylabel("N-1 consensus")
ax1.set_xlabel("Predicted")
ax2.set_xlabel("Predicted")
ax3.set_xlabel("Predicted")
ax4.set_xlabel("Predicted")
ax5.set_xlabel("Predicted")

ax1.set_title("H1")
ax2.set_title("H2")
ax3.set_title("H3")
ax4.set_title("H4")
ax5.set_title("H5")

ax6.axis('off');

plt.savefig(outdir + "cv_confusion_DODO_humans.png", dpi=300, bbox_inches="tight")

### DODH

In [None]:
# Calculate sensitivity confusion matrices
cm_yasa = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['yasa'], labels=labels, normalize="true")
cm_yasa = pd.DataFrame(cm_yasa, index=labels, columns=labels).round(1)

cm_stephansen = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['stephansen'], labels=labels, normalize="true")
cm_stephansen = pd.DataFrame(cm_stephansen, index=labels, columns=labels).round(1)

cm_perslev = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['perslev'], labels=labels, normalize="true")
cm_perslev = pd.DataFrame(cm_perslev, index=labels, columns=labels).round(1)

# Plot
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4), sharey=True)

hmap_params = dict(annot=True, vmin=0, vmax=100, cmap="Blues", square=True, cbar=False, fmt=".1f")
sns.heatmap(cm_yasa, **hmap_params, ax=ax1)
sns.heatmap(cm_stephansen, **hmap_params, ax=ax2)
sns.heatmap(cm_perslev, **hmap_params, ax=ax3)


ax1.set_ylabel("Reference (human consensus)")
ax1.set_xlabel("Predicted")
ax2.set_xlabel("Predicted")
ax3.set_xlabel("Predicted")

ax1.set_title("YASA")
ax2.set_title("Stephansen 2018")
ax3.set_title("Perslev 2021")

plt.savefig(outdir + "cv_confusion_DODH_algorithms.png", dpi=300, bbox_inches="tight")

#### Individual human scorer

In [None]:
# Calculate sensitivity confusion matrices
cm_h1 = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['scorer_1'], labels=labels, normalize="true")
cm_h1 = pd.DataFrame(cm_h1, index=labels, columns=labels).round(1)

cm_h2 = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['scorer_2'], labels=labels, normalize="true")
cm_h2 = pd.DataFrame(cm_h2, index=labels, columns=labels).round(1)

cm_h3 = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['scorer_3'], labels=labels, normalize="true")
cm_h3 = pd.DataFrame(cm_h3, index=labels, columns=labels).round(1)

cm_h4 = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['scorer_4'], labels=labels, normalize="true")
cm_h4 = pd.DataFrame(cm_h4, index=labels, columns=labels).round(1)

cm_h5 = 100 * skm.confusion_matrix(df_dodh['cons'], df_dodh['scorer_5'], labels=labels, normalize="true")
cm_h5 = pd.DataFrame(cm_h5, index=labels, columns=labels).round(1)


# Plot
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(10, 8), sharey=True)

hmap_params = dict(annot=True, vmin=0, vmax=100, cmap="Blues", square=True, cbar=False, fmt=".1f")
sns.heatmap(cm_h1, **hmap_params, ax=ax1)
sns.heatmap(cm_h2, **hmap_params, ax=ax2)
sns.heatmap(cm_h3, **hmap_params, ax=ax3)
sns.heatmap(cm_h4, **hmap_params, ax=ax4)
sns.heatmap(cm_h5, **hmap_params, ax=ax5)

ax1.set_ylabel("N-1 consensus")
ax4.set_ylabel("N-1 consensus")
ax1.set_xlabel("Predicted")
ax2.set_xlabel("Predicted")
ax3.set_xlabel("Predicted")
ax4.set_xlabel("Predicted")
ax5.set_xlabel("Predicted")

ax1.set_title("H1")
ax2.set_title("H2")
ax3.set_title("H3")
ax4.set_title("H4")
ax5.set_title("H5")

ax6.axis('off');

plt.savefig(outdir + "cv_confusion_DODH_humans.png", dpi=300, bbox_inches="tight")

********

## Stage discrepancies

### Percentage of transitions

In [None]:
# DODO
pg.ttest(
    df_scores_dodo.xs("cons", level=-1)['perc_trans'], 
    df_scores_dodo.xs("yasa", level=-1)['perc_trans'], 
    paired=True).round(4)

In [None]:
# DODH
pg.ttest(
    df_scores_dodh.xs("cons", level=-1)['perc_trans'], 
    df_scores_dodh.xs("yasa", level=-1)['perc_trans'], 
    paired=True).round(4)

### Stage proportion

In [None]:
cols_perc = ['perc_' + c for c in labels]
df_prop = df_scores[cols_perc].melt(var_name="stage", value_name="proportion", ignore_index=False).reset_index()
df_prop = df_prop[df_prop['scorer'].isin(['cons', 'yasa'])]
df_prop['scorer'].replace({"cons": "Consensus", "yasa": "YASA"}, inplace=True)
df_prop['stage'] = df_prop['stage'].str.split("_").str.get(1)
df_prop

In [None]:
# Calculate the effect size
ptest = df_prop.pairwise_ttests(dv="proportion", within=['stage', "scorer"], subject="subj", effsize="cohen").iloc[11:, :].round(3)
ef = ptest.loc[:, ['stage', 'cohen']].set_index("stage").abs()
display(ef)

# Plot
fig, ax = plt.subplots(1, 1, figsize=(4, 4))

sns.boxplot(
    y=df_prop['proportion'], x=df_prop['stage'], hue=df_prop['scorer'],
    hue_order=['Consensus', 'YASA'], 
    palette=[color_ref, color_pred], 
    saturation=1, width=0.6, fliersize=0, linewidth=1.5, notch=True);

plt.ylim(0, 80)
plt.yticks([0, 20, 40, 60, 80])
plt.legend(frameon=False, loc="upper right")
plt.ylabel("Proportion of time in bed (%)");

sns.despine()
plt.tight_layout()

*******

## Additional analyses 

### Stage transition and confidence

Here, we use PSG consensus-hypnogram to define the transitions between stages.

In [None]:
df_trans = []

for sub in tqdm(df['subj'].unique(), leave=False):
    df_sub = df[df['subj'] == sub]
    dataset = df_sub['dataset'].iloc[0]
    yt = df_sub['cons']
    yp = df_sub['yasa']
    n = yt.size

    # Identify stable periods, i.e. the 3 epochs before / after are similar (3 minutes window)
    first_ep, last_ep = yt.iloc[0], yt.iloc[-1]
    stable = np.logical_and.reduce((
        yt.shift(1, fill_value=first_ep) == yt,  # = same as previous one
        yt.shift(-1, fill_value=last_ep) == yt, # = same as next one
        yt.shift(2, fill_value=first_ep) == yt,
        yt.shift(-2, fill_value=last_ep) == yt,
        yt.shift(3, fill_value=first_ep) == yt,
        yt.shift(-3, fill_value=last_ep) == yt,
    ))
    
    # Does low human inter-rater agreement occur mostly around stage transitions?
    consensus = (df_sub[cols_scorer].nunique(1) == 1).to_numpy()
    
    # Confidence of the algorithm
    highconf = (df_sub['confidence'] >= 0.8).to_numpy()
    
    # Append to main dict
    sub_scores = {
        # Stage transition
        'p_stable': len(stable[stable]) / n,
        'p_trans': len(stable[~stable]) / n,
        'p_consensus': consensus.sum() / n,
        'p_nocons': np.sum(~consensus) / n,
        
        'p_stable_and_consensus':(stable & consensus).sum() / n,
        'p_stable_and_nocons': (stable & ~consensus).sum() / n,
        'p_trans_and_consensus': (~stable & consensus).sum() / n,
        'p_trans_and_nocons': (~stable & ~consensus).sum() / n,
        'acc_stable': skm.accuracy_score(yt[stable], yp[stable]),
        'acc_trans': skm.accuracy_score(yt[~stable], yp[~stable]),
        
        # Confidence
        'accuracy': skm.accuracy_score(yt, yp),
        'avg_confidence': df_sub['confidence'].mean(),
        'p_highconf': len(highconf[highconf]) / n,
        'p_lowconf': len(highconf[~highconf]) / n,
        'p_highconf_and_consensus':(highconf & consensus).sum() / n,
        'p_highconf_and_nocons': (highconf & ~consensus).sum() / n,
        'p_lowconf_and_consensus': (~highconf & consensus).sum() / n,
        'p_lowconf_and_nocons': (~highconf & ~consensus).sum() / n,
        'acc_highconf': skm.accuracy_score(yt[highconf], yp[highconf]),
        'acc_lowconf': skm.accuracy_score(yt[~highconf], yp[~highconf]),
    }

    # Append to main dataframe
    tmp = 100 * pd.DataFrame(sub_scores, index=[sub])
    tmp.index.name = "subj"
    tmp['dataset'] = dataset
    tmp.set_index("dataset", append=True, inplace=True)
    df_trans.append(tmp)

df_trans = pd.concat(df_trans)
df_trans.sort_index(axis=1, inplace=True)
df_trans.round(3)

In [None]:
# Accuracy x Stage transition
display(df_trans[['acc_stable', 'acc_trans']].apply(mean_std))
pg.ttest(df_trans['acc_stable'], df_trans['acc_trans'], paired=False).round(3)

In [None]:
# Accuracy x Confidence
display(df_trans[['acc_highconf', 'acc_lowconf']].apply(mean_std))
pg.ttest(df_trans['acc_highconf'], df_trans['acc_lowconf'], paired=False).round(3)

In [None]:
# Stage transition x unanimous consensus
display(df_trans[['p_stable_and_consensus', 'p_trans_and_consensus']].apply(mean_std))
pg.ttest(df_trans['p_stable_and_consensus'], df_trans['p_trans_and_consensus'], paired=False).round(3)

In [None]:
# Confidence x unanimous consensus
display(df_trans[['p_highconf_and_consensus', 'p_lowconf_and_consensus']].apply(mean_std))
pg.ttest(df_trans['p_highconf_and_consensus'], df_trans['p_lowconf_and_consensus'], paired=False).round(3)

In [None]:
# Correlation % high confidence epochs vs % unanimous consensus epochs
pg.corr(df_trans['p_highconf'], df_trans['p_consensus']).round(3)

In [None]:
# Average YASA confidence in sleep apnea vs healthy individuals
display(df_trans.groupby('dataset')['avg_confidence'].apply(mean_std))
df_trans.reset_index().pairwise_ttests(dv="avg_confidence", between="dataset")

In [None]:
# Percent high confidence in sleep apnea vs healthy individuals
display(df_trans.groupby('dataset')['p_highconf'].apply(mean_std))
df_trans.reset_index().pairwise_ttests(dv="p_highconf", between="dataset")

In [None]:
# Percent unanimous consensus in sleep apnea vs healthy individuals
display(df_trans.groupby('dataset')['p_consensus'].apply(mean_std))
df_trans.reset_index().pairwise_ttests(dv="p_consensus", between="dataset")

### Confidence x Accuracy

In [None]:
corr_conf = df_trans.xs("dodh", level=-1).pairwise_corr(['avg_confidence', 'accuracy']).round(3)
display(corr_conf)

fig, ax = plt.subplots(1, 1, figsize=(3.5, 3.5), dpi=100)
sns.regplot(data=df_trans.xs("dodh", level=-1), x="avg_confidence", y="accuracy", truncate=True, order=1,
            scatter_kws={"s": 20, "alpha": .2, "lw": 1},
            line_kws={"color": "k", "lw": 3}, 
            color=color_pred, ax=ax)
plt.xlim(60, 100)
plt.ylim(50, 100)
plt.xlabel("Average confidence")
plt.ylabel("Accuracy")

plt.annotate("r=%.2f" % corr_conf.loc[0, 'r'], (0.6, 0.1), xycoords="axes fraction", fontstyle="italic")
sns.despine()
plt.tight_layout()
plt.savefig(outdir + "cv_accuracy_confidence_DODH.png", dpi=300, bbox_inches="tight")

In [None]:
corr_conf = df_trans.xs("dodo", level=-1).pairwise_corr(['avg_confidence', 'accuracy']).round(3)
display(corr_conf)

fig, ax = plt.subplots(1, 1, figsize=(3.5, 3.5), dpi=100)
sns.regplot(data=df_trans.xs("dodo", level=-1), x="avg_confidence", y="accuracy", truncate=True, order=1,
            scatter_kws={"s": 20, "alpha": .2, "lw": 1},
            line_kws={"color": "k", "lw": 3}, 
            color=color_pred, ax=ax)
plt.xlim(60, 100)
plt.ylim(50, 100)
plt.xlabel("Average confidence")
plt.ylabel("Accuracy")

plt.annotate("r=%.2f" % corr_conf.loc[0, 'r'], (0.6, 0.1), xycoords="axes fraction", fontstyle="italic")
sns.despine()
plt.tight_layout()
plt.savefig(outdir + "cv_accuracy_confidence_DODO.png", dpi=300, bbox_inches="tight")

***
## Plot hypnogram

Ranked by YASA accuracy (from highest to lowest accuracy)

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

# Change Seaborn style
sns.set(style="darkgrid", font_scale=1.2)

dic_ylabel = {
    'cons': "Consensus", 
    "yasa": "YASA", 
    'stephansen': "Stephansen 2018", 
    "perslev": "Perslev 2021"}

for dataset in ['dodh', 'dodo']:
    pp = PdfPages("output/plots/%s_hypnograms.pdf" % dataset)
    
    # Find subject order
    order = (
        df_scores.xs((dataset, "yasa"), level=[1, 2])
                 .sort_values("accuracy", ascending=False)
                 .index.get_level_values(0).tolist()
    )

    for subj in tqdm(order):
        df_subj = df[df['subj'] == subj].copy().replace(STR2NUM)
        t_hyp = np.arange(df_subj.shape[0]) / 120
        hypnos = df_subj[['cons', 'yasa', 'stephansen', 'perslev']].copy()
        hypnos.replace({0: 0, 1: 2, 2: 3, 3: 4, 4: 1}, inplace=True)  # REM is now 1
        hypnos_REM = hypnos.where(hypnos == 1)

        fig, axes = plt.subplots(nrows=4, figsize=(10, 10), sharex=True, sharey=True)
        plt.subplots_adjust(hspace=0.2)

        for i, ax in enumerate(axes):
            # Hypnogram (top axis)
            ax.step(t_hyp, -1 * hypnos.iloc[:, i], lw=1.5, color='k')
            ax.step(t_hyp, -1 * hypnos_REM.iloc[:, i], lw=1.5, color='tab:red')

            # No artefacts or Unscored
            ax.set_yticks([0, -1, -2, -3, -4])
            ax.set_yticklabels(['W', 'R', 'N1', 'N2', 'N3'])
            ax.set_ylim(-4.5, 0.5)

            ax.set_xlim(0, t_hyp.max())
            ax.xaxis.set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['top'].set_visible(False)
            ax.set_ylabel(dic_ylabel[hypnos.iloc[:, i].name], fontweight="bold")

            # Annotate accuracy
            if i > 0:
                acc = 100 * skm.accuracy_score(hypnos.iloc[:, 0], hypnos.iloc[:, i])
                ax.annotate(
                    f"Accuracy = {acc:.2f}%", xy=(1, 0.1), xycoords="axes fraction", 
                    ha="right", color="tab:blue", fontweight="bold")


        ax.xaxis.set_visible(True)
        ax.set_xlabel("Time (hours)")
        
        axes[0].set_title(f"{subj}", fontweight="bold")
        plt.tight_layout()
        pp.savefig(dpi=300)
        plt.close()

    pp.close()