# Demographcis of the NSRR training/testing sets

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import pingouin as pg
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
sns.set(style="ticks", font_scale=1.25)

from helper_functions import mean_std

# Define paths
wdir_demo = "output/demo/"
outdir = "output/plots/"

## Data loading

In [None]:
# First, let's merge with the main demographics
df_demo = pd.read_csv(wdir_demo + "demo_nsrr_all.csv")

print(df_demo.shape)
print(df_demo['set'].value_counts())
df_demo.head()

### Training set

In [None]:
df_training = pd.read_parquet("output/features/features_all.parquet", columns=['age']).reset_index()
print(df_training['subj'].nunique(), 'subjects')
print(df_training.shape)
df_training.head().round(2)

### Testing set

In [None]:
# Choose model
model = "eeg+eog+emg+demo"
feat_files = glob.glob("output/cv/%s/cv_loo_nsrr_*.parquet" % model)

df_testing = []
for f in feat_files:
    df_testing.append(pd.read_parquet(f))
    
df_testing = pd.concat(df_testing)
df_testing['subj'] = df_testing['subj'].astype(str)
df_testing['dataset'] = df_testing['dataset'].str.upper()

print(df_testing['subj'].nunique(), 'subjects')
print(df_testing.shape)
df_testing.head().round(2)

In [None]:
# Keep index of subjects included in the training/testing set
idx_training = df_training['subj'].unique().tolist()
idx_testing = df_testing['subj'].unique().tolist()
df_demo = df_demo[df_demo['subj'].isin(idx_training + idx_testing)].reset_index(drop=True)

# Add apnea severity
df_demo['apnea'] = pd.cut(
    df_demo['ahi'], bins=[0, 5, 15, 30, np.inf], right=False, include_lowest=True, 
    labels=["None", "Mild", "Moderate", "Severe"])

df_demo.round(2)

In [None]:
# Optional: remove HOMEPAP
# df_demo = df_demo[~df_demo['dataset'].isin(['HOMEPAP'])].reset_index(drop=True)

********

## Descriptive statistics

In [None]:
grp_set = df_demo.groupby("set")

### Continuous variable

In [None]:
# Number of nights
grp_set['age'].count()

In [None]:
table_desc = grp_set[['age', 'bmi', 'ahi']].agg(mean_std).T

for dv in table_desc.index:
    tt = df_demo.pairwise_ttests(dv=dv, between="set")
    table_desc.loc[dv, 'T'] = tt.loc[0, 'T']
    table_desc.loc[dv, 'dof'] = tt.loc[0, 'dof']
    table_desc.loc[dv, 'p-unc'] = tt.loc[0, 'p-unc']
    table_desc.loc[dv, 'hedges'] = tt.loc[0, 'hedges']

table_desc.round(3)

#### Age

In [None]:
display(df_demo.groupby(['set', 'dataset'])['age'].agg(['mean', 'std', 'min', 'max']).round(2))


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4.5), sharey=True, sharex=True)
sns.histplot(df_demo[df_demo['set'] == 'training']['age'], 
             bins=np.linspace(0, 100, 11), stat='probability', alpha=0.95, ax=ax1)
sns.histplot(df_demo[df_demo['set'] == 'testing']['age'], 
             bins=np.linspace(0, 100, 11), stat='probability', alpha=0.95, ax=ax2)

ax1.set_xlabel("Age (yrs)")
ax2.set_xlabel("Age (yrs)")

ax1.set_title("Training set")
ax2.set_title("Testing set")

plt.xlim(0, 100)
plt.xticks([0, 20, 40, 60, 80, 100])

sns.despine()

plt.savefig(outdir + "hist_age.png", dpi=300, bbox_inches="tight")

#### AHI

In [None]:
df_demo.groupby(['set', 'dataset'])['ahi'].agg(['mean', 'std', 'min', 'max']).round(2)

### Categorical variables

In [None]:
def value_counts_chi2(dv, normalize=True):
    """Show value counts and chi-square test.
    
    NaN values are droppped by chi2_independence (see pd.crosstab)"""
    if normalize:
        display(100 * grp_set[dv].value_counts(normalize=normalize, sort=False).round(5))
    else:
        display(grp_set[dv].value_counts(sort=False).round(5))
    print("")
    display(pg.chi2_independence(df_demo, x=dv, y='set', correction=True)[2].iloc[0, :])

In [None]:
# Sex and chi2-test
value_counts_chi2("male")

In [None]:
# Ethnicity
value_counts_chi2("ethnicity")

In [None]:
# Dataset
value_counts_chi2("dataset")

In [None]:
grp_set['dataset'].value_counts()

In [None]:
# Apnea severity
value_counts_chi2("apnea")

In [None]:
# Insomnia
value_counts_chi2("insomnia")

In [None]:
# Narcolepsy
value_counts_chi2("narcolepsy")

In [None]:
# Depression
value_counts_chi2("depression")

In [None]:
# Diabetes
value_counts_chi2("diabete")

In [None]:
# Hypertension
value_counts_chi2("hypertension")