# NSRR SHHS (Visit 1) Features extraction

https://sleepdata.org/datasets/shhs/pages/10-montage-and-sampling-rate-information-shhs1.md

https://sleepdata.org/datasets/shhs/variables/overall_shhs1

EEG is C4-M1 (sf = 125 Hz). Careful: EOG have a sampling rate of 50 Hz (Nyquist = 25 Hz), need to resample.

In [None]:
import os
import yasa
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from mne.io import read_raw_edf

root_dir = '/Volumes/NSRR/shhs/'
eeg_dir = root_dir + 'polysomnography/edfs/shhs1/'
hypno_dir = root_dir + 'polysomnography/annotations-events-profusion/shhs1/'

desc_dir = root_dir + 'datasets/shhs1-dataset-0.14.0.csv'

usecols = ['nsrrid', 'visitnumber', 'gender', 'age_s1', 'overall_shhs1', 
           'race', 'bmi_s1', 'ahi_a0h3', 'ethnicity']

df_subj = pd.read_csv(desc_dir, usecols=usecols)

# Rename columns
df_subj.rename(columns={'nsrrid': 'subj',
                        'gender1': 'gender',
                        'age_s1': 'age',
                        'overall_shhs1': 'overall',
                        'bmi_s1': 'bmi',
                        'ahi_a0h3': 'ahi',
                      }, inplace=True)

df_subj['race'].replace({1: 'caucasian', 2: 'african', 3: 'other'}, inplace=True)
df_subj.loc[df_subj['ethnicity'] == 1, 'race'] = 'hispanic'
df_subj.drop(columns=['ethnicity'], inplace=True)
df_subj.rename(columns={'race': 'ethnicity'}, inplace=True)

# Keep only "Excellent" quality study
print(df_subj[df_subj['overall'] < 6].shape[0], 
      'subjects with bad PSG data quality will be removed.')
df_subj = df_subj[df_subj['overall'] >= 6]

df_subj['male'] = (df_subj['gender'] == 1).astype(int)

# Keep only first visit
df_subj = df_subj[df_subj['visitnumber'] == 1]

# Convert to str
df_subj['subj'] = df_subj['subj'].apply(lambda x: str(x).zfill(4))
df_subj.set_index('subj', inplace=True)

# Export demographics to CSV file
df_subj['dataset'] = 'SHHS'
df_subj.to_csv("/Volumes/JAWA/SHERLOCK/demo/demo_nsrr_shhs.csv")

# Keep only a random subset of 600 subjects to avoid dataset imbalance
df_subj = df_subj.sample(n=600, replace=False, random_state=42)

print(df_subj.shape[0], 'subjects remaining')
df_subj.head(10)

In [None]:
df = []
include = ['EEG', 'EOG(L)', 'EMG']
sf = 100

for sub in tqdm(df_subj.index):
    eeg_file = eeg_dir + 'shhs1-' + str(sub) + '.edf'
    hypno_file = hypno_dir + 'shhs1-' + str(sub) + '-profusion.xml'

    # LOAD EEG DATA
    try:
        raw = read_raw_edf(eeg_file, preload=False, verbose=0)
        raw = read_raw_edf(eeg_file, preload=True, 
                           exclude=np.setdiff1d(raw.info['ch_names'], include), 
                           verbose=0)
    except:
        continue
        
    # Resample and high-pass filter 
    raw.resample(sf, npad="auto")
    
    # LOAD HYPNOGRAM
    hypno, sf_hyp = yasa.load_profusion_hypno(hypno_file)
    # We keep up to 15 minutes before / after sleep
    start_to_firstsleep_min = np.nonzero(hypno)[0][0] / 2
    lastsleep = np.nonzero(hypno)[0][-1]
    lastsleep_to_end_min = (len(hypno) - lastsleep) / 2
    tmin, tmax = 0, None  # must be in seconds
    if start_to_firstsleep_min > 15:
        tmin = (start_to_firstsleep_min - 15) * 60
    if lastsleep_to_end_min > 15:
        tmax = lastsleep * 30 + 15 * 60
    # Crop!
    raw.crop(tmin, tmax)
    if tmax is None:
        hypno = hypno[int(tmin / 60 * 2):]
    else:
        hypno = hypno[int(tmin / 60 * 2):int(tmax / 60 * 2)]
    # Hypno and data have the same number of epochs
    n_epochs = hypno.shape[0]
    if n_epochs != np.floor(raw.n_times / sf / 30):
        print("- Hypno and data size do not match.")
        continue
    
    # Convert hypnogram to str
    df_hypno = pd.Series(hypno)
    df_hypno.replace({0: 'W', 1: 'N1', 2: 'N2', 3: 'N3', 4: 'R'}, inplace=True)
    stage_min = df_hypno.value_counts(sort=False) / 2

    # INCLUSION CRITERIA
    # Hypnogram must include all stages
    if np.unique(hypno).tolist() != [0, 1, 2, 3, 4]:
        print("- Not all stages are present.")
        continue
    # If the duration is not between 4 to 12 hours, skip subject
    if not(4 < n_epochs / 120 < 12):
        print("- Recording too short/long.")
        continue
    # Requires at least 5 min of each stage
    # if (stage_min < 5).any():
    #    print("- Not 5 min of each stage.")
    #    continue
       
    # EXTRACT FEATURES
    metadata = dict(age=df_subj.loc[sub, 'age'], male=df_subj.loc[sub, 'male'])
    sls = yasa.SleepStaging(raw, eeg_name=include[0], eog_name=include[1], 
                            emg_name=include[2], metadata=metadata)

    features = sls.get_features().reset_index()
    features['subj'] = sub
    features.set_index(['subj', 'epoch'], inplace=True)
    
    # Add hypnogram
    features['stage'] = df_hypno.to_numpy()
    df.append(features)

df = pd.concat(df)

In [None]:
# Add dataset
df['dataset'] = 'shhs1'

# Convert to category
df['dataset'] = df['dataset'].astype('category')
df['stage'] = df['stage'].astype('category')

In [None]:
# %stage
df['stage'].value_counts(normalize=True, sort=True)

In [None]:
# Median value of the EEG IQR per stage
df.groupby('stage')['eeg_iqr'].median()

In [None]:
# Number of unique nights in dataset
df.index.get_level_values(0).nunique()

In [None]:
# Export
df.to_parquet("/Volumes/JAWA/SHERLOCK/features/features_nsrr_shhs1.parquet")