# NSRR HomePAP Features Extraction

Note that we can only use in-lab recording because the at-home recording do not have the necessary channels.

We also only use the "full" night and not the split-lab (during which the participants proceeded with CPAP titration).

**WARNING:** 

1) C4 is C4-FPZ.

https://sleepdata.org/datasets/homepap/pages/montage-and-sampling-rate-information.md

In [None]:
import os
import os
import yasa
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from mne.io import read_raw_edf
from preprocessing import crop_hypno, extract_features

# Define paths
root_dir = '/Volumes/NSRR/homepap/'
eeg_dir = root_dir + 'polysomnography/edfs/lab/full/'
hypno_dir = root_dir + 'polysomnography/annotations-events-profusion/lab/full/'
parent_dir = os.path.dirname(os.getcwd())
out_dir = parent_dir + '/output/features/'

# Keep training set of HomePaP
df_subj = pd.read_csv(parent_dir + "/output/demo/demo_nsrr_all.csv")
df_subj = df_subj.query("dataset == 'HOMEPAP' and set == 'training'").set_index("subj")

In [None]:
df = []
sf = 100

for sub in tqdm(df_subj.index):
    eeg_file = eeg_dir + 'homepap-lab-full-' + sub + '.edf'
    hypno_file = hypno_dir + 'homepap-lab-full-' + sub + '-profusion.xml'

    try:
        raw = read_raw_edf(eeg_file, preload=False, verbose=0)
        chan = raw.info['ch_names']
        # Try different combinations of channels
        # Do not delete! Channels have different names in HomePAP.
        eeg_chan = np.intersect1d(chan, ['C4-M1', 'C4'])[0]
        loc_chan = np.intersect1d(chan, ['E1', 'E-1', 'L-EOG'])[0]
        emg_chan = np.intersect1d(chan, ['Lchin', 'LChin', 'Chin1-Chin2', 'EMG1', 'LCHIN'])[0]
        include = [eeg_chan, loc_chan, emg_chan]
        raw.drop_channels(np.setdiff1d(raw.info['ch_names'], include))
        # Skip subjects if channel were not found
        raw.load_data()
    except:
        continue
        
    # Resample and low-pass filter 
    raw.resample(sf, npad="auto")
    
    # LOAD HYPNOGRAM
    hypno, sf_hyp = yasa.load_profusion_hypno(hypno_file)
    # We keep up to 15 minutes before / after sleep
    hypno, tmin, tmax = crop_hypno(hypno)
    # Crop EEG data
    raw.crop(tmin, tmax)
    
    # Hypno and data have the same number of epochs
    n_epochs = hypno.shape[0]
    if n_epochs != np.floor(raw.n_times / sf / 30):
        print("- Hypno and data size do not match.")
        continue
    
    # Convert hypnogram to str
    df_hypno = pd.Series(hypno)
    df_hypno.replace({0: 'W', 1: 'N1', 2: 'N2', 3: 'N3', 4: 'R'}, inplace=True)
    stage_min = df_hypno.value_counts(sort=False) / 2

    # INCLUSION CRITERIA
    # Hypnogram must include all stages
    if np.unique(hypno).tolist() != [0, 1, 2, 3, 4]:
        print("- Not all stages are present.")
        continue
    # If the duration is not between 4 to 12 hours, skip subject
    if not(4 < n_epochs / 120 < 12):
        print("- Recording too short/long.")
        continue
       
    # EXTRACT FEATURES
    features = extract_features(df_subj, sub, raw, include)
    # Add hypnogram
    features['stage'] = df_hypno.to_numpy()
    df.append(features)

df = pd.concat(df)

In [None]:
# Add dataset
df['dataset'] = 'homepap'

# Convert to category
df['dataset'] = df['dataset'].astype('category')
df['stage'] = df['stage'].astype('category')

In [None]:
# Show %stage
df['stage'].value_counts(normalize=True, sort=True)

In [None]:
# Number of unique nights in dataset
df.index.get_level_values(0).nunique()

In [None]:
# Median value of the EEG IQR per stage
df.groupby('stage')['eeg_iqr'].median()

In [None]:
# Export to Parquet
df.to_parquet(out_dir + "features_nsrr_homepap.parquet")