https://github.com/Dreem-Organization/dreem-learning-evaluation

In [None]:
import os
import mne
import json
import glob
import yasa
import h5py
import pyedflib
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from preprocessing import consensus_scores

DATASET = 'dodo'

# Define paths
eeg_dir = "/Volumes/JAWA/PSG_DATASETS/dod/%s/h5/" % DATASET
hypno_dir = "/Volumes/JAWA/PSG_DATASETS/dod/%s/scores/" % DATASET
parent_dir = os.path.dirname(os.getcwd())
edf_output_dir = "/Volumes/JAWA/PSG_DATASETS/dod/%s/edf/" % DATASET

if not os.path.isdir(edf_output_dir):
    os.mkdir(edf_output_dir)

# Scorers and stage mapping
scorers = ["scorer_" + str(i) for i in [1, 2, 3, 4, 5]]
stage_mapping = {'W': 0, "N1": 1, "N2": 2, "N3": 3, "R": 4}

# YASA classifier
model = "eeg+eog+emg+demo"
path_to_model = parent_dir + '/output/classifiers/clf_%s_lgb_gbdt_custom.joblib' % model
scorers = ["scorer_" + str(i) for i in [1, 2, 3, 4, 5]]
stage_mapping = {'W': 0, "N1": 1, "N2": 2, "N3": 3, "R": 4}

all_files = sorted(glob.glob(eeg_dir + "*.h5"))
print(len(all_files), "files were found.")

In [None]:
df_pred = []

for f in tqdm(all_files):
    
    ###############################################################################################
    # LOAD AND PROCESS PSG DATA
    ###############################################################################################
    
    h5 = h5py.File(f, 'r')
    fname = h5.attrs['record_id'].decode('UTF-8')
    subj = fname.split('-')[0]
    # print(fname)
    desc = json.loads(h5.attrs['description'])
    # list(h5.attrs)
    sf = desc[0]['fs']  # = 250 Hz
    eeg = h5["signals/eeg/C3_M2"][:]
    eog = h5["signals/eog/EOG1"][:]
    emg = h5["signals/emg/EMG"][:]
    # Check the unit
    # display(pd.DataFrame(np.vstack((eeg, eog, emg))).T.describe().round(2))
    try:
        start_time = datetime.fromtimestamp(h5.attrs['start_time'])
    except KeyError:
        start_time=None
    n_epochs_eeg = eeg.size / sf / 30
    
    # Export to EDF for validation against other algorithms (apply only once)
#     header = pyedflib.highlevel.make_header(patientname=subj, startdate=start_time)
#     signal_headers, edf_data = [], []
#     for i, c in enumerate(desc):
#         # Skip ECG because of invalid physical min/max
#         if c['name'] == "ECG":
#             continue
#         # print(c['name'], h5[c['path']][:].min(), h5[c['path']][:].max())
#         if c['name'] == "EOG":
#             c['name'] = c['path'].split('/')[-1]  # EOG1 / EOG2
#         signal_headers.append(
#             dict(
#                 label=c['name'], dimension='uV', sample_rate=c['fs'], physical_min=-2000.0, 
#                 physical_max=2000.0, digital_min=-32768, digital_max=32767, transducer="", 
#                 prefilter=""))
#         edf_data.append(h5[c['path']][:])

#     pyedflib.highlevel.write_edf(
#         edf_file=edf_output_dir + subj + ".edf",
#         signals=edf_data, 
#         signal_headers=signal_headers,
#         header=header)
    
    ###############################################################################################
    # LOAD AND PROCESS HYPNOGRAMS
    ###############################################################################################
    
    # Load consensus hypnogram and convert to 2D
    hypnos = []
    for s in scorers:
        hyp_file = hypno_dir + s + "/" + fname + ".json"
        if not os.path.isfile(hyp_file): continue
        hyp = json.load(open(hyp_file, "r"))
        hypnos.append(hyp)
    if not len(hypnos): continue
    hypnos = np.vstack(hypnos)
    
    # Crop to TIB
    if hypnos.shape[1] != n_epochs_eeg:
        idx_outside_tib = (hypnos == -1).any(0)
        hypnos = hypnos[:, ~idx_outside_tib]

    # Check that size matches and pad if needed
    if hypnos.shape[1] != n_epochs_eeg:
        print(fname, "HYPNO AND EEG DO NOT MATCH!", hypnos.shape[1], n_epochs_eeg)
        to_pad = int(n_epochs_eeg - hypnos.shape[1])
        # if to_pad > 2:
            # If the EEG and hypno differ by more than one minute in length, skip subject
            # The reason is that we don't know which direction we should pad (before / after)?
            # print("SKIPPING SUBJECT")
            # continue
        # If it's only 1 or 2 epochs, we just repeat the value at the end
        hypnos = np.pad(hypnos, [(0, 0), (0, to_pad)], mode="edge")
    else:
        to_pad = 0
        
    # Replace -1 by zero in hypnogram
    # TODO: Should we just remove these instead?
    hypnos[hypnos == -1] = 0
    
    # Create consensus score (Guillot et al. 2020)
    #   To merge multiple sleep stagings into a single consensus sleep staging, we simply take the 
    #   majority vote on each 30-second epoch. When a tie occurs on a specific epoch, we take the 
    #   sleep stage scored by the most reliable scorer, i.e. the one with the highest agreement
    #   with all the other scorers.
    df_hypnos = pd.DataFrame(dict(zip(scorers, hypnos)))
    scorer_rank = (
        df_hypnos
        .corr(accuracy_score)
        .mean()
        .sort_values(ascending=False)
        .index.tolist())
    idx_best_hypno = [int(c.split('_')[1]) - 1 for c in scorer_rank][0]
    hyp_cons = consensus_scores(hypnos, idx_best_hypno)
    
    ###############################################################################################
    # APPLY YASA
    ###############################################################################################
    
    # Convert PSG data to a Raw array, keeping only a subset of channels
    info = mne.create_info(ch_names=['C3_M2', 'EOG1', 'EMG'], 
                           sfreq=sf, ch_types=['eeg', 'eog', 'emg'])
    data = np.vstack((eeg, eog, emg)) / 1e6  # Convert to uV
    raw = mne.io.RawArray(data, info, verbose=False)
    # Predict sleep stages and confidence
    metadata = dict(age=46, male=1)  # Average demographic data from Guillot 2020
    sls = yasa.SleepStaging(raw, eeg_name="C3_M2", eog_name="EOG1", emg_name="EMG", metadata=metadata)
    hyp_pred = pd.Series(sls.predict(path_to_model)).map(stage_mapping).to_numpy()
    assert hyp_pred.size == hyp_cons.size
    proba = sls.predict_proba(path_to_model)
    confidence = proba.max(1).to_numpy()
    
    # Add predictions to dataframe
    df_hypnos['cons'] = hyp_cons
    df_hypnos['yasa'] = hyp_pred
    df_hypnos['confidence'] = confidence
    df_hypnos['subj'] = subj
    df_hypnos['dataset'] = DATASET
    df_hypnos['pad'] = to_pad
    df_hypnos.index.name = "epoch"
    df_pred.append(df_hypnos.reset_index())
    
df_pred = pd.concat(df_pred, ignore_index=True).set_index(["dataset", "subj", "epoch"])
df_pred.round(2)

In [None]:
# Export to csv
out_file = parent_dir + "/output/cv/%s/pred_dreem_%s.csv" % (model, DATASET)
df_pred.to_csv(out_file)