In [1]:
import glob
import os

In [9]:
# Walk stanford mwt folder

root_folder = "..\\Stanford_MWT\\"
for root, subdirs, files in os.walk(root_folder):
    for x in files:
        if x.endswith(".edf") or x.endswith(".rml"):
            print(os.path.join(root, x))

..\Stanford_MWT\Originals\NT1MWT01.edf
..\Stanford_MWT\Originals\NT1MWT02.edf
..\Stanford_MWT\Originals\NT1MWT04.edf
..\Stanford_MWT\Originals\NT1MWT05.edf
..\Stanford_MWT\Originals\NT1MWT06.edf
..\Stanford_MWT\Originals\NT2MWT01.edf
..\Stanford_MWT\Originals\NT2MWT02.edf
..\Stanford_MWT\Scored\Kelvin\NT1MWT01\00000023-Edf-192.168.22.37\00000023-Edf-192.168.22.37.rml
..\Stanford_MWT\Scored\Kelvin\NT1MWT01\00000023-Edf-192.168.22.37\00000023-Edf-192.168.22.37[001]-T.edf
..\Stanford_MWT\Scored\Kelvin\NT1MWT01\00000023-Edf-192.168.22.37\00000023-Edf-192.168.22.37[001].edf
..\Stanford_MWT\Scored\Kelvin\NT2MWT01\00000034-Edf-10.1.12.219\00000034-Edf-10.1.12.219.rml
..\Stanford_MWT\Scored\Kelvin\NT2MWT01\00000034-Edf-10.1.12.219\00000034-Edf-10.1.12.219[001]-T.edf
..\Stanford_MWT\Scored\Margie\NT1MWT01\00000041-Edf-192.168.1.9.rml
..\Stanford_MWT\Scored\Margie\NT1MWT01\00000041-Edf-192.168.1.9[001]-T.edf
..\Stanford_MWT\Scored\Margie\NT1MWT02\00000047-Edf-192.168.1.12.rml
..\Stanford_MWT\Sco

In [10]:
import numpy as np
import pandas as pd

# Group folders and files into originals edfs, scored edfs, and event files (.rml)

original_folder = os.path.join(root_folder, "Originals")
original_files = np.array(glob.glob(f"{original_folder}\\*.edf"))

scored_folder = os.path.join(root_folder, "Scored")
scored_files = ([os.path.join(r, x)
               for r, _, files in os.walk(root_folder)
               for x in files if x.endswith(".rml")])
scored_edfs = ([os.path.join(r, x)
               for r, _, files in os.walk(scored_folder)
               for x in files if x.endswith(".edf")])

# Create a mapping data frame between files and subject id

ids = [x.split(".edf")[0] for x in os.listdir(original_folder)]


# For each recording (id) get the scored recording from each scorer and store the orignal file path and the scored file path, as well as the id of the scorer
info = []
for id in ids:
    
    # Get the original file
    original = [x for x in original_files if id in x]
    assert len(original) == 1
    original = original[0]

    # Find files pertaining to ID that have been scored 
    scored = [x for x in scored_files if id in x]

    # For each scored file get scorerer name and store essential info
    for s in scored:
        scorer = s.split(scored_folder)[-1].split(id)[0].strip('\\')

        out = {
            "id": id,
            "original_path": original,
            "scored_path": s,
            "scorer": scorer
        }
        
        info.append(out)

file_df = pd.DataFrame(info)

In [16]:
import xml.etree.ElementTree as ET

def get_events_from_rml(rml_file, event_mapping = {"Arousal": "MS", "Bruxism": "W_alpha"}) -> pd.DataFrame:
    """Function to parse rml data for events"""

    # XML parse tree and get root
    mytree = ET.parse(rml_file)
    myroot = mytree.getroot()

    # Get tag of the root 
    tagRoot = myroot.tag.split("}")[0] + "}" 

    makeTag = lambda x: f"{tagRoot}{x}"

    # Create scoring tag
    scoringTag = makeTag("ScoringData")

    # Get scoring root and find the event root
    scoringRoot = myroot.findall(scoringTag)[0]
    for child in scoringRoot:
        if "Events" in child.tag:
            eventRoot = child
            break

    # Get all of the events in the event root 
    events = [x.attrib for x in eventRoot]
    scored_events = []

    # Extract events 
    for e in events: 
        if e["Family"] == "Neuro":
            inp = {"description": e["Type"], "onset": float(e["Start"]), "duration": float(e["Duration"])}
            scored_events.append(inp)

    out = pd.DataFrame.from_dict(scored_events)
    out.description = out.description.replace(event_mapping)
    
    return out


In [40]:
import mne
from IPython.display import clear_output

export = False

all_df = pd.DataFrame()

def export_edf(raw, info, root_folder = "..\\Stanford_MWT\\"):

    # Folder
    export_folder = os.path.join(root_folder, "Parsed")
    if not os.path.exists(export_folder):
        os.mkdir(export_folder)

    export_filename = f"{info.scorer}_{info.id}.edf"
    export_path = os.path.join(export_folder, export_filename)
    try:
        mne.export.export_raw(export_path, raw, overwrite=False)
    except:
        overwrite = input(f"Do you wish to overwrite {export_path} [y/n]:")
        if overwrite == "y":
            mne.export.export_raw(export_path, raw, overwrite=True)
            
for i in range(len(file_df)):

    tmp_rec = file_df.iloc[i]

    print(f"Scorer: {tmp_rec.scorer}\t\tID: {tmp_rec.id}")

    # Read the events from the scored file
    events = get_events_from_rml(tmp_rec.scored_path)
    print(events.groupby("description").duration.describe())

    # Read the raw edf
    raw_original = mne.io.read_raw_edf(tmp_rec.original_path)
    

    # Append annotations to original recording
    for _, row in events.iterrows():
        raw_original.annotations.append(onset=row.onset, duration=row.duration, description=row.description)

    tmp_scores = raw_original.annotations.to_data_frame()
    tmp_scores['scorer'] = np.repeat(tmp_rec.scorer, len(tmp_scores))
    tmp_scores['id'] = np.repeat(tmp_rec.id, len(tmp_scores))
    all_df = pd.concat([all_df, tmp_scores])
    
    clear_output(wait=False)

    # Reduce channels
    keep_channels = ['M2:F3',
                    'M1:F4',
                    'M2:C3',
                    'M1:C4',
                    'M1:O2',
                    'M2:O1',
                    'Chin',
                    'E1:M2',
                    'E2:M1']

    for ch in raw_original.ch_names:
        if ch not in keep_channels:
            raw_original.drop_channels(ch)
    

    # Export
    if export:
        export_edf(raw_original, tmp_rec)

In [41]:
all_df[all_df.description=="MS"].groupby(["id","scorer"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,duration,duration,duration,duration,duration,duration,duration
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
id,scorer,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
NT1MWT01,Kelvin,52.0,9.278846,3.501279,3.5,6.0,9.5,12.0,15.0
NT1MWT01,Margie,7.0,9.285714,4.008919,4.0,5.75,11.0,12.5,13.5
NT1MWT01,Sheila,42.0,8.083333,3.334807,3.0,5.0,7.75,11.25,14.5
NT1MWT02,Margie,4.0,12.75,3.227486,8.5,11.125,13.5,15.125,15.5
NT1MWT02,Sheila,23.0,8.195652,3.353218,3.5,5.25,8.5,10.5,14.5
NT1MWT04,Margie,26.0,9.807692,3.328894,5.0,7.125,9.5,12.0,20.0
NT1MWT04,Sheila,60.0,7.825,3.068187,3.0,5.0,7.5,10.0,14.5
NT1MWT05,Margie,26.0,8.711538,2.867658,4.0,6.25,8.0,11.0,15.0
NT1MWT05,Sheila,130.0,7.292308,3.008892,3.0,5.0,6.5,9.5,14.0
NT1MWT06,Margie,23.0,9.891304,7.92573,3.0,3.0,7.0,14.25,28.0


In [38]:
all_df[all_df.description=="MS"].groupby(["id","scorer"]).describe()

id
