# Process MASS data structure 2.0
get sleep stages

In [1]:
import os
import pandas as pd
import numpy as np
import pyedflib
import pickle as pkl
from plotly import express as px
from matplotlib import pyplot as plt

In [72]:
edfpath="D:\SpinCo\MASS\edfs"
annopath="D:\SpinCo\MASS\\annotations"
signalspath="D:\SpinCo\MASS\signals"
stagespath="D:\SpinCo\MASS\stages"

## Signals = Subjects -> SS
we are assuming 1:1 relation signal:subject from now on, so the concepts are merged

In [11]:
allfiles=os.listdir(edfpath)
subjectIds=list()
extensions=list()
types=list()
edfFiles=list()
for file in allfiles:
    aux1=file.split(".")
    if aux1[1]=="edf":
        aux2=aux1[0].split(" ")
        aux3=aux2[0].split("-")
        subjectIds.append(aux3[2])
        extensions.append(aux1[1])
        types.append(aux2[1])
        edfFiles.append(file)

edfsData=pd.DataFrame({
    "subjectId":subjectIds,
    "edfFile":edfFiles,
    "extension":extensions,
    "type":types
})

In [12]:
edfsData.head(5)

Unnamed: 0,subjectId,edfFile,extension,type
0,1,01-02-0001 Base.edf,edf,Base
1,1,01-02-0001 PSG.edf,edf,PSG
2,2,01-02-0002 Base.edf,edf,Base
3,2,01-02-0002 PSG.edf,edf,PSG
4,3,01-02-0003 Base.edf,edf,Base


In [13]:
ssStages=edfsData[edfsData.type=="Base"].reset_index(drop=True)
ssStages

Unnamed: 0,subjectId,edfFile,extension,type
0,1,01-02-0001 Base.edf,edf,Base
1,2,01-02-0002 Base.edf,edf,Base
2,3,01-02-0003 Base.edf,edf,Base
3,4,01-02-0004 Base.edf,edf,Base
4,5,01-02-0005 Base.edf,edf,Base
5,6,01-02-0006 Base.edf,edf,Base
6,7,01-02-0007 Base.edf,edf,Base
7,8,01-02-0008 Base.edf,edf,Base
8,9,01-02-0009 Base.edf,edf,Base
9,10,01-02-0010 Base.edf,edf,Base


In [65]:
subjectIds=list()
starts=list()
durations=list()
types=list()
comments=list()
files=list()
stages=list()
startTimes=list()

for ind, row in ssStages.iterrows():
    print("**********************************************")
    print(ind)
    print(row)
    fullPath=edfpath+"/"+row.edfFile
    signals, signal_headers, header = pyedflib.highlevel.read_edf(fullPath)
    thisStartTime=header['startdate']
    for anno in header['annotations']:
        subjectIds.append(row.subjectId)
        starts.append(anno[0])
        durations.append(anno[1])
        types.append('stage')
        comments.append(anno[2])
        startTimes.append(thisStartTime)
        if '?' in anno[2]:
            stages.append(np.nan)
        elif 'W' in anno[2]:
            stages.append(0)
        elif str(1) in anno[2]:
            stages.append(1)
        elif str(2) in anno[2]:
            stages.append(2)
        elif str(3) in anno[2]:
            stages.append(3)
        elif str(4) in anno[2]:
            stages.append(3)
        elif 'R' in anno[2]:
            stages.append(5)
        else:
            print("WARNING: you should never be here")
            break
            
    print("**********************************************")

""" #check
sum(np.array(samplerates)==np.array(aux))==len(samplerates) """

stageAnnotations=pd.DataFrame({
    'subjectId':subjectIds,
    'startTime':starts,
    'duration':durations,
    'type':types,
    'value':stages,
    'comment':comments,
    'fileDateTime':startTimes
})


**********************************************
0
subjectId                   0001
edfFile      01-02-0001 Base.edf
extension                    edf
type                        Base
Name: 0, dtype: object
**********************************************
**********************************************
1
subjectId                   0002
edfFile      01-02-0002 Base.edf
extension                    edf
type                        Base
Name: 1, dtype: object
**********************************************
**********************************************
2
subjectId                   0003
edfFile      01-02-0003 Base.edf
extension                    edf
type                        Base
Name: 2, dtype: object
**********************************************
**********************************************
3
subjectId                   0004
edfFile      01-02-0004 Base.edf
extension                    edf
type                        Base
Name: 3, dtype: object
*******************************************

### Sinchronize with signal start time

In [58]:
stageAnnotations.head(5)

Unnamed: 0,subjectId,start,duration,type,value,comment,fileDateTime
0,1,561.000009,19.99872,stage,,Sleep stage ?,2000-01-01 23:00:59
1,1,581.000009,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59
2,1,601.000009,19.99872,stage,,Sleep stage ?,2000-01-01 23:00:59
3,1,621.000009,19.99872,stage,,Sleep stage ?,2000-01-01 23:00:59
4,1,641.00001,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59


In [59]:
from EDFlib.edfreader import EDFreader #<-need this to load datetime of the signals

allfiles=os.listdir(edfpath)
subjectIds=list()
extensions=list()
types=list()
edfFiles=list()
startTimes=list()

for file in allfiles:
    aux1=file.split(".")
    if aux1[1]=="edf":
        aux2=aux1[0].split(" ")
        aux3=aux2[0].split("-")
        subjectIds.append(aux3[2])
        extensions.append(aux1[1])
        types.append(aux2[1])
        edfFiles.append(file)
        reader=EDFreader(edfpath+"/"+file)
        startTimes.append(reader.getStartDateTime())

edfsData=pd.DataFrame({
    "subjectId":subjectIds,
    "edfFile":edfFiles,
    "extension":extensions,
    "type":types,
    "fileDateTime":startTimes
})

In [61]:
ssMetadata=edfsData[edfsData.type=="PSG"].reset_index(drop=True)
ssMetadata

Unnamed: 0,subjectId,edfFile,extension,type,fileDateTime
0,1,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
1,2,01-02-0002 PSG.edf,edf,PSG,2000-01-01 22:35:58.036561
2,3,01-02-0003 PSG.edf,edf,PSG,2000-01-01 00:33:32.658647
3,4,01-02-0004 PSG.edf,edf,PSG,2000-01-01 23:23:02.767035
4,5,01-02-0005 PSG.edf,edf,PSG,2000-01-01 22:22:11.859389
5,6,01-02-0006 PSG.edf,edf,PSG,2000-01-01 00:02:19.084621
6,7,01-02-0007 PSG.edf,edf,PSG,2000-01-01 23:17:16.949841
7,8,01-02-0008 PSG.edf,edf,PSG,2000-01-01 23:45:03.904196
8,9,01-02-0009 PSG.edf,edf,PSG,2000-01-01 23:48:54.761051
9,10,01-02-0010 PSG.edf,edf,PSG,2000-01-01 00:02:39.746518


In [67]:
#1. Merge with ssMetadata to have fileDateTime from the signal in each annotation
annotationsMerged=pd.merge(stageAnnotations,ssMetadata,on='subjectId',suffixes=['','_sig'])
annotationsMerged

Unnamed: 0,subjectId,startTime,duration,type,value,comment,fileDateTime,edfFile,extension,type_sig,fileDateTime_sig
0,0001,561.000009,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
1,0001,581.000009,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
2,0001,601.000009,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
3,0001,621.000009,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
4,0001,641.000010,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696
...,...,...,...,...,...,...,...,...,...,...,...
27372,0019,26298.000362,20.002626,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189
27373,0019,26318.000362,19.998720,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189
27374,0019,26338.000363,19.998720,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189
27375,0019,26358.000363,20.002626,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189


In [68]:
#2. Operate using built-in datetime operations
annotationsMerged['annotationDelay']=annotationsMerged.apply(
    lambda row: (row.fileDateTime_sig-row.fileDateTime).total_seconds(),
    axis=1)
annotationsMerged[['type','subjectId','startTime','type','comment','annotationDelay']]

Unnamed: 0,type,subjectId,startTime,type.1,comment,annotationDelay
0,stage,0001,561.000009,stage,Sleep stage ?,0.241696
1,stage,0001,581.000009,stage,Sleep stage ?,0.241696
2,stage,0001,601.000009,stage,Sleep stage ?,0.241696
3,stage,0001,621.000009,stage,Sleep stage ?,0.241696
4,stage,0001,641.000010,stage,Sleep stage ?,0.241696
...,...,...,...,...,...,...
27372,stage,0019,26298.000362,stage,Sleep stage ?,0.911189
27373,stage,0019,26318.000362,stage,Sleep stage ?,0.911189
27374,stage,0019,26338.000363,stage,Sleep stage ?,0.911189
27375,stage,0019,26358.000363,stage,Sleep stage ?,0.911189


In [69]:
#3. Check that there is only one unique value for each combination of type_ann subjectId and labelerId ...
annotationsMerged[['type','subjectId','annotationDelay']].drop_duplicates().groupby(['subjectId']).describe()

#... in fact there is only one delay for each subjectId, the delay comes from the signal start time

Unnamed: 0_level_0,annotationDelay,annotationDelay,annotationDelay,annotationDelay,annotationDelay,annotationDelay,annotationDelay,annotationDelay
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subjectId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1.0,0.241696,,0.241696,0.241696,0.241696,0.241696,0.241696
2,1.0,0.036561,,0.036561,0.036561,0.036561,0.036561,0.036561
3,1.0,0.658647,,0.658647,0.658647,0.658647,0.658647,0.658647
4,1.0,0.767035,,0.767035,0.767035,0.767035,0.767035,0.767035
5,1.0,0.859389,,0.859389,0.859389,0.859389,0.859389,0.859389
6,1.0,0.084621,,0.084621,0.084621,0.084621,0.084621,0.084621
7,1.0,0.949841,,0.949841,0.949841,0.949841,0.949841,0.949841
8,1.0,0.904196,,0.904196,0.904196,0.904196,0.904196,0.904196
9,1.0,0.761051,,0.761051,0.761051,0.761051,0.761051,0.761051
10,1.0,0.746518,,0.746518,0.746518,0.746518,0.746518,0.746518


In [70]:
#4. Now that we are sure of what we do, discount the delay from all start times:
annotationsMerged['originalStartTime']=annotationsMerged['startTime']
annotationsMerged['startTime']=annotationsMerged['startTime']-annotationsMerged['annotationDelay']
annotationsMerged

Unnamed: 0,subjectId,startTime,duration,type,value,comment,fileDateTime,edfFile,extension,type_sig,fileDateTime_sig,annotationDelay,originalStartTime
0,0001,560.758313,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696,0.241696,561.000009
1,0001,580.758313,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696,0.241696,581.000009
2,0001,600.758313,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696,0.241696,601.000009
3,0001,620.758313,19.998720,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696,0.241696,621.000009
4,0001,640.758314,20.002626,stage,,Sleep stage ?,2000-01-01 23:00:59,01-02-0001 PSG.edf,edf,PSG,2000-01-01 23:00:59.241696,0.241696,641.000010
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27372,0019,26297.089173,20.002626,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189,0.911189,26298.000362
27373,0019,26317.089173,19.998720,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189,0.911189,26318.000362
27374,0019,26337.089174,19.998720,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189,0.911189,26338.000363
27375,0019,26357.089174,20.002626,stage,,Sleep stage ?,2000-01-01 23:55:22,01-02-0019 PSG.edf,edf,PSG,2000-01-01 23:55:22.911189,0.911189,26358.000363


In [71]:
#5. clean the dataframe
cleanedAnnotations=annotationsMerged[['subjectId','type','startTime','duration','value','comment']]
cleanedAnnotations

Unnamed: 0,subjectId,type,startTime,duration,value,comment
0,0001,stage,560.758313,19.998720,,Sleep stage ?
1,0001,stage,580.758313,20.002626,,Sleep stage ?
2,0001,stage,600.758313,19.998720,,Sleep stage ?
3,0001,stage,620.758313,19.998720,,Sleep stage ?
4,0001,stage,640.758314,20.002626,,Sleep stage ?
...,...,...,...,...,...,...
27372,0019,stage,26297.089173,20.002626,,Sleep stage ?
27373,0019,stage,26317.089173,19.998720,,Sleep stage ?
27374,0019,stage,26337.089174,19.998720,,Sleep stage ?
27375,0019,stage,26357.089174,20.002626,,Sleep stage ?


In [73]:
#6. save a csv
cleanedAnnotations.to_csv(stagespath+"\\stages.csv",index=False)