# Data Split
creating the data split for the experiment

In [2]:
#external libraries
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import plotly
import plotly.subplots as sb
import plotly.express as px
import plotly.graph_objects as go
import dotenv
import pandas as pd
import scipy.fft as fft
import scipy.signal as sg
import scipy.io as sio
import pickle as pkl
import xgboost as xgb
import time
import uuid

#project library
from spinco import *

#environment variables
dotenv.load_dotenv('lab.env')

#project variables
datapath=os.environ['DATAPATH']
cognipath=datapath+"\\COGNITION"
dreamspath=datapath+"\\DREAMS"
masspath=datapath+"\\MASS"


## define a fixed samplerate

In [4]:
samplerate=200

## load mass

In [5]:
annotations, signalsMetadata = loadMASSSpindles(masspath,forceSamplerate=samplerate)

In [6]:
annotations.head()

Unnamed: 0,type,expert,subjectId,labelerId,startTime,duration,samplerate,stopTime,startInd,stopInd
0,kcomplex,E1,1,1,830.596676,0.699174,200,831.29585,166119,166259
1,kcomplex,E1,1,1,840.981316,0.492156,200,841.473472,168196,168295
2,kcomplex,E1,1,1,970.596678,0.578088,200,971.174766,194119,194235
3,kcomplex,E1,1,1,1049.772807,0.695268,200,1050.468075,209955,210094
4,kcomplex,E1,1,1,1077.231575,0.648396,200,1077.879971,215446,215576


In [7]:
signalsMetadata.head()

Unnamed: 0,subjectId,file,channel,duration,samplerate,isOriginalSamplerate,database
0,1,MASS_0001.pkl,C3-CLE,28956.0,200,False,MASS
1,2,MASS_0002.pkl,C3-CLE,35016.0,200,False,MASS
2,3,MASS_0003.pkl,C3-CLE,36760.0,200,False,MASS
3,4,MASS_0004.pkl,C3-CLE,28004.0,200,False,MASS
4,5,MASS_0005.pkl,C3-CLE,31244.0,200,False,MASS


## data splits definition
after Tapia and Estévez 2020, the paper of RED

1. Remove subjects with only one labeler

2. 4 subjects for testing: 0002, 0006, 0012, 0013

3. 10 fold CV with 8 training and 3 validation subjects

In [10]:
#define subjects used in the experiment
usedSubjects=np.array(signalsMetadata['subjectId'])
usedSubjects=np.setdiff1d(usedSubjects,['0004','0008','0015','0016'])
usedSubjects

array(['0001', '0002', '0003', '0005', '0006', '0007', '0009', '0010',
       '0011', '0012', '0013', '0014', '0017', '0018', '0019'],
      dtype=object)

In [11]:
valCount=3  #number of signals for validation (selected randomly)
testSubjects=np.array(['0002','0006','0012','0013'])

trainSplits=[]
valSplits=[]
testSplits=[]
for i in range(10):
    thisNoTest=np.setdiff1d(usedSubjects,testSubjects)
    thisVal=np.random.choice(thisNoTest,size=valCount,replace=False)
    thisTrain=np.setdiff1d(thisNoTest,thisVal)
    trainSplits.append(thisTrain)
    valSplits.append(thisVal)
    testSplits.append(testSubjects)

dataSplits=pd.DataFrame({
    'train':trainSplits,
    'val':valSplits,
    'test':testSplits
})

dataSplits

Unnamed: 0,train,val,test
0,"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]"
1,"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]"
2,"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]"
3,"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]"
4,"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]"
5,"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]"
6,"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]"
7,"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]"
8,"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]"
9,"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]"


In [15]:
dumpPickle("dataSplits_likeRED.pkl",dataSplits)
