# MODEL TRAINING - using N2 spindles, training with N2 data
saving models in a different folder using pickle, saving them in the dataframe causes xgboost to crash

**DO NOT RUN AGAIN**

In [1]:
#external libraries
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import plotly
import plotly.subplots as sb
import plotly.express as px
import plotly.graph_objects as go
import dotenv
import pandas as pd
import scipy.fft as fft
import scipy.signal as sg
import scipy.io as sio
import pickle as pkl
import xgboost as xgb
import time
import uuid

#project library
from spinco import *

#environment variables
dotenv.load_dotenv('lab.env')

#project variables
datapath=os.environ['DATAPATH']
cognipath=datapath+"\\COGNITION"
dreamspath=datapath+"\\DREAMS"
masspath=datapath+"\\MASS"


## experiment id
we'll use it to create a folder at the end of the script

In [2]:
experimentId=str(uuid.uuid4())

## define a fixed samplerate

In [3]:
samplerate=256

## load mass

In [4]:
#annotations, signalsMetadata = loadMASSSpindles(masspath,forceSamplerate=samplerate,onlySpindlesFilteredN2=True)
annotations, signalsMetadata = loadMASSSpindles(masspath,onlySpindlesFilteredN2=True)

In [5]:
#check with EDA results (not needed)
print(len(annotations)/33458)

0.9797357881523103


In [6]:
annotations.head()

Unnamed: 0,type,expert,subjectId,labelerId,startTime,duration,samplerate,stopTime,startInd,stopInd
0,spindle,E1,1,1,888.327805,0.640579,256,888.968384,227412,227576
1,spindle,E1,1,1,905.758061,0.578094,256,906.336155,231874,232022
2,spindle,E1,1,1,917.731574,0.847603,256,918.579177,234939,235156
3,spindle,E1,1,1,922.078189,0.878845,256,922.957034,236052,236277
4,spindle,E1,1,1,939.055445,0.757767,256,939.813212,240398,240592


In [7]:
signalsMetadata.head()

Unnamed: 0,subjectId,file,channel,duration,samplerate,isOriginalSamplerate,database
0,1,MASS_0001.pkl,C3-CLE,28956.0,256,True,MASS
1,2,MASS_0002.pkl,C3-CLE,35016.0,256,True,MASS
2,3,MASS_0003.pkl,C3-CLE,36760.0,256,True,MASS
3,4,MASS_0004.pkl,C3-CLE,28004.0,256,True,MASS
4,5,MASS_0005.pkl,C3-CLE,31244.0,256,True,MASS


## define a fixed feature selection to use

In [8]:
featureSelection=loadPickle("EXT_F1_iteration_7.pkl")
featureSelection

Unnamed: 0,characteristic,bandName,window
21,hjortActivity,sigma,0.5
66,petrosian,broadband,0.5
22,hjortActivity,theta,0.5
79,relativePower,beta1,0.5
29,hjortComplexity,sigma,0.5


## load data split
10 fold CV with 4 subjects for testing

In [9]:
#by now we used fixed CV data splits instead
dataSplits=loadPickle("dataSplits_likeRED.pkl")
dataSplits

Unnamed: 0,train,val,test
0,"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]"
1,"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]"
2,"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]"
3,"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]"
4,"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]"
5,"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]"
6,"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]"
7,"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]"
8,"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]"
9,"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]"


## define annotation criteria
we want to test the models in three diferent ways

In [10]:
annotationCriteria=pd.DataFrame({
    'criteriumId':['0001','0002','0003'],
    'criteriumName':['E1','E2','union'],
    'labelerIdList':[['0001'],['0002'],['0001','0002']]
})

In [11]:
annotationCriteria

Unnamed: 0,criteriumId,criteriumName,labelerIdList
0,1,E1,[0001]
1,2,E2,[0002]
2,3,union,"[0001, 0002]"


In [12]:
experimentModels=pd.merge(annotationCriteria,dataSplits,how='cross')

In [13]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test
0,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]"
1,1,E1,[0001],"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]"
2,1,E1,[0001],"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]"
3,1,E1,[0001],"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]"
4,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]"
5,1,E1,[0001],"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]"
6,1,E1,[0001],"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]"
7,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]"
8,1,E1,[0001],"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]"
9,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]"


## load hypnograms

In [14]:
stagesAnnotations=pd.read_csv(datapath+"\MASS\stages\stages.csv")
stagesAnnotations['samplerate']=samplerate
stagesAnnotations['subjectId']=stagesAnnotations.apply(
    lambda row: str(row.subjectId).zfill(4),axis=1)
stagesAnnotations['stopTime']=stagesAnnotations.apply(
    lambda row: row.startTime+row.duration , axis=1)
stagesAnnotations['startInd']=stagesAnnotations.apply(
    lambda row: seconds2index(row.startTime,row.samplerate) , axis=1)
stagesAnnotations['stopInd']=stagesAnnotations.apply(
    lambda row: seconds2index(row.stopTime,row.samplerate) , axis=1)
stagesAnnotations.head(5)

Unnamed: 0,subjectId,type,startTime,duration,value,comment,samplerate,stopTime,startInd,stopInd
0,1,stage,560.758313,19.99872,,Sleep stage ?,256,580.757033,143554,148674
1,1,stage,580.758313,20.002626,,Sleep stage ?,256,600.760939,148674,153795
2,1,stage,600.758313,19.99872,,Sleep stage ?,256,620.757033,153794,158914
3,1,stage,620.758313,19.99872,,Sleep stage ?,256,640.757033,158914,164034
4,1,stage,640.758314,20.002626,,Sleep stage ?,256,660.76094,164034,169155


In [15]:
hypnograms={}
for ind, row in signalsMetadata.iterrows():
    subjectId=row.subjectId
    thisStages=stagesAnnotations[stagesAnnotations.subjectId==subjectId]
    excerptDimension=int(row.duration*row.samplerate)
    thisHypnogram=np.ones((excerptDimension,))*np.nan
    for ind_stg, row_stg in thisStages.iterrows():
        thisHypnogram[row_stg.startInd:row_stg.stopInd]=row_stg.value
    hypnograms[subjectId]=thisHypnogram
    
    #check all our spindles are N2
    thisAnnotations=annotations[annotations.subjectId==subjectId].reset_index(drop=True)
    thisLabels=excerptAnnotationsToLabels(thisAnnotations,excerptDimension)
    check=thisLabels*(thisHypnogram==2)
    print("*************** "+subjectId)
    print(np.sum(check==thisLabels)==excerptDimension)
    

*************** 0001
True
*************** 0002
True
*************** 0003
True
*************** 0004
True
*************** 0005
True
*************** 0006
True
*************** 0007
True
*************** 0008
True
*************** 0009
True
*************** 0010
True
*************** 0011
True
*************** 0012
True
*************** 0013
True
*************** 0014
True
*************** 0015
True
*************** 0016
True
*************** 0017
True
*************** 0018
True
*************** 0019
True


In [16]:
def getN2mask(subjectId,hypnograms):
    return hypnograms[subjectId]==2

In [17]:
def loadN2maskVector(subjectList,hypnograms,signalsMetadata,samplerate):
    # operate on the signal lengths of the subjects selected
    thisSignals=signalsMetadata[signalsMetadata.subjectId.isin(subjectList)].copy()
    thisSignals['excerptDimension']=thisSignals.apply(
        lambda row: int(row.duration*samplerate),
        axis=1)
    # initialise vector of labels
    maskVector=np.zeros((np.sum(thisSignals['excerptDimension']),),dtype=bool)
    auxStartInd=0
    # iterate the signals
    for j, row in thisSignals.iterrows():   #iterate the signals selected to fill the row
        subject=row['subjectId']
        excerptDim=row['excerptDimension']
        maskVector[auxStartInd:auxStartInd+excerptDim]=getN2mask(subject,hypnograms)
        auxStartInd=auxStartInd+excerptDim
    return maskVector

## model fitting

In [18]:
n_jobs=4 # paralelization parameter for xgboost (keep it 8 as maximum)
learning_rate=0.5
subsample=0.6

num_boost_round=30  #number of boosting rounds per model

models=[]
modelIds=[]
valF1s=[]
spindleTimeRates=[]
for index, row in experimentModels.iterrows():
    print('**********************')
    print(str(index+1)+' of '+str(len(experimentModels)))
    #Define annotations criterium
    usedAnnotations=annotations[annotations.labelerId.isin(row.labelerIdList)].reset_index(drop=True)
    #Features
    trainFeatures=loadFeatureMatrix(row.train,featureSelection,signalsMetadata,samplerate,datapath)
    """ valFeatures=loadFeatureMatrix(row.val,featureSelection,signalsMetadata,samplerate,datapath) """  #we won't evaluate here
    #Labels
    trainLabels=loadLabelsVector(row.train,usedAnnotations,signalsMetadata,samplerate)

    # Restrict trainingto N2 --------------------------------------------------------------->
    maskVector=loadN2maskVector(row.train,hypnograms,signalsMetadata,samplerate)
    trainFeatures=trainFeatures[maskVector,:]
    trainLabels=trainLabels[maskVector]
    #<---------------------------------------------------------------------------------------
    
    """ valLabels=loadLabelsVector(row.val,usedAnnotations,signalsMetadata,samplerate) """  #we won't evaluate here
    #Train the models
    params={
        'n_jobs':n_jobs,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'objective':'binary:logistic'
    }
    trainDMatrix=xgb.DMatrix(data=trainFeatures,label=trainLabels)
    """ evalDMatrix=xgb.DMatrix(data=valFeatures,label=valLabels) """  #we won't evaluate here
    xgb_model = xgb.train(params,trainDMatrix,num_boost_round=num_boost_round)
    """ xgb_model = xgb.train(params,trainDMatrix,num_boost_round=num_boost_round,evals=[(evalDMatrix,'eval')], custom_metric=eval_F1) """  #we won't evaluate here
    spindleTimeRates.append(sum(trainLabels)/len(trainLabels))
    models.append(xgb_model)
    modelIds.append(str(uuid.uuid4()))
    """ #val predictions and objective function definition
    valPredictions=xgb_model.predict(evalDMatrix, iteration_range=(0,masterTreeLimit))
    valF1s.append(F1(valLabels,valPredictions)) """  #we won't evaluate here
    

**********************
1 of 30
**********************
2 of 30
**********************
3 of 30
**********************
4 of 30
**********************
5 of 30
**********************
6 of 30
**********************
7 of 30
**********************
8 of 30
**********************
9 of 30
**********************
10 of 30
**********************
11 of 30
**********************
12 of 30
**********************
13 of 30
**********************
14 of 30
**********************
15 of 30
**********************
16 of 30
**********************
17 of 30
**********************
18 of 30
**********************
19 of 30
**********************
20 of 30
**********************
21 of 30
**********************
22 of 30
**********************
23 of 30
**********************
24 of 30
**********************
25 of 30
**********************
26 of 30
**********************
27 of 30
**********************
28 of 30
**********************
29 of 30
**********************
30 of 30


In [19]:
experimentModels["modelId"]=modelIds
experimentModels["spindleTimeRate"]=spindleTimeRates

In [20]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate
0,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]",c018bd0b-b31d-416f-852b-b9bcedeace69,0.034389
1,1,E1,[0001],"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]",c183426a-37bc-44b3-b756-8e38fc67ca74,0.044139
2,1,E1,[0001],"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]",012084d7-035c-484e-842a-66eecc38a8fd,0.038282
3,1,E1,[0001],"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]",de35ed40-282f-4c33-bf3e-50d6ab683bef,0.0338
4,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]",b14bfeb7-33dc-4cdd-bf0f-79b85b8d8372,0.040247
5,1,E1,[0001],"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]",49da3a3b-1822-4692-b398-418217e71dc0,0.040892
6,1,E1,[0001],"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]",1d1b374f-4f6f-42f1-b8f2-d9b8153111d9,0.037453
7,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]",5ddee1df-e627-4685-8cc7-7cc03f69af5f,0.040871
8,1,E1,[0001],"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]",f3ae2081-6b6d-426d-96ae-bf2f2db86098,0.040168
9,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]",17947e1b-05e6-4d88-91b8-50242078e358,0.045972


## save results

In [21]:
#create parent folder
experimentpath=datapath+"/experiments/"+experimentId
os.mkdir(experimentpath)
#save each of the models
for id, model in zip(modelIds,models):
    model.save_model(experimentpath+"/"+id+".json")
#save experiment information
dumpPickle(experimentpath+"/experimentModels.pkl",experimentModels)
dumpPickle(experimentpath+"/featureSelection.pkl",featureSelection)

In [22]:
#you need to use this in the following parts of the experiment
print(experimentId)

b1fb375f-42f7-47f2-99dc-dbe2d0864185
