# MODEL TRAINING - DEMO

In [1]:
#external libraries
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import plotly
import plotly.subplots as sb
import plotly.express as px
import plotly.graph_objects as go
import dotenv
import pandas as pd
import scipy.fft as fft
import scipy.signal as sg
import scipy.io as sio
import pickle as pkl
import xgboost as xgb
import time
import uuid

#project library
from spinco import *

#environment variables
dotenv.load_dotenv('lab.env')

#project variables
demopath=os.getcwd()


## experiment id
we'll use it to create a folder at the end of the script

In [2]:
experimentId=str(uuid.uuid4())

## define a fixed samplerate

In [3]:
samplerate=200

## load dreams

In [4]:
signals, annotations, signalsMetadata = loadDREAMSSpindlesDemo(demopath)

SubjectId: 0001--------------
resampling from 100 to 200
SubjectId: 0003--------------
resampling from 50 to 200
SubjectId: 0006--------------
duration discrepancy, removing last 0.7 seconds


In [5]:
annotations=annotations[annotations.type=='spindle'].reset_index(drop=True)
len(annotations)

764

In [6]:
minDuration=0.3
maxDuration=5
annotations=annotations[annotations.duration>minDuration].reset_index(drop=True)
annotations=annotations[annotations.duration<maxDuration].reset_index(drop=True)
len(annotations)

764

In [7]:
annotations.head()

Unnamed: 0,index,startTime,duration,channel,subjectId,labelerId,type,samplerate,stopTime,startInd,stopInd
0,0,282.24,0.72,C3-A1,1,1,spindle,200,282.96,56448,56592
1,1,311.72,1.54,C3-A1,1,1,spindle,200,313.26,62344,62652
2,2,340.28,0.72,C3-A1,1,1,spindle,200,341.0,68056,68200
3,3,366.83,0.65,C3-A1,1,1,spindle,200,367.48,73366,73496
4,4,373.74,0.5,C3-A1,1,1,spindle,200,374.24,74748,74848


In [8]:
signalsMetadata.head()

Unnamed: 0,filename,channel,subjectId,duration,samplerate,database
0,excerpt1.txt,C3-A1,1,1800,200,DREAMS
1,excerpt2.txt,CZ-A1,2,1800,200,DREAMS
2,excerpt3.txt,C3-A1,3,1800,200,DREAMS
3,excerpt4.txt,CZ-A1,4,1800,200,DREAMS
4,excerpt5.txt,CZ-A1,5,1800,200,DREAMS


## define a fixed feature selection to use
36 is the number of features in the article, but it can be changed for additional tests

In [9]:
featureSelectionx=loadPickle("featureSelectionComplete_bootstrapppingR2.pkl")
featureSelection=featureSelectionx.sort_values('Times_selected',ascending=False).head(36).copy().reset_index(drop=True)
featureSelection

Unnamed: 0,window,characteristic,bandName,Times_selected
0,2.0,sigmaIndex,broadband,1000.0
1,2.0,hjortActivity,sigma,1000.0
2,1.5,sigmaIndex,broadband,1000.0
3,1.0,hjortActivity,sigma,1000.0
4,1.0,sigmaIndex,broadband,1000.0
5,2.0,hjortActivity,beta1,1000.0
6,2.0,hjortActivity,beta2,1000.0
7,1.5,hjortActivity,sigma,999.0
8,2.0,hjortMobility,beta1,994.0
9,2.0,hjortActivity,delta2,986.0


## load data split
LOOCV using 5 subjects, 1 for validation

In [10]:
#by now we used fixed CV data splits instead
dataSplits=loadPickle("dataSplits_LOOCV_5S.pkl")
dataSplits

Unnamed: 0,train,val,test
0,"[0002, 0003, 0006]",[0005],1
1,"[0001, 0003, 0005]",[0006],2
2,"[0001, 0002, 0006]",[0005],3
3,"[0001, 0002, 0006]",[0003],5
4,"[0002, 0003, 0005]",[0001],6


## define annotation criteria

In [11]:
annotationCriteria=pd.DataFrame({
    'criteriumId':['0001'],
    'criteriumName':['union'],
    'labelerIdList':[['0001','0002']]
})

In [12]:
annotationCriteria

Unnamed: 0,criteriumId,criteriumName,labelerIdList
0,1,union,"[0001, 0002]"


In [13]:
experimentModels=pd.merge(annotationCriteria,dataSplits,how='cross')

In [14]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test
0,1,union,"[0001, 0002]","[0002, 0003, 0006]",[0005],1
1,1,union,"[0001, 0002]","[0001, 0003, 0005]",[0006],2
2,1,union,"[0001, 0002]","[0001, 0002, 0006]",[0005],3
3,1,union,"[0001, 0002]","[0001, 0002, 0006]",[0003],5
4,1,union,"[0001, 0002]","[0002, 0003, 0005]",[0001],6


## model fitting

In [15]:
n_jobs=4 # paralelization parameter for xgboost (keep it 8 as maximum)
learning_rate=0.4
subsample=0.6

num_boost_round=60  #number of boosting rounds per model

models=[]
modelIds=[]
valF1s=[]
spindleTimeRates=[]
for index, row in experimentModels.iterrows():
    print('**********************')
    print(str(index+1)+' of '+str(len(experimentModels)))
    #Define annotations criterium
    usedAnnotations=annotations[annotations.labelerId.isin(row.labelerIdList)].reset_index(drop=True)
    #Features
    trainFeatures=loadFeatureMatrix(row.train,featureSelection,signalsMetadata,samplerate,demopath)
    #Labels
    trainLabels=loadLabelsVector(row.train,usedAnnotations,signalsMetadata,samplerate)
    
    #Train the models
    params={
        'n_jobs':n_jobs,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'objective':'binary:logistic'
    }
    trainDMatrix=xgb.DMatrix(data=trainFeatures,label=trainLabels)
    xgb_model = xgb.train(params,trainDMatrix,num_boost_round=num_boost_round)
    spindleTimeRates.append(sum(trainLabels)/len(trainLabels))
    models.append(xgb_model)
    modelIds.append(str(uuid.uuid4()))
    

**********************
1 of 5
**********************
2 of 5
**********************
3 of 5
**********************
4 of 5
**********************
5 of 5


In [16]:
experimentModels["modelId"]=modelIds
experimentModels["spindleTimeRate"]=spindleTimeRates

In [17]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate
0,1,union,"[0001, 0002]","[0002, 0003, 0006]",[0005],1,63acdb2a-2cda-4182-a04e-39cea4f2891e,0.042294
1,1,union,"[0001, 0002]","[0001, 0003, 0005]",[0006],2,ec0c0a53-857b-437e-906c-57e9027ea72b,0.051389
2,1,union,"[0001, 0002]","[0001, 0002, 0006]",[0005],3,09017763-6741-4a9b-996c-84c37fd0e52f,0.058704
3,1,union,"[0001, 0002]","[0001, 0002, 0006]",[0003],5,6e7273c0-5925-42d9-9eaf-bcbc5e1b5bd8,0.058704
4,1,union,"[0001, 0002]","[0002, 0003, 0005]",[0001],6,1f55c1c2-7355-43c3-9ff5-55b71f796218,0.040015


## save results

In [18]:
#create parent folder
if not os.path.isdir(demopath+"/DREAMS/experiments/"):
    os.mkdir(demopath+"/DREAMS/experiments/")
experimentpath=demopath+"/DREAMS/experiments/"+experimentId
os.mkdir(experimentpath)
#save each of the models
for id, model in zip(modelIds,models):
    model.save_model(experimentpath+"/"+id+".json")
#save experiment information
dumpPickle(experimentpath+"/experimentModels.pkl",experimentModels)
dumpPickle(experimentpath+"/featureSelection.pkl",featureSelection)

# COPY THE UUID BELOW TO THE NEXT NOTEBOOK

In [19]:
print(experimentId)

310d725f-07ef-4d0d-8d3a-779b06df7546
