# MODEL TRAINING - testing the full approach - COGNITION
saving models in a different folder using pickle, saving them in the dataframe causes xgboost to crash

**DO NOT RUN AGAIN**

In [1]:
#external libraries
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import plotly
import plotly.subplots as sb
import plotly.express as px
import plotly.graph_objects as go
import dotenv
import pandas as pd
import scipy.fft as fft
import scipy.signal as sg
import scipy.io as sio
import pickle as pkl
import xgboost as xgb
import time
import uuid

#project library
from spinco import *

#environment variables
dotenv.load_dotenv('lab.env')

#project variables
datapath=os.environ['DATAPATH']
cognipath=datapath+"/COGNITION"
dreamspath=datapath+"/DREAMS"
masspath=datapath+"/MASS"


## experiment id
we'll use it to create a folder at the end of the script

In [2]:
experimentId=str(uuid.uuid4())

## define a fixed samplerate

In [3]:
samplerate=200

## load cognition

In [4]:
annotations, signalsMetadata = loadCOGNITIONSpindles(cognipath)

In [5]:
annotations=annotations[annotations.type=='spindle'].reset_index(drop=True)
len(annotations)

2785

In [6]:
annotations.head()

Unnamed: 0,subjectId,channel,startTime,stopTime,duration,phase,labelerId,type,samplerate,startInd,stopInd
0,1,C3,1938.132297,1939.202326,1.070029,2,1,spindle,200,387626,387840
1,1,C3,2047.217898,2048.112834,0.894936,2,1,spindle,200,409444,409623
2,1,C3,2170.856032,2171.634237,0.778205,3,1,spindle,200,434171,434327
3,1,C3,2219.766538,2220.894943,1.128405,3,1,spindle,200,443953,444179
4,1,C3,2263.482492,2264.357974,0.875482,3,1,spindle,200,452696,452872


In [7]:
signalsMetadata.head()

Unnamed: 0,filename,channel,subjectId,duration,samplerate,isOriginalSamplerate,database
0,COG001_canal_C3.txt,C3-M2,1,39600,200,False,COGNITION
1,COG002_canal_C3.txt,C3-M2,2,39600,200,False,COGNITION
2,COG003_canal_C3.txt,C3-M2,3,39600,200,False,COGNITION
3,COG004_canal_C3.txt,C3-M2,4,39600,200,False,COGNITION
4,COG005_canal_C3.txt,C3-M2,5,39600,200,False,COGNITION


## define a fixed feature selection to use

In [8]:
featureSelectionx=loadPickle("featureSelectionComplete_bootstrapppingR2.pkl")
featureSelection=featureSelectionx.sort_values('Times_selected',ascending=False).head(60).copy().reset_index(drop=True)
featureSelection

Unnamed: 0,window,characteristic,bandName,Times_selected
0,2.0,sigmaIndex,broadband,1000.0
1,2.0,hjortActivity,sigma,1000.0
2,1.5,sigmaIndex,broadband,1000.0
3,1.0,hjortActivity,sigma,1000.0
4,1.0,sigmaIndex,broadband,1000.0
5,2.0,hjortActivity,beta1,1000.0
6,2.0,hjortActivity,beta2,1000.0
7,1.5,hjortActivity,sigma,999.0
8,2.0,hjortMobility,beta1,994.0
9,2.0,hjortActivity,delta2,986.0


## load data split
LOOCV with 7 subjects

In [9]:
#by now we used fixed CV data splits instead
dataSplits=loadPickle("dataSplits_LOOCV.pkl")
dataSplits

Unnamed: 0,train,val,test
0,"[0004, 0005, 0006, 0007]","[0003, 0008]",2
1,"[0002, 0005, 0006, 0008]","[0007, 0004]",3
2,"[0002, 0003, 0005, 0007]","[0006, 0008]",4
3,"[0002, 0003, 0007, 0008]","[0006, 0004]",5
4,"[0002, 0003, 0007, 0008]","[0004, 0005]",6
5,"[0002, 0004, 0005, 0008]","[0003, 0006]",7
6,"[0002, 0005, 0006, 0007]","[0003, 0004]",8


## define annotation criteria

In [10]:
np.unique(annotations.labelerId)

array(['0001'], dtype=object)

In [11]:
annotationCriteria=pd.DataFrame({
    'criteriumId':['0001'],
    'criteriumName':['E1'],
    'labelerIdList':[['0001']]
})

In [12]:
annotationCriteria

Unnamed: 0,criteriumId,criteriumName,labelerIdList
0,1,E1,[0001]


In [13]:
experimentModels=pd.merge(annotationCriteria,dataSplits,how='cross')

In [14]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test
0,1,E1,[0001],"[0004, 0005, 0006, 0007]","[0003, 0008]",2
1,1,E1,[0001],"[0002, 0005, 0006, 0008]","[0007, 0004]",3
2,1,E1,[0001],"[0002, 0003, 0005, 0007]","[0006, 0008]",4
3,1,E1,[0001],"[0002, 0003, 0007, 0008]","[0006, 0004]",5
4,1,E1,[0001],"[0002, 0003, 0007, 0008]","[0004, 0005]",6
5,1,E1,[0001],"[0002, 0004, 0005, 0008]","[0003, 0006]",7
6,1,E1,[0001],"[0002, 0005, 0006, 0007]","[0003, 0004]",8


In [16]:
for index, row in experimentModels.iterrows():
    
    print(row.train)

['0004' '0005' '0006' '0007']
['0002' '0005' '0006' '0008']
['0002' '0003' '0005' '0007']
['0002' '0003' '0007' '0008']
['0002' '0003' '0007' '0008']
['0002' '0004' '0005' '0008']
['0002' '0005' '0006' '0007']


In [23]:
from typing import List, Callable
class Iterator(xgb.DataIter):
  def __init__(self, paths: List[str]):
    self._paths = paths
    self._it = 0
    # XGBoost will generate some cache files under current directory with the prefix
    # "cache"
    super().__init__(cache_prefix=os.path.join(".", "cache"))

  def next(self, input_data: Callable):
    """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
    called by XGBoost during the construction of ``DMatrix``
    """
    if self._it == len(self._paths):
      # return 0 to let XGBoost know this is the end of iteration
      return 0

    # input_data is a function passed in by XGBoost who has the exact same signature of
    # ``DMatrix``
    
    #Features
    trainfeatures=loadFeatureMatrix([self._paths[self._it]],featureSelection,signalsMetadata,samplerate,datapath)
    #Labels
    trainlabels=loadLabelsVector([self._paths[self._it]],usedAnnotations,signalsMetadata,samplerate)
    input_data(data=trainfeatures, label=trainlabels)
    self._it += 1
    # Return 1 to let XGBoost know we haven't seen all the files yet.
    return 1

  def reset(self):
    """Reset the iterator to its beginning"""
    self._it = 0

## model fitting

In [24]:
n_jobs=4 # paralelization parameter for xgboost (keep it 8 as maximum)
learning_rate=0.4
subsample=0.6

num_boost_round=60  #number of boosting rounds per model

models=[]
modelIds=[]
valF1s=[]
spindleTimeRates=[]
for index, row in experimentModels.iterrows():
    print('**********************')
    print(str(index+1)+' of '+str(len(experimentModels)))
    #Define annotations criterium

    usedAnnotations=annotations[annotations.labelerId.isin(row.labelerIdList)].reset_index(drop=True)

    #Train the models
    params={
        'n_jobs':n_jobs,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'objective':'binary:logistic'
    }
    it = Iterator(row.train)
    trainDMatrix=xgb.DMatrix(it)
    xgb_model = xgb.train(params,trainDMatrix,num_boost_round=num_boost_round)
    trainLabels=loadLabelsVector(row.train,usedAnnotations,signalsMetadata,samplerate)
    spindleTimeRates.append(sum(trainLabels)/len(trainLabels))
    models.append(xgb_model)
    modelIds.append(str(uuid.uuid4()))
    del usedAnnotations, xgb_model, trainDMatrix, trainLabels


**********************
1 of 7
0005
**********************
2 of 7
0005
**********************
3 of 7
0003
**********************
4 of 7
0003
**********************
5 of 7
0003
**********************
6 of 7
0004
**********************
7 of 7
0005


In [25]:
experimentModels["modelId"]=modelIds
experimentModels["spindleTimeRate"]=spindleTimeRates

In [26]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate
0,1,E1,[0001],"[0004, 0005, 0006, 0007]","[0003, 0008]",2,ec118284-7c28-475b-b08d-30da2b97e296,0.008075
1,1,E1,[0001],"[0002, 0005, 0006, 0008]","[0007, 0004]",3,75f2d56f-da8c-4186-bc13-14d7f1ffcbf3,0.005
2,1,E1,[0001],"[0002, 0003, 0005, 0007]","[0006, 0008]",4,1a143af3-f4b7-4371-8f1f-946f611f4c6b,0.012035
3,1,E1,[0001],"[0002, 0003, 0007, 0008]","[0006, 0004]",5,de7f05e3-81bd-45d8-aa1e-26befd15ee4d,0.01034
4,1,E1,[0001],"[0002, 0003, 0007, 0008]","[0004, 0005]",6,026134de-dbae-4d74-bc43-5b0497d8d12a,0.01034
5,1,E1,[0001],"[0002, 0004, 0005, 0008]","[0003, 0006]",7,76a84bb4-8341-48d0-8e8c-fed47301eade,0.006673
6,1,E1,[0001],"[0002, 0005, 0006, 0007]","[0003, 0004]",8,f1bfe900-989a-4460-a8a9-32617cc85c64,0.006822


## save results

In [27]:
#create parent folder
experimentpath=datapath+"/experiments/"+experimentId
os.mkdir(experimentpath)
#save each of the models
for id, model in zip(modelIds,models):
    model.save_model(experimentpath+"/"+id+".json")
#save experiment information
dumpPickle(experimentpath+"/experimentModels.pkl",experimentModels)
dumpPickle(experimentpath+"/featureSelection.pkl",featureSelection)

In [28]:
#you need to use this in the following parts of the experiment
print(experimentId)

b3bcc156-7b29-4721-806e-ed0dbf71de6b
