# Hyperparameter optimization for E2 - using all spindles, training with all data
fistly define the UUID of the experiment to use

In [1]:
experimentId="a74605bf-c0bb-40f7-9b50-444123efb629"

-------------------------------------------------------

In [2]:
#external libraries
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import plotly
import plotly.subplots as sb
import plotly.express as px
import plotly.graph_objects as go
import dotenv
import pandas as pd
import scipy.fft as fft
import scipy.signal as sg
import scipy.io as sio
import pickle as pkl
import xgboost as xgb
import time
import sklearn.metrics as skm

#project library
from spinco import *

#environment variables
dotenv.load_dotenv('lab.env')

#project variables
datapath=os.environ['DATAPATH']
cognipath=datapath+"\\COGNITION"
dreamspath=datapath+"\\DREAMS"
masspath=datapath+"\\MASS"


## load mass

In [3]:
samplerate=200  #Should rethink this

In [4]:
annotations, signalsMetadata = loadMASSSpindles(masspath,forceSamplerate=samplerate)

In [5]:
#consider to include this in a function
minDuration=0.3
maxDuration=2.9
annotations=annotations[annotations.type=='spindle']
annotations=annotations[annotations.duration>minDuration]
annotations=annotations[annotations.duration<maxDuration]
annotations=annotations.reset_index(drop=True)

In [6]:
#check with EDA results (not needed)
print(1-len(annotations)/33458)

0.011297746428357902


In [7]:
annotations.head()

Unnamed: 0,type,expert,subjectId,labelerId,startTime,duration,samplerate,stopTime,startInd,stopInd
0,spindle,E1,1,1,888.327805,0.640579,200,888.968384,177666,177794
1,spindle,E1,1,1,905.758061,0.578094,200,906.336155,181152,181267
2,spindle,E1,1,1,917.731574,0.847603,200,918.579177,183546,183716
3,spindle,E1,1,1,922.078189,0.878845,200,922.957034,184416,184591
4,spindle,E1,1,1,939.055445,0.757767,200,939.813212,187811,187963


In [8]:
signalsMetadata.head()

Unnamed: 0,subjectId,file,channel,duration,samplerate,isOriginalSamplerate,database
0,1,MASS_0001.pkl,C3-CLE,28956.0,200,False,MASS
1,2,MASS_0002.pkl,C3-CLE,35016.0,200,False,MASS
2,3,MASS_0003.pkl,C3-CLE,36760.0,200,False,MASS
3,4,MASS_0004.pkl,C3-CLE,28004.0,200,False,MASS
4,5,MASS_0005.pkl,C3-CLE,31244.0,200,False,MASS


## Load experiment results

In [9]:
experimentModels, featureSelection = loadExperiment(experimentId,datapath)

In [10]:
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate
0,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]",afd523c8-e3bf-44e0-a0c7-85a7b0d747e4,0.016462
1,1,E1,[0001],"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]",d90313d0-46a5-4957-b4bd-1c2e66388a31,0.02328
2,1,E1,[0001],"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]",825f07e0-9a06-44aa-bb72-93cdd7d64ca2,0.018776
3,1,E1,[0001],"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]",0b4afad9-a6fb-43ae-8f49-a5d65994e109,0.016077
4,1,E1,[0001],"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]",bcaeb868-00a0-4038-bc73-fb5d86ea09ee,0.01895
5,1,E1,[0001],"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]",00a66953-1236-49a2-bbdc-78c361fa15f0,0.019888
6,1,E1,[0001],"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]",adcf962a-53df-4cb8-ae1f-7a94e2ad3e0c,0.019369
7,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]",21c95c6e-be0b-49bb-a43b-807238a1b12c,0.019076
8,1,E1,[0001],"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]",216f16f3-c2d8-4735-bf51-0daed207c25e,0.018514
9,1,E1,[0001],"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]",09cbec08-c8af-49fb-aabd-4702bef80265,0.020993


In [11]:
#we show the difference in class inbalance for the annotation criteria considered
experimentModels[['criteriumName','spindleTimeRate']].groupby('criteriumName').describe()

Unnamed: 0_level_0,spindleTimeRate,spindleTimeRate,spindleTimeRate,spindleTimeRate,spindleTimeRate,spindleTimeRate,spindleTimeRate,spindleTimeRate
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
criteriumName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
E1,10.0,0.019138,0.002063,0.016077,0.018579,0.019013,0.019758,0.02328
E2,10.0,0.059377,0.004969,0.054294,0.056372,0.057708,0.06094,0.069221
union,10.0,0.060948,0.005032,0.055834,0.05806,0.059096,0.062695,0.071169


In [12]:
featureSelection

Unnamed: 0,characteristic,bandName,window
21,hjortActivity,sigma,0.5
66,petrosian,broadband,0.5
22,hjortActivity,theta,0.5
79,relativePower,beta1,0.5
29,hjortComplexity,sigma,0.5


## Optimal hyperparameter estimation with E2 criterium
we test the optimal points for the prediction threshold and number of boost iterations in the different validation groups

In [13]:
experimentModels=experimentModels[experimentModels.criteriumName=='E2'].reset_index(drop=True)
experimentModels

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate
0,2,E2,[0002],"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]",1623c979-486c-4f45-ba54-dbbac3770588,0.057678
1,2,E2,[0002],"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]",f8abeec1-7d46-4573-8a01-7827b8584472,0.069221
2,2,E2,[0002],"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]",c556cd55-5f8b-420c-b6b2-0ca323bee5df,0.057084
3,2,E2,[0002],"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]",b9ca9323-c013-423a-8b12-a893e0a8be97,0.054695
4,2,E2,[0002],"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]",e31d0f92-1914-4bb5-9156-f9101fe1307c,0.057739
5,2,E2,[0002],"[0001, 0003, 0007, 0009, 0014, 0017, 0018, 0019]","[0011, 0010, 0005]","[0002, 0006, 0012, 0013]",6fd86953-d3aa-4c9c-8069-b22b951dce3f,0.058764
6,2,E2,[0002],"[0001, 0005, 0007, 0010, 0011, 0014, 0017, 0019]","[0018, 0003, 0009]","[0002, 0006, 0012, 0013]",87913851-d47b-4963-b2fa-1c65d7aec5a1,0.066493
7,2,E2,[0002],"[0001, 0003, 0005, 0007, 0009, 0014, 0018, 0019]","[0010, 0011, 0017]","[0002, 0006, 0012, 0013]",0f0da0b8-d512-47b2-808a-c92b1d073468,0.056135
8,2,E2,[0002],"[0003, 0005, 0007, 0009, 0011, 0014, 0017, 0018]","[0001, 0019, 0010]","[0002, 0006, 0012, 0013]",006d0918-7072-4365-953e-4b399728048d,0.054294
9,2,E2,[0002],"[0001, 0003, 0005, 0007, 0009, 0010, 0011, 0018]","[0017, 0019, 0014]","[0002, 0006, 0012, 0013]",ca54dd96-c49f-4680-a78d-2aabf3059e5c,0.061666


In [14]:
auxThres=pd.DataFrame({
    'hyperThres':[0.15,0.20,0.25,0.30,0.35,0.4]
})
auxTrees=pd.DataFrame({
    'hyperTrees':[10,15,20,25,30]
})

hyperParams=pd.merge(auxThres,auxTrees,how='cross')
hyperParams

Unnamed: 0,hyperThres,hyperTrees
0,0.15,10
1,0.15,15
2,0.15,20
3,0.15,25
4,0.15,30
5,0.2,10
6,0.2,15
7,0.2,20
8,0.2,25
9,0.2,30


In [15]:
hyperClose=0.25
hyperDuration=0.3
hyperExperimentModels=[]
hyperMeanF1=[]
hyperStdF1=[]

for ind_hyper,row_hyper in hyperParams.iterrows():
    thisExperimentModels=experimentModels.copy()
    hyperThres=row_hyper.hyperThres
    hyperTrees=int(row_hyper.hyperTrees)
    
    meanF1=[]
    meanPrecision=[]
    meanRecall=[]
    rawMeanF1=[]
    rawMeanPrecision=[]
    rawMeanRecall=[]

    stdF1=[]
    stdPrecision=[]
    stdRecall=[]
    rawStdF1=[]
    rawStdPrecision=[]
    rawStdRecall=[]

    for ind,row in thisExperimentModels.iterrows():
        #load model
        model=loadBooster(row.modelId,experimentId,datapath)
        #initialise lists
        rawF1s=[]
        rawPrecisions=[]
        rawRecalls=[]
        #rawFprs=[]
        f1s=[]
        precisions=[]
        recalls=[]
        #fprs=[]

        #iterate validation subjects
        for valSubjectId in row.val:
            #Define annotations criterium
            usedAnnotations=annotations[annotations.labelerId.isin(row.labelerIdList)].reset_index(drop=True)
            #Load features and labels
            valFeatures=loadFeatureMatrix([valSubjectId],featureSelection,signalsMetadata,samplerate,datapath)
            valLabels=loadLabelsVector([valSubjectId],usedAnnotations,signalsMetadata,samplerate)
            #Predict
            valDMatrix=xgb.DMatrix(data=valFeatures)
            probabilities=model.predict(valDMatrix,iteration_range=(0,hyperTrees))
            rawLabels=probabilities>=hyperThres
            #Raw Metrics
            rawTp=np.sum(rawLabels*valLabels)
            rawFp=np.sum(rawLabels*(1-valLabels))
            rawTn=np.sum((1-rawLabels)*(1-valLabels))
            rawFn=np.sum((1-rawLabels)*valLabels)
            rawF1=2*rawTp/(2*rawTp+rawFp+rawFn)
            rawPrecision=rawTp/(rawTp+rawFp) 
            rawRecall=rawTp/(rawTp+rawFn)
            #rawFpr=rawFp/(rawFp+rawFn)
            #Raw appends
            rawF1s.append(rawF1)
            rawPrecisions.append(rawPrecision)
            rawRecalls.append(rawRecall)
            #rawFprs.append(rawFpr)
            #Process
            processedLabels=labelingProcess(rawLabels,hyperClose,hyperDuration,samplerate)
            #Processed metrics
            tp=np.sum(processedLabels*valLabels)
            fp=np.sum(processedLabels*(1-valLabels))
            tn=np.sum((1-processedLabels)*(1-valLabels))
            fn=np.sum((1-processedLabels)*valLabels)
            f1=2*tp/(2*tp+fp+fn)
            precision=tp/(tp+fp)
            recall=tp/(tp+fn)
            #fpr=fp/(fp+tn)
            #Processed appends
            f1s.append(f1)
            precisions.append(precision)
            recalls.append(recall)
            #fprs.append(fpr)

        #statistics of the metrics over the subjects of the validation set
        meanF1.append(np.mean(f1s))
        meanPrecision.append(np.mean(precisions))
        meanRecall.append(np.mean(recalls))
        rawMeanF1.append(np.mean(rawF1s))
        rawMeanPrecision.append(np.mean(rawPrecisions))
        rawMeanRecall.append(np.mean(rawRecalls))

        stdF1.append(np.std(f1s))
        stdPrecision.append(np.std(precisions))
        stdRecall.append(np.std(recalls))
        rawStdF1.append(np.std(rawF1s))
        rawStdPrecision.append(np.std(rawPrecisions))
        rawStdRecall.append(np.std(rawRecalls))

    thisExperimentModels['meanF1']=meanF1
    thisExperimentModels['meanPrecision']=meanPrecision
    thisExperimentModels['meanRecall']=meanRecall
    thisExperimentModels['rawMeanF1']=rawMeanF1
    thisExperimentModels['rawMeanPrecision']=rawMeanPrecision
    thisExperimentModels['rawMeanRecall']=rawMeanRecall

    thisExperimentModels['stdF1']=stdF1
    thisExperimentModels['stdPrecision']=stdPrecision
    thisExperimentModels['stdRecall']=stdRecall
    thisExperimentModels['rawStdF1']=rawStdF1
    thisExperimentModels['rawStdPrecision']=rawStdPrecision
    thisExperimentModels['rawStdRecall']=rawStdRecall

    hyperExperimentModels.append(thisExperimentModels)
    hyperMeanF1.append(np.mean(thisExperimentModels['meanF1']))
    hyperStdF1.append(np.std(thisExperimentModels['meanF1']))

In [16]:
hyperParams['meanMeanF1']=hyperMeanF1
hyperParams['stdMeanF1']=hyperStdF1

In [17]:
px.scatter(hyperParams,x='hyperTrees',y='meanMeanF1',error_y='stdMeanF1',facet_col='hyperThres')

In [18]:
px.scatter(hyperParams,x='hyperThres',y='meanMeanF1',error_y='stdMeanF1',facet_col='hyperTrees')

In [19]:
optimalInd=hyperParams[hyperParams.meanMeanF1==np.max(hyperParams.meanMeanF1)].index[0]
print("maximal mean score at:")
hyperParams.iloc[optimalInd]

maximal mean score at:


hyperThres     0.300000
hyperTrees    30.000000
meanMeanF1     0.628502
stdMeanF1      0.038819
Name: 19, dtype: float64

In [20]:
optimalInd_2=hyperParams[hyperParams.stdMeanF1==np.min(hyperParams.stdMeanF1)].index[0]
print("minimum variability of score at:")
hyperParams.iloc[optimalInd_2]

minimum variability of score at:


hyperThres     0.250000
hyperTrees    30.000000
meanMeanF1     0.622762
stdMeanF1      0.037456
Name: 14, dtype: float64

## CODA: study the behaviour at the 'optimal' point

In [21]:
fig=px.scatter(hyperExperimentModels[optimalInd],x='rawMeanF1',y='meanF1', marginal_y="histogram",trendline='ols',hover_name='modelId')
fig.add_trace(
    go.Scatter(x=hyperExperimentModels[optimalInd]['rawMeanF1'], y=hyperExperimentModels[optimalInd]['rawMeanF1'], name="identity", mode='lines',fill="toself")
)
fig.show()

In [22]:
hyperExperimentModels[optimalInd].head(5)

Unnamed: 0,criteriumId,criteriumName,labelerIdList,train,val,test,modelId,spindleTimeRate,meanF1,meanPrecision,meanRecall,rawMeanF1,rawMeanPrecision,rawMeanRecall,stdF1,stdPrecision,stdRecall,rawStdF1,rawStdPrecision,rawStdRecall
0,2,E2,[0002],"[0001, 0003, 0005, 0007, 0010, 0014, 0017, 0019]","[0011, 0018, 0009]","[0002, 0006, 0012, 0013]",1623c979-486c-4f45-ba54-dbbac3770588,0.057678,0.658579,0.596821,0.78375,0.65639,0.631796,0.725912,0.038779,0.122012,0.105789,0.026299,0.111913,0.117169
1,2,E2,[0002],"[0001, 0007, 0009, 0010, 0014, 0017, 0018, 0019]","[0011, 0003, 0005]","[0002, 0006, 0012, 0013]",f8abeec1-7d46-4573-8a01-7827b8584472,0.069221,0.654287,0.57952,0.758208,0.651482,0.612554,0.701325,0.017972,0.02226,0.071028,0.029296,0.013352,0.075131
2,2,E2,[0002],"[0003, 0007, 0010, 0011, 0014, 0017, 0018, 0019]","[0009, 0005, 0001]","[0002, 0006, 0012, 0013]",c556cd55-5f8b-420c-b6b2-0ca323bee5df,0.057084,0.578582,0.775179,0.4989,0.5482,0.792426,0.450288,0.103334,0.087059,0.167505,0.113945,0.073427,0.159395
3,2,E2,[0002],"[0003, 0005, 0007, 0009, 0010, 0011, 0017, 0019]","[0001, 0018, 0014]","[0002, 0006, 0012, 0013]",b9ca9323-c013-423a-8b12-a893e0a8be97,0.054695,0.56736,0.553939,0.743286,0.567163,0.578773,0.701193,0.032767,0.204412,0.214517,0.042516,0.187261,0.226255
4,2,E2,[0002],"[0001, 0003, 0005, 0007, 0010, 0014, 0018, 0019]","[0009, 0011, 0017]","[0002, 0006, 0012, 0013]",e31d0f92-1914-4bb5-9156-f9101fe1307c,0.057739,0.66497,0.698438,0.654442,0.638143,0.726753,0.587107,0.031335,0.069169,0.110875,0.048559,0.058446,0.118993


In [24]:
fig=px.scatter(hyperExperimentModels[optimalInd],x='meanF1',y='meanPrecision', marginal_y="histogram",hover_name='modelId')
fig.add_trace(
    go.Scatter(x=hyperExperimentModels[optimalInd]['meanF1'], y=hyperExperimentModels[optimalInd]['meanF1'], name="identity", mode='lines',fill="toself")
)
fig.show()

In [25]:
fig=px.scatter(hyperExperimentModels[optimalInd],x='meanF1',y='meanRecall', marginal_y="histogram",hover_name='modelId')
fig.add_trace(
    go.Scatter(x=hyperExperimentModels[optimalInd]['meanF1'], y=hyperExperimentModels[optimalInd]['meanF1'], name="identity", mode='lines',fill="toself")
)
fig.show()

In [26]:
fig=px.scatter(hyperExperimentModels[optimalInd],x='meanPrecision',y='meanRecall', marginal_y="histogram",hover_name='modelId')
fig.add_trace(
    go.Scatter(x=hyperExperimentModels[optimalInd]['meanPrecision'], y=hyperExperimentModels[optimalInd]['meanPrecision'], name="identity", mode='lines',fill="toself")
)
fig.show()