In [1]:
pip install -r requirements.txt --user




# Imports

In [33]:
import helperfunctions as hf
from pycaret.classification import *

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from IPython.display import clear_output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sktime.transformations.panel.padder import PaddingTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.utils.mlflow_sktime import save_model

To generate pip requirements.txt:

`pip list --format=freeze > requirements.txt`

# Set up directories
```
-- Directory Structure --

├── Main
│   ├── Code
│   │   ├── Submission.ipynb
│   │   ├── helperfunction.py
│   ├── dataPackage
│   │   ├── task-ils
│   │   ├── task-rest
│   │   ├── EvalSet_StartEndTimes.csv
│   ├── dataPackageEval
│   │   ├── EvalSet_StartEndTimes.csv
```

In [34]:
# set directories
cwd = os.getcwd()
main_dir = os.path.split(cwd)[0]
data_pkg_trg_dir = os.path.join(main_dir, 'dataPackage')
data_ils_trg_dir = os.path.join(main_dir, 'dataPackage', 'task-ils')
# data_rest_trg_dir = os.path.join(main_dir, 'dataPackage', 'task-rest')

data_pkg_eval_dir = os.path.join(main_dir, 'dataPackageEval', 'dataPackage_eval_withOcuevts_230203')
data_ils_eval_dir = os.path.join(main_dir, 'dataPackageEval', 'dataPackage_eval_withOcuevts_230203', 'task-ils')
# data_rest_trg_dir = os.path.join(main_dir, 'dataPackageEval', 'dataPackage_eval_withOcuevts_230203', 'task-rest')

In [35]:
# create directory if not exist
output_trg_dir = os.path.join(main_dir,'Cleaned Trg Data')
if not os.path.isdir(output_trg_dir):
    os.makedirs(output_trg_dir)

output_eval_dir = os.path.join(main_dir,'Cleaned Eval Data')
if not os.path.isdir(output_eval_dir):
    os.makedirs(output_eval_dir)

# rest data cleaning (did not use rest data)
# rest_output_dir = os.path.join(main_dir,'Cleaned Data Rest')
# if not os.path.isdir(rest_output_dir):
#     os.makedirs(rest_output_dir)

ML_final_output_dir = os.path.join(main_dir,'Data Ready for ML_final')
if not os.path.isdir(ML_final_output_dir):
    os.makedirs(ML_final_output_dir)

ML_validation_output_dir = os.path.join(main_dir,'Data Ready for ML_validation')
if not os.path.isdir(ML_validation_output_dir):
    os.makedirs(ML_validation_output_dir)

model_dir = os.path.join(main_dir,'Trained Models')
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)

processed_eval_dir = os.path.join(main_dir,'Data Ready for ML_eval')
if not os.path.isdir(processed_eval_dir):
    os.makedirs(processed_eval_dir)

ensembled_dir = os.path.join(main_dir,'Data Ensembled')
if not os.path.isdir(ensembled_dir):
    os.makedirs(ensembled_dir)

# Data Cleaning

## Data Cleaning for Training Set

### Subject list, level list, downsampling rates for selected sensors for training dataset

In [36]:
# Generate 'cap_name_list', 'level_list', 'sensor_freq_list'
cap_name_list = []
for cp in os.listdir(data_ils_trg_dir):
    if 'cp' in cp:
        cap_name_list.append(cp[-5:])

cap_name_list = pd.DataFrame(data=cap_name_list, columns=['cp_ID'])
cap_name_list.to_csv(os.path.join(data_pkg_trg_dir,'cap_name_list.csv'), index=False)

level_list = {'level': ['01B','02B','03B','04B']}
level_list = pd.DataFrame(level_list)
level_list.to_csv(os.path.join(data_pkg_trg_dir,'level_list.csv'), index=False)

sensor_cutoff_freq_list = {
    'sensor_name': ['lslshimmertorsoacc','lslshimmereda','lslshimmeremg','lslshimmerresp','lslrespitrace','lslshimmerecg'] ,
    'freq': [10,30,5,30,30,30]
    }
sensor_cutoff_freq_list = pd.DataFrame(sensor_cutoff_freq_list)
sensor_cutoff_freq_list.to_csv(os.path.join(data_pkg_trg_dir,'selected_sensor_cutoff_freq.csv'), index=False)

In [37]:
cap_name_list = pd.read_csv(os.path.join(data_pkg_trg_dir,'cap_name_list.csv'))
level_list = pd.read_csv(os.path.join(data_pkg_trg_dir,'level_list.csv'))
sensor_freq_list = pd.read_csv(os.path.join(data_pkg_trg_dir,'selected_sensor_cutoff_freq.csv'))

### Cleaning for selected sensors
`['lslshimmertorsoacc','lslshimmereda','lslshimmeremg','lslshimmerresp','lslrespitrace','lslshimmerecg']`

In [7]:
# data cleaning for ils trg data - output as 1 file per csv
# failed files are printed as output
for cap in cap_name_list['cp_ID']:
    for level in level_list['level']:
        for sensor in sensor_freq_list['sensor_name']:
            try:
                data_csv_list = hf.get_dirs_to_csv(data_ils_trg_dir, cap, level, sensor)
                run = 0
                for csv_dir in data_csv_list:
                    run = run + 1
                    sr = hf.get_csv_freq(csv_dir)
                    cut_off_freq = sensor_freq_list.loc[sensor_freq_list['sensor_name'] == sensor, 'freq'].iloc[0]
                    dsr = np.rint(sr/(cut_off_freq*2))
                    df = pd.read_csv(csv_dir)
                    # remove from df before simulation start and after simulation ends
                    head, tail = hf.get_head_tail_time_to_remove(csv_dir)
                    # remove before start
                    if head > 0:
                        head_rows = head * sr
                        head_rows = int(head_rows) + (head_rows % 1 > 0)
                        df = df.drop(df.index[:head_rows])
                    #remove after stop
                    if tail > 0:
                        tail_rows = tail * sr
                        tail_rows = int(tail_rows) + (tail_rows % 1 > 0)
                        df = df.drop(df.index[-tail_rows:])
                        
                    df_out = pd.DataFrame()
                    cols, times = [], []
                    for column in df:
                        if column != 'time_dn':
                            cols.append(column)
                            df1 = df[column]
                            X = np.fft.fft(df1,axis=0)
                            X_lpf = X
                            X_lpf[cut_off_freq*sr:] = 0
                            Y_lpf = np.fft.ifft(X_lpf,axis=0)
                            Y_lpf = Y_lpf.real
                            Y_dsp = Y_lpf[::int(dsr)]
                            df_out = pd.concat([df_out, pd.DataFrame(Y_dsp)], axis=1)
                            
                    df_out.columns = cols
                    # add time column
                    for i in range(df_out.shape[0]):
                        times.append(i*1/(cut_off_freq*2))
                    df_out.insert(0, 'Time', times)
                    output_csv = os.path.join(output_trg_dir, f"{sensor}_{level}_{cap}_{run}.csv")
                    df_out.to_csv(output_csv, index=False)
            except:
                print (cap, level, sensor, run)
                pass

cp009 01B lslshimmereda 1
cp009 01B lslshimmerresp 1
cp009 01B lslshimmerecg 1
cp009 02B lslshimmereda 1
cp009 02B lslshimmerresp 1
cp009 02B lslshimmerecg 1
cp009 03B lslshimmereda 1
cp009 03B lslshimmerresp 1
cp009 03B lslshimmerecg 1
cp009 04B lslshimmereda 1
cp009 04B lslshimmerresp 1
cp009 04B lslshimmerecg 1
cp028 01B lslshimmeremg 3


### Move HTC Vive Eye Data to clean data folder

In [38]:
# Removes head and tail of data where simulation is paused
# Test data only
# Failed files are printed as output

sensor = 'lslhtcviveeye'
feature_list = ['gaze_origin_l','gaze_direction_l','pupil_diameter_l','eye_openness_l','pupil_position_l']

for cap in cap_name_list['cp_ID']:
    for level in level_list['level']:
        try:
            data_csv_list = hf.get_dirs_to_csv(data_ils_trg_dir, cap, level, sensor)
            run = 0
            for csv_dir in data_csv_list:
                run = run + 1
                sr = hf.get_csv_freq(csv_dir)
                df = pd.read_csv(csv_dir)
                # remove from df before simulation start and after simulation ends
                head, tail = hf.get_head_tail_time_to_remove(csv_dir)
                # remove before start
                if head > 0:
                    head_rows = head * sr
                    head_rows = int(head_rows) + (head_rows % 1 > 0)
                    df = df.drop(df.index[:head_rows])
                # remove after stop
                if tail > 0:
                    tail_rows = tail * sr
                    tail_rows = int(tail_rows) + (tail_rows % 1 > 0)
                    df = df.drop(df.index[-tail_rows:])
                # generate csv files per feature
                for feature in feature_list:
                    df_out = df.filter(like=feature)
                    output_csv = os.path.join(output_trg_dir, f"{sensor}_{feature}_{level}_{cap}_{run}.csv")
                    print(df_out)
                    df_out.to_csv(output_csv, index=False)
        except:
            print (cap, level, sensor, run)
            pass

cp003 01B lslhtcviveeye 1
cp003 02B lslhtcviveeye 1
cp003 03B lslhtcviveeye 1
cp003 04B lslhtcviveeye 1
        gaze_origin_l_x_mm  gaze_origin_l_y_mm  gaze_origin_l_z_mm
23586            33.114700           -0.435181          -20.425568
23587            33.114700           -0.435181          -20.425568
23588            33.118561           -0.458527          -20.425568
23589            33.118561           -0.458527          -20.425568
23590            33.121277           -0.489563          -20.392609
...                    ...                 ...                 ...
129933           33.908722           -0.378754          -20.776947
129934           33.926346           -0.373276          -20.801682
129935           33.926346           -0.373276          -20.801682
129936           33.934250           -0.362595          -20.843109
129937           33.934250           -0.362595          -20.843109

[106352 rows x 3 columns]
        gaze_direction_l_x  gaze_direction_l_y  gaze_direction_l_

## Data Cleaning for Eval Set

### Subject list for eval dataset

In [39]:
# Generate 'cap_name_list', 'level_list', 'sensor_freq_list'
cap_name_eval_list = []
for cp in os.listdir(data_ils_eval_dir):
    if 'cp' in cp:
        cap_name_eval_list.append(cp[-5:])

cap_name_eval_list = pd.DataFrame(data=cap_name_eval_list, columns=['cp_ID'])
cap_name_eval_list.to_csv(os.path.join(data_pkg_eval_dir,'cap_name_list.csv'), index=False)

In [40]:
cap_name_eval_list = pd.read_csv(os.path.join(data_pkg_eval_dir,'cap_name_list.csv'))
sensor_freq_list = pd.read_csv(os.path.join(data_pkg_trg_dir,'selected_sensor_cutoff_freq.csv'))

### Load eval data set start times

In [41]:
start_times_file = os.path.join(main_dir, 'dataPackageEval', 'EvalSet_StartEndTimes.csv')
start_times = pd.read_csv(start_times_file)

### Cleaning for selected sensors
`['lslshimmertorsoacc','lslshimmereda','lslshimmeremg','lslshimmerresp','lslrespitrace','lslshimmerecg']`

In [12]:
# data cleaning for ils eval data - output as 1 file per csv
# failed files are printed as output
level = None
for cap in cap_name_eval_list['cp_ID']:
    for sensor in sensor_freq_list['sensor_name']:
        try:
            data_csv_list = hf.get_dirs_to_csv(data_ils_eval_dir, cap, level, sensor)
            run = 0
            for csv_dir in data_csv_list:
                run = run + 1
                sr = hf.get_csv_freq(csv_dir)
                cut_off_freq = sensor_freq_list.loc[sensor_freq_list['sensor_name'] == sensor, 'freq'].iloc[0]
                dsr = np.rint(sr/(cut_off_freq*2))
                df = pd.read_csv(csv_dir)
                
                # remove from df before simulation start and after it ends
                fname = csv_dir.split('\\')[-1]
                subj = fname.split('_')[0]
                run_num = fname.split('_')[5][-3:]
                start_time = start_times.loc[(start_times['Subject'].str.contains(subj)) & (start_times['Run'] == int(run_num))]['StartTime'].values[0]
                end_time = start_times.loc[(start_times['Subject'].str.contains(subj)) & (start_times['Run'] == int(run_num))]['EndTime'].values[0]
                df = df.loc[(df['time_dn'] > start_time) & (df['time_dn'] < end_time)]
                    
                df_out = pd.DataFrame()
                cols, times = [], []
                for column in df:
                    if column != 'time_dn':
                        cols.append(column)
                        df1 = df[column]
                        X = np.fft.fft(df1,axis=0)
                        X_lpf = X
                        X_lpf[cut_off_freq*sr:] = 0
                        Y_lpf = np.fft.ifft(X_lpf,axis=0)
                        Y_lpf = Y_lpf.real
                        Y_dsp = Y_lpf[::int(dsr)]
                        df_out = pd.concat([df_out, pd.DataFrame(Y_dsp)], axis=1)
                        
                df_out.columns = cols
                # add time column
                for i in range(df_out.shape[0]):
                    times.append(i*1/(cut_off_freq*2))
                df_out.insert(0, 'Time', times)
                output_csv = os.path.join(output_eval_dir, f"{sensor}_{cap}_{run}.csv")
                df_out.to_csv(output_csv, index=False)
        except:
            print (cap, sensor, run)
            pass

### Move HTC Vive Eye Data to clean data folder

In [42]:
# No Cleaning Required
output_eval_dir
# os.path.split(output_trg_dir)[1]

'c:\\Users\\tan_k\\Cleaned Eval Data'

In [43]:
# Removes head of data where simulation is paused
# Test data only
# Failed files are printed as output

sensor = 'lslhtcviveeye'
feature_list = ['gaze_origin_l','gaze_direction_l','pupil_diameter_l','eye_openness_l','pupil_position_l']
level = None

for cap in cap_name_eval_list['cp_ID']:
    try:
        data_csv_list = hf.get_dirs_to_csv(data_ils_eval_dir, cap, level, sensor)
        run = 0
        for csv_dir in data_csv_list:
            run = run + 1
            # sr = hf.get_csv_freq(csv_dir)
            df = pd.read_csv(csv_dir)

            # remove from df before simulation start and after it ends
            fname = os.path.split(csv_dir)[1]
            # fname = csv_dir.split('\\')[-1]
            subj = fname.split('_')[0]
            run_num = fname.split('_')[5][-3:]
            start_time = start_times.loc[(start_times['Subject'].str.contains(subj)) & (start_times['Run'] == int(run_num))]['StartTime'].values[0]
            end_time = start_times.loc[(start_times['Subject'].str.contains(subj)) & (start_times['Run'] == int(run_num))]['EndTime'].values[0]
            df = df.loc[(df['time_dn'] > start_time) & (df['time_dn'] < end_time)]
            # generate csv files per feature
            for feature in feature_list:
                df_out = df.filter(like=feature)
                output_csv = os.path.join(output_eval_dir, f"{sensor}_{feature}_{cap}_{run}.csv")
                df_out.to_csv(output_csv, index=False)
            
    except:
        print (cap, sensor, run)
        pass

In [26]:
output_trg_dir

'c:\\Users\\tan_k\\Cleaned Trg Data'

## Generate 5 folds for train test split

In [44]:
data_dir = output_trg_dir
target_dir = os.path.join(main_dir, 'Data Ready for ML_folds')

# Get list of subjects
data_files = hf.get_all_data_csv_filenames(data_dir)
split_files = [i.split('_') for i in data_files]
subjects = [i[2] for i in split_files]
subjects = list(set(subjects))
subjects.sort()

# Split
train, test = [], []

grpkfold = GroupKFold(n_splits=5)
for train_i, test_i in grpkfold.split(X=subjects, groups=subjects):
    train.append(train_i)
    test.append(test_i)

train_splits, test_splits = {}, {}

for i, fold in enumerate(train):
    train_splits[i] = []
    for j in fold:
        train_splits[i].append(subjects[j])

for i, fold in enumerate(test):
    test_splits[i] = []
    for j in fold:
        test_splits[i].append(subjects[j])

In [45]:
# Check splits
for i in range(5):
    print(train_splits[i])
    print()
    print(test_splits[i])
    print(len(train_splits[i]) + len(test_splits[i]))
    print()

['diameter', 'direction', 'openness', 'origin']

['position']
5

['diameter', 'direction', 'openness', 'position']

['origin']
5

['diameter', 'direction', 'origin', 'position']

['openness']
5

['diameter', 'openness', 'origin', 'position']

['direction']
5

['direction', 'openness', 'origin', 'position']

['diameter']
5



# Individual data types

## EMG

In [None]:
# Did not use

## EDA

In [None]:
# Did not use

## ECG

### Find max length of datasets across train and eval sets for padding

In [6]:
hz = 60
time_step = 1/hz
sensor = 'lslshimmerecg'

# calculating pad length should use longest of all runs for same sensor
df_runs_forpadlength_trg = hf.get_df_runs(output_trg_dir, sensor, '', '', False)
pad_length_trg = int(df_runs_forpadlength_trg['time'].max()/time_step)+2

df_runs_forpadlength_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)
pad_length_eval = int(df_runs_forpadlength_eval['time'].max()/time_step)+2

pad_length = max(pad_length_trg, pad_length_eval)
pad_length

55418

### Validate model on training data folds
#### Generate folds

In [None]:
# Generate folds of training data

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 60
sensor = 'lslshimmerecg'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmerecg_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

In [8]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmerecg_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 3, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform0.npy
	y_test_lslshimmerecg_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 3, 55418)
y shape: (72,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform1.npy
	y_test_lslshimmerecg_standard_padtransform1.npy

Number of runs detected: 84
X shape: (84, 3, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform2.npy
	y_test_lslshimmerecg_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 3, 55418)
y shape: (83,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform3.npy
	y_test_lslshimmerecg_standard_padtransform3.npy

Number of runs detected: 84
X shape: (84, 3, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform4.npy
	y_test_lslshimmerecg_standard_padtransform4.npy



#### Set up model

In [7]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

def ecg_folds(n_estimators=250, n_jobs=-1, random_state=42, result_CV = None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(os.path.join(data_dir, f'X_train_lslshimmerecg_standard_padtransform{i}.npy'))
        y_train = np.load(os.path.join(data_dir, f'y_train_lslshimmerecg_standard_padtransform{i}.npy'))
        X_test = np.load(os.path.join(data_dir, f'X_test_lslshimmerecg_standard_padtransform{i}.npy'))
        y_test = np.load(os.path.join(data_dir, f'y_test_lslshimmerecg_standard_padtransform{i}.npy'))

        # only use first 2 columns of data on la_ra and ll_ra.
        clf = ColumnEnsembleClassifier(
            estimators=[
                ("est1", SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state), [0]),
                ("est2", SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state), [1])
            ]
        )

        clf.fit(X_train, y_train)
        y_pred_proba_SupervisedTimeSeriesForest = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SupervisedTimeSeriesForest
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SupervisedTimeSeriesForest, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

#### Runs

In [None]:
%%time

result, _ = ecg_folds(100, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 45min 32s
Wall time: 1h 17min 50s


accuracy_score    0.361264
AUC_score         0.648099
F1_score          0.321160
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(150, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 47min 27s
Wall time: 1h 33min 41s


accuracy_score    0.362880
AUC_score         0.653857
F1_score          0.321408
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(200, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 48min 37s
Wall time: 1h 50min 57s


accuracy_score    0.372375
AUC_score         0.653808
F1_score          0.333089
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(240, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 49min 27s
Wall time: 2h 4min 13s


accuracy_score    0.383090
AUC_score         0.656757
F1_score          0.342569
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(250, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 49min 17s
Wall time: 2h 12min 56s


accuracy_score    0.390261
AUC_score         0.655580
F1_score          0.351113
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(260, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 50min 4s
Wall time: 2h 11min 18s


accuracy_score    0.375182
AUC_score         0.655679
F1_score          0.334513
dtype: float64

### Generate probabilities for ensemble model

In [10]:
%%time

result, probabilities = ecg_folds(250, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 56min 36s
Wall time: 2h 28min 54s


accuracy_score    0.334194
AUC_score         0.650603
F1_score          0.299353
dtype: float64

In [11]:
# Check
pd.DataFrame(probabilities[0])

Unnamed: 0,0,1,2,3
0,0.526000,0.118,0.108000,0.248000
1,0.582000,0.138,0.084000,0.196000
2,0.542667,0.104,0.129000,0.224333
3,0.342667,0.194,0.206667,0.256667
4,0.302000,0.266,0.244000,0.188000
...,...,...,...,...
79,0.040000,0.338,0.332000,0.290000
80,0.276000,0.274,0.256000,0.194000
81,0.030000,0.334,0.354000,0.282000
82,0.058000,0.334,0.350000,0.258000


In [15]:
# Get run info

data_dir = output_trg_dir
hz = 60
sensor = 'lslshimmerecg'
level = ''

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmerecg_01B', 'lslshimmerecg_02B', 'lslshimmerecg_03B', 'lslshimmerecg_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

NameError: name 'test_splits' is not defined

In [14]:
# Combine probabilities with run info and save it

target_dir = os.path.join(main_dir,'Data Ensembled')

df_combined_lslshimmerecg = df_combined.copy()
df_combined_lslshimmerecg = df_combined_lslshimmerecg.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmerecg = df_combined_lslshimmerecg.reset_index(drop = True)
df_combined_lslshimmerecg.to_csv(os.path.join(target_dir, 'lslshimmerecg_ensembled_training.csv'), index=False)
df_combined_lslshimmerecg

NameError: name 'df_combined' is not defined

In [6]:
# Load saved probabilities
target_dir = os.path.join(main_dir,'Data Ensembled')
df_combined_lslshimmerecg_load = pd.read_csv(target_dir+r'\lslshimmerecg_ensembled_training.csv')
df_combined_lslshimmerecg_load

Unnamed: 0,subject,difficulty,run,lslshimmerecg_01B,lslshimmerecg_02B,lslshimmerecg_03B,lslshimmerecg_04B
0,cp003,01B,1,0.288,0.178,0.272,0.262
1,cp003,01B,2,0.220,0.190,0.236,0.354
2,cp003,01B,3,0.262,0.190,0.280,0.268
3,cp003,02B,1,0.126,0.278,0.314,0.282
4,cp003,02B,2,0.110,0.318,0.318,0.254
...,...,...,...,...,...,...,...
402,cp043,03B,2,0.040,0.338,0.332,0.290
403,cp043,03B,3,0.276,0.274,0.256,0.194
404,cp043,04B,1,0.030,0.334,0.354,0.282
405,cp043,04B,2,0.058,0.334,0.350,0.258


### Generate combined numpy data file for ML model training

In [8]:
# Generate full training data for respiration from Shimmer

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_final')
hz = 60
sensor = 'lslshimmerecg'
subject = ''
level = ''

# StandardScaler
file_suffix = 'train_lslshimmerecg_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)

Number of runs detected: 407
X shape: (407, 3, 55418)
y shape: (407,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform.npy
	y_train_lslshimmerecg_standard_padtransform.npy


### Train model on full set of training data

In [9]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

data_dir = os.path.join(main_dir,'Data Ready for ML_final')
model_dir = os.path.join(main_dir,'Trained Models')

X_train = np.load(os.path.join(data_dir, 'X_train_lslshimmerecg_standard_padtransform.npy'))
y_train = np.load(os.path.join(data_dir, 'y_train_lslshimmerecg_standard_padtransform.npy'))

# only use first 2 columns of data on la_ra and ll_ra.
clf = ColumnEnsembleClassifier(
    estimators=[
        ("est1", SupervisedTimeSeriesForest(n_estimators=250, n_jobs=-1, random_state=42), [0]),
        ("est2", SupervisedTimeSeriesForest(n_estimators=250, n_jobs=-1, random_state=42), [1])
    ]
)

clf.fit(X_train, y_train)

# Save model
with open(os.path.join(model_dir, 'shimmerECG.pkl'), 'wb') as f:
    pickle.dump(clf, f)

### Generate combined numpy data file for eval set

In [9]:
# Generate full eval data for ECG

data_dir = output_eval_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_eval')
hz = 60
sensor = 'lslshimmerecg'
subject = ''
level = ''

# StandardScaler
file_suffix = 'eval_lslshimmerecg_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level, True)

Number of runs detected: 96
X shape: (96, 3, 55418)
Saved files:
	X_eval_lslshimmerecg_standard_padtransform.npy


### Apply trained model to eval data

In [20]:
# Load model
with open(os.path.join(model_dir, 'shimmerECG.pkl'), 'rb') as f:
    clf = pickle.load(f)

# Do predictions
X_eval = np.load(os.path.join(processed_eval_dir, 'X_eval_lslshimmerecg_standard_padtransform.npy'))
y_eval_proba = clf.predict_proba(X_eval)

In [21]:
# Get dataframe of probabilities
df_proba = pd.DataFrame(y_eval_proba)
df_proba.columns = ['lslshimmerecg_01B', 'lslshimmerecg_02B', 'lslshimmerecg_03B', 'lslshimmerecg_04B']

# Runs data
sensor = 'lslshimmerecg'
df_runs_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)

# Save probability df
df_probabilities = pd.concat([df_runs_eval, df_proba], axis=1)
df_probabilities.drop(columns=['difficulty', 'time'], inplace=True)
df_probabilities.loc[:,'run'] = df_probabilities['run'].astype('int')
df_probabilities.sort_values(['subject','run'], inplace=True)
df_probabilities.to_csv(os.path.join(ensembled_dir, 'lslshimmerecg_ensembled_eval.csv'), index=False)
df_probabilities

Unnamed: 0,subject,run,lslshimmerecg_01B,lslshimmerecg_02B,lslshimmerecg_03B,lslshimmerecg_04B
0,cp040,1,0.436000,0.160,0.132000,0.272000
4,cp040,2,0.518000,0.144,0.086000,0.252000
5,cp040,3,0.472000,0.156,0.156000,0.216000
6,cp040,4,0.491727,0.154,0.134727,0.219545
7,cp040,5,0.516400,0.136,0.114800,0.232800
...,...,...,...,...,...,...
94,cp049,8,0.458000,0.140,0.138000,0.264000
95,cp049,9,0.560000,0.148,0.106000,0.186000
85,cp049,10,0.504000,0.184,0.126000,0.186000
86,cp049,11,0.504000,0.184,0.126000,0.186000


## Respiration (from Shimmer Device)

### Find max length of datasets across train and eval sets for padding

In [19]:
hz = 60
time_step = 1/hz
sensor = 'lslshimmerresp'

# calculating pad length should use longest of all runs for same sensor
df_runs_forpadlength_trg = hf.get_df_runs(output_trg_dir, sensor, '', '', False)
pad_length_trg = int(df_runs_forpadlength_trg['time'].max()/time_step)+2

df_runs_forpadlength_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)
pad_length_eval = int(df_runs_forpadlength_eval['time'].max()/time_step)+2

pad_length = max(pad_length_trg, pad_length_eval)
pad_length

55418

### Validate model on training data folds
#### Generate folds

In [20]:
# Generate folds of training data

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 60
sensor = 'lslshimmerresp'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmerresp_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

Number of runs detected: 323
X shape: (323, 1, 55418)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform0.npy
	y_train_lslshimmerresp_standard_padtransform0.npy

Number of runs detected: 335
X shape: (335, 1, 55418)
y shape: (335,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform1.npy
	y_train_lslshimmerresp_standard_padtransform1.npy

Number of runs detected: 323
X shape: (323, 1, 55418)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform2.npy
	y_train_lslshimmerresp_standard_padtransform2.npy

Number of runs detected: 324
X shape: (324, 1, 55418)
y shape: (324,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform3.npy
	y_train_lslshimmerresp_standard_padtransform3.npy

Number of runs detected: 323
X shape: (323, 1, 55418)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform4.npy
	y_train_lslshimmerresp_standard_padtransform4.npy



In [21]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmerresp_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 1, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform0.npy
	y_test_lslshimmerresp_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 1, 55418)
y shape: (72,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform1.npy
	y_test_lslshimmerresp_standard_padtransform1.npy

Number of runs detected: 84
X shape: (84, 1, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform2.npy
	y_test_lslshimmerresp_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 1, 55418)
y shape: (83,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform3.npy
	y_test_lslshimmerresp_standard_padtransform3.npy

Number of runs detected: 84
X shape: (84, 1, 55418)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform4.npy
	y_test_lslshimmerresp_standard_padtransform4.npy



#### Set up model

In [7]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

def respshimmer_folds(n_estimators=40, n_jobs=-1, random_state=42, result_CV = None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(os.path.join(data_dir, f'X_train_lslshimmerresp_standard_padtransform{i}.npy'))
        y_train = np.load(os.path.join(data_dir, f'y_train_lslshimmerresp_standard_padtransform{i}.npy'))
        X_test = np.load(os.path.join(data_dir, f'X_test_lslshimmerresp_standard_padtransform{i}.npy'))
        y_test = np.load(os.path.join(data_dir, f'y_test_lslshimmerresp_standard_padtransform{i}.npy'))

        clf = SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state)

        clf.fit(X_train, y_train)
        y_pred_proba_SupervisedTimeSeriesForest = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SupervisedTimeSeriesForest
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SupervisedTimeSeriesForest, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

#### Runs

In [None]:
%%time

result, _ = respshimmer_folds(40, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 12.4 s
Wall time: 8min 49s


accuracy_score    0.373111
AUC_score         0.630991
F1_score          0.354807
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(50, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 17.2 s
Wall time: 9min 24s


accuracy_score    0.402931
AUC_score         0.647100
F1_score          0.378989
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(60, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 24.2 s
Wall time: 11min 11s


accuracy_score    0.373848
AUC_score         0.644333
F1_score          0.347194
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(70, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 29.3 s
Wall time: 12min 56s


accuracy_score    0.370731
AUC_score         0.644125
F1_score          0.342514
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(80, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 38.3 s
Wall time: 15min 20s


accuracy_score    0.375464
AUC_score         0.651196
F1_score          0.347768
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(90, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 44.7 s
Wall time: 18min 20s


accuracy_score    0.378356
AUC_score         0.648205
F1_score          0.348122
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(100, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 55.3 s
Wall time: 19min


accuracy_score    0.384648
AUC_score         0.645720
F1_score          0.354331
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(110, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 1min 6s
Wall time: 19min 57s


accuracy_score    0.379518
AUC_score         0.645322
F1_score          0.349614
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(200, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 2min 49s
Wall time: 35min 54s


accuracy_score    0.371806
AUC_score         0.646544
F1_score          0.344480
dtype: float64

### Generate probabilities for ensemble model

In [8]:
%%time

result, probabilities = respshimmer_folds(50, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 19.1 s
Wall time: 12min 35s


accuracy_score    0.352362
AUC_score         0.623237
F1_score          0.324980
dtype: float64

In [9]:
# Check
pd.DataFrame(probabilities[0])

Unnamed: 0,0,1,2,3
0,0.54,0.20,0.14,0.12
1,0.58,0.16,0.10,0.16
2,0.58,0.14,0.16,0.12
3,0.36,0.26,0.20,0.18
4,0.24,0.26,0.20,0.30
...,...,...,...,...
79,0.12,0.12,0.36,0.40
80,0.28,0.26,0.20,0.26
81,0.08,0.30,0.38,0.24
82,0.12,0.16,0.44,0.28


In [10]:
# Get run info

data_dir = output_trg_dir
hz = 60
sensor = 'lslshimmerresp'
level = ''

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmerresp_01B', 'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

Number of runs detected: 84
Number of runs detected: 72
Number of runs detected: 84
Number of runs detected: 83
Number of runs detected: 84


In [11]:
# Combine probabilities with run info and save it

target_dir = os.path.join(main_dir,'Data Ensembled')

df_combined_lslshimmerresp = df_combined.copy()
df_combined_lslshimmerresp = df_combined_lslshimmerresp.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmerresp = df_combined_lslshimmerresp.reset_index(drop = True)
df_combined_lslshimmerresp.to_csv(os.path.join(target_dir, 'lslshimmerresp_ensembled_training.csv'), index=False)
df_combined_lslshimmerresp

Unnamed: 0,subject,difficulty,run,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B
0,cp003,01B,1,0.48,0.10,0.12,0.30
1,cp003,01B,2,0.40,0.12,0.18,0.30
2,cp003,01B,3,0.40,0.14,0.24,0.22
3,cp003,02B,1,0.24,0.16,0.32,0.28
4,cp003,02B,2,0.16,0.18,0.38,0.28
...,...,...,...,...,...,...,...
402,cp043,03B,2,0.12,0.12,0.36,0.40
403,cp043,03B,3,0.28,0.26,0.20,0.26
404,cp043,04B,1,0.08,0.30,0.38,0.24
405,cp043,04B,2,0.12,0.16,0.44,0.28


### Generate combined numpy data file for ML model training

In [20]:
# Generate full training data for respiration from Shimmer

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_final')
sensor = 'lslshimmerresp'
subject = ''
level = ''

# StandardScaler
file_suffix = 'train_lslshimmerresp_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)

Number of runs detected: 407
X shape: (407, 1, 55418)
y shape: (407,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform.npy
	y_train_lslshimmerresp_standard_padtransform.npy


### Train model on full set of training data

In [29]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

data_dir = os.path.join(main_dir,'Data Ready for ML_final')
model_dir = os.path.join(main_dir,'Trained Models')

X_train = np.load(os.path.join(data_dir, 'X_train_lslshimmerresp_standard_padtransform.npy'))
y_train = np.load(os.path.join(data_dir, 'y_train_lslshimmerresp_standard_padtransform.npy'))

clf = SupervisedTimeSeriesForest(n_estimators=50, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Save model
with open(os.path.join(model_dir, 'shimmerResp.pkl'), 'wb') as f:
    pickle.dump(clf, f)

### Generate combined numpy data file for eval set

In [21]:
# Generate full eval data for respiration from Shimmer

data_dir = output_eval_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_eval')
hz = 60
sensor = 'lslshimmerresp'
subject = ''
level = ''

# StandardScaler
file_suffix = 'eval_lslshimmerresp_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level, True)

Number of runs detected: 96
X shape: (96, 1, 55418)
Saved files:
	X_eval_lslshimmerresp_standard_padtransform.npy


### Apply trained model to eval data

In [22]:
# Load model
with open(os.path.join(model_dir, 'shimmerResp.pkl'), 'rb') as f:
    clf = pickle.load(f)

# Do predictions
X_eval = np.load(os.path.join(processed_eval_dir, 'X_eval_lslshimmerresp_standard_padtransform.npy'))
y_eval_proba = clf.predict_proba(X_eval)

In [23]:
# Get dataframe of probabilities
df_proba = pd.DataFrame(y_eval_proba)
df_proba.columns = ['lslshimmerresp_01B', 'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B']

# Runs data
sensor = 'lslshimmerresp'
df_runs_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)

# Save probability df
df_probabilities = pd.concat([df_runs_eval, df_proba], axis=1)
df_probabilities.drop(columns=['difficulty', 'time'], inplace=True)
df_probabilities.loc[:,'run'] = df_probabilities['run'].astype('int')
df_probabilities.sort_values(['subject','run'], inplace=True)
df_probabilities.to_csv(os.path.join(ensembled_dir, 'lslshimmerresp_ensembled_eval.csv'), index=False)
df_probabilities

Unnamed: 0,subject,run,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B
0,cp040,1,0.54,0.24,0.02,0.20
4,cp040,2,0.52,0.12,0.10,0.26
5,cp040,3,0.58,0.10,0.08,0.24
6,cp040,4,0.26,0.22,0.02,0.50
7,cp040,5,0.44,0.10,0.16,0.30
...,...,...,...,...,...,...
94,cp049,8,0.50,0.06,0.12,0.32
95,cp049,9,0.58,0.18,0.12,0.12
85,cp049,10,0.32,0.18,0.16,0.34
86,cp049,11,0.60,0.10,0.16,0.14


## Respiration (from Respitrace Device)

In [None]:
# Did not use

## TorsoACC

### Find max length of datasets across train and eval sets for padding

In [13]:
hz = 20
time_step = 1/hz
sensor = 'lslshimmertorsoacc'

# calculating pad length should use longest of all runs for same sensor
df_runs_forpadlength_trg = hf.get_df_runs(output_trg_dir, sensor, '', '', False)
pad_length_trg = int(df_runs_forpadlength_trg['time'].max()/time_step)+2

df_runs_forpadlength_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)
pad_length_eval = int(df_runs_forpadlength_eval['time'].max()/time_step)+2

pad_length = max(pad_length_trg, pad_length_eval)
pad_length

ValueError: cannot convert float NaN to integer

### Validate model on training data folds
#### Generate folds

In [25]:
# Generate folds of training data

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 20
sensor = 'lslshimmertorsoacc'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmertorsoacc_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

Number of runs detected: 306
X shape: (306, 3, 18429)
y shape: (306,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform0.npy
	y_train_lslshimmertorsoacc_standard_padtransform0.npy

Number of runs detected: 318
X shape: (318, 3, 18429)
y shape: (318,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform1.npy
	y_train_lslshimmertorsoacc_standard_padtransform1.npy

Number of runs detected: 311
X shape: (311, 3, 18429)
y shape: (311,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform2.npy
	y_train_lslshimmertorsoacc_standard_padtransform2.npy

Number of runs detected: 307
X shape: (307, 3, 18429)
y shape: (307,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform3.npy
	y_train_lslshimmertorsoacc_standard_padtransform3.npy

Number of runs detected: 318
X shape: (318, 3, 18429)
y shape: (318,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform4.npy
	y_train_lslshimmertorsoacc_standard_padtransform4.npy



In [26]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmertorsoacc_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 3, 18429)
y shape: (84,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform0.npy
	y_test_lslshimmertorsoacc_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 3, 18429)
y shape: (72,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform1.npy
	y_test_lslshimmertorsoacc_standard_padtransform1.npy

Number of runs detected: 79
X shape: (79, 3, 18429)
y shape: (79,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform2.npy
	y_test_lslshimmertorsoacc_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 3, 18429)
y shape: (83,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform3.npy
	y_test_lslshimmertorsoacc_standard_padtransform3.npy

Number of runs detected: 72
X shape: (72, 3, 18429)
y shape: (72,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform4.npy
	y_test_lslshimmertorsoacc_standard_padtransform4.npy



#### Set up model

In [6]:
from sktime.classification.feature_based import SignatureClassifier

def torsoracc_folds(window_name, window_length, window_step, depth, random_state=None, result_CV=None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(os.path.join(data_dir, f'X_train_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        y_train = np.load(os.path.join(data_dir, f'y_train_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        X_test = np.load(os.path.join(data_dir, f'X_test_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        y_test = np.load(os.path.join(data_dir, f'y_test_lslshimmertorsoacc_standard_padtransform{i}.npy'))

        clf = ColumnEnsembleClassifier(
            estimators=[
                ("est1", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [0]),
                ("est2", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [1]),
                ("est3", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [2]),
            ]
        )

        clf.fit(X_train, y_train)
        y_pred_proba_SignatureClassifier = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SignatureClassifier
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SignatureClassifier, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

### Generate probabilities for ensemble model

In [7]:
%%time

result, probabilities = torsoracc_folds(window_name="sliding", window_length=15, window_step=15, depth=4, random_state=42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 2h 5min 52s
Wall time: 2h 14min 54s


accuracy_score    0.408444
AUC_score         0.649584
F1_score          0.370554
dtype: float64

In [8]:
# Check
pd.DataFrame(probabilities[0])

Unnamed: 0,0,1,2,3
0,0.483333,0.196667,0.123333,0.196667
1,0.490000,0.176667,0.113333,0.220000
2,0.436667,0.150000,0.203333,0.210000
3,0.330000,0.233333,0.210000,0.226667
4,0.266667,0.330000,0.200000,0.203333
...,...,...,...,...
79,0.113333,0.306667,0.340000,0.240000
80,0.233333,0.363333,0.203333,0.200000
81,0.106667,0.320000,0.230000,0.343333
82,0.096667,0.290000,0.340000,0.273333


In [10]:
# Get run info

data_dir = output_trg_dir
hz = 20
sensor = 'lslshimmertorsoacc'
level = ''

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmertorsoacc_01B', 'lslshimmertorsoacc_02B', 'lslshimmertorsoacc_03B', 'lslshimmertorsoacc_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

Number of runs detected: 84
Number of runs detected: 72
Number of runs detected: 79
Number of runs detected: 83
Number of runs detected: 72


In [11]:
# Combine probabilities with run info and save it

target_dir = os.path.join(main_dir,'Data Ensembled')

df_combined_lslshimmertorsoacc = df_combined.copy()
df_combined_lslshimmertorsoacc = df_combined_lslshimmertorsoacc.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmertorsoacc = df_combined_lslshimmertorsoacc.reset_index(drop = True)
df_combined_lslshimmertorsoacc.to_csv(os.path.join(target_dir, 'lslshimmertorsoacc_ensembled_training.csv'), index=False)
df_combined_lslshimmertorsoacc

Unnamed: 0,subject,difficulty,run,lslshimmertorsoacc_01B,lslshimmertorsoacc_02B,lslshimmertorsoacc_03B,lslshimmertorsoacc_04B
0,cp004,01B,1,0.483333,0.196667,0.123333,0.196667
1,cp004,01B,2,0.490000,0.176667,0.113333,0.220000
2,cp004,01B,3,0.436667,0.150000,0.203333,0.210000
3,cp004,02B,1,0.330000,0.233333,0.210000,0.226667
4,cp004,02B,2,0.266667,0.330000,0.200000,0.203333
...,...,...,...,...,...,...,...
385,cp043,03B,2,0.113333,0.306667,0.340000,0.240000
386,cp043,03B,3,0.233333,0.363333,0.203333,0.200000
387,cp043,04B,1,0.106667,0.320000,0.230000,0.343333
388,cp043,04B,2,0.096667,0.290000,0.340000,0.273333


### Generate combined numpy data file for ML model training

In [8]:
# Generate full training data for TorsoACC

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_final')
hz = 20
sensor = 'lslshimmertorsoacc'
subject = ''
level = ''

# StandardScaler
file_suffix = 'train_lslshimmertorsoacc_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)

Number of runs detected: 390
X shape: (390, 3, 18429)
y shape: (390,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform.npy
	y_train_lslshimmertorsoacc_standard_padtransform.npy


### Train model on full set of training data

In [9]:
%%time
from sktime.classification.feature_based import SignatureClassifier

data_dir = os.path.join(main_dir,'Data Ready for ML_final')
model_dir = os.path.join(main_dir,'Trained Models')

X_train = np.load(os.path.join(data_dir, 'X_train_lslshimmertorsoacc_standard_padtransform.npy'))
y_train = np.load(os.path.join(data_dir, 'y_train_lslshimmertorsoacc_standard_padtransform.npy'))

clf = ColumnEnsembleClassifier(
    estimators=[
        ("est1", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [0]),
        ("est2", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [1]),
        ("est3", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [2]),
    ]
)

clf.fit(X_train, y_train)

# Save model
with open(os.path.join(model_dir, 'torsoACC.pkl'), 'wb') as f:
    pickle.dump(clf, f)

CPU times: total: 26min 6s
Wall time: 26min 18s


### Generate combined numpy data file for eval set

In [25]:
# Generate full training data for torsoracc

data_dir = output_eval_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_eval')
sensor = 'lslshimmertorsoacc'
subject = ''
level = ''

# StandardScaler
file_suffix = 'eval_lslshimmertorsoacc_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level, True)

Number of runs detected: 96
X shape: (96, 3, 18429)
Saved files:
	X_eval_lslshimmertorsoacc_standard_padtransform.npy


### Apply trained model to eval data

In [26]:
# Load model
with open(os.path.join(model_dir, 'torsoACC.pkl'), 'rb') as f:
    clf = pickle.load(f)

# Do predictions
X_eval = np.load(os.path.join(processed_eval_dir, 'X_eval_lslshimmertorsoacc_standard_padtransform.npy'))
y_eval_proba = clf.predict_proba(X_eval)

In [27]:
# Get dataframe of probabilities
df_proba = pd.DataFrame(y_eval_proba)
df_proba.columns = ['lslshimmertorsoacc_01B', 'lslshimmertorsoacc_02B', 'lslshimmertorsoacc_03B', 'lslshimmertorsoacc_04B']

# Runs data
sensor = 'lslshimmertorsoacc'
df_runs_eval = hf.get_df_runs(output_eval_dir, sensor, '', '', False, True)

# Save probability df
df_probabilities = pd.concat([df_runs_eval, df_proba], axis=1)
df_probabilities.drop(columns=['difficulty', 'time'], inplace=True)
df_probabilities.loc[:,'run'] = df_probabilities['run'].astype('int')
df_probabilities.sort_values(['subject','run'], inplace=True)
df_probabilities.to_csv(os.path.join(ensembled_dir, 'lslshimmertorsoacc_ensembled_eval.csv'), index=False)
df_probabilities

Unnamed: 0,subject,run,lslshimmertorsoacc_01B,lslshimmertorsoacc_02B,lslshimmertorsoacc_03B,lslshimmertorsoacc_04B
0,cp040,1,0.450000,0.140000,0.146667,0.263333
4,cp040,2,0.403333,0.166667,0.126667,0.303333
5,cp040,3,0.406667,0.186667,0.153333,0.253333
6,cp040,4,0.406667,0.143333,0.173333,0.276667
7,cp040,5,0.480000,0.133333,0.140000,0.246667
...,...,...,...,...,...,...
94,cp049,8,0.406667,0.156667,0.150000,0.286667
95,cp049,9,0.486667,0.140000,0.150000,0.223333
85,cp049,10,0.376667,0.156667,0.170000,0.296667
86,cp049,11,0.483333,0.156667,0.133333,0.226667


## HTC Vive Eye Data

### Find max length of datasets across train and eval sets for padding

In [52]:
hz = 250
time_step = 1/hz
sensor = 'lslhtcviveeye_pupil_position_l'

# calculating pad length should use longest of all runs for same sensor
df_runs_forpadlength_trg = hf.get_df_runs_htceye(output_trg_dir, sensor, '', '', False)
pad_length_trg = df_runs_forpadlength_trg['length'].max()+2

df_runs_forpadlength_eval = hf.get_df_runs_htceye(output_eval_dir, sensor, '', '', False, True)
pad_length_eval = df_runs_forpadlength_eval['length'].max()+2

pad_length = max(pad_length_trg, pad_length_eval)
pad_length

219209

### Pupil Position
Use only left eye, xy axes

In [57]:
# Generate folds of training data

data_dir = output_trg_dir
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 250
sensor = 'lslhtcviveeye_pupil_position_l'
level = ''
scaler = StandardScaler()
print(train_splits)
for i in train_splits:
    file_suffix = f'train_lslhtcviveeye_pupil_position_l_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslhtcviveeye_pupil_position_l_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, pad_length, sensor, subject, level)
    print()

{0: ['diameter', 'direction', 'openness', 'origin'], 1: ['diameter', 'direction', 'openness', 'position'], 2: ['diameter', 'direction', 'origin', 'position'], 3: ['diameter', 'openness', 'origin', 'position'], 4: ['direction', 'openness', 'origin', 'position']}
Number of runs detected: 0


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
from sktime.classification.feature_based import SignatureClassifier

def torsoracc_folds(window_name, window_length, window_step, depth, random_state=None, result_CV=None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(os.path.join(data_dir, f'X_train_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        y_train = np.load(os.path.join(data_dir, f'y_train_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        X_test = np.load(os.path.join(data_dir, f'X_test_lslshimmertorsoacc_standard_padtransform{i}.npy'))
        y_test = np.load(os.path.join(data_dir, f'y_test_lslshimmertorsoacc_standard_padtransform{i}.npy'))

        clf = ColumnEnsembleClassifier(
            estimators=[
                ("est1", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [0]),
                ("est2", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [1]),
                ("est3", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [2]),
            ]
        )

        clf.fit(X_train, y_train)
        y_pred_proba_SignatureClassifier = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SignatureClassifier
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SignatureClassifier, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

%%time


### Pupil Diameter
Use only left eye, xyz axes

### Eye Openess
Use only left eye, xyz axes

### Gaze Direction
Use only left eye, xyz axes

### Gaze Origin
Use only left eye, yz axes

# Ensemble Model
## Combine probability tables

In [5]:
from collections import defaultdict

types = defaultdict(lambda: 'float')
types['subject'] = 'str'
types['difficulty'] = 'str'
types['run'] = 'int'

data_dir = os.path.join(main_dir,'Data Ensembled')

df_trg_lslshimmerecg = pd.read_csv(os.path.join(data_dir, 'lslshimmerecg_ensembled_training.csv'), dtype=types)
df_trg_lslshimmerresp = pd.read_csv(os.path.join(data_dir, 'lslshimmerresp_ensembled_training.csv'), dtype=types)
df_trg_lslshimmertorsoacc = pd.read_csv(os.path.join(data_dir, 'lslshimmertorsoacc_ensembled_training.csv'), dtype=types)
df_trg_eye = pd.read_csv(os.path.join(data_dir, 'train_eye_probabilities.csv'), dtype=types)

df_eval_lslshimmerecg = pd.read_csv(os.path.join(data_dir, 'lslshimmerecg_ensembled_eval.csv'), dtype=types)
df_eval_lslshimmerresp = pd.read_csv(os.path.join(data_dir, 'lslshimmerresp_ensembled_eval.csv'), dtype=types)
df_eval_lslshimmertorsoacc = pd.read_csv(os.path.join(data_dir, 'lslshimmertorsoacc_ensembled_eval.csv'), dtype=types)
# df_eval_eye = pd.read_csv(os.path.join(data_dir, 'eval_eye_probabilities.csv'), dtype=types)

In [9]:
df_trg_full = df_trg_lslshimmerecg.merge(df_trg_lslshimmerresp, on=['subject','difficulty','run'], how='outer')
df_trg_full = df_trg_full.merge(df_trg_lslshimmertorsoacc, on=['subject','difficulty','run'], how='outer')
df_trg_full = df_trg_full.merge(df_trg_eye, on=['subject','difficulty','run'], how='outer')
df_trg_full.drop(['subject','run'], axis=1, inplace=True)
df_trg_full

Unnamed: 0,difficulty,lslshimmerecg_01B,lslshimmerecg_02B,lslshimmerecg_03B,lslshimmerecg_04B,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B,lslshimmertorsoacc_01B,...,HTC_pupilposi_03B,HTC_pupilposi_04B,HTC_Diameter_01B,HTC_Diameter_02B,HTC_Diameter_03B,HTC_Diameter_04B,HTC_gazedirection_01B,HTC_gazedirection_02B,HTC_gazedirection_03B,HTC_gazedirection_04B
0,01B,0.288,0.178,0.272,0.262,0.48,0.10,0.12,0.30,,...,,,,,,,,,,
1,01B,0.220,0.190,0.236,0.354,0.40,0.12,0.18,0.30,,...,,,,,,,,,,
2,01B,0.262,0.190,0.280,0.268,0.40,0.14,0.24,0.22,,...,,,,,,,,,,
3,02B,0.126,0.278,0.314,0.282,0.24,0.16,0.32,0.28,,...,,,,,,,,,,
4,02B,0.110,0.318,0.318,0.254,0.16,0.18,0.38,0.28,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,03B,,,,,,,,,,...,0.2700,0.36,0.37,0.34,0.11,0.18,0.1300,0.31,0.2700,0.29
415,03B,,,,,,,,,,...,0.2800,0.30,0.09,0.43,0.27,0.21,0.1000,0.22,0.3100,0.37
416,04B,,,,,,,,,,...,0.2100,0.26,0.11,0.14,0.29,0.46,0.3500,0.17,0.2600,0.22
417,04B,,,,,,,,,,...,0.1900,0.21,0.04,0.09,0.43,0.44,0.2900,0.22,0.2200,0.27


In [None]:
# df_eval_full = df_eval_lslshimmerecg.merge(df_eval_lslshimmerresp, on=['subject','difficulty','run'], how='outer')
# df_eval_full = df_eval_full.merge(df_eval_lslshimmertorsoacc, on=['subject','difficulty','run'], how='outer')
# df_eval_full = df_eval_full.merge(df_eval_eye, on=['subject','difficulty','run'], how='outer')
# df_eval_full

## Train ensemble model

In [10]:
from pycaret.classification import *
s = setup(df_trg_full, target = 'difficulty')

Unnamed: 0,Description,Value
0,Session id,1060
1,Target,difficulty
2,Target type,Multiclass
3,Target mapping,"01B: 0, 02B: 1, 03B: 2, 04B: 3"
4,Original data shape,"(419, 33)"
5,Transformed data shape,"(419, 33)"
6,Transformed train set shape,"(293, 33)"
7,Transformed test set shape,"(126, 33)"
8,Numeric features,32
9,Rows with missing values,6.9%


In [13]:
best = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5768,0.8035,0.5768,0.6201,0.5632,0.4369,0.4509,0.034
rf,Random Forest Classifier,0.5595,0.7879,0.5595,0.5843,0.5554,0.4137,0.4223,0.075
lda,Linear Discriminant Analysis,0.5661,0.78,0.5661,0.6033,0.5634,0.4225,0.4344,0.023
et,Extra Trees Classifier,0.5321,0.7771,0.5321,0.5645,0.5272,0.376,0.3849,0.073
lightgbm,Light Gradient Boosting Machine,0.5323,0.771,0.5323,0.56,0.5236,0.3759,0.3851,0.11
nb,Naive Bayes,0.4575,0.7479,0.4575,0.4924,0.4231,0.2781,0.2989,0.024
gbc,Gradient Boosting Classifier,0.5423,0.7478,0.5423,0.5613,0.5325,0.3902,0.3985,0.113
knn,K Neighbors Classifier,0.4677,0.7292,0.4677,0.4736,0.4612,0.2906,0.2946,0.032
qda,Quadratic Discriminant Analysis,0.4508,0.6777,0.4508,0.4533,0.4415,0.2689,0.2733,0.024
ada,Ada Boost Classifier,0.4436,0.6655,0.4436,0.4645,0.4355,0.2575,0.2638,0.046


In [16]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Set up various dataframes, starting with most important features

In [None]:
df_trg_full.columns

Index(['difficulty', 'lslshimmerecg_01B', 'lslshimmerecg_02B',
       'lslshimmerecg_03B', 'lslshimmerecg_04B', 'lslshimmerresp_01B',
       'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B',
       'lslshimmertorsoacc_01B', 'lslshimmertorsoacc_02B',
       'lslshimmertorsoacc_03B', 'lslshimmertorsoacc_04B',
       'HTC_gazeorigin_01B', 'HTC_gazeorigin_02B', 'HTC_gazeorigin_03B',
       'HTC_gazeorigin_04B', 'HTC_eyeopenness_01B', 'HTC_eyeopenness_02B',
       'HTC_eyeopenness_03B', 'HTC_eyeopenness_04B', 'HTC_pupilposi_01B',
       'HTC_pupilposi_02B', 'HTC_pupilposi_03B', 'HTC_pupilposi_04B',
       'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B',
       'HTC_Diameter_04B', 'HTC_gazedirection_01B', 'HTC_gazedirection_02B',
       'HTC_gazedirection_03B', 'HTC_gazedirection_04B'],
      dtype='object')

In [61]:
# Eye diameter
df_trg_diameter = df_trg_full.copy()
df_trg_diameter = df_trg_diameter[['difficulty', 'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B', 'HTC_Diameter_04B']]

# Respiration
df_trg_resp = df_trg_full.copy()
df_trg_resp = df_trg_resp[['difficulty','lslshimmerresp_01B', 'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B']]

# ECG
df_trg_ecg = df_trg_full.copy()
df_trg_ecg = df_trg_ecg[['difficulty','lslshimmerecg_01B', 'lslshimmerecg_02B', 'lslshimmerecg_03B', 'lslshimmerecg_04B']]

# Gaze origin
df_trg_gazeorigin = df_trg_full.copy()
df_trg_gazeorigin = df_trg_gazeorigin[['difficulty', 'HTC_gazeorigin_01B', 'HTC_gazeorigin_02B', 'HTC_gazeorigin_03B', 'HTC_gazeorigin_04B']]

# Gaze direction
df_trg_gazedir = df_trg_full.copy()
df_trg_gazedir = df_trg_gazedir[['difficulty', 'HTC_gazedirection_01B', 'HTC_gazedirection_02B',
'HTC_gazedirection_03B', 'HTC_gazedirection_04B']]

# Eye openess
df_trg_eyeopen = df_trg_full.copy()
df_trg_eyeopen = df_trg_eyeopen[['difficulty', 'HTC_eyeopenness_01B', 'HTC_eyeopenness_02B', 'HTC_eyeopenness_03B', 'HTC_eyeopenness_04B']]

### Try with just diameter

In [53]:
diameter_setup = setup(df_trg_diameter, target = 'difficulty')
best_diameter = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.6384,0.8316,0.6384,0.668,0.6322,0.5172,0.5257,0.142
et,Extra Trees Classifier,0.6079,0.846,0.6079,0.6202,0.5998,0.4763,0.4823,0.124
lightgbm,Light Gradient Boosting Machine,0.6044,0.8099,0.6044,0.6147,0.5948,0.4713,0.4789,0.12
dt,Decision Tree Classifier,0.597,0.7411,0.597,0.6201,0.592,0.462,0.4688,0.022
gbc,Gradient Boosting Classifier,0.5944,0.8138,0.5944,0.6107,0.5872,0.4581,0.4662,0.201
nb,Naive Bayes,0.5634,0.78,0.5634,0.5707,0.5474,0.4185,0.4275,0.021
lda,Linear Discriminant Analysis,0.5471,0.7867,0.5471,0.5576,0.5374,0.3957,0.4031,0.021
ridge,Ridge Classifier,0.5506,0.0,0.5506,0.5575,0.5288,0.4012,0.4124,0.018
lr,Logistic Regression,0.5406,0.7757,0.5406,0.5365,0.5259,0.3874,0.3937,0.026
svm,SVM - Linear Kernel,0.5367,0.0,0.5367,0.5245,0.503,0.3799,0.3953,0.028


### Try with just respiration

In [55]:
resp_setup = setup(df_trg_resp, target = 'difficulty')
best_resp = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.334,0.5829,0.334,0.3544,0.3294,0.1127,0.1158,0.06
lda,Linear Discriminant Analysis,0.3615,0.6129,0.3615,0.3363,0.3282,0.1508,0.1585,0.018
nb,Naive Bayes,0.3614,0.6117,0.3614,0.3057,0.3036,0.1502,0.1652,0.023
lightgbm,Light Gradient Boosting Machine,0.3102,0.5271,0.3102,0.3064,0.3029,0.0789,0.08,0.03
rf,Random Forest Classifier,0.3102,0.5478,0.3102,0.3036,0.2986,0.0782,0.0796,0.143
gbc,Gradient Boosting Classifier,0.3034,0.5688,0.3034,0.308,0.2979,0.0703,0.0717,0.218
lr,Logistic Regression,0.3445,0.6132,0.3445,0.2777,0.2857,0.1304,0.1435,0.027
ridge,Ridge Classifier,0.3511,0.0,0.3511,0.2944,0.2841,0.1392,0.1561,0.017
et,Extra Trees Classifier,0.293,0.5388,0.293,0.2883,0.2824,0.056,0.0556,0.123
dt,Decision Tree Classifier,0.2861,0.5324,0.2861,0.2931,0.282,0.0465,0.0465,0.021


### Try with just ECG

In [56]:
ecg_setup = setup(df_trg_ecg, target = 'difficulty')
best_ecg = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.3516,0.6111,0.3516,0.3461,0.3417,0.1346,0.1365,0.255
ada,Ada Boost Classifier,0.3552,0.5956,0.3552,0.3536,0.3376,0.1386,0.1431,0.079
lda,Linear Discriminant Analysis,0.3889,0.6265,0.3889,0.3888,0.3333,0.1852,0.204,0.018
knn,K Neighbors Classifier,0.3479,0.573,0.3479,0.324,0.3229,0.1304,0.1359,0.033
lightgbm,Light Gradient Boosting Machine,0.3141,0.5992,0.3141,0.3182,0.3103,0.0835,0.0847,0.038
rf,Random Forest Classifier,0.3207,0.5917,0.3207,0.301,0.3041,0.0934,0.0952,0.174
lr,Logistic Regression,0.379,0.6157,0.379,0.3196,0.3026,0.1728,0.1945,0.023
et,Extra Trees Classifier,0.3177,0.5782,0.3177,0.3021,0.3018,0.0896,0.0917,0.149
nb,Naive Bayes,0.3823,0.6138,0.3823,0.2942,0.2959,0.1756,0.2017,0.021
dt,Decision Tree Classifier,0.297,0.5326,0.297,0.2944,0.2891,0.0627,0.0638,0.023


### Try with just gaze origin

In [57]:
gazeorigin_setup = setup(df_trg_gazeorigin, target = 'difficulty')
best_gazeorigin = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.3482,0.5941,0.3482,0.3628,0.3427,0.1302,0.1328,0.073
lr,Logistic Regression,0.3549,0.6334,0.3549,0.3706,0.3334,0.1428,0.1499,0.024
lda,Linear Discriminant Analysis,0.3248,0.639,0.3248,0.3345,0.3156,0.1007,0.1035,0.019
nb,Naive Bayes,0.342,0.6276,0.342,0.3111,0.312,0.1245,0.1297,0.024
et,Extra Trees Classifier,0.3176,0.5489,0.3176,0.3169,0.3096,0.0906,0.0922,0.134
lightgbm,Light Gradient Boosting Machine,0.314,0.5761,0.314,0.324,0.3089,0.0845,0.0861,0.039
gbc,Gradient Boosting Classifier,0.317,0.5824,0.317,0.3083,0.3043,0.0886,0.0895,0.25
ridge,Ridge Classifier,0.3246,0.0,0.3246,0.2966,0.2963,0.1021,0.1073,0.019
knn,K Neighbors Classifier,0.2967,0.5451,0.2967,0.3118,0.2891,0.063,0.0647,0.03
rf,Random Forest Classifier,0.297,0.5369,0.297,0.29,0.2852,0.0622,0.0629,0.168


### Try with just gaze direction

In [54]:
gazedir_setup = setup(df_trg_gazedir, target = 'difficulty')
best_gazedir = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.3717,0.6148,0.3717,0.3934,0.3483,0.162,0.1745,0.021
lda,Linear Discriminant Analysis,0.3482,0.6177,0.3482,0.3495,0.3398,0.1309,0.133,0.02
ada,Ada Boost Classifier,0.3307,0.5537,0.3307,0.33,0.3241,0.1074,0.1086,0.063
ridge,Ridge Classifier,0.348,0.0,0.348,0.3221,0.3233,0.1306,0.1346,0.019
lr,Logistic Regression,0.3413,0.6138,0.3413,0.3146,0.3155,0.1219,0.1261,0.024
et,Extra Trees Classifier,0.3076,0.5984,0.3076,0.3208,0.3042,0.0788,0.08,0.13
gbc,Gradient Boosting Classifier,0.3038,0.6057,0.3038,0.3032,0.293,0.0726,0.0741,0.205
rf,Random Forest Classifier,0.304,0.6269,0.304,0.3051,0.2919,0.0744,0.0756,0.125
dt,Decision Tree Classifier,0.2974,0.5354,0.2974,0.2968,0.2901,0.0645,0.066,0.021
knn,K Neighbors Classifier,0.2974,0.6174,0.2974,0.2893,0.2837,0.0639,0.0657,0.03


### Try with just eye openess

In [63]:
eyeopen_setup = setup(df_trg_eyeopen, target = 'difficulty')
best_eyeopen = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.4026,0.6243,0.4026,0.4006,0.3834,0.2033,0.2094,0.02
nb,Naive Bayes,0.3889,0.6205,0.3889,0.397,0.3655,0.1845,0.1917,0.022
lr,Logistic Regression,0.3855,0.6278,0.3855,0.3668,0.3497,0.1816,0.1912,0.025
ridge,Ridge Classifier,0.3721,0.0,0.3721,0.3353,0.329,0.1635,0.1725,0.02
ada,Ada Boost Classifier,0.3175,0.5433,0.3175,0.3121,0.3054,0.091,0.0936,0.061
et,Extra Trees Classifier,0.3036,0.5643,0.3036,0.3064,0.2979,0.0726,0.074,0.12
rf,Random Forest Classifier,0.2937,0.5438,0.2937,0.2966,0.2889,0.0591,0.0601,0.159
lightgbm,Light Gradient Boosting Machine,0.2868,0.5459,0.2868,0.2816,0.2791,0.0487,0.0493,0.138
dt,Decision Tree Classifier,0.2762,0.5177,0.2762,0.2764,0.2709,0.0342,0.0343,0.023
knn,K Neighbors Classifier,0.2725,0.5274,0.2725,0.2869,0.2648,0.0303,0.0317,0.034


### Try with diameter and respiration

In [58]:
df_trg_diameter_resp = df_trg_full.copy()
df_trg_diameter_resp = df_trg_diameter_resp[['difficulty', 'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B', 'HTC_Diameter_04B', 'lslshimmerresp_01B', 'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B']]

diameter_resp_setup = setup(df_trg_diameter_resp, target = 'difficulty')
best_diameter_resp = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.598,0.8192,0.598,0.6091,0.5878,0.4633,0.4716,0.029
nb,Naive Bayes,0.5914,0.8163,0.5914,0.6033,0.5855,0.4549,0.461,0.024
lda,Linear Discriminant Analysis,0.5876,0.8253,0.5876,0.6027,0.5799,0.45,0.4565,0.023
ridge,Ridge Classifier,0.591,0.0,0.591,0.6023,0.5784,0.4537,0.4629,0.017
et,Extra Trees Classifier,0.5875,0.8015,0.5875,0.5937,0.5773,0.4494,0.4559,0.132
rf,Random Forest Classifier,0.5775,0.8057,0.5775,0.5961,0.5714,0.4361,0.4431,0.157
lightgbm,Light Gradient Boosting Machine,0.5639,0.7809,0.5639,0.5848,0.5608,0.418,0.4233,0.041
gbc,Gradient Boosting Classifier,0.5605,0.7969,0.5605,0.5838,0.5579,0.4132,0.419,0.242
knn,K Neighbors Classifier,0.5564,0.7537,0.5564,0.5687,0.5485,0.4086,0.4152,0.032
qda,Quadratic Discriminant Analysis,0.5534,0.7905,0.5534,0.6128,0.5441,0.4048,0.4254,0.022


### Try with diameter and ECG

In [59]:
df_trg_diameter_ecg = df_trg_full.copy()
df_trg_diameter_ecg = df_trg_diameter_ecg[['difficulty', 'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B', 'HTC_Diameter_04B', 'lslshimmerecg_01B', 'lslshimmerecg_02B', 'lslshimmerecg_03B', 'lslshimmerecg_04B']]

diameter_ecg_setup = setup(df_trg_diameter_ecg, target = 'difficulty')
best_diameter_ecg = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.5905,0.8047,0.5905,0.6168,0.579,0.4546,0.4653,0.021
rf,Random Forest Classifier,0.5877,0.8035,0.5877,0.5986,0.5789,0.4501,0.4575,0.128
lr,Logistic Regression,0.5939,0.799,0.5939,0.5861,0.5781,0.4589,0.4667,0.022
gbc,Gradient Boosting Classifier,0.58,0.7866,0.58,0.5951,0.5762,0.4396,0.4451,0.23
lda,Linear Discriminant Analysis,0.5839,0.7973,0.5839,0.5787,0.5708,0.4452,0.4522,0.019
ridge,Ridge Classifier,0.587,0.0,0.587,0.5734,0.5665,0.4494,0.4581,0.017
et,Extra Trees Classifier,0.5598,0.7962,0.5598,0.5667,0.5536,0.4124,0.4167,0.121
lightgbm,Light Gradient Boosting Machine,0.5428,0.7931,0.5428,0.5623,0.5396,0.3888,0.3943,0.032
knn,K Neighbors Classifier,0.5392,0.752,0.5392,0.5497,0.5268,0.3842,0.3914,0.026
svm,SVM - Linear Kernel,0.5326,0.0,0.5326,0.5571,0.5062,0.3791,0.4018,0.019


### Try with diameter and gaze direction

In [60]:
df_trg_diameter_gazedir = df_trg_full.copy()
df_trg_diameter_gazedir = df_trg_diameter_gazedir[['difficulty', 'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B', 'HTC_Diameter_04B', 'HTC_gazedirection_01B', 'HTC_gazedirection_02B',
'HTC_gazedirection_03B', 'HTC_gazedirection_04B']]

diameter_gazedir_setup = setup(df_trg_diameter_gazedir, target = 'difficulty')
best_diameter_gazedir = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.5909,0.7896,0.5909,0.6214,0.5908,0.4531,0.4599,0.021
lr,Logistic Regression,0.5803,0.7919,0.5803,0.6028,0.572,0.4394,0.4479,0.021
nb,Naive Bayes,0.57,0.7893,0.57,0.5994,0.5652,0.4256,0.4338,0.02
ridge,Ridge Classifier,0.5668,0.0,0.5668,0.5843,0.5535,0.4215,0.4323,0.018
rf,Random Forest Classifier,0.5566,0.7859,0.5566,0.5737,0.5472,0.4083,0.4161,0.132
gbc,Gradient Boosting Classifier,0.5326,0.7715,0.5326,0.562,0.5326,0.3761,0.3826,0.227
et,Extra Trees Classifier,0.5359,0.7717,0.5359,0.5552,0.5302,0.3801,0.3864,0.123
knn,K Neighbors Classifier,0.5322,0.7523,0.5322,0.5522,0.5285,0.376,0.3816,0.024
lightgbm,Light Gradient Boosting Machine,0.5084,0.7597,0.5084,0.527,0.505,0.3438,0.3483,0.035
qda,Quadratic Discriminant Analysis,0.4779,0.7213,0.4779,0.5093,0.4597,0.3013,0.3157,0.018


### Try with diameter and eye openess

In [65]:
df_trg_diameter_eyeopen = df_trg_full.copy()
df_trg_diameter_eyeopen = df_trg_diameter_eyeopen[['difficulty', 'HTC_Diameter_01B', 'HTC_Diameter_02B', 'HTC_Diameter_03B', 'HTC_Diameter_04B', 'HTC_eyeopenness_01B', 'HTC_eyeopenness_02B', 'HTC_eyeopenness_03B', 'HTC_eyeopenness_04B']]

diameter_eyeopen_setup = setup(df_trg_diameter_eyeopen, target = 'difficulty')
best_diameter_eyeopen = compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.611,0.8172,0.611,0.617,0.601,0.4806,0.4864,0.123
rf,Random Forest Classifier,0.5836,0.8078,0.5836,0.598,0.5769,0.4448,0.4509,0.133
lightgbm,Light Gradient Boosting Machine,0.5733,0.8146,0.5733,0.5936,0.5691,0.4316,0.4393,0.034
lr,Logistic Regression,0.5801,0.7779,0.5801,0.5775,0.5655,0.4401,0.4481,0.023
lda,Linear Discriminant Analysis,0.5598,0.7792,0.5598,0.569,0.5568,0.4129,0.4169,0.018
ridge,Ridge Classifier,0.5597,0.0,0.5597,0.5656,0.54,0.4132,0.4239,0.016
gbc,Gradient Boosting Classifier,0.5456,0.806,0.5456,0.5546,0.5361,0.3945,0.401,0.229
nb,Naive Bayes,0.5426,0.772,0.5426,0.5501,0.536,0.39,0.3955,0.018
knn,K Neighbors Classifier,0.5152,0.7465,0.5152,0.5192,0.5039,0.3535,0.3593,0.027
svm,SVM - Linear Kernel,0.5287,0.0,0.5287,0.5553,0.4942,0.3705,0.3976,0.019
