# Imports

In [1]:
import helperfunctions as hf

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from IPython.display import clear_output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sktime.transformations.panel.padder import PaddingTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.utils.mlflow_sktime import save_model

To generate pip requirements.txt:

`pip list --format=freeze > requirements.txt`

# Training Data

## Data Cleaning for Training Set
### Set up directory
```
-- Directory Structure --

Main Directory
| Code - put code files here: 'Submission.ipynb' and 'helperfunctions.py'
| dataPackage - the original dataPackage with training set data
```

In [2]:
# set directories
cwd = os.getcwd()
main_dir = os.path.split(cwd)[0]
data_ils_dir = os.path.join(main_dir, 'dataPackage', 'task-ils')
data_rest_dir = os.path.join(main_dir, 'dataPackage', 'task-rest')
data_pkg_dir = os.path.join(main_dir, 'dataPackage')

In [14]:
# create directory if not exist
output_dir = os.path.join(main_dir,'Cleaned Data')
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
# rest data cleaning
# rest_output_dir = os.path.join(main_dir,'Cleaned Data Rest')
# if not os.path.isdir(rest_output_dir):
#     os.makedirs(rest_output_dir)
ML_final_output_dir = os.path.join(main_dir,'Data Ready for ML_final')
if not os.path.isdir(ML_final_output_dir):
    os.makedirs(ML_final_output_dir)

ML_validation_output_dir = os.path.join(main_dir,'Data Ready for ML_validation')
if not os.path.isdir(ML_validation_output_dir):
    os.makedirs(ML_validation_output_dir)

model_dir = os.path.join(main_dir,'Trained Models')
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)

### Cleaning for selected sensors
`['lslshimmertorsoacc','lslshimmereda','lslshimmeremg','lslshimmerresp','lslrespitrace','lslshimmerecg']`

In [4]:
# Generate 'cap_name_list', 'level_list', 'sensor_freq_list'
cap_name_list = []
for cp in os.listdir(data_ils_dir):
    if 'cp' in cp:
        cap_name_list.append(cp[-5:])

cap_name_list = pd.DataFrame(data=cap_name_list, columns=['cp_ID'])
cap_name_list.to_csv(os.path.join(data_pkg_dir,'cap_name_list.csv'), index=False)

level_list = {'level': ['01B','02B','03B','04B']}
level_list = pd.DataFrame(level_list)
level_list.to_csv(os.path.join(data_pkg_dir,'level_list.csv'), index=False)

sensor_cutoff_freq_list = {
    'sensor_name': ['lslshimmertorsoacc','lslshimmereda','lslshimmeremg','lslshimmerresp','lslrespitrace','lslshimmerecg'] ,
    'freq': [10,30,5,30,30,30]
    }
sensor_cutoff_freq_list = pd.DataFrame(sensor_cutoff_freq_list)
sensor_cutoff_freq_list.to_csv(os.path.join(data_pkg_dir,'selected_sensor_cutoff_freq.csv'), index=False)

In [5]:
cap_name_list = pd.read_csv(os.path.join(data_pkg_dir,'cap_name_list.csv'))
level_list = pd.read_csv(os.path.join(data_pkg_dir,'level_list.csv'))
sensor_freq_list = pd.read_csv(os.path.join(data_pkg_dir,'selected_sensor_cutoff_freq.csv'))

In [6]:
# data cleaning for ils data - output as 1 file per csv
# failed files are printed as output
for cap in cap_name_list['cp_ID']:
    for level in level_list['level']:
        for sensor in sensor_freq_list['sensor_name']:
            try:
                data_csv_list = hf.get_dirs_to_csv(data_ils_dir, cap, level, sensor)
                run = 0
                for csv_dir in data_csv_list:
                    run = run + 1
                    sr = hf.get_csv_freq(csv_dir)
                    cut_off_freq = sensor_freq_list.loc[sensor_freq_list['sensor_name'] == sensor, 'freq'].iloc[0]
                    dsr = np.rint(sr/(cut_off_freq*2))
                    df = pd.read_csv(csv_dir)
                    # remove from df before simulation start and after simulation ends (12 Dec new)
                    head, tail = hf.get_head_tail_time_to_remove(csv_dir)
                    # remove before start
                    if head > 0:
                        head_rows = head * sr
                        head_rows = int(head_rows) + (head_rows % 1 > 0)
                        df = df.drop(df.index[:head_rows])
                    #remove after stop
                    if tail > 0:
                        tail_rows = tail * sr
                        tail_rows = int(tail_rows) + (tail_rows % 1 > 0)
                        df = df.drop(df.index[-tail_rows:])
                    # end of 12 Dec New added code
                    df_out = pd.DataFrame()
                    cols, times = [], []
                    for column in df:
                        if column != 'time_dn':
                            cols.append(column)
                            df1 = df[column]
                            X = np.fft.fft(df1,axis=0)
                            X_lpf = X
                            X_lpf[cut_off_freq*sr:] = 0
                            Y_lpf = np.fft.ifft(X_lpf,axis=0)
                            Y_lpf = Y_lpf.real
                            Y_dsp = Y_lpf[::int(dsr)]
                            df_out = pd.concat([df_out, pd.DataFrame(Y_dsp)], axis=1)
                            
                    df_out.columns = cols
                    # add time column
                    for i in range(df_out.shape[0]):
                        times.append(i*1/(cut_off_freq*2))
                    df_out.insert(0, 'Time', times)
                    output_csv = os.path.join(output_dir, f"{sensor}_{level}_{cap}_{run}.csv")
                    df_out.to_csv(output_csv, index=False)
            except:
                print (cap, level, sensor, run)
                pass

cp009 01B lslshimmereda 1
cp009 01B lslshimmerresp 1
cp009 01B lslshimmerecg 1
cp009 02B lslshimmereda 1
cp009 02B lslshimmerresp 1
cp009 02B lslshimmerecg 1
cp009 03B lslshimmereda 1
cp009 03B lslshimmerresp 1
cp009 03B lslshimmerecg 1
cp009 04B lslshimmereda 1
cp009 04B lslshimmerresp 1
cp009 04B lslshimmerecg 1
cp028 01B lslshimmeremg 3


In [10]:
## DID NOT USE REST DATA ##
# data cleaning for rest data - output as 1 file per csv
# failed files are printed as output
# level = '000'
# for cap in cap_name_list['cp_ID']:
#         for sensor in sensor_freq_list['sensor_name']:
#             try:
#                 data_csv_list = hf.get_dirs_to_csv(data_rest_dir, cap, level, sensor)
#                 run = 0
#                 for csv_dir in data_csv_list:
#                     run = run + 1
#                     sr = hf.get_csv_freq(csv_dir)
#                     cut_off_freq = sensor_freq_list.loc[sensor_freq_list['sensor_name'] == sensor, 'freq'].iloc[0]
#                     dsr = np.rint(sr/(cut_off_freq*2))
#                     df = pd.read_csv(csv_dir)

#                     df_out = pd.DataFrame()
#                     cols, times = [], []
#                     for column in df:
#                         if column != 'time_dn':
#                             cols.append(column)
#                             df1 = df[column]
#                             X = np.fft.fft(df1,axis=0)
#                             X_lpf = X
#                             X_lpf[cut_off_freq*sr:] = 0
#                             Y_lpf = np.fft.ifft(X_lpf,axis=0)
#                             Y_lpf = Y_lpf.real
#                             Y_dsp = Y_lpf[::int(dsr)]
#                             df_out = pd.concat([df_out, pd.DataFrame(Y_dsp)], axis=1)

#                             # output_csv = os.path.join(rest_output_dir, f"{sensor}_{column}_{level}_{cap}_{run}.csv")
#                             # df_out = pd.DataFrame(Y_dsp)
#                             # df_out.columns = [column]
#                             # df_out.to_csv(output_csv, index=False)
                        
#                     df_out.columns = cols
#                     # add time column
#                     for i in range(df_out.shape[0]):
#                         times.append(i*1/(cut_off_freq*2))
#                     df_out.insert(0, 'Time', times)
#                     output_csv = os.path.join(rest_output_dir, f"{sensor}_{level}_{cap}_{run}.csv")
#                     df_out.to_csv(output_csv, index=False)
#             except:
#                  print (cap, level, sensor, run)
#                  pass

cp009 000 lslshimmereda 1
cp009 000 lslshimmerresp 1
cp009 000 lslshimmerecg 1
cp028 000 lslshimmeremg 2
cp042 000 lslshimmeremg 1


### Move HTC Vive Eye Data to clean data folder

In [7]:
# Timestamps are in original matlab format
# Test data only
# Failed files are printed as output

sensor = 'lslhtcviveeye'

for cap in cap_name_list['cp_ID']:
    for level in level_list['level']:
        try:
            data_csv_list = hf.get_dirs_to_csv(data_ils_dir, cap, level, sensor)
            run = 0
            for csv_dir in data_csv_list:
                run = run + 1
                sr = hf.get_csv_freq(csv_dir)
                df = pd.read_csv(csv_dir)
                # remove from df before simulation start and after simulation ends (12 Dec new)
                head, tail = hf.get_head_tail_time_to_remove(csv_dir)
                # remove before start
                if head > 0:
                    head_rows = head * sr
                    head_rows = int(head_rows) + (head_rows % 1 > 0)
                    df = df.drop(df.index[:head_rows])
                # remove after stop
                if tail > 0:
                    tail_rows = tail * sr
                    tail_rows = int(tail_rows) + (tail_rows % 1 > 0)
                    df = df.drop(df.index[-tail_rows:])
                output_csv = os.path.join(output_dir, f"{sensor}_{level}_{cap}_{run}.csv")
                df.to_csv(output_csv, index=False)
        except:
            print (cap, level, sensor, run)
            pass

cp003 01B lslhtcviveeye 1
cp003 02B lslhtcviveeye 1
cp003 03B lslhtcviveeye 1
cp003 04B lslhtcviveeye 1
cp027 01B lslhtcviveeye 1
cp027 03B lslhtcviveeye 1


## Generate 5 folds for train test split

In [3]:
subjects2 = [i for i in range(1,34)]

# Split
train, test = [], []

grpkfold = GroupKFold(n_splits=5)
for train_i, test_i in grpkfold.split(X=subjects2, groups=subjects2):
    train.append(train_i)
    test.append(test_i)

train_splits, test_splits = {}, {}

for i, fold in enumerate(train):
    train_splits[i] = []
    for j in fold:
        train_splits[i].append(subjects2[j])

for i, fold in enumerate(test):
    test_splits[i] = []
    for j in fold:
        test_splits[i].append(subjects2[j])

# Check splits
for i in range(5):
    print(train_splits[i])
    print(test_splits[i])
    print(len(train_splits[i]) + len(test_splits[i]))
    print()

[1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32]
[5, 10, 15, 20, 25, 30, 33]
33

[1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 18, 19, 20, 22, 23, 24, 25, 27, 28, 29, 30, 32, 33]
[6, 11, 16, 17, 21, 26, 31]
33

[3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 33]
[1, 2, 7, 12, 22, 27, 32]
33

[1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33]
[3, 8, 13, 18, 23, 28]
33

[1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33]
[4, 9, 14, 19, 24, 29]
33



In [8]:
data_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Cleaned Data'
target_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ready for ML_folds'

# Get list of subjects
data_files = hf.get_all_data_csv_filenames(data_dir)
split_files = [i.split('_') for i in data_files]
subjects = [i[2] for i in split_files]
subjects = list(set(subjects))
subjects.sort()

# Split
train, test = [], []

grpkfold = GroupKFold(n_splits=5)
for train_i, test_i in grpkfold.split(X=subjects, groups=subjects):
    train.append(train_i)
    test.append(test_i)

train_splits, test_splits = {}, {}

for i, fold in enumerate(train):
    train_splits[i] = []
    for j in fold:
        train_splits[i].append(subjects[j])

for i, fold in enumerate(test):
    test_splits[i] = []
    for j in fold:
        test_splits[i].append(subjects[j])

In [9]:
# Check splits
for i in range(5):
    print(train_splits[i])
    print()
    print(test_splits[i])
    print(len(train_splits[i]) + len(test_splits[i]))
    print()

['cp003', 'cp005', 'cp006', 'cp008', 'cp009', 'cp012', 'cp013', 'cp014', 'cp015', 'cp017', 'cp018', 'cp019', 'cp020', 'cp022', 'cp023', 'cp024', 'cp025', 'cp026', 'cp027', 'cp029', 'cp030', 'cp031', 'cp032', 'cp035', 'cp036', 'cp037', 'cp038', 'cp042']

['cp004', 'cp011', 'cp016', 'cp028', 'cp033', 'cp039', 'cp043']
35

['cp003', 'cp004', 'cp005', 'cp006', 'cp008', 'cp011', 'cp012', 'cp014', 'cp016', 'cp017', 'cp018', 'cp019', 'cp022', 'cp023', 'cp024', 'cp025', 'cp026', 'cp028', 'cp029', 'cp030', 'cp032', 'cp033', 'cp035', 'cp036', 'cp037', 'cp039', 'cp042', 'cp043']

['cp009', 'cp013', 'cp015', 'cp020', 'cp027', 'cp031', 'cp038']
35

['cp003', 'cp004', 'cp005', 'cp006', 'cp009', 'cp011', 'cp012', 'cp013', 'cp015', 'cp016', 'cp017', 'cp018', 'cp020', 'cp022', 'cp025', 'cp027', 'cp028', 'cp029', 'cp030', 'cp031', 'cp032', 'cp033', 'cp035', 'cp036', 'cp038', 'cp039', 'cp042', 'cp043']

['cp008', 'cp014', 'cp019', 'cp023', 'cp024', 'cp026', 'cp037']
35

['cp003', 'cp004', 'cp005', 'cp008

# Individual data types

## EMG

In [None]:
# Did not use

## EDA

In [None]:
# Did not use

## ECG

### Validate model on training data folds
#### Generate folds

In [None]:
# Generate folds of training data

data_dir = os.path.join(main_dir,'Cleaned Data')
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 60
sensor = 'lslshimmerecg'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmerecg_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 323
X shape: (323, 3, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform0.npy
	y_train_lslshimmerecg_standard_padtransform0.npy

Number of runs detected: 335
X shape: (335, 3, 48024)
y shape: (335,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform1.npy
	y_train_lslshimmerecg_standard_padtransform1.npy

Number of runs detected: 323
X shape: (323, 3, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform2.npy
	y_train_lslshimmerecg_standard_padtransform2.npy

Number of runs detected: 324
X shape: (324, 3, 48024)
y shape: (324,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform3.npy
	y_train_lslshimmerecg_standard_padtransform3.npy

Number of runs detected: 323
X shape: (323, 3, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform4.npy
	y_train_lslshimmerecg_standard_padtransform4.npy



In [None]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmerecg_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 3, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform0.npy
	y_test_lslshimmerecg_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 3, 48024)
y shape: (72,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform1.npy
	y_test_lslshimmerecg_standard_padtransform1.npy

Number of runs detected: 84
X shape: (84, 3, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform2.npy
	y_test_lslshimmerecg_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 3, 48024)
y shape: (83,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform3.npy
	y_test_lslshimmerecg_standard_padtransform3.npy

Number of runs detected: 84
X shape: (84, 3, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerecg_standard_padtransform4.npy
	y_test_lslshimmerecg_standard_padtransform4.npy



#### Set up model

In [None]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

def ecg_folds(n_estimators=250, n_jobs=-1, random_state=42, result_CV = None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(data_dir + '\\' + f'X_train_lslshimmerecg_standard_padtransform{i}.npy')
        y_train = np.load(data_dir + '\\' + f'y_train_lslshimmerecg_standard_padtransform{i}.npy')
        X_test = np.load(data_dir + '\\' + f'X_test_lslshimmerecg_standard_padtransform{i}.npy')
        y_test = np.load(data_dir + '\\' + f'y_test_lslshimmerecg_standard_padtransform{i}.npy')

        # only use first 2 columns of data on la_ra and ll_ra.
        clf = ColumnEnsembleClassifier(
            estimators=[
                ("est1", SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state), [0]),
                ("est2", SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state), [1])
            ]
        )

        clf.fit(X_train, y_train)
        y_pred_proba_SupervisedTimeSeriesForest = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SupervisedTimeSeriesForest
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SupervisedTimeSeriesForest, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

#### Runs

In [None]:
%%time

result, _ = ecg_folds(100, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 45min 32s
Wall time: 1h 17min 50s


accuracy_score    0.361264
AUC_score         0.648099
F1_score          0.321160
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(150, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 47min 27s
Wall time: 1h 33min 41s


accuracy_score    0.362880
AUC_score         0.653857
F1_score          0.321408
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(200, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 48min 37s
Wall time: 1h 50min 57s


accuracy_score    0.372375
AUC_score         0.653808
F1_score          0.333089
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(240, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 49min 27s
Wall time: 2h 4min 13s


accuracy_score    0.383090
AUC_score         0.656757
F1_score          0.342569
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(250, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 49min 17s
Wall time: 2h 12min 56s


accuracy_score    0.390261
AUC_score         0.655580
F1_score          0.351113
dtype: float64

In [None]:
%%time

result, _ = ecg_folds(260, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 50min 4s
Wall time: 2h 11min 18s


accuracy_score    0.375182
AUC_score         0.655679
F1_score          0.334513
dtype: float64

### Generate probabilities for ensemble model

In [None]:
%%time

result, probabilities = ecg_folds(250, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

In [None]:
pd.DataFrame(probabilities[0])

In [None]:
# Get run info

data_dir = os.path.join(main_dir,'Cleaned Data')
hz = 60
sensor = 'lslshimmerecg'
level = ''

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmerecg_01B', 'lslshimmerecg_02B', 'lslshimmerecg_03B', 'lslshimmerecg_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

Number of runs detected: 84
Number of runs detected: 72
Number of runs detected: 84
Number of runs detected: 83
Number of runs detected: 84


In [None]:
# Combine probabilities with run info and save it

target_dir = os.path.join(main_dir,'Data Ensembled')

df_combined_lslshimmerecg = df_combined.copy()
df_combined_lslshimmerecg = df_combined_lslshimmerecg.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmerecg = df_combined_lslshimmerecg.reset_index(drop = True)
df_combined_lslshimmerecg.to_csv(target_dir+r'\lslshimmerecg_ensembled_training.csv', index=False)
df_combined_lslshimmerecg

Unnamed: 0,subject,difficulty,run,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B
0,cp003,01B,1,0.44,0.22,0.14,0.20
1,cp003,01B,2,0.24,0.22,0.22,0.32
2,cp003,01B,3,0.46,0.24,0.18,0.12
3,cp003,02B,1,0.20,0.28,0.28,0.24
4,cp003,02B,2,0.12,0.22,0.42,0.24
...,...,...,...,...,...,...,...
402,cp043,03B,2,0.04,0.28,0.44,0.24
403,cp043,03B,3,0.26,0.30,0.20,0.24
404,cp043,04B,1,0.08,0.28,0.36,0.28
405,cp043,04B,2,0.04,0.22,0.46,0.28


### Generate combined numpy data file for ML model training

In [12]:
# Generate full training data for respiration from Shimmer

data_dir = os.path.join(main_dir,'Cleaned Data')
target_dir = os.path.join(main_dir,'Data Ready for ML_final')
hz = 60
sensor = 'lslshimmerecg'
subject = ''
level = ''

# StandardScaler
file_suffix = 'train_lslshimmerecg_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)

Number of runs detected: 407
X shape: (407, 3, 48024)
y shape: (407,)
Saved files:
	X_train_lslshimmerecg_standard_padtransform.npy
	y_train_lslshimmerecg_standard_padtransform.npy


### Train model on full set of training data

In [15]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

data_dir = os.path.join(main_dir,'Data Ready for ML_final')
model_dir = os.path.join(main_dir,'Trained Models')

X_train = np.load(data_dir + '\\' + 'X_train_lslshimmerecg_standard_padtransform.npy')
y_train = np.load(data_dir + '\\' + 'y_train_lslshimmerecg_standard_padtransform.npy')

# only use first 2 columns of data on la_ra and ll_ra.
clf = ColumnEnsembleClassifier(
    estimators=[
        ("est1", SupervisedTimeSeriesForest(n_estimators=250, n_jobs=-1, random_state=42), [0]),
        ("est2", SupervisedTimeSeriesForest(n_estimators=250, n_jobs=-1, random_state=42), [1])
    ]
)

clf.fit(X_train, y_train)

# Save model
with open(model_dir + r'\shimmerECG.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load model code

In [None]:
# Load model
# with open(model_dir + r'\shimmerECG.pkl', 'rb') as f:
#     clf = pickle.load(f)

# y_pred_proba = clf.predict_proba(X_train)
# y_pred = clf.predict(X_train)

# acc = accuracy_score(y_train, y_pred)
# auc = roc_auc_score(y_train, y_pred_proba, multi_class='ovr', average = 'macro')
# f1 = f1_score(y_train, y_pred, average='macro')

# print(acc)
# print(auc)
# print(f1)

1.0
1.0
1.0


## Respiration (from Shimmer Device)

### Validate model on training data folds
#### Generate folds

In [None]:
# Generate folds of training data

data_dir = os.path.join(main_dir,'Cleaned Data')
target_dir = os.path.join(main_dir,'Data Ready for ML_validation')
hz = 60
sensor = 'lslshimmerresp'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmerresp_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 323
X shape: (323, 1, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform0.npy
	y_train_lslshimmerresp_standard_padtransform0.npy

Number of runs detected: 335
X shape: (335, 1, 48024)
y shape: (335,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform1.npy
	y_train_lslshimmerresp_standard_padtransform1.npy

Number of runs detected: 323
X shape: (323, 1, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform2.npy
	y_train_lslshimmerresp_standard_padtransform2.npy

Number of runs detected: 324
X shape: (324, 1, 48024)
y shape: (324,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform3.npy
	y_train_lslshimmerresp_standard_padtransform3.npy

Number of runs detected: 323
X shape: (323, 1, 48024)
y shape: (323,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform4.npy
	y_train_lslshimmerresp_standard_padtransform4.npy



In [None]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmerresp_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 1, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform0.npy
	y_test_lslshimmerresp_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 1, 48024)
y shape: (72,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform1.npy
	y_test_lslshimmerresp_standard_padtransform1.npy

Number of runs detected: 84
X shape: (84, 1, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform2.npy
	y_test_lslshimmerresp_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 1, 48024)
y shape: (83,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform3.npy
	y_test_lslshimmerresp_standard_padtransform3.npy

Number of runs detected: 84
X shape: (84, 1, 48024)
y shape: (84,)
Saved files:
	X_test_lslshimmerresp_standard_padtransform4.npy
	y_test_lslshimmerresp_standard_padtransform4.npy



#### Set up model

In [None]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

def respshimmer_folds(n_estimators=40, n_jobs=-1, random_state=42, result_CV = None):
    data_dir = os.path.join(main_dir,'Data Ready for ML_validation')
    probabilities = {}

    for i in range(5):
        X_train = np.load(data_dir + '\\' + f'X_train_lslshimmerresp_standard_padtransform{i}.npy')
        y_train = np.load(data_dir + '\\' + f'y_train_lslshimmerresp_standard_padtransform{i}.npy')
        X_test = np.load(data_dir + '\\' + f'X_test_lslshimmerresp_standard_padtransform{i}.npy')
        y_test = np.load(data_dir + '\\' + f'y_test_lslshimmerresp_standard_padtransform{i}.npy')

        clf = SupervisedTimeSeriesForest(n_estimators, n_jobs, random_state)

        clf.fit(X_train, y_train)
        y_pred_proba_SupervisedTimeSeriesForest = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SupervisedTimeSeriesForest
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SupervisedTimeSeriesForest, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

#### Runs

In [None]:
%%time

result, _ = respshimmer_folds(40, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 12.4 s
Wall time: 8min 49s


accuracy_score    0.373111
AUC_score         0.630991
F1_score          0.354807
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(50, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 17.2 s
Wall time: 9min 24s


accuracy_score    0.402931
AUC_score         0.647100
F1_score          0.378989
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(60, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 24.2 s
Wall time: 11min 11s


accuracy_score    0.373848
AUC_score         0.644333
F1_score          0.347194
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(70, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 29.3 s
Wall time: 12min 56s


accuracy_score    0.370731
AUC_score         0.644125
F1_score          0.342514
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(80, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 38.3 s
Wall time: 15min 20s


accuracy_score    0.375464
AUC_score         0.651196
F1_score          0.347768
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(90, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 44.7 s
Wall time: 18min 20s


accuracy_score    0.378356
AUC_score         0.648205
F1_score          0.348122
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(100, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 55.3 s
Wall time: 19min


accuracy_score    0.384648
AUC_score         0.645720
F1_score          0.354331
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(110, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 1min 6s
Wall time: 19min 57s


accuracy_score    0.379518
AUC_score         0.645322
F1_score          0.349614
dtype: float64

In [None]:
%%time

result, _ = respshimmer_folds(200, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 2min 49s
Wall time: 35min 54s


accuracy_score    0.371806
AUC_score         0.646544
F1_score          0.344480
dtype: float64

### Generate probabilities for ensemble model

In [None]:
%%time

result, probabilities = respshimmer_folds(50, -1, 42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 16.8 s
Wall time: 8min 44s


accuracy_score    0.402931
AUC_score         0.647100
F1_score          0.378989
dtype: float64

In [None]:
pd.DataFrame(probabilities[0])

Unnamed: 0,0,1,2,3
0,0.58,0.10,0.14,0.18
1,0.54,0.10,0.12,0.24
2,0.42,0.14,0.18,0.26
3,0.36,0.22,0.28,0.14
4,0.24,0.18,0.22,0.36
...,...,...,...,...
79,0.04,0.28,0.44,0.24
80,0.26,0.30,0.20,0.24
81,0.08,0.28,0.36,0.28
82,0.04,0.22,0.46,0.28


In [None]:
# Get run info

data_dir = os.path.join(main_dir,'Cleaned Data')
hz = 60
sensor = 'lslshimmerresp'
level = ''

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmerresp_01B', 'lslshimmerresp_02B', 'lslshimmerresp_03B', 'lslshimmerresp_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

Number of runs detected: 84
Number of runs detected: 72
Number of runs detected: 84
Number of runs detected: 83
Number of runs detected: 84


In [None]:
# Combine probabilities with run info and save it

target_dir = os.path.join(main_dir,'Data Ensembled')

df_combined_lslshimmerresp = df_combined.copy()
df_combined_lslshimmerresp = df_combined_lslshimmerresp.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmerresp = df_combined_lslshimmerresp.reset_index(drop = True)
df_combined_lslshimmerresp.to_csv(target_dir+r'\lslshimmerresp_ensembled_training.csv', index=False)
df_combined_lslshimmerresp

Unnamed: 0,subject,difficulty,run,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B
0,cp003,01B,1,0.44,0.22,0.14,0.20
1,cp003,01B,2,0.24,0.22,0.22,0.32
2,cp003,01B,3,0.46,0.24,0.18,0.12
3,cp003,02B,1,0.20,0.28,0.28,0.24
4,cp003,02B,2,0.12,0.22,0.42,0.24
...,...,...,...,...,...,...,...
402,cp043,03B,2,0.04,0.28,0.44,0.24
403,cp043,03B,3,0.26,0.30,0.20,0.24
404,cp043,04B,1,0.08,0.28,0.36,0.28
405,cp043,04B,2,0.04,0.22,0.46,0.28


### Generate combined numpy data file for ML model training

In [4]:
# Generate full training data for respiration from Shimmer

data_dir = os.path.join(main_dir,'Cleaned Data')
target_dir = os.path.join(main_dir,'Data Ready for ML_final')
hz = 60
sensor = 'lslshimmerresp'
subject = ''
level = ''

# StandardScaler
file_suffix = 'train_lslshimmerresp_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)

(407, 4)
407
X shape: (407, 1, 48024)
y shape: (407,)
Saved files:
	X_train_lslshimmerresp_standard_padtransform.npy
	y_train_lslshimmerresp_standard_padtransform.npy


### Train model on full set of training data

In [6]:
from sktime.classification.interval_based import SupervisedTimeSeriesForest

data_dir = os.path.join(main_dir,'Data Ready for ML_final')
model_dir = os.path.join(main_dir,'Trained Models')

X_train = np.load(data_dir + '\\' + 'X_train_lslshimmerresp_standard_padtransform.npy')
y_train = np.load(data_dir + '\\' + 'y_train_lslshimmerresp_standard_padtransform.npy')

clf = SupervisedTimeSeriesForest(n_estimators=40, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Save model
with open(model_dir + r'\shimmerResp.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load model code

In [7]:
# Load model
# with open(model_dir + r'\shimmerResp.pkl', 'rb') as f:
#     clf = pickle.load(f)

# y_pred_proba = clf.predict_proba(X_train)
# y_pred = clf.predict(X_train)

# acc = accuracy_score(y_train, y_pred)
# auc = roc_auc_score(y_train, y_pred_proba, multi_class='ovr', average = 'macro')
# f1 = f1_score(y_train, y_pred, average='macro')

# print(acc)
# print(auc)
# print(f1)

1.0
1.0
1.0


## Respiration (from Respitrace Device)

In [None]:
# Did not use

## TorsoACC

### Generate combined numpy data file for ML model training

In [None]:
# Generate full training data for TorsoACC

data_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Cleaned Data'
target_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ready for ML_final'
hz = 20
sensor = 'lslshimmertorsoacc'
subject = ''
level = ''

# MinMaxScaler
file_suffix = 'train_lslshimmertorsoacc_minmax_padtransform'
scaler = MinMaxScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)

# StandardScaler
file_suffix = 'train_lslshimmertorsoacc_standard_padtransform'
scaler = StandardScaler()
hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)

(390, 4)
390
X shape: (390, 3, 15969)
y shape: (390,)
Saved files:
	X_train_lslshimmertorsoacc_minmax_padtransform.npy
	y_train_lslshimmertorsoacc_minmax_padtransform.npy
(390, 4)
390
X shape: (390, 3, 15969)
y shape: (390,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform.npy
	y_train_lslshimmertorsoacc_standard_padtransform.npy


### Train model on full set of training data

In [48]:
%%time
from sktime.classification.feature_based import SignatureClassifier

data_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ready for ML_final'
model_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Trained Models'

X_train = np.load(data_dir + '\\' + 'X_train_lslshimmertorsoacc_standard_padtransform.npy')
y_train = np.load(data_dir + '\\' + 'y_train_lslshimmertorsoacc_standard_padtransform.npy')

clf = ColumnEnsembleClassifier(
    estimators=[
        ("est1", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [0]),
        ("est2", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [1]),
        ("est3", SignatureClassifier(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42), [2]),
    ]
)

clf.fit(X_train, y_train)

# Save model
with open(model_dir + r'\torsoACC.pkl', 'wb') as f:
    pickle.dump(clf, f)

CPU times: total: 22min 16s
Wall time: 22min 53s


In [8]:
# Load model
# with open(model_dir + r'\torsoACC2.pkl', 'rb') as f:
#     clf = pickle.load(f)

### Validate model on training data folds

#### Generate folds

In [44]:
# Generate folds of training data

data_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Cleaned Data'
target_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ready for ML_validation'
hz = 20
sensor = 'lslshimmertorsoacc'
level = ''

scaler = StandardScaler()

for i in train_splits:
    file_suffix = f'train_lslshimmertorsoacc_standard_padtransform{i}'
    subject = train_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 306
X shape: (306, 3, 15969)
y shape: (306,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform0.npy
	y_train_lslshimmertorsoacc_standard_padtransform0.npy

Number of runs detected: 318
X shape: (318, 3, 15969)
y shape: (318,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform1.npy
	y_train_lslshimmertorsoacc_standard_padtransform1.npy

Number of runs detected: 311
X shape: (311, 3, 15969)
y shape: (311,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform2.npy
	y_train_lslshimmertorsoacc_standard_padtransform2.npy

Number of runs detected: 307
X shape: (307, 3, 15969)
y shape: (307,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform3.npy
	y_train_lslshimmertorsoacc_standard_padtransform3.npy

Number of runs detected: 318
X shape: (318, 3, 15969)
y shape: (318,)
Saved files:
	X_train_lslshimmertorsoacc_standard_padtransform4.npy
	y_train_lslshimmertorsoacc_standard_padtransform4.npy



In [45]:
# Generate folds of test data

for i in test_splits:
    file_suffix = f'test_lslshimmertorsoacc_standard_padtransform{i}'
    subject = test_splits[i]
    hf.generate_ml_data(data_dir, target_dir, file_suffix, scaler, hz, sensor, subject, level)
    print()

Number of runs detected: 84
X shape: (84, 3, 15969)
y shape: (84,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform0.npy
	y_test_lslshimmertorsoacc_standard_padtransform0.npy

Number of runs detected: 72
X shape: (72, 3, 15969)
y shape: (72,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform1.npy
	y_test_lslshimmertorsoacc_standard_padtransform1.npy

Number of runs detected: 79
X shape: (79, 3, 15969)
y shape: (79,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform2.npy
	y_test_lslshimmertorsoacc_standard_padtransform2.npy

Number of runs detected: 83
X shape: (83, 3, 15969)
y shape: (83,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform3.npy
	y_test_lslshimmertorsoacc_standard_padtransform3.npy

Number of runs detected: 72
X shape: (72, 3, 15969)
y shape: (72,)
Saved files:
	X_test_lslshimmertorsoacc_standard_padtransform4.npy
	y_test_lslshimmertorsoacc_standard_padtransform4.npy



#### Set up model

In [2]:
from sktime.classification.feature_based import SignatureClassifier

def torsoracc_folds(window_name, window_length, window_step, depth, random_state=None, result_CV=None):
    path = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ready for ML_validation'
    probabilities = {}

    for i in range(5):
        X_train = np.load(path + '\\' + f'X_train_lslshimmertorsoacc_standard_padtransform{i}.npy')
        y_train = np.load(path + '\\' + f'y_train_lslshimmertorsoacc_standard_padtransform{i}.npy')
        X_test = np.load(path + '\\' + f'X_test_lslshimmertorsoacc_standard_padtransform{i}.npy')
        y_test = np.load(path + '\\' + f'y_test_lslshimmertorsoacc_standard_padtransform{i}.npy')

        clf = ColumnEnsembleClassifier(
            estimators=[
                ("est1", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [0]),
                ("est2", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [1]),
                ("est3", SignatureClassifier(window_name = window_name, window_length = window_length, window_step = window_step, depth = depth, random_state = random_state), [2]),
            ]
        )

        clf.fit(X_train, y_train)
        y_pred_proba_SignatureClassifier = clf.predict_proba(X_test)
        probabilities[i] = y_pred_proba_SignatureClassifier
        # Log results
        class_list = [0,1,2,3]
        result_CV = hf.log_result(f'Set {i}', class_list, y_test, y_pred_proba_SignatureClassifier, result_CV)
        
        print(f'\rFold {i} complete', end='')
    print()
    return result_CV, probabilities

#### Generate probabilities for ensemble model

In [3]:
%%time

# result, probabilities = torsoracc_folds(estimator=None, augmentation_list=('basepoint', 'addtime'), window_name="sliding", window_length=15, window_step=15, rescaling=None, sig_tfm='signature', depth=4, random_state=42)
result, probabilities = torsoracc_folds(window_name="sliding", window_length=15, window_step=15, depth=4, random_state=42)
tmp = pd.DataFrame(result)
tmp.mean()

Fold 4 complete
CPU times: total: 1h 48min 2s
Wall time: 1h 49min 15s


accuracy_score    0.400967
AUC_score         0.645512
F1_score          0.363241
dtype: float64

In [4]:
pd.DataFrame(probabilities[0])

Unnamed: 0,0,1,2,3
0,0.486667,0.200000,0.120000,0.193333
1,0.510000,0.150000,0.120000,0.220000
2,0.406667,0.140000,0.196667,0.256667
3,0.323333,0.283333,0.196667,0.196667
4,0.300000,0.306667,0.183333,0.210000
...,...,...,...,...
79,0.103333,0.286667,0.306667,0.303333
80,0.246667,0.366667,0.170000,0.216667
81,0.076667,0.306667,0.300000,0.316667
82,0.100000,0.280000,0.396667,0.223333


In [9]:
data_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Cleaned Data'
hz = 20
sensor = 'lslshimmertorsoacc'
level = ''
# subject = test_splits[0]
# df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)

df_combined = pd.DataFrame()
split_df_list = []
for i in range(5):
    subject = test_splits[i]
    df_runs = hf.get_df_runs(data_dir, sensor, subject, level, True)
    df_runs = df_runs.drop('time', axis=1)
    df_proba = pd.DataFrame(probabilities[i])
    df_proba.columns = ['lslshimmertorsoacc_01B', 'lslshimmertorsoacc_02B', 'lslshimmertorsoacc_03B', 'lslshimmertorsoacc_04B']
    df_cur = pd.concat([df_runs, df_proba], axis=1)
    split_df_list.append(df_cur)
    df_combined = pd.concat([df_combined, df_cur], axis=0)

Number of runs detected: 84
Number of runs detected: 72
Number of runs detected: 79
Number of runs detected: 83
Number of runs detected: 72


In [10]:
# Save the probabilities

target_dir = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ensembled'

df_combined_lslshimmertorsoacc = df_combined.copy()
df_combined_lslshimmertorsoacc = df_combined_lslshimmertorsoacc.sort_values(['subject','difficulty','run'], axis = 0)
df_combined_lslshimmertorsoacc = df_combined_lslshimmertorsoacc.reset_index(drop = True)
df_combined_lslshimmertorsoacc.to_csv(target_dir+r'\lslshimmertorsoacc_ensembled_training.csv', index=False)
df_combined_lslshimmertorsoacc

Unnamed: 0,subject,difficulty,run,lslshimmertorsoacc_01B,lslshimmertorsoacc_02B,lslshimmertorsoacc_03B,lslshimmertorsoacc_04B
0,cp004,01B,1,0.486667,0.200000,0.120000,0.193333
1,cp004,01B,2,0.510000,0.150000,0.120000,0.220000
2,cp004,01B,3,0.406667,0.140000,0.196667,0.256667
3,cp004,02B,1,0.323333,0.283333,0.196667,0.196667
4,cp004,02B,2,0.300000,0.306667,0.183333,0.210000
...,...,...,...,...,...,...,...
385,cp043,03B,2,0.103333,0.286667,0.306667,0.303333
386,cp043,03B,3,0.246667,0.366667,0.170000,0.216667
387,cp043,04B,1,0.076667,0.306667,0.300000,0.316667
388,cp043,04B,2,0.100000,0.280000,0.396667,0.223333


# Ensemble Model
## Combine probability tables

In [26]:
from collections import defaultdict

types = defaultdict(lambda: 'float')
types['subject'] = 'str'
types['difficulty'] = 'str'
types['run'] = 'int'

path = r'C:\Users\shiuh\Documents\Pilot Performance Data Science Comp\Data Ensembled'

df_combined_lslshimmerresp = pd.read_csv(path + r'\lslshimmerresp_ensembled_training.csv', dtype=types)
df_combined_lslshimmertorsoacc = pd.read_csv(path + r'\lslshimmertorsoacc_ensembled_training.csv', dtype=types)

In [30]:
df_full = df_combined_lslshimmerresp.merge(df_combined_lslshimmertorsoacc, on=['subject','difficulty','run'], how='outer')

In [34]:
df_full

Unnamed: 0,subject,difficulty,run,lslshimmerresp_01B,lslshimmerresp_02B,lslshimmerresp_03B,lslshimmerresp_04B,lslshimmertorsoacc_01B,lslshimmertorsoacc_02B,lslshimmertorsoacc_03B,lslshimmertorsoacc_04B
0,cp003,01B,1,0.44,0.22,0.14,0.20,,,,
1,cp003,01B,2,0.24,0.22,0.22,0.32,,,,
2,cp003,01B,3,0.46,0.24,0.18,0.12,,,,
3,cp003,02B,1,0.20,0.28,0.28,0.24,,,,
4,cp003,02B,2,0.12,0.22,0.42,0.24,,,,
...,...,...,...,...,...,...,...,...,...,...,...
402,cp043,03B,2,0.04,0.28,0.44,0.24,0.103333,0.286667,0.306667,0.303333
403,cp043,03B,3,0.26,0.30,0.20,0.24,0.246667,0.366667,0.170000,0.216667
404,cp043,04B,1,0.08,0.28,0.36,0.28,0.076667,0.306667,0.300000,0.316667
405,cp043,04B,2,0.04,0.22,0.46,0.28,0.100000,0.280000,0.396667,0.223333
