In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from collections import Counter
from tqdm import tqdm
import scipy.signal as signal
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

In [2]:
SUBJECT_IDS = [i for i in range(2, 18) if i != 12]

In [3]:
# LOAD ALL DATA
subject_cols = ['age', 'height_cm', 'weight_kg', 'gender', 'is_smoker']
subject_info = pd.DataFrame(columns = subject_cols)

for subject_id in tqdm(SUBJECT_IDS):
    with open(f'WESAD/S{subject_id}/S{subject_id}_readme.txt') as f:
        lines = f.readlines()

        age = int(lines[1].split(":")[1][1:-1])
        height_cm = int(lines[2].split(":")[1][1:-1])
        weight_kg = int(lines[3].split(":")[1][1:-1])
        gender = lines[4].split(":")[1][1:-1]
        is_smoker = lines[10].split("?")[1][1:-1]
    
        info = pd.DataFrame([[age, height_cm, weight_kg, gender, is_smoker]], columns = subject_cols)

        subject_info = pd.concat([subject_info, info], axis=0)

subject_info.index = SUBJECT_IDS

print('-- Loaded All Subject Info --')

subject_chest_data: dict[int, pd.DataFrame] = dict()
subject_wrist_data: dict[int, pd.DataFrame] = dict()

chest_features = ['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp']
wrist_features = ['ACC', 'BVP', 'EDA', 'TEMP']

for subject_id in tqdm(SUBJECT_IDS):
    data = joblib.load(f'WESAD/S{subject_id}/S{subject_id}.joblib')

    # Load in all chest information
    chest_info = data['signal']['chest']
    chest_data = {feature: chest_info[feature] for feature in chest_features}

    chest_dict = {
        'ACC_X': chest_data['ACC'][:, 0],
        'ACC_Y': chest_data['ACC'][:, 1],
        'ACC_Z': chest_data['ACC'][:, 2],
        'ECG': chest_data['ECG'].flatten(),
        'EMG': chest_data['EMG'].flatten(),
        'EDA': chest_data['EDA'].flatten(),
        'TEMP': chest_data['Temp'].flatten(),
        'RESP': chest_data['Resp'].flatten(),
        'STATE': data['label'].flatten()
    }

    chest_df = pd.DataFrame(chest_dict)
    subject_chest_data[subject_id] = chest_df

    # Load in all wrist information
    wrist_info = data['signal']['wrist']
    wrist_data = {feature: wrist_info[feature] for feature in wrist_features}

    wrist_dict = {
        'ACC_X': signal.decimate(wrist_data['ACC'][:, 0], 8),
        'ACC_Y': signal.decimate(wrist_data['ACC'][:, 1], 8),
        'ACC_Z': signal.decimate(wrist_data['ACC'][:, 2], 8),
        'BVP': signal.decimate(wrist_data['BVP'].flatten(), 16),
        'EDA': wrist_data['EDA'].flatten(),
        'TEMP': wrist_data['TEMP'].flatten(),
        'STATE': data['label'][::175]
    }

    wrist_df = pd.DataFrame(wrist_dict)

    total_labels = Counter(data['label'])
    total_labels.update({i: 0 for i in range(8)})
    assert np.allclose((pd.Series(total_labels) / wrist_df['STATE'].value_counts()).fillna(175).to_numpy(), np.full(8, 175), atol = 1e-2)
    
    subject_wrist_data[subject_id] = wrist_df

print('-- Loaded All Wrist & Chest Sensor Info --')

100%|██████████| 15/15 [03:06<00:00, 12.41s/it]


-- Loaded All Subject Info --


 27%|██▋       | 4/15 [02:01<05:34, 30.45s/it]


KeyboardInterrupt: 

In [4]:
# PRE-EDA PREPROCESSING

# From all dataframes in chest and wrist data, remove rows with labels 5, 6, 7 (ignored as demanded by experiment)
for subject_id in subject_chest_data.keys():
    subject_chest_data[subject_id] = subject_chest_data[subject_id][~subject_chest_data[subject_id]['STATE'].isin([5, 6, 7])]
    subject_chest_data[subject_id].reset_index(drop=True, inplace=True)

for subject_id in subject_wrist_data.keys():
    subject_wrist_data[subject_id] = subject_wrist_data[subject_id][~subject_wrist_data[subject_id]['STATE'].isin([5, 6, 7])]
    subject_wrist_data[subject_id].reset_index(drop=True, inplace=True)

In [5]:
# POST-EDA PREPROCESSING

# Add to each dataframe (chest and wrist) a column that is that subject's ID
for subject_id in subject_chest_data.keys():
    subject_chest_data[subject_id] = subject_chest_data[subject_id].copy()
    subject_chest_data[subject_id].loc[:, 'SUBJECT_ID'] = subject_id

for subject_id in subject_wrist_data.keys():
    subject_wrist_data[subject_id] = subject_wrist_data[subject_id].copy()
    subject_wrist_data[subject_id].loc[:, 'SUBJECT_ID'] = subject_id

# Concatenate all dataframes into one
all_chest_data = pd.concat(subject_chest_data.values(), axis=0)
all_wrist_data = pd.concat(subject_wrist_data.values(), axis=0)
all_chest_data.reset_index(drop=True, inplace=True)
all_wrist_data.reset_index(drop=True, inplace=True)

In [6]:
# Must stratify on 75 distinct categories (5 States, 15 Subjects) to ensure equal representation in train/val/test
chest_train_perc, chest_val_perc, chest_test_perc = 0.8, 0.1, 0.1
wrist_train_perc, wrist_val_perc, wrist_test_perc = 0.6, 0.2, 0.2

all_chest_data['STATE_SUBJECT'] = all_chest_data['STATE'].astype(str) + '_' + all_chest_data['SUBJECT_ID'].astype(str)
all_wrist_data['STATE_SUBJECT'] = all_wrist_data['STATE'].astype(str) + '_' + all_wrist_data['SUBJECT_ID'].astype(str)

chest_X_train, chest_X_other, chest_y_train, chest_y_other = train_test_split(all_chest_data, all_chest_data['STATE'], train_size = chest_train_perc, stratify = all_chest_data['STATE_SUBJECT'], random_state = 42)
chest_X_val, chest_X_test, chest_y_val, chest_y_test = train_test_split(chest_X_other, chest_y_other, test_size = (chest_test_perc / (chest_val_perc + chest_test_perc)), stratify = chest_X_other['STATE_SUBJECT'], random_state = 42)

wrist_X_train, wrist_X_other, wrist_y_train, wrist_y_other = train_test_split(all_wrist_data, all_wrist_data['STATE'], train_size = wrist_train_perc, stratify = all_wrist_data['STATE_SUBJECT'], random_state = 42)
wrist_X_val, wrist_X_test, wrist_y_val, wrist_y_test = train_test_split(wrist_X_other, wrist_y_other, test_size = (wrist_test_perc / (wrist_val_perc + wrist_test_perc)), stratify = wrist_X_other['STATE_SUBJECT'], random_state = 42)

# Drop the STATE_SUBJECT columns and reset indices
chest_X_train = chest_X_train.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)
chest_X_val = chest_X_val.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)
chest_X_test = chest_X_test.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)

wrist_X_train = wrist_X_train.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)
wrist_X_val = wrist_X_val.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)
wrist_X_test = wrist_X_test.drop(columns = ['STATE_SUBJECT']).reset_index(drop=True)

print('Train Chest Size:', chest_X_train.shape)
print('Val Chest Size:', chest_X_val.shape)
print('Test Chest Size:', chest_X_test.shape)

print('Train Wrist Size:', wrist_X_train.shape)
print('Val Wrist Size:', wrist_X_val.shape)
print('Test Wrist Size:', wrist_X_test.shape)

Train Chest Size: (47300400, 10)
Val Chest Size: (5912550, 10)
Test Chest Size: (5912550, 10)
Train Wrist Size: (202716, 8)
Val Wrist Size: (67572, 8)
Test Wrist Size: (67572, 8)


In [7]:
# Verify overarching split proportion sizes
train_chest_subjects = chest_X_train['SUBJECT_ID'].value_counts()
val_chest_subjects = chest_X_val['SUBJECT_ID'].value_counts()
test_chest_subjects = chest_X_test['SUBJECT_ID'].value_counts()

train_wrist_subjects = wrist_X_train['SUBJECT_ID'].value_counts()
val_wrist_subjects = wrist_X_val['SUBJECT_ID'].value_counts()
test_wrist_subjects = wrist_X_test['SUBJECT_ID'].value_counts()

assert np.allclose(train_chest_subjects / (train_chest_subjects + val_chest_subjects + test_chest_subjects), chest_train_perc, atol = 1e-2)
assert np.allclose(val_chest_subjects / (train_chest_subjects + val_chest_subjects + test_chest_subjects), chest_val_perc, atol = 1e-2)
assert np.allclose(test_chest_subjects / (train_chest_subjects + val_chest_subjects + test_chest_subjects), chest_test_perc, atol = 1e-2)

assert np.allclose(train_wrist_subjects / (train_wrist_subjects + val_wrist_subjects + test_wrist_subjects), wrist_train_perc, atol = 1e-2)
assert np.allclose(val_wrist_subjects / (train_wrist_subjects + val_wrist_subjects + test_wrist_subjects), wrist_val_perc, atol = 1e-2)
assert np.allclose(test_wrist_subjects / (train_wrist_subjects + val_wrist_subjects + test_wrist_subjects), wrist_test_perc, atol = 1e-2)

In [9]:
# Use OneHotEncoder on SUBJECT_ID 
chest_subject_ohe_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'error')
wrist_subject_ohe_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'error')

oh_features = ['SUBJECT_ID']

chest_X_train_ohe = chest_subject_ohe_encoder.fit_transform(chest_X_train[oh_features])
chest_X_val_ohe = chest_subject_ohe_encoder.transform(chest_X_val[oh_features])
chest_X_test_ohe = chest_subject_ohe_encoder.transform(chest_X_test[oh_features])

wrist_X_train_ohe = wrist_subject_ohe_encoder.fit_transform(wrist_X_train[oh_features])
wrist_X_val_ohe = wrist_subject_ohe_encoder.transform(wrist_X_val[oh_features])
wrist_X_test_ohe = wrist_subject_ohe_encoder.transform(wrist_X_test[oh_features])

In [13]:
# Use StandardScaler on all non-target features
chest_subject_ss_encoder = StandardScaler()
wrist_subject_ss_encoder = StandardScaler()

chest_ss_features = ['ACC_X', 'ACC_Y', 'ACC_Z', 'ECG', 'EMG', 'EDA', 'TEMP', 'RESP']
wrist_ss_features = ['ACC_X', 'ACC_Y', 'ACC_Z', 'BVP', 'EDA', 'TEMP']

chest_X_train_ss = chest_subject_ss_encoder.fit_transform(chest_X_train[chest_ss_features])
chest_X_val_ss = chest_subject_ss_encoder.transform(chest_X_val[chest_ss_features])
chest_X_test_ss = chest_subject_ss_encoder.transform(chest_X_test[chest_ss_features])

wrist_X_train_ss = wrist_subject_ss_encoder.fit_transform(wrist_X_train[wrist_ss_features])
wrist_X_val_ss = wrist_subject_ss_encoder.transform(wrist_X_val[wrist_ss_features])
wrist_X_test_ss = wrist_subject_ss_encoder.transform(wrist_X_test[wrist_ss_features])

In [22]:
# Concatenate all (transformed) features back together and back to DataFrame form

chest_X_train = pd.DataFrame(np.concatenate([chest_X_train_ohe, chest_X_train_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + chest_ss_features)
chest_X_val = pd.DataFrame(np.concatenate([chest_X_val_ohe, chest_X_val_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + chest_ss_features)
chest_X_test = pd.DataFrame(np.concatenate([chest_X_test_ohe, chest_X_test_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + chest_ss_features)

wrist_X_train = pd.DataFrame(np.concatenate([wrist_X_train_ohe, wrist_X_train_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + wrist_ss_features)
wrist_X_val = pd.DataFrame(np.concatenate([wrist_X_val_ohe, wrist_X_val_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + wrist_ss_features)
wrist_X_test = pd.DataFrame(np.concatenate([wrist_X_test_ohe, wrist_X_test_ss], axis=1), columns = [f'IS_S{SUBJECT_IDS[i]}' for i in range(len(SUBJECT_IDS))] + wrist_ss_features)

: 

: 

In [21]:
print(chest_X_train.columns)
print(chest_X_train.head(5))

RangeIndex(start=0, stop=23, step=1)
