Libraries

In [45]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler

# Load data and transforming it

In [46]:
def get_standardizable_features(dataframe: pd.DataFrame):
    return [x for x in dataframe.columns if x.startswith('HR')]


scaler = StandardScaler()

# Load data.
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
tmp = hr_data.copy()

# Transform data
transform_data = True
if transform_data:
    to_log_transform = ['HR_std']
    for col in to_log_transform:
        tmp[col] = np.log(tmp[col])
        tmp[col].name = col + '_log'
        tmp.rename(columns={col: col + '_log'}, inplace=True)
    to_inverse_transform = ['HR_AUC']
    for col in to_inverse_transform:
        tmp[col] = 1 / tmp[col]
        tmp[col].name = col + '_inverse'
        tmp.rename(columns={col: col + '_inverse'}, inplace=True)
    to_boxcox_transform = ['HR_Max']
    for col in to_boxcox_transform:
        tmp[col], _ = boxcox(tmp[col] - tmp[col].min() + 1)
        tmp[col].name = col + '_boxcox'
        tmp.rename(columns={col: col + '_boxcox'}, inplace=True)

# Standardize data
standardized = True
if standardized:
    cols = get_standardizable_features(tmp)
    tmp[cols] = scaler.fit_transform(tmp[cols])

working_dataset = tmp.copy()
working_dataset.head()

Unnamed: 0,HR_Mean,HR_Median,HR_std_log,HR_Min,HR_Max_boxcox,HR_AUC_inverse,Round,Phase,Individual,Puzzler,Frustrated,Cohort
0,0.261529,0.393515,-0.592147,0.827134,-0.269578,-0.009167,3,3,1,1,1,1
1,-0.672305,-0.526346,-1.034575,-0.04311,-0.863799,0.305442,3,2,1,1,5,1
2,-0.352626,-0.180934,-0.632567,0.065136,-0.609914,0.400355,3,1,1,1,0,1
3,0.388771,0.378658,-0.292721,0.697523,-0.121668,-0.650189,2,3,1,1,1,1
4,0.17556,-0.033608,0.329823,0.44115,0.31146,-0.289564,2,2,1,1,5,1


# All attributes (Without HR_Median & HR_Min)

In [47]:
# filtering off the non-feature-selected-HR-features
all_filtered_hr_data = working_dataset.copy()
dropped = ["HR_Median","HR_Min"]

for column_to_drop in dropped:
    all_filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = all_filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = all_filtered_hr_data.iloc[:, -1]   # Last column

all_filtered_hr_data.head()


Unnamed: 0,HR_Mean,HR_std_log,HR_Max_boxcox,HR_AUC_inverse,Round,Phase,Individual,Puzzler,Frustrated,Cohort
0,0.261529,-0.592147,-0.269578,-0.009167,3,3,1,1,1,1
1,-0.672305,-1.034575,-0.863799,0.305442,3,2,1,1,5,1
2,-0.352626,-0.632567,-0.609914,0.400355,3,1,1,1,0,1
3,0.388771,-0.292721,-0.121668,-0.650189,2,3,1,1,1,1
4,0.17556,0.329823,0.31146,-0.289564,2,2,1,1,5,1


# Only HR Features (Without HR_Median & HR_Min)

In [48]:
# filtering off the non-HR features
filtered_hr_data = working_dataset.copy()
dropped = ["HR_Median","HR_Min","Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]  # Last column

filtered_hr_data.head()

Unnamed: 0,HR_Mean,HR_std_log,HR_Max_boxcox,HR_AUC_inverse,Frustrated
0,0.261529,-0.592147,-0.269578,-0.009167,1
1,-0.672305,-1.034575,-0.863799,0.305442,5
2,-0.352626,-0.632567,-0.609914,0.400355,0
3,0.388771,-0.292721,-0.121668,-0.650189,1
4,0.17556,0.329823,0.31146,-0.289564,5
