Libraries

In [37]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.model_selection import cross_validate,cross_val_score,LeaveOneOut
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score

Loading the data

In [38]:
def get_standardizable_features(dataframe: pd.DataFrame):
    return [x for x in dataframe.columns if x.startswith('HR')]


scaler = StandardScaler()

# Load data.
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
tmp = hr_data.copy()

# Transform data
transform_data = True
if transform_data:
    to_log_transform = ['HR_std']
    for col in to_log_transform:
        tmp[col] = np.log(tmp[col])
        tmp[col].name = col + '_log'
        tmp.rename(columns={col: col + '_log'}, inplace=True)
    to_inverse_transform = ['HR_AUC']
    for col in to_inverse_transform:
        tmp[col] = 1 / tmp[col]
        tmp[col].name = col + '_inverse'
        tmp.rename(columns={col: col + '_inverse'}, inplace=True)
    to_boxcox_transform = ['HR_Max']
    for col in to_boxcox_transform:
        tmp[col], _ = boxcox(tmp[col] - tmp[col].min() + 1)
        tmp[col].name = col + '_boxcox'
        tmp.rename(columns={col: col + '_boxcox'}, inplace=True)

# Standardize data
standardized = True
if standardized:
    cols = get_standardizable_features(tmp)
    tmp[cols] = scaler.fit_transform(tmp[cols])

working_dataset = tmp.copy()

# filtering off the non-HR features
filtered_hr_data = working_dataset.copy()
dropped = ["HR_Median","HR_Min","Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]  # Last column


threshold = 5
# Create a new binary variable indicating High or Low frustration based on the threshold
Y_grouped = (Y >= threshold).astype(int)
Y = Y_grouped
# Replace the original 'Frustrated' variable with the grouped variable in your DataFrame
filtered_hr_data['Frustrated'] = Y_grouped

filtered_hr_data.head()

Unnamed: 0,HR_Mean,HR_std_log,HR_Max_boxcox,HR_AUC_inverse,Frustrated
0,0.261529,-0.592147,-0.269578,-0.009167,0
1,-0.672305,-1.034575,-0.863799,0.305442,1
2,-0.352626,-0.632567,-0.609914,0.400355,0
3,0.388771,-0.292721,-0.121668,-0.650189,0
4,0.17556,0.329823,0.31146,-0.289564,1


Majority Votin' LOOCV

In [39]:
%%capture
accuracyArray = []

# Initialize confusion matrix
overall_conf_matrix = np.zeros((2, 2), dtype=int)

for i in range(len(Y)):
    X_train = X.drop(i)
    X_test = X.loc[i].values.reshape(1, -1)
    y_train = Y.drop(i)

    # Use majority vote as the prediction
    majority_vote_prediction = y_train.mode().iloc[0]

    # Update confusion matrix
    conf_matrix = confusion_matrix([Y.iloc[i]], [majority_vote_prediction])
    overall_conf_matrix += conf_matrix

    # Quick and easy evaluation
    accuracy = accuracy_score([Y.iloc[i]], [majority_vote_prediction])
    accuracyArray.append(accuracy)

average_accuracy = np.mean(accuracyArray)

In [40]:
print("Average Accuracy (Majority Vote):", average_accuracy * 100)
print("Confusion Matrix:")
print(overall_conf_matrix)

Average Accuracy (Majority Vote): 85.71428571428571
Confusion Matrix:
[[144 144]
 [168 144]]
