# Det her er bare en test for at se om det faktisk er en mulighed at bruge linear regression

Libraries

In [8]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler

Loading the data

In [9]:
def get_standardizable_features(dataframe: pd.DataFrame):
    return [x for x in dataframe.columns if x.startswith('HR')]


scaler = StandardScaler()

# Load data.
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
tmp = hr_data.copy()

# Transform data
transform_data = True
if transform_data:
    to_log_transform = ['HR_std']
    for col in to_log_transform:
        tmp[col] = np.log(tmp[col])
        tmp[col].name = col + '_log'
        tmp.rename(columns={col: col + '_log'}, inplace=True)
    to_inverse_transform = ['HR_AUC']
    for col in to_inverse_transform:
        tmp[col] = 1 / tmp[col]
        tmp[col].name = col + '_inverse'
        tmp.rename(columns={col: col + '_inverse'}, inplace=True)
    to_boxcox_transform = ['HR_Max']
    for col in to_boxcox_transform:
        tmp[col], _ = boxcox(tmp[col] - tmp[col].min() + 1)
        tmp[col].name = col + '_boxcox'
        tmp.rename(columns={col: col + '_boxcox'}, inplace=True)

# Standardize data
standardized = True
if standardized:
    cols = get_standardizable_features(tmp)
    tmp[cols] = scaler.fit_transform(tmp[cols])

working_dataset = tmp.copy()

# filtering off the non-HR features
filtered_hr_data = working_dataset.copy()
dropped = ["HR_Median","HR_Min","Round", "Phase", "Individual", "Puzzler", "Cohort"]
filtered_hr_data['const'] = 1
for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]  # Last column

filtered_hr_data.head()

Unnamed: 0,HR_Mean,HR_std_log,HR_Max_boxcox,HR_AUC_inverse,Frustrated,const
0,0.261529,-0.592147,-0.269578,-0.009167,1,1
1,-0.672305,-1.034575,-0.863799,0.305442,5,1
2,-0.352626,-0.632567,-0.609914,0.400355,0,1
3,0.388771,-0.292721,-0.121668,-0.650189,1,1
4,0.17556,0.329823,0.31146,-0.289564,5,1


Fitting

In [10]:
model = sm.OLS(filtered_hr_data['Frustrated'], filtered_hr_data[['const', 'HR_Mean', 'HR_std_log','HR_Max_boxcox',"HR_AUC_inverse"]])
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             Frustrated   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     1.580
Date:                Tue, 16 Jan 2024   Prob (F-statistic):              0.182
Time:                        12:48:17   Log-Likelihood:                -344.69
No. Observations:                 168   AIC:                             699.4
Df Residuals:                     163   BIC:                             715.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              2.2917      0.147     15.