In [353]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import cross_validate,cross_val_score,LeaveOneOut
from sklearn.metrics import recall_score


In [354]:
seed = random.randint(0,10000)
# Loading the data
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
# Vi filtrerer dataen, så de variables i "dropped" ikke er med - og da vi faktisk kun skal bruge HR variablerne, dropper vi alle de andre.
filtered_hr_data = hr_data.copy()
dropped = ["HR_Median", "HR_std", "Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=seed)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


filtered_hr_data.head()

Unnamed: 0,HR_Mean,HR_Min,HR_Max,HR_AUC,Frustrated
0,77.965186,73.23,83.37,22924.945,1
1,70.981097,67.12,78.22,21930.4,5
2,73.371959,67.88,80.22,21647.085,0
3,78.916822,72.32,84.92,25258.905,1
4,77.322226,70.52,90.15,23890.565,5


In [355]:
# Let's make the Baseline model - Simple Linear Regression model!
reg = LinearRegression()

# fit
reg.fit(X_train_scaled, y_train)

# predict
y_pred = reg.predict(X_test_scaled).round().astype(int)

# quick and easy evaluation
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test,y_pred)
predictions_df = pd.DataFrame({'TrueValues': y_test, 'PredictedValues': y_pred})

print(predictions_df,"\nWithout some HR features","\nAccuracy: ",accuracy*100,"\nMSE: ",mse)

     TrueValues  PredictedValues
38            2                2
130           4                3
11            0                3
47            2                3
125           1                2
97            5                2
121           3                2
64            7                2
80            3                2
44            2                3
162           4                2
82            3                3
2             0                2
58            3                2
137           1                2
19            5                2
30            4                3
112           2                3
120           0                2
72            3                2
113           1                2
96            2                2
14            1                2
49            3                2
0             1                2
71            0                3
3             1                2
67            5                2
91            3                3
106       

In [356]:
# Nu prøver jeg altså lige med alle HR features..
# Loading the data
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
# Vi filtrerer dataen, så de variables i "dropped" ikke er med - og da vi faktisk kun skal bruge HR variablerne, dropper vi alle de andre.
filtered_hr_data = hr_data.copy()
dropped = ["Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=seed)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Let's make the Baseline model - Simple Linear Regression model!
reg2 = LinearRegression()

# fit
reg2.fit(X_train_scaled, y_train)

# predict
y_pred_reg2 = reg2.predict(X_test_scaled).round().astype(int)

# quick and easy evaluation
mse = mean_squared_error(y_test, y_pred_reg2)
accuracy = accuracy_score(y_test,y_pred_reg2)
predictions_df = pd.DataFrame({'TrueValues': y_test, 'PredictedValues': y_pred_reg2})

print(predictions_df,"\nWith all HR Features","\nAccuracy: ",accuracy*100,"\nMSE: ",mse)

     TrueValues  PredictedValues
38            2                2
130           4                3
11            0                3
47            2                3
125           1                2
97            5                2
121           3                2
64            7                2
80            3                2
44            2                2
162           4                2
82            3                3
2             0                2
58            3                2
137           1                2
19            5                2
30            4                3
112           2                2
120           0                2
72            3                2
113           1                2
96            2                2
14            1                2
49            3                2
0             1                2
71            0                3
3             1                2
67            5                2
91            3                3
106       

Ved ikke helt hvad jeg skal konkludere dér, for de to modeller har omtrent samme MSE tror jeg.

# Now for the CV | Cross-Validation

# With HR_Median and HR_std

Loading the Data | This is the data without HR_Median & HR_std!

In [357]:

# Loading the data
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
# Vi filtrerer dataen, så de variables i "dropped" ikke er med - og da vi faktisk kun skal bruge HR variablerne, dropper vi alle de andre.
filtered_hr_data = hr_data.copy()
dropped = ["HR_Median", "HR_std", "Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column
print(filtered_hr_data)

       HR_Mean  HR_Min  HR_Max     HR_AUC  Frustrated
0    77.965186   73.23   83.37  22924.945           1
1    70.981097   67.12   78.22  21930.400           5
2    73.371959   67.88   80.22  21647.085           0
3    78.916822   72.32   84.92  25258.905           1
4    77.322226   70.52   90.15  23890.565           5
..         ...     ...     ...        ...         ...
163  73.594539   57.43   93.53  21482.985           8
164  57.839897   52.97   74.14  16825.740           0
165  64.237295   58.97   72.63  18691.065           1
166  70.834320   66.65   76.07  20753.005           4
167  71.133878   57.17  114.33  20820.320           0

[168 rows x 5 columns]


In [358]:
%%capture
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import numpy as np

model = LinearRegression()
accuracyArray = []
X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column

for i in range(len(Y)):
    X_train, X_test, y_train, y_test = X.drop(i), X.loc[i].values.reshape(1, -1), Y.drop(i), Y.values[i]

    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled).round().astype(int)

    # Convert scalar y_test to an array
    y_test_array = np.array([y_test])

    # Quick and easy evaluation
    accuracy = accuracy_score(y_test_array, y_pred)
    accuracyArray.append(accuracy)

average_accuracy = np.mean(accuracyArray)



In [359]:
print("Average Accuracy:", average_accuracy*100)

Average Accuracy: 15.476190476190476


# With all HR

In [360]:

%%capture
# Loading the data
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
# Vi filtrerer dataen, så de variables i "dropped" ikke er med - og da vi faktisk kun skal bruge HR variablerne, dropper vi alle de andre.
filtered_hr_data = hr_data.copy()
dropped = ["Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column
print(filtered_hr_data)


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import numpy as np

model = LinearRegression()
accuracyArray = []
X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column

for i in range(len(Y)):
    X_train, X_test, y_train, y_test = X.drop(i), X.loc[i].values.reshape(1, -1), Y.drop(i), Y.values[i]

    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled).round().astype(int)

    # Convert scalar y_test to an array
    y_test_array = np.array([y_test])

    # Quick and easy evaluation
    accuracy = accuracy_score(y_test_array, y_pred)
    accuracyArray.append(accuracy)

average_accuracy = np.mean(accuracyArray)



In [361]:
print("Average Accuracy:", average_accuracy*100)

Average Accuracy: 15.476190476190476


# HR_Median & HR_Min features!

In [362]:
%%capture
# Loading the data
hr_data = pd.read_csv('HR_data_transformed.csv', sep=',', header=0)
hr_data.drop(['Unnamed: 0'], axis=1, inplace=True)
# Vi filtrerer dataen, så de variables i "dropped" ikke er med - og da vi faktisk kun skal bruge HR variablerne, dropper vi alle de andre.
filtered_hr_data = hr_data.copy()
dropped = ["HR_Median", "HR_Min","Round", "Phase", "Individual", "Puzzler", "Cohort"]

for column_to_drop in dropped:
    filtered_hr_data.drop(column_to_drop, axis=1, inplace=True)

X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column
print(filtered_hr_data)


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import numpy as np

model = LinearRegression()
accuracyArray = []
X = filtered_hr_data.iloc[:, :-1]  # All columns except the last one
Y = filtered_hr_data.iloc[:, -1]   # Last column

for i in range(len(Y)):
    X_train, X_test, y_train, y_test = X.drop(i), X.loc[i].values.reshape(1, -1), Y.drop(i), Y.values[i]

    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled).round().astype(int)

    # Convert scalar y_test to an array
    y_test_array = np.array([y_test])

    # Quick and easy evaluation
    accuracy = accuracy_score(y_test_array, y_pred)
    accuracyArray.append(accuracy)

average_accuracy = np.mean(accuracyArray)



In [363]:
print("Average Accuracy:", average_accuracy*100)

Average Accuracy: 17.261904761904763


# Så der er ingen forskel udover når man fjerner "HR_Median", "HR_Min"

# Som giver højeste accuracy