## Import packages and datasets

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from stability_selection import StabilitySelection
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from evaluate_regression_models import *
from HelperFunctions import *


In [2]:
pre_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/pre_2019_2020.csv")
stress_endterm_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/stress_endterm_2019_2020.csv")
ema_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/ema_2019_2020.csv")
calls_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/calls_2019_2020.csv")
bluetooth_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/bluetooth_2019_2020.csv")
screen_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/screen_2019_2020.csv")
location_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/location_2019_2020.csv")
steps_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/steps_2019_2020.csv")
sleep_2019_2020 = pd.read_csv("Cleaned_Datasets/2019_2020/sleep_2019_2020.csv")

pre_2021 = pd.read_csv("Cleaned_Datasets/2021/pre_2021.csv")
stress_endterm_2021 = pd.read_csv("Cleaned_Datasets/2021/stress_endterm_2021.csv")
ema_2021 = pd.read_csv("Cleaned_Datasets/2021/ema_2021.csv")
calls_2021 = pd.read_csv("Cleaned_Datasets/2021/calls_2021_ssa.csv")
bluetooth_2021 = pd.read_csv("Cleaned_Datasets/2021/bluetooth_2021_ssa.csv")
screen_2021 = pd.read_csv("Cleaned_Datasets/2021/screen_2021_ssa.csv")
location_2021 = pd.read_csv("Cleaned_Datasets/2021/location_2021_ssa.csv")
steps_2021 = pd.read_csv("Cleaned_Datasets/2021/steps_2021_ssa.csv")
sleep_2021 = pd.read_csv("Cleaned_Datasets/2021/sleep_2021_ssa.csv")


## Perform univariate feature selection and evaluate prediction models

In [3]:
# models to fit
random_states = range(0, 10)
models_with_params = {
    'Linear Regression': (LinearRegression(), {'fit_intercept': [True]}),
    'Lasso': (Lasso(random_state=0, max_iter=10000), {'alpha': [0.001, 0.01, 0.1, 1]}),
    'Ridge': (Ridge(random_state=0), {'alpha': [0.001, 0.01, 0.1, 1]}),
    'Random Forest': (RandomForestRegressor(random_state=0), {'max_depth': [3, 5, 7]}),
    'XGBoost': (XGBRegressor(random_state=0), {'max_depth': [3, 5, 7]}),
    'SVM': (SVR(), {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    'Median Regressor': (DummyRegressor(strategy='median'), {})
}

### Pre and Endterm

In [4]:
df_pre_post = merge_datasets([pre_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_pre_post = df_pre_post.dropna()
print(df_pre_post.shape)
_, df_pre_post_selected = perform_rfecv(df_pre_post)
print(df_pre_post_selected.shape)

(354, 29)
Optimal number of features: 15
Features selected: ['UCLA_10items_PRE', 'SocialFit_PRE', '2waySSS_giving_emotional_PRE', 'ERQ_reappraisal_PRE', 'ERQ_suppression_PRE', 'BRS_PRE', 'CHIPS_PRE', 'PSS_10items_PRE', 'PSS_14items_PRE', 'STAI_PRE', 'MAAS_15items_PRE', 'CESD_9items_PRE', 'BDI2_PRE', 'GQ_PRE', 'FSPWB_PRE']
(354, 17)


In [5]:
pre_post_results = evaluate_multiple_models(df_pre_post_selected, models_with_params, random_states)

Evaluating model: Linear Regression


10it [00:00, 187.30it/s]
10it [00:00, 244.27it/s]
10it [00:00, 236.45it/s]
10it [00:00, 233.79it/s]
10it [00:00, 242.42it/s]
10it [00:00, 223.54it/s]
10it [00:00, 199.38it/s]
10it [00:00, 227.78it/s]
10it [00:00, 229.99it/s]
10it [00:00, 236.84it/s]


Results for Linear Regression:
MSE: Avg = 25.3332, Std = 0.4280
RMSE: Avg = 5.0330, Std = 0.0422
MAE: Avg = 3.9750, Std = 0.0307
R²: Avg = 0.4414, Std = 0.0094
Evaluating model: Lasso


10it [00:00, 54.00it/s]
10it [00:00, 53.58it/s]
10it [00:00, 47.50it/s]
10it [00:00, 49.22it/s]
10it [00:00, 48.05it/s]
10it [00:00, 48.20it/s]
10it [00:00, 49.80it/s]
10it [00:00, 50.68it/s]
10it [00:00, 49.32it/s]
10it [00:00, 49.99it/s]


Results for Lasso:
MSE: Avg = 25.5802, Std = 0.5086
RMSE: Avg = 5.0574, Std = 0.0500
MAE: Avg = 4.0074, Std = 0.0363
R²: Avg = 0.4360, Std = 0.0112
Evaluating model: Ridge


10it [00:00, 74.80it/s]
10it [00:00, 77.32it/s]
10it [00:00, 76.78it/s]
10it [00:00, 78.05it/s]
10it [00:00, 75.75it/s]
10it [00:00, 77.24it/s]
10it [00:00, 86.18it/s]
10it [00:00, 82.35it/s]
10it [00:00, 82.16it/s]
10it [00:00, 83.56it/s]


Results for Ridge:
MSE: Avg = 25.4137, Std = 0.3242
RMSE: Avg = 5.0411, Std = 0.0320
MAE: Avg = 3.9929, Std = 0.0242
R²: Avg = 0.4397, Std = 0.0071
Evaluating model: Random Forest


10it [00:12,  1.23s/it]
10it [00:12,  1.21s/it]
10it [00:13,  1.30s/it]
10it [00:13,  1.31s/it]
10it [00:12,  1.28s/it]
10it [00:13,  1.35s/it]
10it [00:13,  1.35s/it]
10it [00:13,  1.37s/it]
10it [00:13,  1.38s/it]
10it [00:13,  1.38s/it]


Results for Random Forest:
MSE: Avg = 25.7137, Std = 0.2952
RMSE: Avg = 5.0708, Std = 0.0291
MAE: Avg = 4.0068, Std = 0.0221
R²: Avg = 0.4331, Std = 0.0065
Evaluating model: XGBoost


10it [00:15,  1.55s/it]
10it [00:15,  1.51s/it]
10it [00:15,  1.52s/it]
10it [00:15,  1.59s/it]
10it [00:16,  1.66s/it]
10it [00:16,  1.69s/it]
10it [00:16,  1.61s/it]
10it [00:18,  1.82s/it]
10it [00:22,  2.23s/it]
10it [00:17,  1.74s/it]


Results for XGBoost:
MSE: Avg = 29.2908, Std = 1.0225
RMSE: Avg = 5.4113, Std = 0.0941
MAE: Avg = 4.2663, Std = 0.0994
R²: Avg = 0.3542, Std = 0.0225
Evaluating model: SVM


10it [00:01,  8.23it/s]
10it [00:01,  8.41it/s]
10it [00:01,  7.54it/s]
10it [00:01,  8.23it/s]
10it [00:01,  8.22it/s]
10it [00:01,  8.46it/s]
10it [00:01,  8.80it/s]
10it [00:01,  8.82it/s]
10it [00:01,  7.66it/s]
10it [00:01,  8.23it/s]


Results for SVM:
MSE: Avg = 25.3065, Std = 0.3369
RMSE: Avg = 5.0304, Std = 0.0334
MAE: Avg = 4.0044, Std = 0.0298
R²: Avg = 0.4420, Std = 0.0074
Evaluating model: Median Regressor


10it [00:00, 275.44it/s]
10it [00:00, 290.15it/s]
10it [00:00, 284.83it/s]
10it [00:00, 173.97it/s]
10it [00:00, 154.92it/s]
10it [00:00, 286.05it/s]
10it [00:00, 174.51it/s]
10it [00:00, 260.78it/s]
10it [00:00, 291.55it/s]
10it [00:00, 300.39it/s]

Results for Median Regressor:
MSE: Avg = 46.1740, Std = 0.1939
RMSE: Avg = 6.7951, Std = 0.0143
MAE: Avg = 5.4260, Std = 0.0224
R²: Avg = -0.0181, Std = 0.0043

Best model based on R²: SVM with R² = 0.4420





### EMA and Endterm

In [6]:
df_ema_post = merge_datasets([ema_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_ema_post = df_ema_post.dropna()
print(df_ema_post.shape)
_, df_ema_post_selected = perform_rfecv(df_ema_post)
print(df_ema_post_selected.shape)


(300, 52)
Optimal number of features: 42
Features selected: ['phq4_EMA_mean', 'phq4_EMA_median', 'phq4_EMA_max', 'phq4_EMA_min', 'phq4_EMA_std', 'phq4_EMA_skew', 'phq4_EMA_kurt', 'phq4_EMA_iqr', 'phq4_EMA_autocorr', 'phq4_EMA_rmsd', 'phq4_anxiety_EMA_mean', 'phq4_anxiety_EMA_std', 'phq4_anxiety_EMA_skew', 'phq4_anxiety_EMA_kurt', 'phq4_anxiety_EMA_autocorr', 'phq4_anxiety_EMA_rmsd', 'phq4_depression_EMA_mean', 'phq4_depression_EMA_std', 'phq4_depression_EMA_skew', 'phq4_depression_EMA_kurt', 'phq4_depression_EMA_iqr', 'phq4_depression_EMA_autocorr', 'phq4_depression_EMA_rmsd', 'positive_affect_EMA_mean', 'positive_affect_EMA_median', 'positive_affect_EMA_max', 'positive_affect_EMA_min', 'positive_affect_EMA_std', 'positive_affect_EMA_skew', 'positive_affect_EMA_kurt', 'positive_affect_EMA_iqr', 'positive_affect_EMA_autocorr', 'positive_affect_EMA_rmsd', 'negative_affect_EMA_mean', 'negative_affect_EMA_median', 'negative_affect_EMA_max', 'negative_affect_EMA_std', 'negative_affect_EMA_s

In [7]:
ema_post_results = evaluate_multiple_models(df_ema_post_selected, models_with_params, random_states)

Evaluating model: Linear Regression


10it [00:00, 20.98it/s]
10it [00:00, 40.79it/s]
10it [00:00, 11.39it/s]
10it [00:00, 68.44it/s]
10it [00:00, 58.19it/s]
10it [00:00, 65.78it/s]
10it [00:00, 75.81it/s]
10it [00:00, 66.65it/s]
10it [00:00, 72.66it/s]
10it [00:00, 59.86it/s]


Results for Linear Regression:
MSE: Avg = 27.3840, Std = 0.7056
RMSE: Avg = 5.2325, Std = 0.0675
MAE: Avg = 4.0202, Std = 0.0600
R²: Avg = 0.3372, Std = 0.0171
Evaluating model: Lasso


10it [00:01,  7.67it/s]
10it [00:01,  6.60it/s]
10it [00:01,  7.67it/s]
10it [00:01,  6.13it/s]
10it [00:01,  6.90it/s]
10it [00:01,  9.20it/s]
10it [00:01,  7.55it/s]
10it [00:02,  4.07it/s]
10it [00:01,  6.62it/s]
10it [00:01,  6.61it/s]


Results for Lasso:
MSE: Avg = 23.7230, Std = 0.2363
RMSE: Avg = 4.8706, Std = 0.0243
MAE: Avg = 3.7837, Std = 0.0208
R²: Avg = 0.4258, Std = 0.0057
Evaluating model: Ridge


10it [00:00, 19.53it/s]
10it [00:00, 28.64it/s]
10it [00:00, 31.89it/s]
10it [00:00, 44.30it/s]
10it [00:00, 19.51it/s]
10it [00:00, 32.02it/s]
10it [00:00, 39.74it/s]
10it [00:00, 38.74it/s]
10it [00:00, 37.65it/s]
10it [00:00, 40.12it/s]


Results for Ridge:
MSE: Avg = 24.1780, Std = 0.3103
RMSE: Avg = 4.9170, Std = 0.0316
MAE: Avg = 3.7855, Std = 0.0264
R²: Avg = 0.4148, Std = 0.0075
Evaluating model: Random Forest


10it [00:27,  2.77s/it]
10it [00:27,  2.71s/it]
10it [00:27,  2.72s/it]
10it [00:28,  2.82s/it]
10it [00:27,  2.80s/it]
10it [00:27,  2.79s/it]
10it [00:27,  2.80s/it]
10it [00:27,  2.79s/it]
10it [00:27,  2.80s/it]
10it [00:27,  2.73s/it]


Results for Random Forest:
MSE: Avg = 23.3968, Std = 0.2796
RMSE: Avg = 4.8369, Std = 0.0288
MAE: Avg = 3.8397, Std = 0.0324
R²: Avg = 0.4337, Std = 0.0068
Evaluating model: XGBoost


10it [00:20,  2.07s/it]
10it [00:18,  1.86s/it]
10it [00:20,  2.00s/it]
10it [00:20,  2.04s/it]
10it [00:20,  2.01s/it]
10it [00:20,  2.06s/it]
10it [00:19,  1.95s/it]
10it [00:22,  2.28s/it]
10it [00:19,  1.97s/it]
10it [00:18,  1.85s/it]


Results for XGBoost:
MSE: Avg = 27.6549, Std = 0.9449
RMSE: Avg = 5.2580, Std = 0.0887
MAE: Avg = 4.2000, Std = 0.0608
R²: Avg = 0.3306, Std = 0.0229
Evaluating model: SVM


10it [00:01,  7.02it/s]
10it [00:01,  7.06it/s]
10it [00:01,  6.73it/s]
10it [00:01,  7.01it/s]
10it [00:01,  6.84it/s]
10it [00:01,  7.19it/s]
10it [00:01,  6.80it/s]
10it [00:01,  6.91it/s]
10it [00:01,  6.79it/s]
10it [00:01,  6.48it/s]


Results for SVM:
MSE: Avg = 23.3473, Std = 0.2812
RMSE: Avg = 4.8318, Std = 0.0290
MAE: Avg = 3.7603, Std = 0.0219
R²: Avg = 0.4349, Std = 0.0068
Evaluating model: Median Regressor


10it [00:00, 272.07it/s]
10it [00:00, 267.25it/s]
10it [00:00, 266.85it/s]
10it [00:00, 272.00it/s]
10it [00:00, 270.49it/s]
10it [00:00, 285.14it/s]
10it [00:00, 268.68it/s]
10it [00:00, 289.75it/s]
10it [00:00, 285.54it/s]
10it [00:00, 274.33it/s]

Results for Median Regressor:
MSE: Avg = 42.2332, Std = 0.2944
RMSE: Avg = 6.4987, Std = 0.0227
MAE: Avg = 5.1100, Std = 0.0181
R²: Avg = -0.0223, Std = 0.0071

Best model based on R²: SVM with R² = 0.4349





### Pre, EMA, and Endterm

In [8]:
#df_pre_ema_post = merge_datasets([pre_2019_2020, ema_2019_2020_transformed, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')

df_pre_ema_post = merge_datasets([df_pre_post_selected, df_ema_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

df_pre_ema_post = df_pre_ema_post.dropna()
print(df_pre_ema_post.shape)
results_filtered, df_pre_ema_post_selected = perform_rfecv(df_pre_ema_post)
print(df_pre_ema_post_selected.shape)


(283, 59)
Optimal number of features: 51
Features selected: ['UCLA_10items_PRE', 'SocialFit_PRE', '2waySSS_giving_emotional_PRE', 'ERQ_reappraisal_PRE', 'ERQ_suppression_PRE', 'BRS_PRE', 'CHIPS_PRE', 'PSS_10items_PRE', 'PSS_14items_PRE', 'STAI_PRE', 'MAAS_15items_PRE', 'CESD_9items_PRE', 'BDI2_PRE', 'GQ_PRE', 'FSPWB_PRE', 'phq4_EMA_mean', 'phq4_EMA_median', 'phq4_EMA_max', 'phq4_EMA_min', 'phq4_EMA_std', 'phq4_EMA_skew', 'phq4_EMA_kurt', 'phq4_EMA_autocorr', 'phq4_EMA_rmsd', 'phq4_anxiety_EMA_mean', 'phq4_anxiety_EMA_std', 'phq4_anxiety_EMA_skew', 'phq4_anxiety_EMA_kurt', 'phq4_anxiety_EMA_rmsd', 'phq4_depression_EMA_mean', 'phq4_depression_EMA_std', 'phq4_depression_EMA_kurt', 'phq4_depression_EMA_rmsd', 'positive_affect_EMA_mean', 'positive_affect_EMA_median', 'positive_affect_EMA_max', 'positive_affect_EMA_min', 'positive_affect_EMA_std', 'positive_affect_EMA_skew', 'positive_affect_EMA_kurt', 'positive_affect_EMA_autocorr', 'positive_affect_EMA_rmsd', 'negative_affect_EMA_mean', 'n

In [9]:
pre_ema_post_results = evaluate_multiple_models(df_pre_ema_post_selected, models_with_params, random_states)

Evaluating model: Linear Regression


10it [00:00, 15.32it/s]
10it [00:00, 17.71it/s]
10it [00:00, 21.32it/s]
10it [00:00, 20.26it/s]
10it [00:00, 18.70it/s]
10it [00:00, 12.49it/s]
10it [00:00, 15.01it/s]
10it [00:00, 25.93it/s]
10it [00:00, 30.75it/s]
10it [00:00, 10.24it/s]


Results for Linear Regression:
MSE: Avg = 25.0257, Std = 0.8354
RMSE: Avg = 5.0019, Std = 0.0838
MAE: Avg = 3.9929, Std = 0.0622
R²: Avg = 0.3974, Std = 0.0201
Evaluating model: Lasso


10it [00:06,  1.55it/s]
10it [00:05,  1.97it/s]
10it [00:07,  1.41it/s]
10it [00:05,  1.76it/s]
10it [00:06,  1.46it/s]
10it [00:05,  1.90it/s]
10it [00:06,  1.55it/s]
10it [00:05,  1.71it/s]
10it [00:06,  1.66it/s]
10it [00:06,  1.59it/s]


Results for Lasso:
MSE: Avg = 21.5254, Std = 0.4252
RMSE: Avg = 4.6393, Std = 0.0459
MAE: Avg = 3.6545, Std = 0.0445
R²: Avg = 0.4817, Std = 0.0102
Evaluating model: Ridge


10it [00:02,  4.35it/s]
10it [00:01,  8.96it/s]
10it [00:00, 13.17it/s]
10it [00:00, 12.87it/s]
10it [00:00, 10.55it/s]
10it [00:00, 12.34it/s]
10it [00:01,  8.65it/s]
10it [00:02,  4.02it/s]
10it [00:01,  6.35it/s]
10it [00:01,  5.43it/s]


Results for Ridge:
MSE: Avg = 21.7486, Std = 0.5534
RMSE: Avg = 4.6632, Std = 0.0594
MAE: Avg = 3.7124, Std = 0.0525
R²: Avg = 0.4763, Std = 0.0133
Evaluating model: Random Forest


10it [00:29,  2.96s/it]
10it [00:30,  3.03s/it]
10it [00:30,  3.02s/it]
10it [00:30,  3.04s/it]
10it [00:30,  3.01s/it]
10it [00:29,  2.97s/it]
10it [00:30,  3.02s/it]
10it [00:29,  3.00s/it]
10it [00:29,  2.98s/it]
10it [00:29,  2.98s/it]


Results for Random Forest:
MSE: Avg = 22.0688, Std = 0.2340
RMSE: Avg = 4.6977, Std = 0.0249
MAE: Avg = 3.8090, Std = 0.0224
R²: Avg = 0.4686, Std = 0.0056
Evaluating model: XGBoost


10it [00:20,  2.05s/it]
10it [00:21,  2.11s/it]
10it [00:19,  1.98s/it]
10it [00:20,  2.01s/it]
10it [00:20,  2.02s/it]
10it [00:19,  1.99s/it]
10it [00:19,  1.97s/it]
10it [00:19,  1.92s/it]
10it [00:18,  1.82s/it]
10it [00:18,  1.84s/it]


Results for XGBoost:
MSE: Avg = 24.9814, Std = 0.8072
RMSE: Avg = 4.9975, Std = 0.0811
MAE: Avg = 4.0204, Std = 0.0693
R²: Avg = 0.3984, Std = 0.0194
Evaluating model: SVM


10it [00:01,  7.23it/s]
10it [00:01,  7.37it/s]
10it [00:01,  7.39it/s]
10it [00:01,  7.15it/s]
10it [00:01,  7.07it/s]
10it [00:01,  7.22it/s]
10it [00:01,  7.24it/s]
10it [00:01,  7.13it/s]
10it [00:01,  7.28it/s]
10it [00:01,  7.05it/s]


Results for SVM:
MSE: Avg = 21.7871, Std = 0.5239
RMSE: Avg = 4.6673, Std = 0.0560
MAE: Avg = 3.7002, Std = 0.0505
R²: Avg = 0.4754, Std = 0.0126
Evaluating model: Median Regressor


10it [00:00, 265.67it/s]
10it [00:00, 285.81it/s]
10it [00:00, 276.97it/s]
10it [00:00, 260.04it/s]
10it [00:00, 273.58it/s]
10it [00:00, 263.02it/s]
10it [00:00, 261.17it/s]
10it [00:00, 263.87it/s]
10it [00:00, 269.42it/s]
10it [00:00, 257.07it/s]

Results for Median Regressor:
MSE: Avg = 42.5113, Std = 0.2885
RMSE: Avg = 6.5200, Std = 0.0221
MAE: Avg = 5.1583, Std = 0.0155
R²: Avg = -0.0237, Std = 0.0069

Best model based on R²: Lasso with R² = 0.4817





### Call and Endterm

In [10]:
df_call_post = merge_datasets([calls_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_call_post = df_call_post.dropna()
print(df_call_post.shape)
_, df_call_post_selected = perform_rfecv(df_call_post)
print(df_call_post_selected.shape)

(193, 82)
Optimal number of features: 57
Features selected: ['f_call:phone_calls_rapids_missed_count:allday_slope_mean', 'f_call:phone_calls_rapids_missed_count:allday_slope_variance', 'f_call:phone_calls_rapids_missed_count:allday_curvature_mean', 'f_call:phone_calls_rapids_missed_count:allday_curvature_max', 'f_call:phone_calls_rapids_missed_count:allday_absolute_auc', 'f_call:phone_calls_rapids_missed_count:allday_slope', 'f_call:phone_calls_rapids_missed_count:allday_mean', 'f_call:phone_calls_rapids_missed_count:allday_variance', 'f_call:phone_calls_rapids_incoming_count:allday_slope_mean', 'f_call:phone_calls_rapids_incoming_count:allday_curvature_mean', 'f_call:phone_calls_rapids_incoming_count:allday_curvature_max', 'f_call:phone_calls_rapids_incoming_count:allday_total_variation', 'f_call:phone_calls_rapids_incoming_count:allday_absolute_auc', 'f_call:phone_calls_rapids_incoming_count:allday_slope', 'f_call:phone_calls_rapids_incoming_count:allday_mean', 'f_call:phone_calls_ra

In [11]:
#call_post_results = evaluate_multiple_models(df_call_post_selected, models_with_params, random_states)

### Bluetooth and Endterm

In [12]:
df_bluetooth_post = merge_datasets([bluetooth_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_bluetooth_post = df_bluetooth_post.dropna()
print(df_bluetooth_post.shape)
_, df_bluetooth_post_selected = perform_rfecv(df_bluetooth_post)
print(df_bluetooth_post_selected.shape)

(333, 32)
Optimal number of features: 10
Features selected: ['f_blue:phone_bluetooth_rapids_countscans:allday_slope_mean', 'f_blue:phone_bluetooth_rapids_countscans:allday_slope', 'f_blue:phone_bluetooth_rapids_countscans:allday_mean', 'f_blue:phone_bluetooth_rapids_uniquedevices:allday_total_variation', 'f_blue:phone_bluetooth_rapids_uniquedevices:allday_slope', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_slope_mean', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_curvature_mean', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_curvature_max', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_slope', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_mean']
(333, 12)


In [13]:
#bluetooth_post_results = evaluate_multiple_models(df_bluetooth_post_selected, models_with_params, random_states)

### Phone usage and Endterm

In [14]:
df_screen_post = merge_datasets([screen_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_screen_post = df_screen_post.dropna()
print(df_screen_post.shape)
_, df_screen_post_selected = perform_rfecv(df_screen_post)
print(df_screen_post_selected.shape)

(339, 32)
Optimal number of features: 19
Features selected: ['f_screen:phone_screen_rapids_sumdurationunlock:allday_slope_mean', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_slope_variance', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_curvature_mean', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_curvature_max', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_num_inflection_points', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_total_variation', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_absolute_auc', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_slope', 'f_screen:phone_screen_rapids_sumdurationunlock:allday_variance', 'f_screen:phone_screen_rapids_countepisodeunlock:allday_slope_variance', 'f_screen:phone_screen_rapids_countepisodeunlock:allday_curvature_mean', 'f_screen:phone_screen_rapids_countepisodeunlock:allday_curvature_max', 'f_screen:phone_screen_rapids_countepisodeunlock:allday_slope', 'f_screen:phone_screen_

In [15]:
#screen_post_results = evaluate_multiple_models(df_screen_post_selected, models_with_params, random_states)

### Location and Endterm

In [16]:
df_location_post = merge_datasets([location_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_location_post = df_location_post.dropna()
print(df_location_post.shape)
_, df_location_post_selected = perform_rfecv(df_location_post)
print(df_location_post_selected.shape)

(298, 62)
Optimal number of features: 39
Features selected: ['f_loc:phone_locations_barnett_hometime:allday_slope_mean', 'f_loc:phone_locations_barnett_hometime:allday_curvature_mean', 'f_loc:phone_locations_barnett_hometime:allday_num_inflection_points', 'f_loc:phone_locations_barnett_hometime:allday_total_variation', 'f_loc:phone_locations_barnett_hometime:allday_absolute_auc', 'f_loc:phone_locations_barnett_hometime:allday_slope', 'f_loc:phone_locations_barnett_hometime:allday_variance', 'f_loc:phone_locations_barnett_disttravelled:allday_slope_mean', 'f_loc:phone_locations_barnett_disttravelled:allday_curvature_mean', 'f_loc:phone_locations_barnett_disttravelled:allday_num_inflection_points', 'f_loc:phone_locations_barnett_disttravelled:allday_absolute_auc', 'f_loc:phone_locations_barnett_disttravelled:allday_slope', 'f_loc:phone_locations_barnett_rog:allday_num_inflection_points', 'f_loc:phone_locations_barnett_rog:allday_total_variation', 'f_loc:phone_locations_barnett_rog:allday

In [17]:
#location_post_results = evaluate_multiple_models(df_location_post_selected, models_with_params, random_states)

### Sleep and Endterm

In [18]:
df_sleep_post = merge_datasets([sleep_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_sleep_post = df_sleep_post.dropna()
print(df_sleep_post.shape)
_, df_sleep_post_selected = perform_rfecv(df_sleep_post)
print(df_sleep_post_selected.shape)

(320, 62)
Optimal number of features: 54
Features selected: ['f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_slope_mean', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_slope_variance', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_curvature_mean', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_curvature_max', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_num_inflection_points', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_total_variation', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_absolute_auc', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_slope', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_mean', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_variance', 'f_slp:fitbit_sleep_intraday_rapids_sumdurationawakeunifiedmain:allday_sl

In [19]:
#sleep_post_results = evaluate_multiple_models(df_sleep_post_selected, models_with_params, random_states)

### Steps and Endterm

In [20]:
df_steps_post = merge_datasets([steps_2019_2020, stress_endterm_2019_2020], on_column = 'pid', how = 'inner')
df_steps_post = df_steps_post.dropna()
print(df_steps_post.shape)
_, df_steps_post_selected = perform_rfecv(df_steps_post)
print(df_steps_post_selected.shape)

(329, 52)
Optimal number of features: 37
Features selected: ['f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_num_inflection_points', 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_total_variation', 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_absolute_auc', 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_slope', 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_mean', 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday_variance', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_slope_mean', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_curvature_mean', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_curvature_max', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_num_inflection_points', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_total_variation', 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday_absolute_auc', 'f_

In [21]:
#steps_post_results = evaluate_multiple_models(df_steps_post_selected, models_with_params, random_states)

### Sensors and Endterm

In [22]:
### Here i left out calls data becuase of high missing rates
#df_sensors_post = merge_datasets([bluetooth_2019_2020, location_2019_2020, screen_2019_2020, sleep_2019_2020, steps_2019_2020, stress_endterm_2019_2020], on_column='pid', how='inner')

df_sensors_post = merge_datasets([df_bluetooth_post_selected, df_location_post_selected, df_screen_post_selected, df_sleep_post_selected, df_steps_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

df_sensors_post = df_sensors_post.dropna()
print(df_sensors_post.shape)
_, df_sensors_post_selected = perform_rfecv(df_sensors_post)
print(df_sensors_post_selected.shape)


(278, 161)
Optimal number of features: 121
Features selected: ['f_blue:phone_bluetooth_rapids_countscans:allday_slope_mean', 'f_blue:phone_bluetooth_rapids_countscans:allday_slope', 'f_blue:phone_bluetooth_rapids_uniquedevices:allday_slope', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_slope_mean', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_curvature_mean', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_curvature_max', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_slope', 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday_mean', 'f_loc:phone_locations_barnett_hometime:allday_slope_mean', 'f_loc:phone_locations_barnett_hometime:allday_curvature_mean', 'f_loc:phone_locations_barnett_hometime:allday_num_inflection_points', 'f_loc:phone_locations_barnett_hometime:allday_total_variation', 'f_loc:phone_locations_barnett_hometime:allday_absolute_auc', 'f_loc:phone_locations_barnett_hometime:allday_slo

In [24]:
sensors_post_results = evaluate_multiple_models(df_sensors_post_selected, models_with_params, random_states)


Evaluating model: Linear Regression


10it [00:03,  2.91it/s]
10it [00:01,  5.04it/s]
10it [00:02,  4.43it/s]
10it [00:02,  4.06it/s]
10it [00:04,  2.12it/s]
10it [00:01,  5.43it/s]
10it [00:02,  3.66it/s]
10it [00:04,  2.41it/s]
10it [00:02,  4.06it/s]
10it [00:03,  3.06it/s]


Results for Linear Regression:
MSE: Avg = 1018133.2654, Std = 456650.7606
RMSE: Avg = 980.9663, Std = 236.3014
MAE: Avg = 87.6349, Std = 18.8101
R²: Avg = -23751.8746, Std = 10653.5840
Evaluating model: Lasso


  model = cd_fast.enet_coordinate_descent(
10it [00:27,  2.77s/it]
10it [00:27,  2.74s/it]
10it [00:27,  2.80s/it]
10it [00:22,  2.25s/it]
  model = cd_fast.enet_coordinate_descent(
10it [00:30,  3.10s/it]
10it [00:17,  1.74s/it]
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
10it [00:25,  2.60s/it]
10it [00:17,  1.77s/it]
10it [00:16,  1.63s/it]
10it [00:19,  1.96s/it]


Results for Lasso:
MSE: Avg = 41.2907, Std = 0.3047
RMSE: Avg = 6.4257, Std = 0.0237
MAE: Avg = 5.1247, Std = 0.0252
R²: Avg = 0.0367, Std = 0.0071
Evaluating model: Ridge


10it [01:21,  8.17s/it]
10it [01:26,  8.61s/it]
10it [01:28,  8.81s/it]
10it [01:26,  8.68s/it]
10it [01:23,  8.32s/it]
10it [01:26,  8.60s/it]
10it [01:19,  7.95s/it]
10it [01:13,  7.40s/it]
10it [01:13,  7.37s/it]
10it [01:17,  7.71s/it]


Results for Ridge:
MSE: Avg = 79472.9452, Std = 36911.9853
RMSE: Avg = 272.0028, Std = 74.0772
MAE: Avg = 25.3684, Std = 4.8813
R²: Avg = -1853.0902, Std = 861.1503
Evaluating model: Random Forest


10it [01:16,  7.67s/it]
10it [01:15,  7.57s/it]
10it [01:15,  7.53s/it]
10it [01:15,  7.56s/it]
10it [01:15,  7.52s/it]
10it [01:14,  7.48s/it]
10it [01:12,  7.30s/it]
10it [01:11,  7.16s/it]
10it [01:16,  7.61s/it]
10it [01:15,  7.52s/it]


Results for Random Forest:
MSE: Avg = 41.8946, Std = 0.6084
RMSE: Avg = 6.4724, Std = 0.0470
MAE: Avg = 5.1702, Std = 0.0477
R²: Avg = 0.0226, Std = 0.0142
Evaluating model: XGBoost


10it [00:30,  3.05s/it]
10it [00:31,  3.20s/it]
10it [00:30,  3.04s/it]
10it [00:29,  2.97s/it]
10it [00:31,  3.10s/it]
10it [00:30,  3.03s/it]
10it [00:29,  2.99s/it]
10it [00:30,  3.06s/it]
10it [00:30,  3.01s/it]
10it [00:32,  3.23s/it]


Results for XGBoost:
MSE: Avg = 48.7364, Std = 1.7825
RMSE: Avg = 6.9800, Std = 0.1287
MAE: Avg = 5.6837, Std = 0.1055
R²: Avg = -0.1370, Std = 0.0416
Evaluating model: SVM


10it [00:02,  4.84it/s]
10it [00:01,  5.13it/s]
10it [00:01,  5.23it/s]
10it [00:01,  5.28it/s]
10it [00:01,  5.16it/s]
10it [00:01,  5.27it/s]
10it [00:01,  5.24it/s]
10it [00:01,  5.20it/s]
10it [00:01,  5.13it/s]
10it [00:01,  5.18it/s]


Results for SVM:
MSE: Avg = 1911.2877, Std = 2143.2278
RMSE: Avg = 33.7792, Std = 27.7534
MAE: Avg = 7.3324, Std = 2.0170
R²: Avg = -43.5900, Std = 50.0011
Evaluating model: Median Regressor


10it [00:00, 222.40it/s]
10it [00:00, 246.94it/s]
10it [00:00, 255.85it/s]
10it [00:00, 252.33it/s]
10it [00:00, 255.17it/s]
10it [00:00, 255.12it/s]
10it [00:00, 257.37it/s]
10it [00:00, 252.30it/s]
10it [00:00, 257.82it/s]
10it [00:00, 257.47it/s]

Results for Median Regressor:
MSE: Avg = 42.8813, Std = 0.0000
RMSE: Avg = 6.5484, Std = 0.0000
MAE: Avg = 5.1763, Std = 0.0000
R²: Avg = -0.0004, Std = 0.0000

Best model based on R²: Lasso with R² = 0.0367





### EMA, Sensors, and Endterm

In [25]:
### Here i left out calls data becuase of high missing rates
#df_ema_sensors_post = merge_datasets([ema_2019_2020_transformed, bluetooth_2019_2020, location_2019_2020, screen_2019_2020, sleep_2019_2020, steps_2019_2020, stress_endterm_2019_2020], on_column='pid', how='inner')

df_ema_sensors_post = merge_datasets([df_ema_post_selected, df_sensors_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

df_ema_sensors_post = df_ema_sensors_post.dropna()
print(df_ema_sensors_post.shape)
_, df_ema_sensors_post_selected = perform_rfecv(df_ema_sensors_post)
print(df_ema_sensors_post_selected.shape)


(221, 165)
Optimal number of features: 124
Features selected: ['phq4_EMA_mean', 'phq4_EMA_median', 'phq4_EMA_max', 'phq4_EMA_min', 'phq4_EMA_std', 'phq4_EMA_skew', 'phq4_EMA_kurt', 'phq4_EMA_autocorr', 'phq4_EMA_rmsd', 'phq4_anxiety_EMA_mean', 'phq4_anxiety_EMA_std', 'phq4_anxiety_EMA_skew', 'phq4_anxiety_EMA_autocorr', 'phq4_anxiety_EMA_rmsd', 'phq4_depression_EMA_mean', 'phq4_depression_EMA_std', 'phq4_depression_EMA_skew', 'phq4_depression_EMA_iqr', 'phq4_depression_EMA_autocorr', 'phq4_depression_EMA_rmsd', 'positive_affect_EMA_mean', 'positive_affect_EMA_median', 'positive_affect_EMA_max', 'positive_affect_EMA_min', 'positive_affect_EMA_std', 'positive_affect_EMA_skew', 'positive_affect_EMA_kurt', 'positive_affect_EMA_autocorr', 'positive_affect_EMA_rmsd', 'negative_affect_EMA_mean', 'negative_affect_EMA_median', 'negative_affect_EMA_max', 'negative_affect_EMA_std', 'negative_affect_EMA_skew', 'negative_affect_EMA_kurt', 'negative_affect_EMA_iqr', 'negative_affect_EMA_autocorr', '

In [27]:
ema_sensors_post_results = evaluate_multiple_models(df_ema_sensors_post_selected, models_with_params, random_states)


Evaluating model: Linear Regression


10it [00:00, 11.54it/s]
10it [00:00, 11.59it/s]
10it [00:02,  4.92it/s]
10it [00:00, 10.14it/s]
10it [00:00, 10.93it/s]
10it [00:02,  4.73it/s]
10it [00:01,  8.70it/s]
10it [00:01,  7.94it/s]
10it [00:01,  7.37it/s]
10it [00:01,  8.19it/s]


Results for Linear Regression:
MSE: Avg = 227872.3767, Std = 55714.7662
RMSE: Avg = 473.1123, Std = 63.5381
MAE: Avg = 59.1241, Std = 7.8507
R²: Avg = -5749.6411, Std = 1406.0310
Evaluating model: Lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
10it [00:35,  3.51s/it]
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
10it [00:35,  3.50s/it]
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
10it [00:27,  2.71s/it]
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_co

KeyboardInterrupt: 

### Pre, EMA, Sensors, and Endterm

In [None]:
### Here i left out calls data becuase of high missing rates
#df_pre_ema_sensors_post = merge_datasets([pre_2019_2020, ema_2019_2020_transformed, bluetooth_2019_2020, location_2019_2020, screen_2019_2020, sleep_2019_2020, steps_2019_2020, stress_endterm_2019_2020], on_column='pid', how='inner')

#df_pre_ema_sensors_post = merge_datasets([df_pre_post_selected, df_ema_post_selected, df_sensors_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

df_pre_ema_sensors_post = merge_datasets([df_pre_post_selected, df_ema_sensors_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

#df_pre_ema_sensors_post = merge_datasets([df_pre_ema_post_selected, df_sensors_post_selected, stress_endterm_2019_2020], on_column=['pid', 'PSS_10items_POST'], how='inner')

#df_pre_ema_sensors_post = df_pre_ema_sensors_post.drop(columns = ['CESD_10items_PRE', 'BYAACQ_PRE'])
df_pre_ema_sensors_post = df_pre_ema_sensors_post.dropna()
print(df_pre_ema_sensors_post.shape)
_, df_pre_ema_sensors_post_selected = perform_rfecv(df_pre_ema_sensors_post)
print(df_pre_ema_sensors_post_selected.shape)


In [None]:
df_pre_ema_sensors_post_selected.columns

In [None]:

pre_ema_sensors_post_results = evaluate_multiple_models(df_pre_ema_sensors_post_selected, models_with_params, random_states)


Best model based on R²: Ridge with R² = 0.4746

### SHAP Analysis of the best model and with best dataset

### Fitting the Final model: 2019_2020 train and 2021 test

In [None]:
df_2021 = merge_datasets([pre_2021, 
                          ema_2021, 
                          bluetooth_2021, 
                          location_2021, 
                          screen_2021, 
                          sleep_2021, 
                          steps_2021, 
                          stress_endterm_2021], 
                         on_column='pid', how='inner')

In [None]:
feature_rename = {
    'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday_mean': 'sumduration_main_sleep_mean',
    'f_loc:phone_locations_barnett_circanrtn:allday_slope': 'circadianroutine_mean'
}

df_pre_ema_sensors_post_selected.rename(columns=feature_rename, inplace=True)
df_2021.rename(columns=feature_rename, inplace=True)

In [None]:
df_train = df_pre_ema_sensors_post_selected
X_train = df_train.drop(columns=['PSS_10items_POST', 'pid']) 
y_train = df_train['PSS_10items_POST']

In [None]:
df_test = df_2021[df_pre_ema_sensors_post_selected.columns.to_list()].dropna()
X_test = df_test.drop(columns=['PSS_10items_POST', 'pid']) 
y_test = df_test['PSS_10items_POST']

In [None]:
from evaluate_final_model import *
pre_ema_sensors_post_results = train_and_evaluate_model(X_train, y_train, X_test, y_test, RandomForestRegressor(random_state=42), {'max_depth': [3, 5, 7]})
