# Dataset Exploration

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import scipy.stats
import warnings
from HelperFunctions import *

warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Pre-Survey Data

### 2018, 2019, 2020, 2021

In [2]:
pre_2018 = pd.read_csv("Datasets/INS-W_1/SurveyData/pre.csv")
print(f"PRE_2018: {pre_2018.shape}")
plot_correlation_matrix(pre_2018, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_pre_2018")

pre_2019 = pd.read_csv("Datasets/INS-W_2/SurveyData/pre.csv")
print(f"PRE_2019: {pre_2019.shape}")
plot_correlation_matrix(pre_2019, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_pre_2019")

pre_2020 = pd.read_csv("Datasets/INS-W_3/SurveyData/pre.csv")
print(f"PRE_2020: {pre_2020.shape}")
plot_correlation_matrix(pre_2020, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_pre_2020")

pre_2021 = pd.read_csv("Datasets/INS-W_4/SurveyData/pre.csv")
print(f"PRE_2021:{pre_2021.shape}")
plot_correlation_matrix(pre_2021, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_pre_2021")
pre_2021.to_csv('Cleaned_Datasets/2021/pre_2021.csv', index=False)

### Combining the 2019 and 2020 dataset to be used for model development
pre_2019_2020 = pd.concat([pre_2019, pre_2020], axis=0, ignore_index=True)
pre_2019_2020 = pre_2019_2020.drop(columns = ['Unnamed: 0', 'date', 'CESD_10items_PRE', 'BYAACQ_PRE']) # 'CESD_10items_PRE', 'BYAACQ_PRE' # dropping these because of high missing rates
print(f"PRE_2019_2020: {pre_2019_2020.shape}")
plot_correlation_matrix(pre_2019_2020, exclude_columns=['pid'], plot_name="correlation_heatmap_pre_2019_2020")
pre_2019_2020.to_csv('Cleaned_Datasets/2019_2020/pre_2019_2020.csv', index=False)


PRE_2018: (193, 23)
Plot saved as Correlation_Plots/correlation_heatmap_pre_2018.html
PRE_2019: (253, 31)
Plot saved as Correlation_Plots/correlation_heatmap_pre_2019.html
PRE_2020: (147, 32)
Plot saved as Correlation_Plots/correlation_heatmap_pre_2020.html
PRE_2021:(209, 32)
Plot saved as Correlation_Plots/correlation_heatmap_pre_2021.html
PRE_2019_2020: (400, 28)
Plot saved as Correlation_Plots/correlation_heatmap_pre_2019_2020.html


In [3]:
# pre_2019_2020 = pre_2019_2020.drop(columns = ['Unnamed: 0', 'date'])
# pre_2019_2020.info()
# pre_2019_2020.to_csv('Cleaned_Datasets/2019_2020/pre_2019_2020.csv', index=False)


## EMA Data

### 2018, 2019, 2020, 2021

In [4]:
ema_2018 = pd.read_csv("Datasets/INS-W_1/SurveyData/ema.csv")
ema_2018_cleaned = process_ema_data(ema_2018, start_dates, date_column='date')
print(f"EMA_2018: {ema_2018_cleaned.info()}")
print(f"\n")

ema_2019 = pd.read_csv("Datasets/INS-W_2/SurveyData/ema.csv")
ema_2019_cleaned = process_ema_data(ema_2019, start_dates, date_column='date')
print(f"EMA_2019: {ema_2019_cleaned.info()}")
print(f"\n")

ema_2020 = pd.read_csv("Datasets/INS-W_3/SurveyData/ema.csv")
ema_2020_cleaned = process_ema_data(ema_2020, start_dates, date_column='date')
print(f"EMA_2020: {ema_2020_cleaned.info()}")
print(f"\n")

ema_2021 = pd.read_csv("Datasets/INS-W_4/SurveyData/ema.csv")
ema_2021_cleaned = process_ema_data(ema_2021, start_dates, date_column='date')
print(f"EMA_2021: {ema_2021_cleaned.info()}")
print(f"\n")

### Combining the 2019 and 2020 dataset to be used for model development
ema_2019_2020 = pd.concat([ema_2019_cleaned, ema_2020_cleaned], axis=0, ignore_index=True)
print(f"EMA_2019_2020: {ema_2019_2020.info()}")
plot_correlation_matrix(ema_2019_2020, exclude_columns=['pid', 'sequential_week', 'week_label'], plot_name="correlation_heatmap_pre_2019_2020")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873 entries, 0 to 1872
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pid                  1873 non-null   object 
 1   week_label           1873 non-null   object 
 2   sequential_week      1873 non-null   int64  
 3   negative_affect_EMA  1873 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 58.7+ KB
EMA_2018: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2362 entries, 0 to 2361
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pid                  2362 non-null   object 
 1   week_label           2362 non-null   object 
 2   sequential_week      2362 non-null   int64  
 3   phq4_EMA             2066 non-null   float64
 4   phq4_anxiety_EMA     2067 non-null   float64
 5   phq4_depression_EMA  2066 non-null   float64
 6   pss

In [5]:
ema_2019_2020 = ema_2019_2020.drop(columns = ['pss4_EMA'])
ema_2019_2020_transformed = transform_ema_dataset(ema_2019_2020, min_data_ratio = 0.1)
print(ema_2019_2020_transformed.info())
print(ema_2019_2020_transformed.shape)

ema_2019_2020_transformed.to_csv('Cleaned_Datasets/2019_2020/ema_2019_2020.csv', index=False)

ema_2021_cleaned = ema_2021_cleaned.drop(columns = ['pss4_EMA'])
ema_2021_transformed = transform_ema_dataset(ema_2021_cleaned, min_data_ratio = 0.1)
print(ema_2021_transformed.info())
print(ema_2021_transformed.shape)

ema_2021_transformed.to_csv('Cleaned_Datasets/2021/ema_2021.csv', index=False)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pid                           397 non-null    object 
 1   phq4_EMA_mean                 395 non-null    float64
 2   phq4_EMA_median               395 non-null    float64
 3   phq4_EMA_max                  395 non-null    float64
 4   phq4_EMA_min                  395 non-null    float64
 5   phq4_EMA_std                  393 non-null    float64
 6   phq4_EMA_skew                 389 non-null    float64
 7   phq4_EMA_kurt                 388 non-null    float64
 8   phq4_EMA_iqr                  395 non-null    float64
 9   phq4_EMA_autocorr             370 non-null    float64
 10  phq4_EMA_rmsd                 393 non-null    float64
 11  phq4_anxiety_EMA_mean         395 non-null    float64
 12  phq4_anxiety_EMA_median       395 non-null    float64
 13  phq4_


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pid                           209 non-null    object 
 1   phq4_EMA_mean                 209 non-null    float64
 2   phq4_EMA_median               209 non-null    float64
 3   phq4_EMA_max                  209 non-null    float64
 4   phq4_EMA_min                  209 non-null    float64
 5   phq4_EMA_std                  207 non-null    float64
 6   phq4_EMA_skew                 206 non-null    float64
 7   phq4_EMA_kurt                 205 non-null    float64
 8   phq4_EMA_iqr                  209 non-null    float64
 9   phq4_EMA_autocorr             192 non-null    float64
 10  phq4_EMA_rmsd                 207 non-null    float64
 11  phq4_anxiety_EMA_mean         209 non-null    float64
 12  phq4_anxiety_EMA_median       209 non-null    float64
 13  phq4_


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

## Post Dataset

### 2018, 2019, 2020, 2021

In [6]:
print(f"POST_2018")
post_2018 = pd.read_csv("Datasets/INS-W_1/SurveyData/post.csv")
plot_correlation_matrix(post_2018, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_post_2018")
stress_endterm_2018 = post_2018[['pid', 'date', 'PSS_10items_POST']]
print(stress_endterm_2018['PSS_10items_POST'].describe())
print(stress_endterm_2018['PSS_10items_POST'].value_counts(dropna=False))
stress_endterm_2018.to_csv('Cleaned_Datasets/2018/stress_endterm_2018.csv', index=False)
print(f"\n")

print(f"POST_2019")
post_2019 = pd.read_csv("Datasets/INS-W_2/SurveyData/post.csv")
plot_correlation_matrix(post_2019, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_post_2019")
stress_endterm_2019 = post_2019[['pid', 'date', 'PSS_10items_POST']]
print(stress_endterm_2019['PSS_10items_POST'].describe())
print(stress_endterm_2019['PSS_10items_POST'].value_counts(dropna=False))
stress_endterm_2019.to_csv('Cleaned_Datasets/2019/stress_endterm_2019.csv', index=False)
print(f"\n")

print(f"POST_2020")
post_2020 = pd.read_csv("Datasets/INS-W_3/SurveyData/post.csv")
plot_correlation_matrix(post_2020, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_post_2020")
stress_endterm_2020 = post_2020[['pid', 'date', 'PSS_10items_POST']]
print(stress_endterm_2020['PSS_10items_POST'].describe())
print(stress_endterm_2020['PSS_10items_POST'].value_counts(dropna=False))
stress_endterm_2020.to_csv('Cleaned_Datasets/2020/stress_endterm_2020.csv', index=False)
print(f"\n")

print(f"POST_2021")
post_2021 = pd.read_csv("Datasets/INS-W_4/SurveyData/post.csv")
plot_correlation_matrix(post_2021, exclude_columns=['Unnamed: 0', 'pid', 'date'], plot_name="correlation_heatmap_post_2021")
stress_endterm_2021 = post_2021[['pid', 'date', 'PSS_10items_POST']]
print(stress_endterm_2021['PSS_10items_POST'].describe())
print(stress_endterm_2021['PSS_10items_POST'].value_counts(dropna=False))
stress_endterm_2021.to_csv('Cleaned_Datasets/2021/stress_endterm_2021.csv', index=False)
print(f"\n")

print(f"POST_2019_2020")
stress_endterm_2019_2020 = pd.concat([stress_endterm_2019, stress_endterm_2020], axis=0, ignore_index=True)
stress_endterm_2019_2020 = stress_endterm_2019_2020.drop(columns = ['date'])
stress_endterm_2019_2020.info()
stress_endterm_2019_2020.to_csv('Cleaned_Datasets/2019_2020/stress_endterm_2019_2020.csv', index=False)
print(f"\n")

POST_2018
Plot saved as Correlation_Plots/correlation_heatmap_post_2018.html
count    175.000000
mean      18.977143
std        6.364371
min        6.000000
25%       14.000000
50%       18.000000
75%       23.000000
max       38.000000
Name: PSS_10items_POST, dtype: float64
PSS_10items_POST
18.0    16
22.0    14
21.0    11
13.0    11
20.0    10
17.0    10
23.0    10
15.0     9
16.0     8
24.0     7
12.0     7
14.0     7
25.0     6
11.0     6
29.0     5
19.0     5
6.0      4
8.0      4
30.0     3
28.0     3
10.0     3
27.0     3
31.0     2
32.0     2
33.0     2
26.0     2
9.0      2
NaN      1
34.0     1
38.0     1
7.0      1
Name: count, dtype: int64


POST_2019
Plot saved as Correlation_Plots/correlation_heatmap_post_2019.html
count    229.000000
mean      18.620087
std        6.813927
min        2.000000
25%       14.000000
50%       19.000000
75%       24.000000
max       34.000000
Name: PSS_10items_POST, dtype: float64
PSS_10items_POST
18.0    22
19.0    17
20.0    14
25.0    11
2

In [7]:
# stress_endterm_2019_2020 = stress_endterm_2019_2020.drop(columns = ['date'])
# stress_endterm_2019_2020.info()
# stress_endterm_2019_2020.to_csv('Cleaned_Datasets/2019_2020/stress_endterm_2019_2020.csv', index=False)


## Wearable Data

Notes:
For each of the wearable data features, they are presented at the following time segments:
1) morning (6 am - 12 pm)
2) afternoon (12 pm - 6 pm)
3) evening (6 pm - 12 am)
4) night (12 am - 6 am)
5) all day
6) 7 - day history
7) 14 - day history
8) weekday
9) weekend

All numeric features have two extra versions - in addition to their original state:
1) normalized (subtracted by each participant's median and divide by the 5-95 quartile range)
2) discretized (low/medium/high split by 33/66 quantile of each participant's feature value)

In [8]:
from ssa_features import *

#### Calls

For calls, we are interested in the following daily semantic features:
- Total number of missed calls - 'f_call:phone_calls_rapids_missed_count:allday'
- Total number of incoming calls - 'f_call:phone_calls_rapids_incoming_count:allday'
- Total number of outgoing calls - 'f_call:phone_calls_rapids_outgoing_count:allday'
- Duration of incoming calls (seconds) - 'f_call:phone_calls_rapids_incoming_sumduration:allday'
- Duration of outgoing calls (seconds) - 'f_call:phone_calls_rapids_outgoing_sumduration:allday'
- Total distinct missed contacts - 'f_call:phone_calls_rapids_missed_distinctcontacts:allday'
- Total distinct incoming contacts - 'f_call:phone_calls_rapids_incoming_distinctcontacts:allday'
- Total disctinct outgoing contacts - 'f_call:phone_calls_rapids_outgoing_distinctcontacts:allday'

These are features we can create from existing ones:
- Total number of all calls - Total number of missed calls + Total number of incoming calls + Total number of outgoing calls
- Duration of all calls (seconds) - Duration of incoming calls (seconds) + Duration of outgoing calls (seconds)

In [9]:
#call_2018_data, missing_2018, summary_2018 = process_call_data("Datasets/INS-W_1/FeatureData/call.csv", "2018")
#call_2019_data, missing_2019, summary_2019 = process_call_data("Datasets/INS-W_2/FeatureData/call.csv", "2019")
#call_2020_data, missing_2020, summary_2020 = process_call_data("Datasets/INS-W_3/FeatureData/call.csv", "2020")
#call_2021_data, missing_2021, summary_2021 = process_call_data("Datasets/INS-W_4/FeatureData/call.csv", "2021")

call_2018_data, missing_2018, summary_2018 = process_call_data_reduced("Datasets/INS-W_1/FeatureData/call.csv", "2018")
call_2019_data, missing_2019, summary_2019 = process_call_data_reduced("Datasets/INS-W_2/FeatureData/call.csv", "2019")
call_2020_data, missing_2020, summary_2020 = process_call_data_reduced("Datasets/INS-W_3/FeatureData/call.csv", "2020")
call_2021_data, missing_2021, summary_2021 = process_call_data_reduced("Datasets/INS-W_4/FeatureData/call.csv", "2021")

calls_2018_ssa = ssa_first_component_and_plot(call_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
calls_2019_ssa = ssa_first_component_and_plot(call_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
calls_2020_ssa = ssa_first_component_and_plot(call_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
calls_2021_ssa = ssa_first_component_and_plot(call_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
calls_2021_ssa.to_csv('Cleaned_Datasets/2021/calls_2021_ssa.csv', index=False)

calls_2019_2020 = pd.concat([calls_2019_ssa, calls_2020_ssa], axis=0, ignore_index=True)
print(calls_2019_2020.shape)
calls_2019_2020.to_csv('Cleaned_Datasets/2019_2020/calls_2019_2020.csv', index=False)


(11005, 786)
date                                                          0.000000
f_call:phone_calls_rapids_missed_count:allday                 0.642799
f_call:phone_calls_rapids_incoming_count:allday               0.642799
f_call:phone_calls_rapids_outgoing_count:allday               0.642799
f_call:phone_calls_rapids_incoming_sumduration:allday         0.782190
f_call:phone_calls_rapids_outgoing_sumduration:allday         0.724671
f_call:phone_calls_rapids_missed_distinctcontacts:allday      0.642799
f_call:phone_calls_rapids_incoming_distinctcontacts:allday    0.642799
f_call:phone_calls_rapids_outgoing_distinctcontacts:allday    0.642799
Name: mean, dtype: float64
Plot saved as Missingness_Plots_Sensor_Data/Total_Duration_of_Incoming_Calls_2018_-_10_Weeks.png
Plot saved as Time_Series_Plots/Duration of Incoming Calls 2018.png


(15478, 786)
date                                                          0.000000
f_call:phone_calls_rapids_missed_count:allday                 0.519770



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_call:phone_calls_rapids_outgoing_sumduration:allday
Initial shape: (218, 71)
Filtered shape: (145, 71)
Processing feature: f_call:phone_calls_rapids_missed_distinctcontacts:allday
Initial shape: (218, 71)
Filtered shape: (182, 71)
Processing feature: f_call:phone_calls_rapids_incoming_distinctcontacts:allday
Initial shape: (218, 71)
Filtered shape: (182, 71)
Processing feature: f_call:phone_calls_rapids_outgoing_distinctcontacts:allday
Initial shape: (218, 71)
Filtered shape: (182, 71)
Final shape: (218, 81)
Processing feature: f_call:phone_calls_rapids_missed_count:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Processing feature: f_call:phone_calls_rapids_incoming_count:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Processing feature: f_call:phone_calls_rapids_outgoing_count:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Processing feature: f_call:phone_calls_rapids_incoming_sumduration:allday
Initial shape: (137, 71)
Filtered s



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_call:phone_calls_rapids_missed_distinctcontacts:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Processing feature: f_call:phone_calls_rapids_incoming_distinctcontacts:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Processing feature: f_call:phone_calls_rapids_outgoing_distinctcontacts:allday
Initial shape: (137, 71)
Filtered shape: (121, 71)
Final shape: (137, 81)
Processing feature: f_call:phone_calls_rapids_missed_count:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Processing feature: f_call:phone_calls_rapids_incoming_count:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Processing feature: f_call:phone_calls_rapids_outgoing_count:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Processing feature: f_call:phone_calls_rapids_incoming_sumduration:allday
Initial shape: (195, 71)
Filtered shape: (155, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_call:phone_calls_rapids_outgoing_sumduration:allday
Initial shape: (195, 71)
Filtered shape: (158, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_call:phone_calls_rapids_missed_distinctcontacts:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Processing feature: f_call:phone_calls_rapids_incoming_distinctcontacts:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Processing feature: f_call:phone_calls_rapids_outgoing_distinctcontacts:allday
Initial shape: (195, 71)
Filtered shape: (181, 71)
Final shape: (195, 81)
(355, 81)


#### Bluetooth

For calls, we are interested in the following daily semantic features:
- Total number of scans - 'f_blue:phone_bluetooth_rapids_countscans:allday'
- Total number of unique scanned devices - 'f_blue:phone_bluetooth_rapids_uniquedevices:allday'
- Total number of most scanned unique device - 'f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday'

In [10]:
# bluetooth_2018 = pd.read_csv("Datasets/INS-W_1/FeatureData/bluetooth.csv", low_memory=False)
# bluetooth_2019 = pd.read_csv("Datasets/INS-W_2/FeatureData/bluetooth.csv", low_memory=False)
# bluetooth_2020 = pd.read_csv("Datasets/INS-W_3/FeatureData/bluetooth.csv", low_memory=False)
# bluetooth_2021 = pd.read_csv("Datasets/INS-W_4/FeatureData/bluetooth.csv", low_memory=False)

In [11]:
#bluetooth_2018_data, missing_2018, summary_2018 = process_bluetooth_data("Datasets/INS-W_1/FeatureData/bluetooth.csv", "2018")
#bluetooth_2019_data, missing_2019, summary_2019 = process_bluetooth_data("Datasets/INS-W_2/FeatureData/bluetooth.csv", "2019")
#bluetooth_2020_data, missing_2020, summary_2020 = process_bluetooth_data("Datasets/INS-W_3/FeatureData/bluetooth.csv", "2020")
#bluetooth_2021_data, missing_2021, summary_2021 = process_bluetooth_data("Datasets/INS-W_4/FeatureData/bluetooth.csv", "2021")

bluetooth_2018_data, missing_2018, summary_2018 = process_bluetooth_data_reduced("Datasets/INS-W_1/FeatureData/bluetooth.csv", "2018")
bluetooth_2019_data, missing_2019, summary_2019 = process_bluetooth_data_reduced("Datasets/INS-W_2/FeatureData/bluetooth.csv", "2019")
bluetooth_2020_data, missing_2020, summary_2020 = process_bluetooth_data_reduced("Datasets/INS-W_3/FeatureData/bluetooth.csv", "2020")
bluetooth_2021_data, missing_2021, summary_2021 = process_bluetooth_data_reduced("Datasets/INS-W_4/FeatureData/bluetooth.csv", "2021")

bluetooth_2018_data.to_csv('Cleaned_Datasets/2018/bluetooth_2018_data.csv', index=False)
bluetooth_2019_data.to_csv('Cleaned_Datasets/2019/bluetooth_2019_data.csv', index=False)
bluetooth_2020_data.to_csv('Cleaned_Datasets/2020/bluetooth_2020_data.csv', index=False)
bluetooth_2021_data.to_csv('Cleaned_Datasets/2021/bluetooth_2021_data.csv', index=False)

bluetooth_2018_ssa = ssa_first_component_and_plot(bluetooth_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
bluetooth_2019_ssa = ssa_first_component_and_plot(bluetooth_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
bluetooth_2020_ssa = ssa_first_component_and_plot(bluetooth_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
bluetooth_2021_ssa = ssa_first_component_and_plot(bluetooth_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
bluetooth_2021_ssa.to_csv('Cleaned_Datasets/2021/bluetooth_2021_ssa.csv', index=False)

bluetooth_2019_2020 = pd.concat([bluetooth_2019_ssa, bluetooth_2020_ssa], axis=0, ignore_index=True)
print(bluetooth_2019_2020.shape)
bluetooth_2019_2020.to_csv('Cleaned_Datasets/2019_2020/bluetooth_2019_2020.csv', index=False)


(11005, 894)
                                                        mean       50%  min  \
date                                                0.000000  0.000000  0.0   
f_blue:phone_bluetooth_rapids_countscans:allday     0.159382  0.112676  0.0   
f_blue:phone_bluetooth_rapids_uniquedevices:allday  0.159382  0.112676  0.0   
f_blue:phone_bluetooth_rapids_countscansmostuni...  0.159382  0.112676  0.0   
f_blue:phone_bluetooth_rapids_countscans_norm:a...  0.159382  0.112676  0.0   
f_blue:phone_bluetooth_rapids_uniquedevices_nor...  0.159382  0.112676  0.0   
f_blue:phone_bluetooth_rapids_countscansmostuni...  0.159382  0.112676  0.0   

                                                        max  
date                                                0.00000  
f_blue:phone_bluetooth_rapids_countscans:allday     0.71831  
f_blue:phone_bluetooth_rapids_uniquedevices:allday  0.71831  
f_blue:phone_bluetooth_rapids_countscansmostuni...  0.71831  
f_blue:phone_bluetooth_rapids_countscans_nor



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)
Final shape: (218, 31)
Processing feature: f_blue:phone_bluetooth_rapids_countscans:allday
Initial shape: (137, 71)
Filtered shape: (130, 71)
Processing feature: f_blue:phone_bluetooth_rapids_uniquedevices:allday
Initial shape: (137, 71)
Filtered shape: (130, 71)
Processing feature: f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday
Initial shape: (137, 71)
Filtered shape: (130, 71)
Final shape: (137, 31)
Processing feature: f_blue:phone_bluetooth_rapids_countscans:allday
Initial shape: (195, 71)
Filtered shape: (193, 71)
Processing feature: f_blue:phone_bluetooth_rapids_uniquedevices:allday
Initial shape: (195, 71)
Filtered shape: (193, 71)
Processing feature: f_blue:phone_bluetooth_rapids_countscansmostuniquedevice:allday
Initial shape: (195, 71)
Filtered shape: (193, 71)
Final shape: (195, 31)
(355, 31)


#### Phone Usage

For phone usage, we are interested in the following daily semantic features:
- Total duration of unlock episodes (minutes) - 'f_screen:phone_screen_rapids_sumdurationunlock:allday'
- Total number of unlock episodes - 'f_screen:phone_screen_rapids_countepisodeunlock:allday'
- Minutes until the first unlock episode (minutes) - 'f_screen:phone_screen_rapids_firstuseafter00unlock:allday'

In [12]:
# screen_2018 = pd.read_csv("Datasets/INS-W_1/FeatureData/screen.csv", low_memory=False)
# screen_2019 = pd.read_csv("Datasets/INS-W_2/FeatureData/screen.csv", low_memory=False)
# screen_2020 = pd.read_csv("Datasets/INS-W_3/FeatureData/screen.csv", low_memory=False)
# screen_2021 = pd.read_csv("Datasets/INS-W_4/FeatureData/screen.csv", low_memory=False)

In [13]:
#screen_2018_data, missing_2018, summary_2018 = process_screen_data("Datasets/INS-W_1/FeatureData/screen.csv", "2018")
#screen_2019_data, missing_2019, summary_2019 = process_screen_data("Datasets/INS-W_2/FeatureData/screen.csv", "2019")
#screen_2020_data, missing_2020, summary_2020 = process_screen_data("Datasets/INS-W_3/FeatureData/screen.csv", "2020")
#screen_2021_data, missing_2021, summary_2021 = process_screen_data("Datasets/INS-W_4/FeatureData/screen.csv", "2021")

screen_2018_data, missing_2018, summary_2018 = process_screen_data_reduced("Datasets/INS-W_1/FeatureData/screen.csv", "2018")
screen_2019_data, missing_2019, summary_2019 = process_screen_data_reduced("Datasets/INS-W_2/FeatureData/screen.csv", "2019")
screen_2020_data, missing_2020, summary_2020 = process_screen_data_reduced("Datasets/INS-W_3/FeatureData/screen.csv", "2020")
screen_2021_data, missing_2021, summary_2021 = process_screen_data_reduced("Datasets/INS-W_4/FeatureData/screen.csv", "2021")

screen_2018_data.to_csv('Cleaned_Datasets/2018/screen_2018_data.csv', index=False)
screen_2019_data.to_csv('Cleaned_Datasets/2019/screen_2019_data.csv', index=False)
screen_2020_data.to_csv('Cleaned_Datasets/2020/screen_2020_data.csv', index=False)
screen_2021_data.to_csv('Cleaned_Datasets/2021/screen_2021_data.csv', index=False)

screen_2018_ssa = ssa_first_component_and_plot(screen_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
screen_2019_ssa = ssa_first_component_and_plot(screen_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
screen_2020_ssa = ssa_first_component_and_plot(screen_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
screen_2021_ssa = ssa_first_component_and_plot(screen_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
screen_2021_ssa.to_csv('Cleaned_Datasets/2021/screen_2021_ssa.csv', index=False)

screen_2019_2020 = pd.concat([screen_2019_ssa, screen_2020_ssa], axis=0, ignore_index=True)
print(screen_2019_2020.shape)
screen_2019_2020.to_csv('Cleaned_Datasets/2019_2020/screen_2019_2020.csv', index=False)


(11005, 1137)
                                                        mean       50%  min  \
date                                                0.000000  0.000000  0.0   
f_screen:phone_screen_rapids_sumdurationunlock:...  0.121218  0.070423  0.0   
f_screen:phone_screen_rapids_countepisodeunlock...  0.121218  0.070423  0.0   
f_screen:phone_screen_rapids_firstuseafter00unl...  0.121218  0.070423  0.0   

                                                         max  
date                                                0.000000  
f_screen:phone_screen_rapids_sumdurationunlock:...  0.591549  
f_screen:phone_screen_rapids_countepisodeunlock...  0.591549  
f_screen:phone_screen_rapids_firstuseafter00unl...  0.591549  
Plot saved as Missingness_Plots_Sensor_Data/Daily_Screen_Time_(Minutes)_2018_-_10_Weeks.png
Plot saved as Time_Series_Plots/Daily Screen Time (Minutes) 2018.png


(15478, 1137)
                                                        mean       50%  min  \
date               



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_screen:phone_screen_rapids_countepisodeunlock:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)
Processing feature: f_screen:phone_screen_rapids_firstuseafter00unlock:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Final shape: (218, 31)
Processing feature: f_screen:phone_screen_rapids_sumdurationunlock:allday
Initial shape: (137, 71)
Filtered shape: (136, 71)
Processing feature: f_screen:phone_screen_rapids_countepisodeunlock:allday
Initial shape: (137, 71)
Filtered shape: (136, 71)
Processing feature: f_screen:phone_screen_rapids_firstuseafter00unlock:allday
Initial shape: (137, 71)
Filtered shape: (136, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Final shape: (137, 31)
Processing feature: f_screen:phone_screen_rapids_sumdurationunlock:allday
Initial shape: (195, 71)
Filtered shape: (194, 71)
Processing feature: f_screen:phone_screen_rapids_countepisodeunlock:allday
Initial shape: (195, 71)
Filtered shape: (194, 71)
Processing feature: f_screen:phone_screen_rapids_firstuseafter00unlock:allday
Initial shape: (195, 71)
Filtered shape: (194, 71)
Final shape: (195, 31)
(355, 31)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



#### Location

For location, we are interested in the following daily semantic features:
- Time spent at home (minutes) - 'f_loc:phone_locations_barnett_hometime:allday'
- Distance travelled (meters) - 'f_loc:phone_locations_barnett_disttravelled:allday'
- The Radius of Gyration: the area covered by a person over a day (meters) - 'f_loc:phone_locations_barnett_rog:allday'
- Number of significant locations visited during day - 'f_loc:phone_locations_barnett_siglocsvisited:allday'
- Circadian routine - 'f_loc:phone_locations_barnett_circdnrtn:allday'
- The fraction of a day spent in a pause (as opposed to a flight) - 'f_loc:phone_locations_barnett_probpause:allday'

In [14]:
# location_2018 = pd.read_csv("Datasets/INS-W_1/FeatureData/location.csv", low_memory=False)
# location_2019 = pd.read_csv("Datasets/INS-W_2/FeatureData/location.csv", low_memory=False)
# location_2020 = pd.read_csv("Datasets/INS-W_3/FeatureData/location.csv", low_memory=False)
# location_2021 = pd.read_csv("Datasets/INS-W_4/FeatureData/location.csv", low_memory=False)

In [15]:
#location_2018_data, missing_2018, summary_2018 = process_location_data("Datasets/INS-W_1/FeatureData/location.csv", "2018")
#location_2019_data, missing_2019, summary_2019 = process_location_data("Datasets/INS-W_2/FeatureData/location.csv", "2019")
#location_2020_data, missing_2020, summary_2020 = process_location_data("Datasets/INS-W_3/FeatureData/location.csv", "2020")
#location_2021_data, missing_2021, summary_2021 = process_location_data("Datasets/INS-W_4/FeatureData/location.csv", "2021")

location_2018_data, missing_2018, summary_2018 = process_location_data_reduced("Datasets/INS-W_1/FeatureData/location.csv", "2018")
location_2019_data, missing_2019, summary_2019 = process_location_data_reduced("Datasets/INS-W_2/FeatureData/location.csv", "2019")
location_2020_data, missing_2020, summary_2020 = process_location_data_reduced("Datasets/INS-W_3/FeatureData/location.csv", "2020")
location_2021_data, missing_2021, summary_2021 = process_location_data_reduced("Datasets/INS-W_4/FeatureData/location.csv", "2021")

location_2018_data.to_csv('Cleaned_Datasets/2018/location_2018_data.csv', index=False)
location_2019_data.to_csv('Cleaned_Datasets/2019/location_2019_data.csv', index=False)
location_2020_data.to_csv('Cleaned_Datasets/2020/location_2020_data.csv', index=False)
location_2021_data.to_csv('Cleaned_Datasets/2021/location_2021_data.csv', index=False)

location_2018_ssa = ssa_first_component_and_plot(location_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
location_2019_ssa = ssa_first_component_and_plot(location_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
location_2020_ssa = ssa_first_component_and_plot(location_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
location_2021_ssa = ssa_first_component_and_plot(location_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
location_2021_ssa.to_csv('Cleaned_Datasets/2021/location_2021_ssa.csv', index=False)

location_2019_2020 = pd.concat([location_2019_ssa, location_2020_ssa], axis=0, ignore_index=True)
print(location_2019_2020.shape)
location_2019_2020.to_csv('Cleaned_Datasets/2019_2020/location_2019_2020.csv', index=False)


(11005, 1110)
                                                        mean       50%  min  \
date                                                0.000000  0.000000  0.0   
f_loc:phone_locations_barnett_hometime:allday       0.118492  0.056338  0.0   
f_loc:phone_locations_barnett_disttravelled:allday  0.118492  0.056338  0.0   
f_loc:phone_locations_barnett_rog:allday            0.118492  0.056338  0.0   
f_loc:phone_locations_barnett_siglocsvisited:al...  0.118492  0.056338  0.0   
f_loc:phone_locations_barnett_circdnrtn:allday      0.118492  0.056338  0.0   
f_loc:phone_locations_barnett_probpause:allday      0.118492  0.056338  0.0   

                                                         max  
date                                                0.000000  
f_loc:phone_locations_barnett_hometime:allday       0.591549  
f_loc:phone_locations_barnett_disttravelled:allday  0.591549  
f_loc:phone_locations_barnett_rog:allday            0.591549  
f_loc:phone_locations_barnett_siglocsv



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_rog:allday
Initial shape: (155, 71)
Filtered shape: (155, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_loc:phone_locations_barnett_siglocsvisited:allday
Initial shape: (155, 71)
Filtered shape: (155, 71)
Processing feature: f_loc:phone_locations_barnett_circdnrtn:allday
Initial shape: (155, 71)
Filtered shape: (155, 71)
Processing feature: f_loc:phone_locations_barnett_probpause:allday
Initial shape: (155, 71)
Filtered shape: (155, 71)
Final shape: (155, 61)
Processing feature: f_loc:phone_locations_barnett_hometime:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_disttravelled:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_rog:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_siglocsvisited:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)
Processing feature: f_loc:phone_locations_barnett_circdnrtn:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)
Processing feature: f_loc:phone_locations_barnett_probpause:allday
Initial shape: (218, 71)
Filtered shape: (217, 71)
Final shape: (218, 61)
Processing feature: f_loc:phone_locations_barnett_hometime:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)
Processing feature: f_loc:phone_locations_barnett_disttravelled:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_rog:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)
Processing feature: f_loc:phone_locations_barnett_siglocsvisited:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)
Processing feature: f_loc:phone_locations_barnett_circdnrtn:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)
Processing feature: f_loc:phone_locations_barnett_probpause:allday
Initial shape: (137, 71)
Filtered shape: (95, 71)
Final shape: (137, 61)
Processing feature: f_loc:phone_locations_barnett_hometime:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)
Processing feature: f_loc:phone_locations_barnett_disttravelled:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_rog:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_loc:phone_locations_barnett_siglocsvisited:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)
Processing feature: f_loc:phone_locations_barnett_circdnrtn:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)
Processing feature: f_loc:phone_locations_barnett_probpause:allday
Initial shape: (195, 71)
Filtered shape: (168, 71)
Final shape: (195, 61)
(355, 61)


#### Sleep

For sleep, we are interested in the following daily semantic features:
- Time asleep of main sleep (minutes) - 'f_slp:fitbit_sleep_intraday_rapids_sumdurationasleepunifiedmain:allday'
- Time awake of main sleep (minutes) - 'f_slp:fitbit_sleep_intraday_rapids_sumdurationawakeunifiedmain:allday'
- Count of asleep episodes in main sleep - 'f_slp:fitbit_sleep_intraday_rapids_countepisodeasleepunifiedmain:allday'
- Count of awake episodes in main sleep - 'f_slp:fitbit_sleep_intraday_rapids_countepisodeawakeunifiedmain:allday'
- Average sleep efficiency for a certain sleep type during a time segment (scores) - 'f_slp:fitbit_sleep_summary_rapids_avgefficiencymain:allday'
- Total duration the user stayed in bed in main sleep (minutes) - 'f_slp:fitbit_sleep_summary_rapids_sumdurationinbedmain:allday'

In [16]:
# sleep_2018 = pd.read_csv("Datasets/INS-W_1/FeatureData/sleep.csv")
# sleep_2019 = pd.read_csv("Datasets/INS-W_2/FeatureData/sleep.csv")
# sleep_2020 = pd.read_csv("Datasets/INS-W_3/FeatureData/sleep.csv")
# sleep_2021 = pd.read_csv("Datasets/INS-W_4/FeatureData/sleep.csv")


In [17]:
# sleep_2018_data, missing_2018, summary_2018 = process_sleep_data("Datasets/INS-W_1/FeatureData/sleep.csv", "2018")
# sleep_2019_data, missing_2019, summary_2019 = process_sleep_data("Datasets/INS-W_2/FeatureData/sleep.csv", "2019")
# sleep_2020_data, missing_2020, summary_2020 = process_sleep_data("Datasets/INS-W_3/FeatureData/sleep.csv", "2020")
# sleep_2021_data, missing_2021, summary_2021 = process_sleep_data("Datasets/INS-W_4/FeatureData/sleep.csv", "2021")

sleep_2018_data, missing_2018, summary_2018 = process_sleep_data_reduced("Datasets/INS-W_1/FeatureData/sleep.csv", "2018")
sleep_2019_data, missing_2019, summary_2019 = process_sleep_data_reduced("Datasets/INS-W_2/FeatureData/sleep.csv", "2019")
sleep_2020_data, missing_2020, summary_2020 = process_sleep_data_reduced("Datasets/INS-W_3/FeatureData/sleep.csv", "2020")
sleep_2021_data, missing_2021, summary_2021 = process_sleep_data_reduced("Datasets/INS-W_4/FeatureData/sleep.csv", "2021")

sleep_2018_data.to_csv('Cleaned_Datasets/2018/sleep_2018_data.csv', index=False)
sleep_2019_data.to_csv('Cleaned_Datasets/2019/sleep_2019_data.csv', index=False)
sleep_2020_data.to_csv('Cleaned_Datasets/2020/sleep_2020_data.csv', index=False)
sleep_2021_data.to_csv('Cleaned_Datasets/2021/sleep_2021_data.csv', index=False)

sleep_2018_ssa = ssa_first_component_and_plot(sleep_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
sleep_2019_ssa = ssa_first_component_and_plot(sleep_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
sleep_2020_ssa = ssa_first_component_and_plot(sleep_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
sleep_2021_ssa = ssa_first_component_and_plot(sleep_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
sleep_2021_ssa.to_csv('Cleaned_Datasets/2021/sleep_2021_ssa.csv', index=False)

sleep_2019_2020 = pd.concat([sleep_2019_ssa, sleep_2020_ssa], axis=0, ignore_index=True)
print(sleep_2019_2020.shape)
sleep_2019_2020.to_csv('Cleaned_Datasets/2019_2020/sleep_2019_2020.csv', index=False)


(11005, 921)
                                                        mean       50%  min  \
date                                                0.000000  0.000000  0.0   
f_slp:fitbit_sleep_intraday_rapids_sumdurationa...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_sumdurationa...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_countepisode...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_countepisode...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_summary_rapids_avgefficiency...  0.356747  0.169014  0.0   
f_slp:fitbit_sleep_summary_rapids_sumdurationin...  0.349750  0.169014  0.0   
f_slp:fitbit_sleep_intraday_rapids_sumdurationa...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_sumdurationa...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_countepisode...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_intraday_rapids_countepisode...  0.340027  0.140845  0.0   
f_slp:fitbit_sleep_summary_rapids_avgef

#### Steps

For steps, we are interested in the following daily semantic features:
- Total steps per day - 'f_steps:fitbit_steps_intraday_rapids_sumsteps:allday'
- Count of sedentary bouts per day - 'f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday'
- Total duration of sedentary bouts per day (minutes) - 'f_steps:fitbit_steps_intraday_rapids_sumdurationsedentarybout:allday'
- Count of active bouts per day - 'f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout:allday'
- Total duration of active bouts per day (minutes) - 'f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout:allday'

In [18]:
# steps_2018_data, missing_2018, summary_2018 = process_steps_data("Datasets/INS-W_1/FeatureData/steps.csv", "2018")
# steps_2019_data, missing_2019, summary_2019 = process_steps_data("Datasets/INS-W_2/FeatureData/steps.csv", "2019")
# steps_2020_data, missing_2020, summary_2020 = process_steps_data("Datasets/INS-W_3/FeatureData/steps.csv", "2020")
# steps_2021_data, missing_2021, summary_2021 = process_steps_data("Datasets/INS-W_4/FeatureData/steps.csv", "2021")

steps_2018_data, missing_2018, summary_2018 = process_steps_data_reduced("Datasets/INS-W_1/FeatureData/steps.csv", "2018")
steps_2019_data, missing_2019, summary_2019 = process_steps_data_reduced("Datasets/INS-W_2/FeatureData/steps.csv", "2019")
steps_2020_data, missing_2020, summary_2020 = process_steps_data_reduced("Datasets/INS-W_3/FeatureData/steps.csv", "2020")
steps_2021_data, missing_2021, summary_2021 = process_steps_data_reduced("Datasets/INS-W_4/FeatureData/steps.csv", "2021")

steps_2018_data.to_csv('Cleaned_Datasets/2018/steps_2018_data.csv', index=False)
steps_2019_data.to_csv('Cleaned_Datasets/2019/steps_2019_data.csv', index=False)
steps_2020_data.to_csv('Cleaned_Datasets/2020/steps_2020_data.csv', index=False)
steps_2021_data.to_csv('Cleaned_Datasets/2021/steps_2021_data.csv', index=False)

steps_2018_ssa = ssa_first_component_and_plot(steps_2018_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
steps_2019_ssa = ssa_first_component_and_plot(steps_2019_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
steps_2020_ssa = ssa_first_component_and_plot(steps_2020_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
steps_2021_ssa = ssa_first_component_and_plot(steps_2021_data, window_size=7, smoothing_factor=5, threshold=0.8, plot=False)
steps_2021_ssa.to_csv('Cleaned_Datasets/2021/steps_2021_ssa.csv', index=False)

steps_2019_2020 = pd.concat([steps_2019_ssa, steps_2020_ssa], axis=0, ignore_index=True)
print(steps_2019_2020.shape)
steps_2019_2020.to_csv('Cleaned_Datasets/2019_2020/steps_2019_2020.csv', index=False)


(11005, 597)
                                                        mean       50%  min  \
date                                                0.000000  0.000000  0.0   
f_steps:fitbit_steps_intraday_rapids_sumsteps:a...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_countepiso...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_sumduratio...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_countepiso...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_sumduratio...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_sumsteps_n...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_countepiso...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_sumduratio...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_countepiso...  0.289959  0.084507  0.0   
f_steps:fitbit_steps_intraday_rapids_sumduratio...  0.289959  0.084507  0.0   

                                      



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationsedentarybout:allday
Initial shape: (155, 71)
Filtered shape: (131, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout:allday
Initial shape: (155, 71)
Filtered shape: (131, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout:allday
Initial shape: (155, 71)
Filtered shape: (131, 71)
Final shape: (155, 51)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumsteps:allday
Initial shape: (218, 71)
Filtered shape: (208, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday
Initial shape: (218, 71)
Filtered shape: (208, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationsedentarybout:allday
Initial shape: (218, 71)
Filtered shape: (208, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout:allday
Initial shape: (218, 71)
Filtered shape: (208, 71)
Processing feature:



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationsedentarybout:allday
Initial shape: (137, 71)
Filtered shape: (132, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout:allday
Initial shape: (137, 71)
Filtered shape: (132, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout:allday
Initial shape: (137, 71)
Filtered shape: (132, 71)
Final shape: (137, 51)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumsteps:allday
Initial shape: (195, 71)
Filtered shape: (189, 71)




The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.



The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reac

Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodesedentarybout:allday
Initial shape: (195, 71)
Filtered shape: (189, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationsedentarybout:allday
Initial shape: (195, 71)
Filtered shape: (189, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout:allday
Initial shape: (195, 71)
Filtered shape: (189, 71)
Processing feature: f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout:allday
Initial shape: (195, 71)
Filtered shape: (189, 71)
Final shape: (195, 51)
(355, 51)
