In [1]:
import os
import warnings
from collections import defaultdict
import pandas as pd
import numpy as np
import pingouin as pg
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
# from passivefeatureslite.extract.applications import *
from passivefeatureslite.extract.locations import *
from passivefeatureslite.extract.bluetooth import *
from passivefeatureslite.extract.call import *
from passivefeatureslite.extract.message import *
from passivefeatureslite.extract.wifi import *
from passivefeatureslite.extract.screen import *

from visualutils import plot_one_col
from syncutils import get_sensor_samples_per_survey_measure
from statsutils import norm_per_person_wrapper, cut_dates, get_95_ci_lower, get_95_ci_upper, get_95_ci_med_lower, get_95_ci_med_upper
from timesettings import COVID_CUT_OFF, LOCKDOWN_YELLOW_START, LOCKDOWN_GREEN_START

from timeutils import sensor_time_to_local, EASTERN, add_survey_time_cols
from utils import PHQ_BIWEEKLY_EVENTS_MAIN, PHQ_BIWEEKLY_EVENTS_EXTENDED, MONTHLY_EVENTS_MAIN, MONTHLY_EVENTS_EXTENDED

BIWEEKLY_EVENTS = PHQ_BIWEEKLY_EVENTS_MAIN + PHQ_BIWEEKLY_EVENTS_EXTENDED
MONTHLY_EVENTS = MONTHLY_EVENTS_MAIN + MONTHLY_EVENTS_EXTENDED

pd.set_option('mode.chained_assignment',None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('error', category=RuntimeWarning)


DATA_FOLDER = "./data/"

## SENSOR DATA PATHS
BLUE_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "bluetooth_fixed_did.csv")
WIFI_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "wifi_fixed_did.csv")


DEVICE_TYPE_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "aware_device.csv")
CALLS_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "calls_fixed_did.csv")
MSGS_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "messages_fixed_did.csv")

LOC_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "locations_semantic_start_date_to_end_date.csv")
SCR_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "screen_clean.csv")


SLP_PATH = os.path.join(DATA_FOLDER, "sensors_fitbit", "sleep.csv")
STEPS_PATH = os.path.join(DATA_FOLDER, "sensors_fitbit", "steps_total.csv")
STEPS_LEVELS_PATH = os.path.join(DATA_FOLDER, "sensors_fitbit", "activity_types_minutes.csv")
HR_PATH = os.path.join(DATA_FOLDER, "sensors_fitbit", "hr.csv")

## @TODO - activity levels, 
## notif, touch, steps intraday, HR

## INTRADAY "FOLDER" PATHS
SLP_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "sleep_intraday", "{0}.csv")
HR_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "hr_intraday", "{0}.csv")
STEPS_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "steps_intraday", "{0}.csv")
VERY_ACT_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "minutesVeryActive_intraday", "{0}.csv")
FAIRLY_ACT_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "minutesFairlyActive_intraday", "{0}.csv")
LIGHTLY_ACT_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "minutesLightlyActive_intraday", "{0}.csv")
SED_ACT_INTRADAY_PATH_LIKE = os.path.join(DATA_FOLDER, "sensors_fitbit", "minutesSedentary_intraday", "{0}.csv")



## SURVEY PATH
SURVEY_PATH = os.path.join(DATA_FOLDER, "QTotalScoresMod.csv")

## EMA PATH
# EMA_PATH = os.path.join(DATA_FOLDER, "sensors_clean", "survey_clean.csv")

## MAPPING FILE
PRT_TO_DID_MAPPING_FILE = os.path.join(DATA_FOLDER, "mapping_ids/participant_id_MAP_fitbit_id_MAP_device_id.csv")


## OUTPUT SETTINGS
OUTPUT_FEATS = True
OUTPUT_FEATS_FOLDER = os.path.join(DATA_FOLDER, "feats_for_multimodal_time_slices")
def output_features(df_feats, sensor_name, feat_type):
    df_feats.to_csv(os.path.join(OUTPUT_FEATS_FOLDER, "feats_{0}_{1}.csv".format(sensor_name, feat_type)))


# ## NORM FEATURES FOR EACH PERSON - ignore for now. may make more sense to norm on per day feats
# TO_NORM_PER_PERSON = False 


  return warn(


# MUST READ
## While reading, pay special attention to:
### - Consider, when features = nan, can we replace with 0?
### - For phase, ensure that all features are from is_during_study (consider this for other scenarios too)


In [2]:
## device_id to device type
DID_TO_TYPE = pd.read_csv(DEVICE_TYPE_PATH)
DID_TO_TYPE = DID_TO_TYPE.set_index("device_id")["brand"].to_dict()
# display(DID_TO_TYPE)




# Read and pre-process survey and ema data


In [3]:
## GET SURVEY DATA AND MAP TO DEVICE_ID, ADD EXTRA COLS
MAPPING_DF = pd.read_csv(PRT_TO_DID_MAPPING_FILE)
MAPPING_DICT = MAPPING_DF.set_index("participant_id")["final_device_id"].to_dict()
FB_MAPPING_DICT = MAPPING_DF.set_index("fitbit_id")["final_device_id"].to_dict()
FB_MAPPING_DICT_INV = MAPPING_DF.set_index("final_device_id")["fitbit_id"].to_dict()

df_survey = pd.read_csv(SURVEY_PATH)
df_survey["device_id"] = df_survey["record_id"].apply((lambda x: MAPPING_DICT[x]))
df_survey = add_survey_time_cols(df_survey)
df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])
DIDS_SEL = list(df_survey["device_id"].unique())
print (len(DIDS_SEL))

# ## GET CLEAN EMA DATA, ADD EXTRA COLS
# df_ema = pd.read_csv(EMA_PATH)
# df_ema["timestamp_dt"] = df_ema["timestamp"].apply(sensor_time_to_local)
# df_ema["date_dt"] = df_ema["timestamp_dt"].dt.date
# df_ema = df_ema.sort_values(by=["device_id", "timestamp_dt"])
# EMA_DIDS_SEL = list(df_ema["device_id"].unique())
# print (len(EMA_DIDS_SEL))

# ## limit dates
# df_ema = cut_dates(df_ema, "timestamp_dt")

# ## NORM 
# if TO_NORM_PER_PERSON:
#     df_ema = norm_per_person_wrapper(df_ema, ["depression", "tiredness"], "device_id")



104


In [4]:
display(df_survey.head(2))
# display(df_ema.head(2))
# display(df_survey["device_id"].unique())

print (df_survey.columns)


# tmp = df_ema.sort_values(by="timestamp", ascending=False)
# display(tmp)


print (df_survey["timestamp_dt"].min())

print (df_survey["timestamp_dt"].max())

# print (df_ema["timestamp_dt"].min())

# print (df_ema["timestamp_dt"].max())


Unnamed: 0.1,Unnamed: 0,record_id,redcap_event_name,complete_status,age,doe,timestamp,survey_complete,phq9_score,phq12_functioning,msrsr_m,pss_m,mfis_m,psqi_total,pdds,device_id,timestamp_dt,date_dt,timestamp_dt_dow,timestamp_dt_dow_wrt_sat,date_dt_prev_sat,date_dt_win_start
460,461,PRT180756,week_0,Completed,58,2019-11-12,2019-11-17 10:09:46,Complete,0.0,0,7.0,24.0,7.0,10.0,3.0,0017eb47-eecc-4737-9582-9749dba7f48e,2019-11-17 10:09:46,2019-11-17,6,1,2019-11-16,2019-11-03
461,462,PRT180756,week_2,Completed,58,2019-11-12,2019-11-30 12:07:38,Complete,0.0,0,,,,,,0017eb47-eecc-4737-9582-9749dba7f48e,2019-11-30 12:07:38,2019-11-30,5,0,2019-11-30,2019-11-16


Index(['Unnamed: 0', 'record_id', 'redcap_event_name', 'complete_status',
       'age', 'doe', 'timestamp', 'survey_complete', 'phq9_score',
       'phq12_functioning', 'msrsr_m', 'pss_m', 'mfis_m', 'psqi_total', 'pdds',
       'device_id', 'timestamp_dt', 'date_dt', 'timestamp_dt_dow',
       'timestamp_dt_dow_wrt_sat', 'date_dt_prev_sat', 'date_dt_win_start'],
      dtype='object')
2019-11-16 09:08:18
2021-01-24 08:48:28


# Add a phase and week label

In [5]:
def add_phase_label(df, ts_col):
    df["phase"] = np.where(df[ts_col]<COVID_CUT_OFF, "pre", \
                           np.where( (df[ts_col]>=COVID_CUT_OFF) & (df[ts_col]<LOCKDOWN_YELLOW_START), "lock", \
                                    np.where( (df[ts_col]>=LOCKDOWN_YELLOW_START) & (df[ts_col]<LOCKDOWN_GREEN_START), "yellow", \
                                             "green"
                                            )
                                   )
                          )
    return df

def add_phase_label_wrapper(df, ts_col):
    df = add_phase_label(df, ts_col)
    df["phase_wrapper"] = np.where( ((df["phase"]=="yellow") |  (df["phase"]=="green") ), "post", df["phase"])
    return df


# def add_week_label_row_of_person(df_survey_did, data_row, ts_col):
# #     display (data_row)
# #     display (df_survey_did)
#     data_ts = data_row[ts_col]
# #     display ((df_survey_did[ts_col]-data_ts).dt.days)
# #     print (type((df_survey_did[ts_col]-data_ts)))
#     df_survey_gte = df_survey_did[(df_survey_did[ts_col]>=data_ts)]
#     df_survey_gte_biweekly = df_survey_gte[(df_survey_gte["redcap_event_name"].isin(BIWEEKLY_EVENTS))]
#     df_survey_gte_monthly = df_survey_gte[(df_survey_gte["redcap_event_name"].isin(MONTHLY_EVENTS))]
#     df_survey_lte = df_survey_did[(df_survey_did[ts_col]< data_ts)] # is it in between the study
# #     display (df_survey_gte)

#     ## BIWEEKLY
#     if len(df_survey_gte_biweekly)==0: # after the last week
#         biweekly_lbl = "extra"
#         biweekly_date = np.nan
#     else:
#         df_survey_select_14_days = df_survey_gte_biweekly[((df_survey_gte_biweekly[ts_col]-data_ts).dt.days)<=14]
#         if len(df_survey_select_14_days)==0: ## there is a survey after data point but diff is > 14 days
#             if len(df_survey_lte)==0: # if there is no survey before data then "pre-study"
#                 biweekly_lbl = "pre"
#             else:
#                 biweekly_lbl = np.nan # else, missing in between due to delay etc 
#             biweekly_date = np.nan
#         else:
#             biweekly_lbl = df_survey_select_14_days["redcap_event_name"].iloc[0]
#             biweekly_date = df_survey_select_14_days["date_dt"].iloc[0]

#     ## MONTHLY
#     if len(df_survey_gte_monthly)==0: # after the last week
#         monthly_lbl = "extra"
#         monthly_date = np.nan
#     else:
#         df_survey_select_31_days = df_survey_gte_monthly[((df_survey_gte_monthly[ts_col]-data_ts).dt.days)<=31]
#         if len(df_survey_select_31_days)==0: ## there is a survey after data point but diff is > 14 days
#             if len(df_survey_lte)==0:
#                 monthly_lbl = "pre"
#             else:
#                 monthly_lbl = "np.nan"
#             monthly_date = np.nan
#         else:
#             monthly_lbl = df_survey_select_31_days["redcap_event_name"].iloc[0]
#             monthly_date = df_survey_select_31_days["date_dt"].iloc[0]
#     data_row["biweekly_lbl"] = biweekly_lbl
#     data_row["monthly_lbl"] = monthly_lbl
#     data_row["biweekly_date"] = biweekly_date
#     data_row["monthly_date"] = monthly_date
#     return data_row
            
    

# def add_week_label(df, ts_col, df_survey):
# #     print (type(df[ts_col].iloc[0]))
# #     print (type(df_survey[ts_col].iloc[0]))
    
# #     display(df_survey.head(1))
    
#     df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])
#     df_did_list = []
#     print ("Processing did counts...")
#     cnt = 0
#     for did in df_survey["device_id"].unique():
#         cnt += 1
# #         if cnt != 6: # debugging
# #             continue
#         print ("{0}, ".format(cnt), end='')
#         df_survey_did = df_survey[(df_survey["device_id"]==did)][[ts_col, "redcap_event_name"]]
#         df_did = df[(df["device_id"]==did)]
#         if len(df_did)==0:
#             continue
#         df_did = df_did.apply((lambda x: (add_week_label_row_of_person(df_survey_did, x, ts_col))), axis=1)
#         df_did["biweekly_date_dt_diff"] = df_did["biweekly_date"] - df_did["date_dt"] 
#         df_did["monthly_date_dt_diff"] = df_did["monthly_date"] - df_did["date_dt"] 
#         cols_disp = ["timestamp_dt", "date_dt", "biweekly_lbl", "monthly_lbl", \
#                         "biweekly_date", "monthly_date",\
#                         "biweekly_date_dt_diff", "monthly_date_dt_diff"
#                        ]
# #         display(df_did[cols_disp])
# #         display(df_did)
#         df_did_list.append(df_did)
# #         break # debug
#     df_dids_all = pd.concat(df_did_list)
#     return df_dids_all

def add_week_label(df, ts_col, df_survey, print_out=True):
    df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])
    df_did_list = []
    if print_out:
        print ("Processing did counts (tweak)...")
    cnt = 0
    for did in df_survey["device_id"].unique():
        cnt += 1
#         if cnt > 1: # debugging
#             break
        if print_out:
            print ("{0}, ".format(cnt), end='')
        df_survey_did = df_survey[(df_survey["device_id"]==did)][[ts_col, "redcap_event_name"]]
        df_survey_did_biweekly = df_survey_did[(df_survey_did["redcap_event_name"].isin(BIWEEKLY_EVENTS))]
        df_survey_did_monthly = df_survey_did[(df_survey_did["redcap_event_name"].isin(MONTHLY_EVENTS))]
        df_did = df[(df["device_id"]==did)]
        if len(df_did)==0:
            continue
        df_did["biweekly_lbl"] = None
        df_did["monthly_lbl"] = None
        df_did["biweekly_date"] = None
        df_did["monthly_date"] = None
        ## iterate through survey df rows
        for index, row in df_survey_did_biweekly.iterrows():
            curr_lbl = row["redcap_event_name"]
            curr_dt = row["date_dt"]
            curr_dt_minus_14 = curr_dt - datetime.timedelta(days=14)
#             print ("{2}: {0}, {1}".format(curr_dt, curr_dt_minus_14, curr_lbl))
            df_did["biweekly_lbl"] = np.where( ( (df_did["biweekly_lbl"].isnull()) & (df_did["date_dt"]<curr_dt) & \
                                                (df_did["date_dt"]>=curr_dt_minus_14)), \
                                              curr_lbl, df_did["biweekly_lbl"])
            df_did["biweekly_date"] = np.where( ( (df_did["biweekly_date"].isnull()) & (df_did["date_dt"]<curr_dt) & (df_did["date_dt"]>=curr_dt_minus_14)), curr_dt, df_did["biweekly_date"])
        for index, row in df_survey_did_monthly.iterrows():
            curr_lbl = row["redcap_event_name"]
            curr_dt = row["date_dt"]
            curr_dt_minus_31 = curr_dt - datetime.timedelta(days=31) # have to subtract 15 days to include 30
#             print ("{2}: {0}, {1}".format(curr_dt, curr_dt_minus_31, curr_lbl))
            df_did["monthly_lbl"] = np.where( ( (df_did["monthly_lbl"].isnull()) & (df_did["date_dt"]<curr_dt) & (df_did["date_dt"]>=curr_dt_minus_31)), curr_lbl, df_did["monthly_lbl"])
            df_did["monthly_date"] = np.where( ( (df_did["monthly_date"].isnull()) & (df_did["date_dt"]<curr_dt) & (df_did["date_dt"]>=curr_dt_minus_31)), curr_dt, df_did["monthly_date"])

#         cols_disp = ["timestamp_dt", "date_dt", "biweekly_lbl", "monthly_lbl", \
#                         "biweekly_date", "monthly_date",\
#                        ]
#         display(df_did[cols_disp])
        df_did_list.append(df_did)
# #         break # debug
    df_dids_all = pd.concat(df_did_list)
    return df_dids_all
        
    
def add_dow_label(df, ts_col="timestamp_dt"):
    df["dow"] = df[ts_col].dt.dayofweek
    df["dow_lbl"] = np.where(df["dow"].isin([5, 6]), "wkend", "wkdy")
    return df

def add_tod_label(df, ts_col="timestamp_dt"):
    df["hour_dt"] = df[ts_col].dt.hour
    df["tod_lbl"] = np.where(df["hour_dt"].isin([0, 1, 2, 3, 4, 5]), "ni", \
                             np.where(df["hour_dt"].isin([6, 7, 8, 9, 10, 11]), "mo", \
                                      np.where(df["hour_dt"].isin([12, 13, 14, 15, 16, 17]), "af", "ev"
                                              )
                                     )
                            )
    return df

def add_wknum_wrt_wk12(df,  ts_col, df_survey, print_out=True):
    df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])
    df_did_list = []
    if print_out:
        print ("Processing did counts...")
    cnt = 0
    for did in df_survey["device_id"].unique():
        cnt += 1
#         if cnt > 1: # debugging
#             break
#         print ("{0}, ".format(cnt), end='')
        df_survey_did = df_survey[(df_survey["device_id"]==did)][[ts_col, "redcap_event_name"]]
        df_did = df[(df["device_id"]==did)]
        df_did["wknum_wrt_wk12"] = None
        df_survey_did_week_12 = df_survey_did[(df_survey_did["redcap_event_name"]=="week_12")]
        if len(df_survey_did_week_12)==0:
            continue
#         display(df_survey_did)
        week_12_date = df_survey_did_week_12["date_dt"].iloc[0]
        curr_end_date = week_12_date
        for i in range(12, 0, -1):
            curr_start_date = curr_end_date - datetime.timedelta(days=7)
#             print ("wk {0}: {1} to {2}".format(i, curr_start_date, curr_end_date))
            df_did["wknum_wrt_wk12"] = np.where( ( (df_did["date_dt"]>=curr_start_date) & (df_did["date_dt"]<curr_end_date) ), "wknum_{0}".format(i), df_did["wknum_wrt_wk12"])
            curr_end_date = curr_start_date
#         display(df_did)
#         break # debug
        df_did_list.append(df_did)
    df_dids_all = pd.concat(df_did_list)
    return df_dids_all


def get_during_study_lbl(df, df_survey, print_out=True):
    ''' Generates a label that can be used to exclude samples before week_0 and after the last week
    Includes the days week_0 and week_last surveys were taken
    '''
    df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])
    df_did_list = []
    if print_out:
        print ("Processing did counts...")
    cnt = 0
    for did in df_survey["device_id"].unique():
        cnt += 1
#         if cnt > 1: # debugging
#             break
#         print ("{0}, ".format(cnt), end='')
        df_survey_did = df_survey[(df_survey["device_id"]==did)][["date_dt", "redcap_event_name"]]
        df_did = df[(df["device_id"]==did)]
        df_did["is_during_study"] = None
        df_survey_did_week_0 = df_survey_did["date_dt"].iloc[0]
        df_survey_did_week_last = df_survey_did["date_dt"].iloc[-1]
#         print ("{0}: {1} to {2}".format(did, df_survey_did_week_0, df_survey_did_week_last))
        df_did["is_during_study"] = np.where( ( (df_did["date_dt"]>=df_survey_did_week_0) & (df_did["date_dt"]<=df_survey_did_week_last) ), True, df_did["is_during_study"])
        df_did_list.append(df_did)
#         display(df_did.head(100))
#         display(df_did.tail(100))
    df_dids_all = pd.concat(df_did_list)
    return df_dids_all

        
        

def add_all_epoch_labels(df, df_survey, print_out=True):
    ## Add PHASE
    df =  add_phase_label_wrapper(df, "timestamp_dt")  
    # Add week label
    df =  add_week_label(df, "date_dt", df_survey, print_out)  
    # add dow label
    df =  add_dow_label(df, "timestamp_dt")  
    # add tod label
    df =  add_tod_label(df, "timestamp_dt")  
    # add week nnum wrt week 12
    df = add_wknum_wrt_wk12(df, "date_dt", df_survey, print_out)
    # add is_during_study
    df = get_during_study_lbl(df, df_survey, print_out)
    return df
    
    
        
        


In [6]:
# # ## GET SURVEY DATA AND MAP TO DEVICE_ID, ADD EXTRA COLS
# MAPPING_DF = pd.read_csv(PRT_TO_DID_MAPPING_FILE)
# MAPPING_DICT = MAPPING_DF.set_index("participant_id")["final_device_id"].to_dict()
# FB_MAPPING_DICT = MAPPING_DF.set_index("fitbit_id")["final_device_id"].to_dict()

# df_survey = pd.read_csv(SURVEY_PATH)
# df_survey["device_id"] = df_survey["record_id"].apply((lambda x: MAPPING_DICT[x]))
# df_survey = add_survey_time_cols(df_survey)
# df_survey = df_survey.sort_values(by=["device_id", "timestamp_dt"])

# ## Add PHASE
# df_survey =  add_phase_label_wrapper(df_survey, "timestamp_dt")  


# # ## GET CLEAN EMA DATA, ADD EXTRA COLS
# # df_ema = pd.read_csv(EMA_PATH)
# # df_ema["timestamp_dt"] = df_ema["timestamp"].apply(sensor_time_to_local)
# # df_ema["date_dt"] = df_ema["timestamp_dt"].dt.date
# # df_ema = df_ema.sort_values(by=["device_id", "timestamp_dt"])

# # ## NORM 
# # if TO_NORM_PER_PERSON:
# #     df_ema = norm_per_person_wrapper(df_ema, ["depression", "tiredness"], "device_id")



# # df_ema_per_day = df_ema.groupby(["device_id", "date_dt"]).agg(mean_dep=("depression", "mean"),\
# #                                                               max_dep=("depression", "max"),\
# #                                                               n_dep=("depression", "count"),\
# #                                                               mean_tir=("tiredness", "mean"),\
# #                                                               max_tir=("tiredness", "max"),\
# #                                                               n_tir=("tiredness", "count"),\
# #                                                              ).reset_index()






In [7]:
# display(df_survey.columns)
# display(df_survey.head(20))


# Create feature function for means

In [8]:
def add_features_mean(g, INPUT_FEAT_NAMES):
    feats_dict = {}
    if len(g)==1:
        for feat in INPUT_FEAT_NAMES:
            feats_dict[feat] = g[feat].iloc[0]
    elif len(g)==0:
        for feat in INPUT_FEAT_NAMES:
            feats_dict[feat] = np.nan
    else:
        for feat in INPUT_FEAT_NAMES:
            feats_dict[feat] = g[feat].mean()
    return pd.Series(feats_dict)


# Bluetooth - clean and add phase

In [None]:
df_blue = pd.read_csv(BLUE_PATH)
df_blue["timestamp_dt"] = df_blue["timestamp"].apply(sensor_time_to_local)
df_blue["date_dt"] = df_blue["timestamp_dt"].dt.date
df_blue = df_blue.sort_values(by=["device_id", "timestamp_dt"])

# blue_input_feats = ["num_scans_of_most_frequent_device", \
#                     "num_scans_of_least_frequent_device", \
#                     "number_unique_devices", \
#                     "num_scans_of_most_frequent_device_of_others", \
#                     "num_scans_of_least_frequent_device_of_others", \
#                     "number_unique_devices_of_others", \
#                     "num_scans_of_most_frequent_device_of_self", \
#                     "num_scans_of_least_frequent_device_of_self", \
#                     "number_unique_devices_of_self", \
#                     "sum_num_scans_of_all_devices_of_self", \
#                     "sum_num_scans_of_all_devices_of_others", \
#                     "avg_num_scans_of_all_devices_of_self", \
#                     "avg_num_scans_of_all_devices_of_others"]

## ADD ALL LABELS
df_blue =  add_all_epoch_labels(df_blue, df_survey) 
df_blue_in_study = df_blue[(df_blue["is_during_study"]==True)] # helps exclude incomplete days





In [None]:
# display(df_blue[df_blue["biweekly_lbl"].isnull()])
# display(df_blue_in_study[df_blue_in_study["biweekly_lbl"].isnull()])

# display(df_blue.iloc[550:700])
# display(df_blue.head(20))
display(df_blue_in_study.head(2))


In [None]:
# display (df_blue.head(2))

def get_df_blue_clusters(df_blue):
    ## Calculate frequency of each bt address 
    df_blue_freq = df_blue.groupby(["device_id", "bt_address"]).count()[["date_dt"]]
    df_blue_freq = df_blue_freq.rename(columns={"date_dt": "freq"})

    ## Calculate num days of each bt address 
    df_blue_numdays = pd.DataFrame(df_blue.groupby(["device_id", "bt_address"])["date_dt"].nunique())
    df_blue_numdays = df_blue_numdays.rename(columns={"date_dt": "numdays"})

    ## Concat
    df_blue_avgfreq = df_blue_freq.join(df_blue_numdays)
    df_blue_avgfreq["avgfreq"] = df_blue_avgfreq["freq"]/df_blue_avgfreq["numdays"]
    df_blue_avgfreq = df_blue_avgfreq.reset_index()
    df_blue_avgfreq = df_blue_avgfreq.sort_values(by=["device_id", "freq", "bt_address"], ascending=[True, False, True])
    # display (df_blue_avgfreq.head(20))
    ## Clustering for each device_id separately
    df_blue_clust_list = []
    did_list = []
    num_clusters_list = []
    own_devices_list_of_list = []
    num_own_devices_list = []
    did_to_own_devices_list = {}
    for did in df_blue_avgfreq["device_id"].unique():
        df_blue_avgfreq_did = df_blue_avgfreq[(df_blue_avgfreq["device_id"]==did)]
        df_blue_clust_did, numclust_did = cluster_address_freq(df_blue_avgfreq_did)
        df_blue_clust_list.append(df_blue_clust_did)
    #     display (df_blue_clust_did.head(2))
        did_list.append(did)
        num_clusters_list.append(numclust_did)
        if (numclust_did is None) or (numclust_did == 0):
            own_devices_list_of_list.append([])
            num_own_devices_list.append(0)
            did_to_own_devices_list[did] = []
        else:
            owndevices_did = getOwnDevices(df_blue_clust_did)
            own_devices_list_of_list.append(owndevices_did)  
            num_own_devices_list.append(len(owndevices_did))
            did_to_own_devices_list[did] = owndevices_did

    df_blue_clust = pd.concat(df_blue_clust_list)
    df_blue_user_clust = pd.DataFrame({"device_id": did_list, "num_clusters": num_clusters_list, \
                                       "own_devices_list": own_devices_list_of_list, \
                                       "num_own_devices": num_own_devices_list
                                      })


    df_blue_user_clust = df_blue_user_clust.set_index("device_id")
    df_blue_user_clust_owndevices_dict = df_blue_user_clust["own_devices_list"]
    return (df_blue_clust, df_blue_user_clust, df_blue_user_clust_owndevices_dict, did_to_own_devices_list)



In [None]:
## FOR CLUSTERING ONLY USE SAMPLES IN_DURING_STUDY (complete days)
df_blue_clust, df_blue_user_clust, df_blue_user_clust_owndevices_dict, did_to_own_devices_list = get_df_blue_clusters(df_blue_in_study)


In [None]:
# display (df_blue_clust.head(2))
# display (df_blue_user_clust.head(2))
print (len(df_blue_clust))
print (len(df_blue_user_clust))

display (df_blue_user_clust.head(2))


### Bluetooth get per phase/week features

In [None]:
def add_features_blue(g, did_to_own_devices_list):
    if "device_id" in g.columns:
        did = g["device_id"].iloc[0]
    else:
        did = g.name[0]
    owndevices = did_to_own_devices_list[did]
    feats_dict = {}
    feats_dict["num_scans_of_most_frequent_device"] = num_scans_of_most_frequent_device(g)
    feats_dict["num_scans_of_least_frequent_device"] = num_scans_of_least_frequent_device(g)
    feats_dict["number_unique_devices"] = number_unique_devices(g)
    
    feats_dict["num_scans_of_most_frequent_device_of_others"] = num_scans_of_most_frequent_device_of_others(g, [owndevices])
    feats_dict["num_scans_of_least_frequent_device_of_others"] = num_scans_of_least_frequent_device_of_others(g, [owndevices])
    feats_dict["number_unique_devices_of_others"] = number_unique_devices_of_others(g, [owndevices])
    
    feats_dict["num_scans_of_most_frequent_device_of_self"] = num_scans_of_most_frequent_device_of_self(g, [owndevices])
    feats_dict["num_scans_of_least_frequent_device_of_self"] = num_scans_of_least_frequent_device_of_self(g, [owndevices])
    feats_dict["number_unique_devices_of_self"] = number_unique_devices_of_self(g, [owndevices])
    
    feats_dict["sum_num_scans_of_all_devices_of_self"] = sum_num_scans_of_all_devices_of_self(g, [owndevices])
    feats_dict["sum_num_scans_of_all_devices_of_others"] = sum_num_scans_of_all_devices_of_others(g, [owndevices])
    
    feats_dict["avg_num_scans_of_all_devices_of_self"] = avg_num_scans_of_all_devices_of_self(g, [owndevices])
    feats_dict["avg_num_scans_of_all_devices_of_others"] = avg_num_scans_of_all_devices_of_others(g, [owndevices])
    
#     feats_dict["std_num_scans_of_all_devices_of_self"] = std_num_scans_of_all_devices_of_self(g, [owndevices])
#     feats_dict["std_num_scans_of_all_devices_of_others"] = std_num_scans_of_all_devices_of_others(g, [owndevices])
    ## too many nulls for stds
    return pd.Series(feats_dict)


def mean_blue_feats_across_days(df_grp, did_to_own_devices_list):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean



In [None]:
##per phase - use only complete days so that mean is not skewed
##we are averaging across days as people may have different number of days in each phase
df_blue_phase_feats = df_blue_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_blue_feats_across_days(x, did_to_own_devices_list))).reset_index()
df_blue_phase_feats = df_blue_phase_feats.drop(columns=["level_2"])
output_features(df_blue_phase_feats, "blue", "per_phase")

##per-phase tod
df_blue_phase_feats = df_blue_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_blue_feats_across_days(x, did_to_own_devices_list))).reset_index()
df_blue_phase_feats = df_blue_phase_feats.drop(columns=["level_3"])
output_features(df_blue_phase_feats, "blue", "per_phase_tod")

##per-phase dow
df_blue_phase_feats = df_blue_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_blue_feats_across_days(x, did_to_own_devices_list))).reset_index()
df_blue_phase_feats = df_blue_phase_feats.drop(columns=["level_3"])
output_features(df_blue_phase_feats, "blue", "per_phase_dow")

##per-phase tod, dow
df_blue_phase_feats = df_blue_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_blue_feats_across_days(x, did_to_own_devices_list))).reset_index()
df_blue_phase_feats = df_blue_phase_feats.drop(columns=["level_4"])
output_features(df_blue_phase_feats, "blue", "per_phase_tod_dow")




# # per-phase
# df_blue_phase_feats = df_blue.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_phase_feats, "blue", "per_phase")
# # per-phase tod
# df_blue_phase_feats = df_blue.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_phase_feats, "blue", "per_phase_tod")
# # per-phase dow
# df_blue_phase_feats = df_blue.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_phase_feats, "blue", "per_phase_dow")
# # per-phase tod, dow
# df_blue_phase_feats = df_blue.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_phase_feats, "blue", "per_phase_tod_dow")


In [None]:
# display(df_blue_phase_feats.head(10))


In [None]:
# # biweekly
# df_blue_biweekly_feats = df_blue.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_biweekly_feats, "blue", "biweekly")
# # biweekly tod
# df_blue_biweekly_feats = df_blue.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_biweekly_feats, "blue", "biweekly_tod")
# # biweekly dow
# df_blue_biweekly_feats = df_blue.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_biweekly_feats, "blue", "biweekly_dow")
# # biweekly tod, dow
# df_blue_biweekly_feats = df_blue.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_biweekly_feats, "blue", "biweekly_tod_dow")


In [None]:
# # monthly
# df_blue_monthly_feats = df_blue.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_monthly_feats, "blue", "monthly")
# # monthly tod
# df_blue_monthly_feats = df_blue.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_monthly_feats, "blue", "monthly_tod")
# # monthly dow
# df_blue_monthly_feats = df_blue.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_monthly_feats, "blue", "monthly_dow")
# # monthly tod, dow
# df_blue_monthly_feats = df_blue.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_monthly_feats, "blue", "monthly_tod_dow")


In [None]:
# # weekly
# df_blue_weekly_feats = df_blue.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_weekly_feats, "blue", "weekly")
# # weekly tod
# df_blue_weekly_feats = df_blue.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_weekly_feats, "blue", "weekly_tod")
# # weekly dow
# df_blue_weekly_feats = df_blue.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_weekly_feats, "blue", "weekly_dow")
# # weekly tod, dow
# df_blue_weekly_feats = df_blue.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_blue(x, did_to_own_devices_list))).reset_index()
# output_features(df_blue_weekly_feats, "blue", "weekly_tod_dow")


In [None]:
# display(df_blue_biweekly_feats.head(2))
# display(df_blue_monthly_feats.head(2))

# display(df_blue_biweekly_feats.isnull().sum(axis = 0))
# display(df_blue_monthly_feats.isnull().sum(axis = 0))


# Calls - clean and add phase


In [None]:
df_calls = pd.read_csv(CALLS_PATH)
df_calls["timestamp_dt"] = df_calls["timestamp"].apply(sensor_time_to_local)
df_calls["date_dt"] = df_calls["timestamp_dt"].dt.date
df_calls = df_calls.sort_values(by=["device_id", "timestamp_dt"])




In [None]:
DID_MAPPED = {}
def get_device_type(did_old, df_calls_in):
    if did_old in DID_MAPPED:
        dtype = DID_MAPPED[did_old]
    elif did_old in DID_TO_TYPE:
        dtype = DID_TO_TYPE[did_old]
    else:
        df = df_calls_in.copy(deep=True)
        df_did_old_any_4 = df[( (df["device_id_old"]==did_old) & (df["call_type"]==4) ) ]
        if len(df_did_old_any_4)>0:
            dtype = "iPhone"
        else:
            dtype = "Android"
        print ("{0} not in aware_device (guess = {1})".format(did_old, dtype))
    DID_MAPPED[did_old] = dtype
    return dtype
        

def reformat_ios_data(g_in):
    if g_in is None:
        return None
    g = g_in[(g_in["device_type"]=="iPhone")]
    g_android = g_in[(g_in["device_type"]!="iPhone")]
    if len(g)>0:
        df_incoming = g[(g["call_type"] == 4) & (g["call_type"].shift(1) == 2) & (g["call_type"].shift(2) == 1)]
        df_outgoing = g[(g["call_type"] == 4) & (g["call_type"].shift(1) == 2) & (g["call_type"].shift(2) == 3)]
        df_missed = g[(g["call_type"] == 4) & ((g["call_type"].shift(1) == 1) | (g["call_type"].shift(1) == 3))]
        df_incoming["call_type"] = 1
        df_outgoing["call_type"] = 2
        df_missed["call_type"] = 3
        df = df_incoming.append(df_outgoing)
        df = df.append(df_missed)
        if len(g_android) > 0:
            df = df.append(g_android)
    else:
        df = g_android
    df = df.sort_values(by=['timestamp_dt'])
    return df

      

df_calls["device_type"] = df_calls["device_id_old"].apply((lambda x: get_device_type(x, df_calls)))

df_calls = reformat_ios_data(df_calls)


# calls_input_feats = ["number_outgoing_calls", "number_incoming_calls", "number_missed_calls", \
#                      "duration_outgoing_calls_seconds", "duration_incoming_calls_seconds", \
#                      "number_of_correspondents_phone"]


# ## Add PHASE
# df_calls =  add_phase_label_wrapper(df_calls, "timestamp_dt")  

# # Add week label
# df_calls =  add_week_label(df_calls, "date_dt", df_survey)  

## ADD ALL LABELS
df_calls =  add_all_epoch_labels(df_calls, df_survey)  
df_calls_in_study = df_calls[(df_calls["is_during_study"]==True)] # helps exclude incomplete days



In [None]:
display(df_calls.head(2))

# tmp = df_calls[(df_calls["call_type"]==4)]
# print (len(tmp))


### Calls - feature function

In [None]:
def add_features_calls(g):
    feats_dict = {}
    feats_dict["number_outgoing_calls"] = number_outgoing_calls(g)
    feats_dict["number_incoming_calls"] = number_incoming_calls(g)
    feats_dict["number_missed_calls"] = number_missed_calls(g)
    feats_dict["duration_outgoing_calls_seconds"] = duration_outgoing_calls_seconds(g)
    feats_dict["duration_incoming_calls_seconds"] = duration_incoming_calls_seconds(g)
    feats_dict["number_of_correspondents_phone"] = number_of_correspondents_phone(g)
    return pd.Series(feats_dict)

def mean_calls_feats_across_days(df_grp):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_calls(x))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean


### Calls - Get per-phase/week features


In [None]:
##per phase - use only complete days so that mean is not skewed
##we are averaging across days as people may have different number of days in each phase
df_calls_phase_feats = df_calls_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_calls_feats_across_days(x))).reset_index()
df_calls_phase_feats = df_calls_phase_feats.drop(columns=["level_2"])
output_features(df_calls_phase_feats, "calls", "per_phase")

##per-phase tod
df_calls_phase_feats = df_calls_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_calls_feats_across_days(x))).reset_index()
df_calls_phase_feats = df_calls_phase_feats.drop(columns=["level_3"])
output_features(df_calls_phase_feats, "calls", "per_phase_tod")

##per-phase dow
df_calls_phase_feats = df_calls_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_calls_feats_across_days(x))).reset_index()
df_calls_phase_feats = df_calls_phase_feats.drop(columns=["level_3"])
output_features(df_calls_phase_feats, "calls", "per_phase_dow")

##per-phase tod, dow
df_calls_phase_feats = df_calls_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_calls_feats_across_days(x))).reset_index()
df_calls_phase_feats = df_calls_phase_feats.drop(columns=["level_4"])
output_features(df_calls_phase_feats, "calls", "per_phase_tod_dow")




# # per-phase
# df_calls_phase_feats = df_calls.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_phase_feats, "calls", "per_phase")
# # per-phase tod
# df_calls_phase_feats = df_calls.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_phase_feats, "calls", "per_phase_tod")
# # per-phase dow
# df_calls_phase_feats = df_calls.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_phase_feats, "calls", "per_phase_dow")
# # per-phase tod, dow
# df_calls_phase_feats = df_calls.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_phase_feats, "calls", "per_phase_tod_dow")


In [None]:
# display(df_calls_phase_feats.head(2))
# display(df_calls_phase_feats.isnull().sum(axis = 0))


# # biweekly
# df_calls_biweekly_feats = df_calls.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_biweekly_feats, "calls", "biweekly")
# # biweekly tod
# df_calls_biweekly_feats = df_calls.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_biweekly_feats, "calls", "biweekly_tod")
# # biweekly dow
# df_calls_biweekly_feats = df_calls.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_biweekly_feats, "calls", "biweekly_dow")
# # biweekly tod, dow
# df_calls_biweekly_feats = df_calls.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_biweekly_feats, "calls", "biweekly_tod_dow")



In [None]:
# df_calls_biweekly_feats = df_calls.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# df_calls_monthly_feats = df_calls.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()

# # monthly
# df_calls_monthly_feats = df_calls.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_monthly_feats, "calls", "monthly")
# # monthly tod
# df_calls_monthly_feats = df_calls.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_monthly_feats, "calls", "monthly_tod")
# # monthly dow
# df_calls_monthly_feats = df_calls.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_monthly_feats, "calls", "monthly_dow")
# # monthly tod, dow
# df_calls_monthly_feats = df_calls.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_monthly_feats, "calls", "monthly_tod_dow")



In [None]:
# # weekly
# df_calls_weekly_feats = df_calls.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_weekly_feats, "calls", "weekly")
# # weekly tod
# df_calls_weekly_feats = df_calls.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_weekly_feats, "calls", "weekly_tod")
# # weekly dow
# df_calls_weekly_feats = df_calls.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_weekly_feats, "calls", "weekly_dow")
# # weekly tod, dow
# df_calls_weekly_feats = df_calls.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_calls(x))).reset_index()
# output_features(df_calls_weekly_feats, "calls", "weekly_tod_dow")


In [None]:
# display(df_calls_biweekly_feats.head(2))
# display(df_calls_biweekly_feats.isnull().sum(axis = 0))
# display(df_calls_monthly_feats.head(2))
# display(df_calls_monthly_feats.isnull().sum(axis = 0))

# ## OUTPUT
# output_features(df_calls_biweekly_feats, "calls", "biweekly")
# output_features(df_calls_monthly_feats, "calls", "monthly")


# Msgs - clean and add phase


In [None]:
df_msgs = pd.read_csv(MSGS_PATH)
df_msgs["timestamp_dt"] = df_msgs["timestamp"].apply(sensor_time_to_local)
df_msgs["date_dt"] = df_msgs["timestamp_dt"].dt.date
df_msgs = df_msgs.sort_values(by=["device_id", "timestamp_dt"])

# msgs_input_feats = ["number_of_outgoing_messages", \
#                     "number_of_incoming_messages", \
#                     "number_of_correspondents"
#                    ]



# ## Add PHASE
# df_msgs =  add_phase_label_wrapper(df_msgs, "timestamp_dt")  

# # Add week label
# df_msgs =  add_week_label(df_msgs, "date_dt", df_survey)

## ADD ALL LABELS
df_msgs =  add_all_epoch_labels(df_msgs, df_survey)  
df_msgs_in_study = df_msgs[(df_msgs["is_during_study"]==True)] # helps exclude incomplete days


In [None]:
display(df_msgs.head(2))


### Messages - feature function


In [None]:
def add_features_msgs(g):
    feats_dict = {}
    feats_dict["number_of_outgoing_messages"] = number_of_outgoing_messages(g)
    feats_dict["number_of_incoming_messages"] = number_of_incoming_messages(g)
    feats_dict["number_of_correspondents"] = number_of_correspondents(g)
    return pd.Series(feats_dict)

def mean_msgs_feats_across_days(df_grp):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_msgs(x))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean



### Msgs - Get per-phase/week features


In [None]:
# per phase - use only complete days so that mean is not skewed
# we are averaging across days as people may have different number of days in each phase
# df_msgs_phase_feats = df_msgs_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_blue_msgs_across_days(x))).reset_index()
# df_msgs_phase_feats = df_msgs_phase_feats.drop(columns=["level_2"])
# output_features(df_msgs_phase_feats, "msgs", "per_phase")

# per-phase tod
# df_msgs_phase_feats = df_msgs_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_msgs_feats_across_days(x))).reset_index()
# df_msgs_phase_feats = df_msgs_phase_feats.drop(columns=["level_3"])
# output_features(df_msgs_phase_feats, "msgs", "per_phase_tod")

# per-phase dow
# df_msgs_phase_feats = df_msgs_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_msgs_feats_across_days(x))).reset_index()
# df_msgs_phase_feats = df_msgs_phase_feats.drop(columns=["level_3"])
# output_features(df_msgs_phase_feats, "msgs", "per_phase_dow")

# per-phase tod, dow
# df_msgs_phase_feats = df_msgs_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_msgs_feats_across_days(x))).reset_index()
# df_msgs_phase_feats = df_msgs_phase_feats.drop(columns=["level_4"])
# output_features(df_msgs_phase_feats, "msgs", "per_phase_tod_dow")




# # per-phase
# df_msgs_phase_feats = df_msgs.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_phase_feats, "msgs", "per_phase")
# # per-phase tod
# df_msgs_phase_feats = df_msgs.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_phase_feats, "msgs", "per_phase_tod")
# # per-phase dow
# df_msgs_phase_feats = df_msgs.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_phase_feats, "msgs", "per_phase_dow")
# # per-phase tod, dow
# df_msgs_phase_feats = df_msgs.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_phase_feats, "msgs", "per_phase_tod_dow")


In [None]:
# display(df_msgs_phase_feats.head(2))
# display(df_msgs_phase_feats.isnull().sum(axis = 0))
# df_msgs_biweekly_feats = df_msgs.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# df_msgs_monthly_feats = df_msgs.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()

# # biweekly
# df_msgs_biweekly_feats = df_msgs.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_biweekly_feats, "msgs", "biweekly")
# # biweekly tod
# df_msgs_biweekly_feats = df_msgs.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_biweekly_feats, "msgs", "biweekly_tod")
# # biweekly dow
# df_msgs_biweekly_feats = df_msgs.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_biweekly_feats, "msgs", "biweekly_dow")
# # biweekly tod, dow
# df_msgs_biweekly_feats = df_msgs.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_biweekly_feats, "msgs", "biweekly_tod_dow")




In [None]:
# # monthly
# df_msgs_monthly_feats = df_msgs.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_monthly_feats, "msgs", "monthly")
# # monthly tod
# df_msgs_monthly_feats = df_msgs.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_monthly_feats, "msgs", "monthly_tod")
# # monthly dow
# df_msgs_monthly_feats = df_msgs.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_monthly_feats, "msgs", "monthly_dow")
# # monthly tod, dow
# df_msgs_monthly_feats = df_msgs.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_monthly_feats, "msgs", "monthly_tod_dow")


In [None]:
# # weekly
# df_msgs_weekly_feats = df_msgs.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_weekly_feats, "msgs", "weekly")
# # weekly tod
# df_msgs_weekly_feats = df_msgs.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_weekly_feats, "msgs", "weekly_tod")
# # weekly dow
# df_msgs_weekly_feats = df_msgs.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_weekly_feats, "msgs", "weekly_dow")
# # weekly tod, dow
# df_msgs_weekly_feats = df_msgs.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_msgs(x))).reset_index()
# output_features(df_msgs_weekly_feats, "msgs", "weekly_tod_dow")


In [None]:
# display(df_msgs_biweekly_feats.head(2))
# display(df_msgs_biweekly_feats.isnull().sum(axis = 0))
# display(df_msgs_monthly_feats.head(2))
# display(df_msgs_monthly_feats.isnull().sum(axis = 0))

# ## OUTPUT
# output_features(df_msgs_biweekly_feats, "msgs", "biweekly")
# output_features(df_msgs_monthly_feats, "msgs", "monthly")


# Sleep - Clean and add phase

In [None]:
df_slp = pd.read_csv(SLP_PATH)
df_slp["timestamp_dt"] = pd.to_datetime(df_slp["date"], format="%Y-%m-%d")
df_slp["date_dt"] = df_slp["timestamp_dt"].dt.date
df_slp["device_id"] = df_slp["fitbit_id"].apply(lambda x: FB_MAPPING_DICT[x])
df_slp = df_slp.sort_values(by=["device_id", "date_dt"])

slp_input_feats = ['totalMinutesAsleep', 'totalTimeInBed',\
                    'totalSleepRecords', \
#                    'mainSlp_duration', \ # same as mainSlp_timeInBed
                   'mainSlp_minutesAsleep',\
                    'mainSlp_timeInBed', \
                    'mainSlp_efficiency', \
                   'mainSlp_restlessDuration',\
                    'mainSlp_restlessCount', \
                    'mainSlp_startMinusMidntMin', \
                   'mainSlp_endMinusMidntMin']

# ## NORM 
# if TO_NORM_PER_PERSON:
#     df_slp = norm_per_person_wrapper(df_slp, ["totalMinutesAsleep", "mainSlp_minutesAsleep",\
#                                               "mainSlp_efficiency", "mainSlp_restlessDuration", \
#                                               "mainSlp_startMinusMidntMin", "mainSlp_endMinusMidntMin"
#                                              ], "device_id")
    
# ## Add PHASE
# df_slp =  add_phase_label_wrapper(df_slp, "timestamp_dt")  

# # Add week label
# df_slp =  add_week_label(df_slp, "date_dt", df_survey)  

## ADD ALL LABELS
df_slp =  add_all_epoch_labels(df_slp, df_survey)  
df_slp_in_study = df_slp[(df_slp["is_during_study"]==True)] # helps exclude incomplete days


In [None]:
# display(df_slp.head(2))
# print(df_slp.columns)

# display(df_slp["timestamp_dt"].head(2))

# tmp = df_slp[(df_slp["mainSlp_duration"]!=df_slp["mainSlp_timeInBed"])]
# display(tmp)

### Sleep - intraday features for tod_lbl AND tod_lbl+dow_lbl
- 'totalMinutesAsleep', 'totalTimeInBed', 'totalSleepRecords'
- Then for the longest sleep record, get 'mainSlp_minutesAsleep', 'mainSlp_timeInBed',
'mainSlp_efficiency', 'mainSlp_restlessDuration', 'mainSlp_restlessCount', 
'mainSlp_startMinusMidntMin', 'mainSlp_endMinusMidntMin'


In [None]:
DATETIME_FORMAT_FOR_SLEEP_INTRADAY = "%Y-%m-%d %H:%M:%S"

def get_time_minus_midnt(dt, dt_midnt):
    sub = dt-dt_midnt
    return (sub.total_seconds()/60.0)

def get_slp_feats_per_day(df_grp):
    asleep = df_grp[(df_grp["value"]=="asleep")]
    restless = df_grp[(df_grp["value"]=="restless")]
    awake = df_grp[(df_grp["value"]=="awake")]
    n_recs = len(list(df_grp["logId"].unique()))
    df_grp_cnts = (df_grp.groupby("logId")["value"].count()).reset_index()
    df_grp_cnts = df_grp_cnts.sort_values(by=["value"], ascending=False)
    ## for main only
    main_logId = df_grp_cnts.iloc[0]["logId"]
    df_grp_main_logId = df_grp[(df_grp["logId"]==main_logId)]
    main_asleep = df_grp_main_logId[(df_grp_main_logId["value"]=="asleep")]
    main_restless = df_grp_main_logId[(df_grp_main_logId["value"]=="restless")]
    main_awake = df_grp_main_logId[(df_grp_main_logId["value"]=="awake")]
    df_grp_main_logId["shift_value"] = df_grp_main_logId["value"].shift(periods=1)
    df_grp_main_logId_value_switches = df_grp_main_logId[(df_grp_main_logId["value"]!=df_grp_main_logId["shift_value"])]
    df_grp_main_logId_value_switches_restless = df_grp_main_logId_value_switches[(df_grp_main_logId_value_switches["value"]=="restless")]

    main_start_in_t_window = datetime.datetime.strptime(df_grp_main_logId["datetime"].iloc[0], DATETIME_FORMAT_FOR_SLEEP_INTRADAY)
    main_end_in_t_window = datetime.datetime.strptime(df_grp_main_logId["datetime"].iloc[-1], DATETIME_FORMAT_FOR_SLEEP_INTRADAY)
    dt_midnt = datetime.datetime.combine(df_grp.name, datetime.time(0, 0))
#     print (dt_midnt)
#     print (main_start_in_t_window)
    
    

    return pd.Series({"asleep": len(asleep), \
                      "restless": len(restless), \
                      "awake": len(awake), \
                      "n_recs": n_recs, \
                      "main_logId": main_logId,\
                      "main_asleep": len(main_asleep),\
                      "main_restless": len(main_restless),\
                      "main_awake": len(main_awake),\
                      "main_restless_count": len(df_grp_main_logId_value_switches_restless),\
                      "main_startMinusMidntMin": get_time_minus_midnt(main_start_in_t_window, dt_midnt),\
                      "main_endMinusMidntMin": get_time_minus_midnt(main_end_in_t_window, dt_midnt),\
                     })



def get_slp_intraday_feats_for_grp(df_grp):
    feat_dict = {}
#     display(df_grp.head(4))
#     display(df_grp.iloc[132:137])
    df_grp_per_day = df_grp.groupby("date_dt").apply(get_slp_feats_per_day)
    df_grp_per_day["timeInBed"] = df_grp_per_day["asleep"] + df_grp_per_day["awake"]
    feat_dict["totalMinutesAsleep"] = df_grp_per_day["asleep"].mean()
    feat_dict["totalTimeInBed"] = df_grp_per_day["timeInBed"].mean()
    feat_dict["totalSleepRecords"] = df_grp_per_day["n_recs"].mean()
    ## main
    df_grp_per_day["main_timeInBed"] = df_grp_per_day["main_asleep"] + df_grp_per_day["main_awake"]
    feat_dict["mainSlp_minutesAsleep"] = df_grp_per_day["main_asleep"].mean()
    feat_dict["mainSlp_timeInBed"] = df_grp_per_day["main_timeInBed"].mean()
    df_grp_per_day["main_efficiency"] = round((df_grp_per_day["main_asleep"]/df_grp_per_day["main_timeInBed"])*100.0, 0)
    feat_dict["mainSlp_efficiency"] = df_grp_per_day["main_efficiency"].mean()
    feat_dict["mainSlp_restlessDuration"] = df_grp_per_day["main_restless"].mean()
    feat_dict["mainSlp_restlessCount"] = df_grp_per_day["main_restless_count"].mean()
    feat_dict["mainSlp_startMinusMidntMin"] = df_grp_per_day["main_startMinusMidntMin"].mean()
    feat_dict["mainSlp_endMinusMidntMin"] = df_grp_per_day["main_endMinusMidntMin"].mean()
#     display(df_grp_per_day)
#     display(feat_dict)
    return pd.Series(feat_dict)
    

def get_slp_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study):
    ## (1) did to fitbit id and load parsed csv
    fid = FB_MAPPING_DICT_INV[did]
    fpath = SLP_INTRADAY_PATH_LIKE.format(fid)
    data_df = pd.read_csv(fpath)
    data_df["device_id"] = did
    data_df["timestamp_dt"] = pd.to_datetime(data_df["datetime"], format="%Y-%m-%d %H:%M:%S")
    data_df["date_dt"] = data_df["timestamp_dt"].dt.date
    ## (2) add phase, tod, and dow lbls
    data_df =  add_all_epoch_labels(data_df, df_survey, print_out=False) 
    if only_incld_during_study:
        data_df = data_df[(data_df["is_during_study"]==True)] # helps exclude incomplete days
    #print (len(data_df))
    #display(data_df.iloc[30000:30020])
    ## (3) function to extract features from group
    ## (4) grp by tod_lbl, and grp by tod_lbl and dow_lbl
    feats_df_grp = data_df.groupby(grpbycols).apply(get_slp_intraday_feats_for_grp)
    feats_df_grp = feats_df_grp.reset_index()
#     print ("features")
#     display(feats_df_grp)
    return feats_df_grp
    
    
def get_slp_intraday_feats(did_list, df_survey, grpbycols, only_incld_during_study):
    did_df_list = []
    cnt = 0
    print ("Processing did cnts intraday...")
    for did in did_list:
        cnt += 1
        print ("{0}, ".format(cnt), end='')
        did_df = get_slp_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study)
        did_df_list.append(did_df)
#         break ## debugging
    did_df_all = pd.concat(did_df_list)
    return (did_df_all)
    
    

In [None]:
# tmp = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl"])

# display (tmp.head(5))




In [None]:
# print (len(list(tmp["device_id"].unique())))



### Sleep - Get per-phase/week features

In [None]:
# df_slp_phase_feats = df_slp.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_phase_feats, "slp", "per_phase")

# per-phase
df_slp_phase_feats = df_slp_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
output_features(df_slp_phase_feats, "slp", "per_phase")

# per-phase tod -- VALID
df_slp_phase_feats = get_slp_intraday_feats(list(df_slp_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl"], only_incld_during_study=True)
output_features(df_slp_phase_feats, "slp", "per_phase_tod")

# per-phase dow
df_slp_phase_feats = df_slp_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
output_features(df_slp_phase_feats, "slp", "per_phase_dow")

# per-phase tod, dow -- VALID
df_slp_phase_feats = get_slp_intraday_feats(list(df_slp_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"], only_incld_during_study=True)
output_features(df_slp_phase_feats, "slp", "per_phase_tod_dow")


In [None]:
# display(df_slp_phase_feats.head(2))
# df_slp_biweekly_feats = df_slp.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()

# # biweekly
# df_slp_biweekly_feats = df_slp.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_biweekly_feats, "slp", "biweekly")
# # biweekly tod -- VALID
# df_slp_biweekly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl", "tod_lbl"])
# output_features(df_slp_biweekly_feats, "slp", "biweekly_tod")
# # biweekly dow
# df_slp_biweekly_feats = df_slp.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_biweekly_feats, "slp", "biweekly_dow")
# # biweekly tod, dow -- VALID
# df_slp_biweekly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"])
# output_features(df_slp_biweekly_feats, "slp", "biweekly_tod_dow")


In [None]:
# df_slp_monthly_feats = df_slp.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()

# # monthly
# df_slp_monthly_feats = df_slp.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_monthly_feats, "slp", "monthly")
# # monthly tod -- VALID
# df_slp_monthly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "monthly_lbl", "tod_lbl"])
# output_features(df_slp_monthly_feats, "slp", "monthly_tod")
# # monthly dow
# df_slp_monthly_feats = df_slp.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_monthly_feats, "slp", "monthly_dow")
# # monthly tod, dow -- VALID
# df_slp_monthly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"])
# output_features(df_slp_monthly_feats, "slp", "monthly_tod_dow")


In [None]:
# display(df_slp_biweekly_feats.head(2))
# display(df_slp_biweekly_feats.isnull().sum(axis = 0))
# display(df_slp_monthly_feats.head(2))
# display(df_slp_monthly_feats.isnull().sum(axis = 0))

# ## OUTPUT
# output_features(df_slp_biweekly_feats, "msgs", "biweekly")
# output_features(df_slp_monthly_feats, "msgs", "monthly")


# # weekly
# df_slp_weekly_feats = df_slp.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_weekly_feats, "slp", "weekly")
# # weekly tod -- VALID
# df_slp_weekly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12", "tod_lbl"])
# output_features(df_slp_weekly_feats, "slp", "weekly_tod")
# # weekly dow
# df_slp_weekly_feats = df_slp.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_mean(x, slp_input_feats))).reset_index()
# output_features(df_slp_weekly_feats, "slp", "weekly_dow")
# # weekly tod, dow -- VALID
# df_slp_weekly_feats = get_slp_intraday_feats(list(df_slp["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"])
# output_features(df_slp_weekly_feats, "slp", "weekly_tod_dow")


# Steps - Clean and add phase

In [None]:
df_steps = pd.read_csv(STEPS_PATH)
df_steps["timestamp_dt"] = pd.to_datetime(df_steps["date"], format="%Y-%m-%d")
df_steps["date_dt"] = df_steps["timestamp_dt"].dt.date
df_steps["device_id"] = df_steps["fitbit_id"].apply(lambda x: FB_MAPPING_DICT[x])
df_steps = df_steps.sort_values(by=["device_id", "date_dt"])

steps_input_feats = ['steps_total']

# ## NORM 
# if TO_NORM_PER_PERSON:
#     df_steps = norm_per_person_wrapper(df_steps, ["steps_total"], "device_id")


# ## Add PHASE
# df_steps =  add_phase_label_wrapper(df_steps, "timestamp_dt")  

# # Add week label
# df_steps =  add_week_label(df_steps, "date_dt", df_survey)  


## ADD ALL LABELS
df_steps =  add_all_epoch_labels(df_steps, df_survey)  
df_steps_in_study = df_steps[(df_steps["is_during_study"]==True)] # helps exclude incomplete days


In [None]:
display(df_steps.iloc[100:160])
print(df_steps.columns)

In [None]:
df_steps_levels = pd.read_csv(STEPS_LEVELS_PATH)
df_steps_levels["timestamp_dt"] = pd.to_datetime(df_steps_levels["date"], format="%Y-%m-%d")
df_steps_levels["date_dt"] = df_steps_levels["timestamp_dt"].dt.date
df_steps_levels["device_id"] = df_steps_levels["fitbit_id"].apply(lambda x: FB_MAPPING_DICT[x])
df_steps_levels = df_steps_levels.sort_values(by=["device_id", "date_dt"])


In [None]:
steps_levels_input_feats = ['veryActiveMinutes', 'fairlyActiveMinutes', 'lightlyActiveMinutes', 'sedentaryMinutes']

# ## Add PHASE
# df_steps_levels =  add_phase_label_wrapper(df_steps_levels, "timestamp_dt")  
# # Add week label
# df_steps_levels =  add_week_label(df_steps_levels, "date_dt", df_survey)  


## ADD ALL LABELS
df_steps_levels =  add_all_epoch_labels(df_steps_levels, df_survey)  
df_steps_levels_in_study = df_steps_levels[(df_steps_levels["is_during_study"]==True)] # helps exclude incomplete days


In [None]:
display(df_steps_levels.head(2))
print(df_steps_levels.columns)

### Steps and steps levels - intraday features for tod_lbl AND tod_lbl+dow_lbl


In [None]:
def get_steps_intraday_feats_for_grp(df_grp):
#     print (df_grp.name)
#     display(df_grp.head(10))
    feat_dict = {}
#     display(df_grp.head(4))
#     display(df_grp.iloc[132:137])
    df_grp_per_day = pd.DataFrame(df_grp.groupby("date_dt")["value"].sum())
#     display(df_grp_per_day)
    feat_dict["value"] = df_grp_per_day["value"].mean()
    return pd.Series(feat_dict)
    
    
def add_did_ts_epoch_cols(data_df, did, only_incld_during_study):
    data_df["device_id"] = did
    data_df["timestamp_dt"] = pd.to_datetime(data_df["datetime"], format="%Y-%m-%d %H:%M:%S")
    data_df["date_dt"] = data_df["timestamp_dt"].dt.date
    ## (2) add phase, tod, and dow lbls
    data_df =  add_all_epoch_labels(data_df, df_survey, print_out=False)
    if only_incld_during_study:
        data_df = data_df[(data_df["is_during_study"]==True)] # helps exclude incomplete days
    return data_df

def get_steps_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study):
    ## (1) did to fitbit id and load parsed csv
    fid = FB_MAPPING_DICT_INV[did]
    steps_df = add_did_ts_epoch_cols(pd.read_csv(STEPS_INTRADAY_PATH_LIKE.format(fid)), did, only_incld_during_study)
    very_act_df = add_did_ts_epoch_cols(pd.read_csv(VERY_ACT_INTRADAY_PATH_LIKE.format(fid)), did, only_incld_during_study)
    fairly_act_df = add_did_ts_epoch_cols(pd.read_csv(FAIRLY_ACT_INTRADAY_PATH_LIKE.format(fid)), did, only_incld_during_study)
    lightly_act_df = add_did_ts_epoch_cols(pd.read_csv(LIGHTLY_ACT_INTRADAY_PATH_LIKE.format(fid)), did, only_incld_during_study)
    sed_act_df = add_did_ts_epoch_cols(pd.read_csv(SED_ACT_INTRADAY_PATH_LIKE.format(fid)), did, only_incld_during_study)
#     print ("intraday loaded")
    feats = ["steps_total", "veryActiveMinutes", "fairlyActiveMinutes", "lightlyActiveMinutes", "sedentaryMinutes"]
    data_df_for_feats = [steps_df, very_act_df, fairly_act_df, lightly_act_df, sed_act_df]
#     feats = ["veryActiveMinutes"]
#     data_df_for_feats = [very_act_df]
    data_df_feats_list = []
    for ki in range(0, len(feats)):
        feat_name = feats[ki]
        data_df = data_df_for_feats[ki]
        data_df_feats = data_df.groupby(grpbycols).apply(get_steps_intraday_feats_for_grp)
        data_df_feats = data_df_feats.rename(columns={"value": feat_name})
        #display(data_df_feats)
        data_df_feats_list.append(data_df_feats)
    data_df_feats_all = (pd.concat(data_df_feats_list, axis=1)).reset_index()
#     display(data_df_feats_all)
    return data_df_feats_all
    
    
def get_steps_intraday_feats(did_list, df_survey, grpbycols, only_incld_during_study):
    did_df_list = []
    cnt = 0
    print ("Processing did cnts intraday...")
    for did in did_list:
        cnt += 1
        print ("{0}, ".format(cnt), end='')
        did_df = get_steps_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study)
        did_df_list.append(did_df)
#         break ## debugging
    did_df_all = pd.concat(did_df_list)
    return (did_df_all)


In [None]:
# tmp = get_steps_intraday_feats(list(df_steps["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl"])


In [None]:
# display(tmp.head(10))

### Steps and steps levels - Get per-phase/week features

In [None]:
# per phase
df_steps_phase_feats = df_steps_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
df_steps_levels_phase_feats = df_steps_levels_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
df_steps_phase_feats = pd.merge(df_steps_phase_feats, df_steps_levels_phase_feats,  how='left', left_on=['device_id','phase_wrapper'], right_on = ['device_id','phase_wrapper'])
output_features(df_steps_phase_feats, "steps", "per_phase")


# per phase dow
df_steps_phase_feats = df_steps_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
df_steps_levels_phase_feats = df_steps_levels_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
df_steps_phase_feats = pd.merge(df_steps_phase_feats, df_steps_levels_phase_feats,  how='left', left_on=['device_id','phase_wrapper', "dow_lbl"], right_on = ['device_id','phase_wrapper', "dow_lbl"])
output_features(df_steps_phase_feats, "steps", "per_phase_dow")



# # per phase tod -- INVALID
# df_steps_phase_feats = df_steps.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_phase_feats = df_steps_levels.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_phase_feats = pd.merge(df_steps_phase_feats, df_steps_levels_phase_feats,  how='left', left_on=['device_id','phase_wrapper', "tod_lbl"], right_on = ['device_id','phase_wrapper', "tod_lbl"])
# output_features(df_steps_phase_feats, "steps", "per_phase_tod")

# # per phase tod dow -- INVALID
# df_steps_phase_feats = df_steps.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_phase_feats = df_steps_levels.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_phase_feats = pd.merge(df_steps_phase_feats, df_steps_levels_phase_feats,  how='left', left_on=['device_id','phase_wrapper', "tod_lbl", "dow_lbl"], right_on = ['device_id','phase_wrapper', "tod_lbl", "dow_lbl"])
# output_features(df_steps_phase_feats, "steps", "per_phase_tod_dow")


In [None]:
## per phase tod
df_steps_phase_feats = get_steps_intraday_feats(list(df_steps_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl"], only_incld_during_study=True)
output_features(df_steps_phase_feats, "steps", "per_phase_tod")

## per phase tod dow
df_steps_phase_feats = get_steps_intraday_feats(list(df_steps_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"], only_incld_during_study=True)
output_features(df_steps_phase_feats, "steps", "per_phase_tod_dow")




In [None]:
# # biweekly
# df_steps_biweekly_feats = df_steps.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_biweekly_feats = df_steps_levels.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_biweekly_feats = pd.merge(df_steps_biweekly_feats, df_steps_levels_biweekly_feats,  how='left', left_on=['device_id','biweekly_lbl'], right_on = ['device_id','biweekly_lbl'])
# output_features(df_steps_biweekly_feats, "steps", "biweekly_lbl")

# # biweekly tod
# df_steps_biweekly_feats = df_steps.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_biweekly_feats = df_steps_levels.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_biweekly_feats = pd.merge(df_steps_biweekly_feats, df_steps_levels_biweekly_feats,  how='left', left_on=['device_id','biweekly_lbl', "tod_lbl"], right_on = ['device_id','biweekly_lbl', "tod_lbl"])
# output_features(df_steps_biweekly_feats, "steps", "per_biweekly_tod")

# # biweekly dow
# df_steps_biweekly_feats = df_steps.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_biweekly_feats = df_steps_levels.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_biweekly_feats = pd.merge(df_steps_biweekly_feats, df_steps_levels_biweekly_feats,  how='left', left_on=['device_id','biweekly_lbl', "dow_lbl"], right_on = ['device_id','biweekly_lbl', "dow_lbl"])
# output_features(df_steps_biweekly_feats, "steps", "per_biweekly_dow")

# # biweekly tod
# df_steps_biweekly_feats = df_steps.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_biweekly_feats = df_steps_levels.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_biweekly_feats = pd.merge(df_steps_biweekly_feats, df_steps_levels_biweekly_feats,  how='left', left_on=['device_id','biweekly_lbl', "tod_lbl", "dow_lbl"], right_on = ['device_id','biweekly_lbl', "tod_lbl", "dow_lbl"])
# output_features(df_steps_biweekly_feats, "steps", "per_biweekly_tod_dow")


In [None]:
# # monthly
# df_steps_monthly_feats = df_steps.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_monthly_feats = df_steps_levels.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_monthly_feats = pd.merge(df_steps_monthly_feats, df_steps_levels_monthly_feats,  how='left', left_on=['device_id','monthly_lbl'], right_on = ['device_id','monthly_lbl'])
# output_features(df_steps_monthly_feats, "steps", "monthly_lbl")

# # monthly tod
# df_steps_monthly_feats = df_steps.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_monthly_feats = df_steps_levels.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_monthly_feats = pd.merge(df_steps_monthly_feats, df_steps_levels_monthly_feats,  how='left', left_on=['device_id','monthly_lbl', "tod_lbl"], right_on = ['device_id','monthly_lbl', "tod_lbl"])
# output_features(df_steps_monthly_feats, "steps", "per_monthly_tod")

# # monthly dow
# df_steps_monthly_feats = df_steps.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_monthly_feats = df_steps_levels.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_monthly_feats = pd.merge(df_steps_monthly_feats, df_steps_levels_monthly_feats,  how='left', left_on=['device_id','monthly_lbl', "dow_lbl"], right_on = ['device_id','monthly_lbl', "dow_lbl"])
# output_features(df_steps_monthly_feats, "steps", "per_monthly_dow")

# # monthly tod
# df_steps_monthly_feats = df_steps.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_monthly_feats = df_steps_levels.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_monthly_feats = pd.merge(df_steps_monthly_feats, df_steps_levels_monthly_feats,  how='left', left_on=['device_id','monthly_lbl', "tod_lbl", "dow_lbl"], right_on = ['device_id','monthly_lbl', "tod_lbl", "dow_lbl"])
# output_features(df_steps_monthly_feats, "steps", "per_monthly_tod_dow")


In [None]:
# # weekly
# df_steps_weekly_feats = df_steps.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_weekly_feats = df_steps_levels.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_weekly_feats = pd.merge(df_steps_weekly_feats, df_steps_levels_weekly_feats,  how='left', left_on=['device_id','wknum_wrt_wk12'], right_on = ['device_id','wknum_wrt_wk12'])
# output_features(df_steps_weekly_feats, "steps", "weekly")

# # weekly tod
# df_steps_weekly_feats = df_steps.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_weekly_feats = df_steps_levels.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_weekly_feats = pd.merge(df_steps_weekly_feats, df_steps_levels_weekly_feats,  how='left', left_on=['device_id','wknum_wrt_wk12', "tod_lbl"], right_on = ['device_id','wknum_wrt_wk12', "tod_lbl"])
# output_features(df_steps_weekly_feats, "steps", "weekly_tod")

# # weekly dow
# df_steps_weekly_feats = df_steps.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_weekly_feats = df_steps_levels.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_weekly_feats = pd.merge(df_steps_weekly_feats, df_steps_levels_weekly_feats,  how='left', left_on=['device_id','wknum_wrt_wk12', "dow_lbl"], right_on = ['device_id','wknum_wrt_wk12', "dow_lbl"])
# output_features(df_steps_weekly_feats, "steps", "weekly_dow")

# # weekly tod
# df_steps_weekly_feats = df_steps.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()
# df_steps_levels_weekly_feats = df_steps_levels.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()
# df_steps_weekly_feats = pd.merge(df_steps_weekly_feats, df_steps_levels_weekly_feats,  how='left', left_on=['device_id','wknum_wrt_wk12', "tod_lbl", "dow_lbl"], right_on = ['device_id','wknum_wrt_wk12', "tod_lbl", "dow_lbl"])
# output_features(df_steps_weekly_feats, "steps", "weekly_tod_dow")


In [None]:
# display(df_steps_phase_feats.head(2))
# # display(df_steps_levels_phase_feats.head(2))

# display(df_steps_phase_feats.isnull().sum(axis = 0))

# display(df_steps_phase_feats[(df_steps_phase_feats["veryActiveMinutes"].isnull())])


# ## OUTPUT
# output_features(df_steps_phase_feats, "steps", "per_phase")

In [None]:
# df_steps_biweekly_feats = df_steps.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()

# df_steps_levels_biweekly_feats = df_steps_levels.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()


# df_steps_biweekly_feats = pd.merge(df_steps_biweekly_feats, df_steps_levels_biweekly_feats,  how='left', left_on=['device_id','phase_wrapper'], right_on = ['device_id','phase_wrapper'])



# df_steps_monthly_feats = df_steps.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, steps_input_feats))).reset_index()

# df_steps_levels_monthly_feats = df_steps_levels.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, steps_levels_input_feats))).reset_index()


# df_steps_monthly_feats = pd.merge(df_steps_monthly_feats, df_steps_levels_monthly_feats,  how='left', left_on=['device_id','phase_wrapper'], right_on = ['device_id','phase_wrapper'])



In [None]:
# display(df_steps_biweekly_feats.head(2))

# display(df_steps_biweekly_feats.isnull().sum(axis = 0))

# display(df_steps_biweekly_feats[(df_steps_biweekly_feats["veryActiveMinutes"].isnull())])


# display(df_steps_monthly_feats.head(2))

# display(df_steps_monthly_feats.isnull().sum(axis = 0))

# display(df_steps_monthly_feats[(df_steps_monthly_feats["veryActiveMinutes"].isnull())])



# ## OUTPUT
# output_features(df_steps_biweekly_feats, "steps", "biweekly")
# output_features(df_steps_monthly_feats, "steps", "monthly")

# Loc - Clean and add phase

In [9]:
df_loc = pd.read_csv(LOC_PATH)
df_loc["timestamp_dt"] = df_loc["timestamp"].apply(sensor_time_to_local)
df_loc["date_dt"] = df_loc["timestamp_dt"].dt.date
df_loc = df_loc.sort_values(by=["device_id", "timestamp_dt"])

# ## Add PHASE
# df_loc =  add_phase_label_wrapper(df_loc, "timestamp_dt")  

# # Add week label
# df_loc =  add_week_label(df_loc, "date_dt", df_survey) 


## ADD ALL LABELS
df_loc =  add_all_epoch_labels(df_loc, df_survey)  
df_loc_in_study = df_loc[(df_loc["is_during_study"]==True)] # helps exclude incomplete days


Processing did counts (tweak)...
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, Processing did counts...
Processing did counts...


In [10]:
# display(df_loc.head(20))
print(df_loc.columns)

display(df_loc["semantic_loc"].unique())
display(df_loc["stationary"].unique())


Index(['datetime_EST', '_id', 'timestamp', 'device_id', 'latitude',
       'longitude', 'stationary', 'time_label', 'location_label',
       'semantic_loc', 'timestamp_dt', 'date_dt', 'phase', 'phase_wrapper',
       'biweekly_lbl', 'monthly_lbl', 'biweekly_date', 'monthly_date', 'dow',
       'dow_lbl', 'hour_dt', 'tod_lbl', 'wknum_wrt_wk12', 'is_during_study'],
      dtype='object')


array(['other', 'transit', 'home', 'work'], dtype=object)

array([1, 0])

### Loc - feature function

In [11]:
LOC_DIDS = []
def add_features_loc(g):
    if "device_id" in g.columns:
        did = g["device_id"].iloc[0]
    else:
        did = g.name[0]
    if did not in LOC_DIDS:
        LOC_DIDS.append(did)
        print ("{0}, ".format(len(LOC_DIDS)), end='')
    feats_dict = {}
    feats_dict["number_of_clusters"] = number_of_clusters(g)
    feats_dict["home_stay_time_percent"] = home_stay_time_percent(g)
    feats_dict["radius_of_gyration"] = radius_of_gyration(g)
    len_stay_clust_series = len_stay_at_clusters_in_minutes(g, SAMPLE_RATE=5)
    feats_dict["std_len_stay_at_clusters_in_minutes"] = len_stay_clust_series["std_len_stay_at_clusters_in_minutes"]
    feats_dict["mean_len_stay_at_clusters_in_minutes"] = len_stay_clust_series["mean_len_stay_at_clusters_in_minutes"]
    feats_dict["pct_time_at_top_cluster_1"] = pct_time_at_top_cluster_x(g, 1)
    feats_dict["pct_time_at_top_cluster_2"] = pct_time_at_top_cluster_x(g, 2)
    feats_dict["pct_time_at_top_cluster_3"] = pct_time_at_top_cluster_x(g, 3)

    feats_dict["circadian_movement"] = circadian_movement(g)
    feats_dict["location_entropy"] = location_entropy(g)
    feats_dict["location_entropy_normalized"] = location_entropy_normalized(g)
    feats_dict["location_variance"] = location_variance(g)
    feats_dict["location_variance_log"] = location_variance_log(g)
    
#     feats_dict["number_location_transitions"] = number_location_transitions(g) -- no need

    feats_dict["total_distance"] = travel_distance_meters(g, SAMPLE_RATE=10) # sample rate is 5, but we can include distances that are 10 min apart.
    feats_dict["moving_time_percent"] = moving_time_percent(g)
    feats_dict["outliers_time_percent"] = outliers_time_percent(g)
    return pd.Series(feats_dict)


def mean_loc_feats_across_days(df_grp):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_loc(x))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean



### Locs - Get per-phase/week features


In [12]:
# tmp = df_loc.iloc[0:50000].groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_loc(x))).reset_index()
# display(tmp)

In [13]:
##per phase - use only complete days so that mean is not skewed
##we are averaging across days as people may have different number of days in each phase
df_loc_phase_feats = df_loc_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_loc_feats_across_days(x))).reset_index()
df_loc_phase_feats = df_loc_phase_feats.drop(columns=["level_2"])
output_features(df_loc_phase_feats, "loc", "per_phase")

##per-phase tod
df_loc_phase_feats = df_loc_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_loc_feats_across_days(x))).reset_index()
df_loc_phase_feats = df_loc_phase_feats.drop(columns=["level_3"])
output_features(df_loc_phase_feats, "loc", "per_phase_tod")

##per-phase dow
df_loc_phase_feats = df_loc_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_loc_feats_across_days(x))).reset_index()
df_loc_phase_feats = df_loc_phase_feats.drop(columns=["level_3"])
output_features(df_loc_phase_feats, "loc", "per_phase_dow")

##per-phase tod, dow
df_loc_phase_feats = df_loc_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_loc_feats_across_days(x))).reset_index()
df_loc_phase_feats = df_loc_phase_feats.drop(columns=["level_4"])
output_features(df_loc_phase_feats, "loc", "per_phase_tod_dow")





# # per-phase
# df_loc_phase_feats = df_loc.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_phase_feats, "loc", "per_phase")
# # per-phase tod
# df_loc_phase_feats = df_loc.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_phase_feats, "loc", "per_phase_tod")
# # per-phase dow
# df_loc_phase_feats = df_loc.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_phase_feats, "loc", "per_phase_dow")
# # per-phase tod, dow
# df_loc_phase_feats = df_loc.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_phase_feats, "loc", "per_phase_tod_dow")



1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 

In [None]:
# # biweekly
# df_loc_biweekly_feats = df_loc.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_biweekly_feats, "loc", "biweekly")
# # biweekly tod
# df_loc_biweekly_feats = df_loc.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_biweekly_feats, "loc", "biweekly_tod")
# # biweekly dow
# df_loc_biweekly_feats = df_loc.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_biweekly_feats, "loc", "biweekly_dow")
# # biweekly tod, dow
# df_loc_biweekly_feats = df_loc.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_biweekly_feats, "loc", "biweekly_tod_dow")



In [None]:
# # monthly
# df_loc_monthly_feats = df_loc.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_monthly_feats, "loc", "monthly")
# # monthly tod
# df_loc_monthly_feats = df_loc.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_monthly_feats, "loc", "monthly_tod")
# # monthly dow
# df_loc_monthly_feats = df_loc.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_monthly_feats, "loc", "monthly_dow")
# # monthly tod, dow
# df_loc_monthly_feats = df_loc.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_monthly_feats, "loc", "monthly_tod_dow")


# df_loc_biweekly_feats = df_loc.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# df_loc_monthly_feats = df_loc.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()



In [None]:
# display(df_loc_biweekly_feats.head(2))
# display(df_loc_monthly_feats.head(2))

# # OUTPUT
# output_features(df_loc_biweekly_feats, "loc", "biweekly")
# output_features(df_loc_monthly_feats, "loc", "monthly")


# # weekly
# df_loc_weekly_feats = df_loc.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_weekly_feats, "loc", "weekly")
# # weekly tod
# df_loc_weekly_feats = df_loc.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_weekly_feats, "loc", "weekly_tod")
# # weekly dow
# df_loc_weekly_feats = df_loc.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_weekly_feats, "loc", "weekly_dow")
# # weekly tod, dow
# df_loc_weekly_feats = df_loc.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_loc(x))).reset_index()
# output_features(df_loc_weekly_feats, "loc", "weekly_tod_dow")


# Wifi - Clean and add phase



In [None]:
df_wifi = pd.read_csv(WIFI_PATH)
df_wifi["timestamp_dt"] = df_wifi["timestamp"].apply(sensor_time_to_local)
df_wifi["date_dt"] = df_wifi["timestamp_dt"].dt.date
df_wifi = df_wifi.sort_values(by=["device_id", "timestamp_dt"])

# ## Add PHASE
# df_wifi =  add_phase_label_wrapper(df_wifi, "timestamp_dt")  

# # Add week label
# df_wifi =  add_week_label(df_wifi, "date_dt", df_survey)  

## ADD ALL LABELS
df_wifi =  add_all_epoch_labels(df_wifi, df_survey)  
df_wifi_in_study = df_wifi[(df_wifi["is_during_study"]==True)] # helps exclude incomplete days



In [None]:
display(df_wifi.head(2))
print(df_wifi.columns)

### Wifi - feature function

In [None]:
def add_features_wifi(g):
    feats_dict = {}
    feats_dict["number_unique_wifi_hotspots"] = number_unique_wifi_hotspots(g)
    return pd.Series(feats_dict)

def mean_wifi_feats_across_days(df_grp):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_wifi(x))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean


### Wifi - Get per-phase/week features


In [None]:
# per phase - use only complete days so that mean is not skewed
# we are averaging across days as people may have different number of days in each phase
# df_wifi_phase_feats = df_wifi_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_wifi_feats_across_days(x))).reset_index()
# df_wifi_phase_feats = df_wifi_phase_feats.drop(columns=["level_2"])
# output_features(df_wifi_phase_feats, "wifi", "per_phase")

# per-phase tod
# df_wifi_phase_feats = df_wifi_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_wifi_feats_across_days(x))).reset_index()
# df_wifi_phase_feats = df_wifi_phase_feats.drop(columns=["level_3"])
# output_features(df_wifi_phase_feats, "wifi", "per_phase_tod")

# per-phase dow
# df_wifi_phase_feats = df_wifi_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_wifi_feats_across_days(x))).reset_index()
# df_wifi_phase_feats = df_wifi_phase_feats.drop(columns=["level_3"])
# output_features(df_wifi_phase_feats, "wifi", "per_phase_dow")

# per-phase tod, dow
# df_wifi_phase_feats = df_wifi_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_wifi_feats_across_days(x))).reset_index()
# df_wifi_phase_feats = df_wifi_phase_feats.drop(columns=["level_4"])
# output_features(df_wifi_phase_feats, "wifi", "per_phase_tod_dow")



# # per-phase
# df_wifi_phase_feats = df_wifi.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_phase_feats, "wifi", "per_phase")
# # per-phase tod
# df_wifi_phase_feats = df_wifi.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_phase_feats, "wifi", "per_phase_tod")
# # per-phase dow
# df_wifi_phase_feats = df_wifi.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_phase_feats, "wifi", "per_phase_dow")
# # per-phase tod, dow
# df_wifi_phase_feats = df_wifi.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_phase_feats, "wifi", "per_phase_tod_dow")


In [None]:
# # biweekly
# df_wifi_biweekly_feats = df_wifi.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_biweekly_feats, "wifi", "biweekly")
# # biweekly tod
# df_wifi_biweekly_feats = df_wifi.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_biweekly_feats, "wifi", "biweekly_tod")
# # biweekly dow
# df_wifi_biweekly_feats = df_wifi.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_biweekly_feats, "wifi", "biweekly_dow")
# # biweekly tod, dow
# df_wifi_biweekly_feats = df_wifi.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_biweekly_feats, "wifi", "biweekly_tod_dow")


In [None]:
# df_wifi_biweekly_feats = df_wifi.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# df_wifi_monthly_feats = df_wifi.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()

# # monthly
# df_wifi_monthly_feats = df_wifi.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_monthly_feats, "wifi", "monthly")
# # monthly tod
# df_wifi_monthly_feats = df_wifi.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_monthly_feats, "wifi", "monthly_tod")
# # monthly dow
# df_wifi_monthly_feats = df_wifi.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_monthly_feats, "wifi", "monthly_dow")
# # monthly tod, dow
# df_wifi_monthly_feats = df_wifi.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_monthly_feats, "wifi", "monthly_tod_dow")


In [None]:
# display(df_wifi_biweekly_feats.head(2))
# display(df_wifi_monthly_feats.head(2))

# # # OUTPUT
# # output_features(df_wifi_biweekly_feats, "wifi", "biweekly")
# # output_features(df_wifi_monthly_feats, "wifi", "monthly")


# # weekly
# df_wifi_weekly_feats = df_wifi.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_weekly_feats, "wifi", "weekly")
# # weekly tod
# df_wifi_weekly_feats = df_wifi.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_weekly_feats, "wifi", "weekly_tod")
# # weekly dow
# df_wifi_weekly_feats = df_wifi.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_weekly_feats, "wifi", "weekly_dow")
# # weekly tod, dow
# df_wifi_weekly_feats = df_wifi.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_wifi(x))).reset_index()
# output_features(df_wifi_weekly_feats, "wifi", "weekly_tod_dow")


# Screen - Clean and add phase


In [None]:
df_scr = pd.read_csv(SCR_PATH)
df_scr["timestamp_dt"] = df_scr["timestamp"].apply(sensor_time_to_local)
df_scr["date_dt"] = df_scr["timestamp_dt"].dt.date
df_scr = df_scr.sort_values(by=["device_id", "timestamp_dt"])

# ## Add PHASE
# df_scr =  add_phase_label_wrapper(df_scr, "timestamp_dt")  

# # Add week label
# df_scr =  add_week_label(df_scr, "date_dt", df_survey)  

## ADD ALL LABELS
df_scr =  add_all_epoch_labels(df_scr, df_survey)  
df_scr_in_study = df_scr[(df_scr["is_during_study"]==True)] # helps exclude incomplete days



In [None]:
display(df_scr.head(2))
print(df_scr.columns)


### Screen - feature function


In [None]:
def add_features_scr(g):
    feats_dict = {}
    feats_dict["number_of_unlocks"] = number_of_unlocks(g)
    feats_dict["mean_unlocks_per_minute"] = mean_unlocks_per_minute(g)
#     feats_dict["median_unlocks_per_minute"] = median_unlocks_per_minute(g)
    feats_dict["interaction_time_minutes"] = interaction_time_minutes(g)
#     feats_dict["mean_interaction_time_per_use_secs"] = mean_interaction_time_per_use_secs(g)
    feats_dict["median_interaction_time_per_use_secs"] = median_interaction_time_per_use_secs(g)
    return pd.Series(feats_dict)

def mean_scr_feats_across_days(df_grp):
    df_grp_feats_per_day = df_grp.groupby("date_dt", as_index=False).apply((lambda x: add_features_scr(x))).reset_index(drop=True)
    df_grp_feats_per_day_mean = df_grp_feats_per_day.mean().to_frame().transpose()
#     display(df_grp_feats_per_day_mean)
    return df_grp_feats_per_day_mean


### Scr - Get per-phase features


In [None]:
##per phase - use only complete days so that mean is not skewed
##we are averaging across days as people may have different number of days in each phase
df_scr_phase_feats = df_scr_in_study.groupby(["device_id", "phase_wrapper"]).apply((lambda x: mean_scr_feats_across_days(x))).reset_index()
df_scr_phase_feats = df_scr_phase_feats.drop(columns=["level_2"])
output_features(df_scr_phase_feats, "scr", "per_phase")

##per-phase tod
df_scr_phase_feats = df_scr_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: mean_scr_feats_across_days(x))).reset_index()
df_scr_phase_feats = df_scr_phase_feats.drop(columns=["level_3"])
output_features(df_scr_phase_feats, "scr", "per_phase_tod")

##per-phase dow
df_scr_phase_feats = df_scr_in_study.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: mean_scr_feats_across_days(x))).reset_index()
df_scr_phase_feats = df_scr_phase_feats.drop(columns=["level_3"])
output_features(df_scr_phase_feats, "scr", "per_phase_dow")

##per-phase tod, dow
df_scr_phase_feats = df_scr_in_study.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: mean_scr_feats_across_days(x))).reset_index()
df_scr_phase_feats = df_scr_phase_feats.drop(columns=["level_4"])
output_features(df_scr_phase_feats, "scr", "per_phase_tod_dow")




# # per-phase
# df_scr_phase_feats = df_scr.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_phase_feats, "scr", "per_phase")
# # per-phase tod
# df_scr_phase_feats = df_scr.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_phase_feats, "scr", "per_phase_tod")
# # per-phase dow
# df_scr_phase_feats = df_scr.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_phase_feats, "scr", "per_phase_dow")
# # per-phase tod, dow
# df_scr_phase_feats = df_scr.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_phase_feats, "scr", "per_phase_tod_dow")



In [None]:
# # df_scr_biweekly_feats = df_scr.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# # df_scr_monthly_feats = df_scr.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()

# # biweekly
# df_scr_biweekly_feats = df_scr.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_biweekly_feats, "scr", "biweekly")
# # biweekly tod
# df_scr_biweekly_feats = df_scr.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_biweekly_feats, "scr", "biweekly_tod")
# # biweekly dow
# df_scr_biweekly_feats = df_scr.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_biweekly_feats, "scr", "biweekly_dow")
# # biweekly tod, dow
# df_scr_biweekly_feats = df_scr.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_biweekly_feats, "scr", "biweekly_tod_dow")



In [None]:
# # monthly
# df_scr_monthly_feats = df_scr.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_monthly_feats, "scr", "monthly")
# # monthly tod
# df_scr_monthly_feats = df_scr.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_monthly_feats, "scr", "monthly_tod")
# # monthly dow
# df_scr_monthly_feats = df_scr.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_monthly_feats, "scr", "monthly_dow")
# # monthly tod, dow
# df_scr_monthly_feats = df_scr.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_monthly_feats, "scr", "monthly_tod_dow")



In [None]:
# # weekly
# df_scr_weekly_feats = df_scr.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_weekly_feats, "scr", "weekly")
# # weekly tod
# df_scr_weekly_feats = df_scr.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_weekly_feats, "scr", "weekly_tod")
# # weekly dow
# df_scr_weekly_feats = df_scr.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_weekly_feats, "scr", "weekly_dow")
# # weekly tod, dow
# df_scr_weekly_feats = df_scr.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_scr(x))).reset_index()
# output_features(df_scr_weekly_feats, "scr", "weekly_tod_dow")


In [None]:
# display(df_scr_biweekly_feats.head(2))
# display(df_scr_biweekly_feats.isnull().sum(axis = 0))
# display(df_scr_monthly_feats.head(2))
# display(df_scr_monthly_feats.isnull().sum(axis = 0))

# # OUTPUT
# output_features(df_scr_biweekly_feats, "scr", "biweekly")
# output_features(df_scr_monthly_feats, "scr", "monthly")


# HR - Clean and add phase


In [None]:
df_hr = pd.read_csv(HR_PATH)
df_hr["timestamp_dt"] = pd.to_datetime(df_hr["date"], format="%Y-%m-%d")
df_hr["date_dt"] = df_hr["timestamp_dt"].dt.date
df_hr["device_id"] = df_hr["fitbit_id"].apply(lambda x: FB_MAPPING_DICT[x])
df_hr = df_hr.sort_values(by=["device_id", "date_dt"])


# ## Add PHASE
# df_hr =  add_phase_label_wrapper(df_hr, "timestamp_dt")  

hr_input_feats = ["hr_val", "out_of_range", "fat_burn", "cardio", "peak"]

# # Add week label
# df_hr =  add_week_label(df_hr, "date_dt", df_survey)  



## ADD ALL LABELS
df_hr =  add_all_epoch_labels(df_hr, df_survey) 
df_hr_in_study = df_hr[(df_hr["is_during_study"]==True)] # helps exclude incomplete days


In [None]:
display(df_hr.head(2))
print(df_hr.columns)


### HR - intraday features


In [None]:

def get_hr_feats_per_day(df_grp):
#     display(df_grp.head(2))
    df_grp_out_of_range = df_grp[(df_grp["value_hr_zone"]=="Out of Range")]
    mean_hr_out_of_range = df_grp_out_of_range["value"].mean()
    mean_hr = df_grp["value"].mean()
    minutes_per_zone_dict = df_grp["value_hr_zone"].value_counts().to_dict()
    minutes_per_zone = defaultdict(int, minutes_per_zone_dict)
    out_dict = {}
    for z in ["Out of Range", "Fat Burn", "Cardio", "Peak"]:
        out_dict[z] = minutes_per_zone[z]
    out_dict["mean_hr"] = mean_hr
    out_dict["mean_hr_out_of_range"] = mean_hr_out_of_range
    return pd.Series(out_dict)



def get_hr_intraday_feats_for_grp(df_grp):
    feat_dict = {}
#     display(df_grp.head(4))
#     display(df_grp.iloc[132:137])
    df_grp_per_day = df_grp.groupby("date_dt").apply(get_hr_feats_per_day)
    for f in ["mean_hr", "mean_hr_out_of_range", "Out of Range", "Fat Burn", "Cardio", "Peak"]:
        feat_dict[f] = df_grp_per_day[f].mean()
#     display(df_grp_per_day)
#     display(feat_dict)
    return pd.Series(feat_dict)
    

def get_hr_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study):
    ## (1) did to fitbit id and load parsed csv
    fid = FB_MAPPING_DICT_INV[did]
    fpath = HR_INTRADAY_PATH_LIKE.format(fid)
    data_df = pd.read_csv(fpath)
    data_df["device_id"] = did
    data_df["timestamp_dt"] = pd.to_datetime(data_df["datetime"], format="%Y-%m-%d %H:%M:%S")
    data_df["date_dt"] = data_df["timestamp_dt"].dt.date
    ## (2) add phase, tod, and dow lbls
    data_df =  add_all_epoch_labels(data_df, df_survey, print_out=False) 
    if only_incld_during_study:
        data_df = data_df[(data_df["is_during_study"]==True)] # helps exclude incomplete days
    #print (len(data_df))
#     display(data_df.head(20))
#     display(data_df.iloc[30000:30020])
    ## (3) function to extract features from group
    ## (4) grp by tod_lbl, and grp by tod_lbl and dow_lbl
    feats_df_grp = data_df.groupby(grpbycols).apply(get_hr_intraday_feats_for_grp)
    feats_df_grp = feats_df_grp.reset_index()
#     print ("features")
#     display(feats_df_grp)
    return feats_df_grp
    
    
def get_hr_intraday_feats(did_list, df_survey, grpbycols, only_incld_during_study):
    did_df_list = []
    cnt = 0
    print ("Processing did cnts intraday...")
    for did in did_list:
        cnt += 1
        print ("{0}, ".format(cnt), end='')
        did_df = get_hr_intraday_feats_for_did(did, df_survey, grpbycols, only_incld_during_study)
        did_df_list.append(did_df)
#         if cnt>5:
#             break ## debugging
    did_df_all = pd.concat(did_df_list)
    return (did_df_all)
    
    

In [None]:
# tmp = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "phase_wrapper"])

# display (tmp.head(5))


### HR - Get per-phase/week features
- We'll use ONLY intraday features. I
gnore daily features completely!

In [None]:
# df_hr_phase_feats = df_hr.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# display(df_hr_phase_feats.head(2))
# display(df_hr_phase_feats.isnull().sum(axis = 0))
# # OUTPUT
# output_features(df_hr_phase_feats, "hr", "per_phase")


# # per-phase
# df_hr_phase_feats = df_hr.groupby(["device_id", "phase_wrapper"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_phase_feats, "hr", "per_phase")
# # per-phase tod
# df_hr_phase_feats = df_hr.groupby(["device_id", "phase_wrapper", "tod_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_phase_feats, "hr", "per_phase_tod")
# # per-phase dow
# df_hr_phase_feats = df_hr.groupby(["device_id", "phase_wrapper", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_phase_feats, "hr", "per_phase_dow")
# # per-phase tod, dow
# df_hr_phase_feats = df_hr.groupby(["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_phase_feats, "hr", "per_phase_tod_dow")

# per phase
df_hr_phase_feats = get_hr_intraday_feats(list(df_hr_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper"], only_incld_during_study=True)
output_features(df_hr_phase_feats, "hr", "per_phase")
# per-phase tod
df_hr_phase_feats = get_hr_intraday_feats(list(df_hr_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl"], only_incld_during_study=True)
output_features(df_hr_phase_feats, "hr", "per_phase_tod")
# per-phase dow
df_hr_phase_feats = get_hr_intraday_feats(list(df_hr_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "dow_lbl"], only_incld_during_study=True)
output_features(df_hr_phase_feats, "hr", "per_phase_dow")
# per-phase tod, dow
df_hr_phase_feats = get_hr_intraday_feats(list(df_hr_in_study["device_id"].unique()), df_survey, ["device_id", "phase_wrapper", "tod_lbl", "dow_lbl"], only_incld_during_study=True)
output_features(df_hr_phase_feats, "hr", "per_phase_tod_dow")



In [None]:
# df_hr_biweekly_feats = df_hr.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# df_hr_monthly_feats = df_hr.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()

# # biweekly
# df_hr_biweekly_feats = df_hr.groupby(["device_id", "biweekly_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_biweekly_feats, "hr", "biweekly")
# # biweekly tod
# df_hr_biweekly_feats = df_hr.groupby(["device_id", "biweekly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_biweekly_feats, "hr", "biweekly_tod")
# # biweekly dow
# df_hr_biweekly_feats = df_hr.groupby(["device_id", "biweekly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_biweekly_feats, "hr", "biweekly_dow")
# # biweekly tod, dow
# df_hr_biweekly_feats = df_hr.groupby(["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_biweekly_feats, "hr", "biweekly_tod_dow")

# # biweekly
# df_hr_biweekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl"])
# output_features(df_hr_biweekly_feats, "hr", "biweekly")
# # biweekly tod
# df_hr_biweekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl", "tod_lbl"])
# output_features(df_hr_biweekly_feats, "hr", "biweekly_tod")
# # biweekly dow
# df_hr_biweekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl", "dow_lbl"])
# output_features(df_hr_biweekly_feats, "hr", "biweekly_dow")
# # biweekly tod dow
# df_hr_biweekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "biweekly_lbl", "tod_lbl", "dow_lbl"])
# output_features(df_hr_biweekly_feats, "hr", "biweekly_tod_dow")



In [None]:
# # monthly
# df_hr_monthly_feats = df_hr.groupby(["device_id", "monthly_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_monthly_feats, "hr", "monthly")
# # monthly tod
# df_hr_monthly_feats = df_hr.groupby(["device_id", "monthly_lbl", "tod_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_monthly_feats, "hr", "monthly_tod")
# # monthly dow
# df_hr_monthly_feats = df_hr.groupby(["device_id", "monthly_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_monthly_feats, "hr", "monthly_dow")
# # monthly tod, dow
# df_hr_monthly_feats = df_hr.groupby(["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_monthly_feats, "hr", "monthly_tod_dow")

# # monthly
# df_hr_monthly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "monthly_lbl"])
# output_features(df_hr_monthly_feats, "hr", "monthly")
# # monthly tod
# df_hr_monthly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "monthly_lbl", "tod_lbl"])
# output_features(df_hr_monthly_feats, "hr", "monthly_tod")
# # monthly dow
# df_hr_monthly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "monthly_lbl", "dow_lbl"])
# output_features(df_hr_monthly_feats, "hr", "monthly_dow")
# # monthly tod, dow
# df_hr_monthly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "monthly_lbl", "tod_lbl", "dow_lbl"])
# output_features(df_hr_monthly_feats, "hr", "monthly_tod_dow")




In [None]:
# display(df_hr_biweekly_feats.head(2))
# display(df_hr_biweekly_feats.isnull().sum(axis = 0))
# display(df_hr_monthly_feats.head(2))
# display(df_hr_monthly_feats.isnull().sum(axis = 0))

# # # OUTPUT
# # output_features(df_hr_biweekly_feats, "hr", "biweekly")
# output_features(df_hr_monthly_feats, "hr", "monthly")


# # weekly
# df_hr_weekly_feats = df_hr.groupby(["device_id", "wknum_wrt_wk12"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_weekly_feats, "hr", "weekly")
# # weekly tod
# df_hr_weekly_feats = df_hr.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_weekly_feats, "hr", "weekly_tod")
# # weekly dow
# df_hr_weekly_feats = df_hr.groupby(["device_id", "wknum_wrt_wk12", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_weekly_feats, "hr", "weekly_dow")
# # weekly tod, dow
# df_hr_weekly_feats = df_hr.groupby(["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"]).apply((lambda x: add_features_mean(x, hr_input_feats))).reset_index()
# output_features(df_hr_weekly_feats, "hr", "weekly_tod_dow")


# # weekly
# df_hr_weekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12"])
# output_features(df_hr_weekly_feats, "hr", "weekly")
# # weekly tod
# df_hr_weekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12", "tod_lbl"])
# output_features(df_hr_weekly_feats, "hr", "weekly_tod")
# # weekly dow
# df_hr_weekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12", "dow_lbl"])
# output_features(df_hr_weekly_feats, "hr", "weekly_dow")
# # weekly tod, dow
# df_hr_weekly_feats = get_hr_intraday_feats(list(df_hr["device_id"].unique()), df_survey, ["device_id", "wknum_wrt_wk12", "tod_lbl", "dow_lbl"])
# output_features(df_hr_weekly_feats, "hr", "weekly_tod_dow")
