In [18]:
from src.utils import data_load
import pandas as pd
import matplotlib.pyplot as plt
from src.s3_utils import pandas_from_csv_s3
from sklearn.model_selection import KFold
import re
import datetime
import seaborn as sns
import numpy as np
from collections import defaultdict
import os
import pickle
import json
import math
import random
random.seed(0)
np.random.seed(0)

In [19]:
keys = {'oura_sleep', 'oura_activity', 'birth'}
data = data_load(data_keys=keys, wave=5)

In [20]:
oura_sleep_list = ['hr_lowest', 'hr_average', 'rmssd', 'deep', 'light', 'awake', 'rem']
oura_activity_list = []
birth_list = ['user_id', 'birth_date', 'birth_scheduled', 'birth_gestage']

In [21]:
oura_df = data['oura_sleep'][['user_id', 'date'] + oura_sleep_list]
oura_activity = data['oura_activity'][['user_id', 'date'] + oura_activity_list]
oura_df = pd.merge(oura_df, oura_activity, on=['user_id', 'date'], how='inner')
oura_df['date'] = pd.to_datetime(oura_df['date'])
birth_df = data['birth'][birth_list]
birth_df['birth_date'] = pd.to_datetime(birth_df['birth_date'])

In [22]:
data['oura_sleep'].columns

Index(['id', 'user_id', 'identity_id', 'created_at', 'updated_at',
       'retrieved_at', 'subsource', 'event_date', 'awake', 'bedtime_end',
       'bedtime_end_delta', 'bedtime_start', 'bedtime_start_delta',
       'breath_average', 'deep', 'duration', 'efficiency', 'hr_5min',
       'hr_average', 'hr_lowest', 'hypnogram_5min', 'is_longest', 'light',
       'midpoint_at_delta', 'midpoint_time', 'onset_latency', 'period_id',
       'rem', 'restless', 'rmssd', 'rmssd_5min', 'score', 'score_alignment',
       'score_deep', 'score_disturbances', 'score_efficiency', 'score_latency',
       'score_rem', 'score_total', 'temperature_delta',
       'temperature_deviation', 'temperature_trend_deviation', 'timezone',
       'total', 'date'],
      dtype='object')

In [39]:
clean_positive_data = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id', 'start_date'] })
delta = 3
start = 5
for uid in birth_df['user_id'].unique():
    df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
    if len(df) > 0:
        for slide_i in range(start - delta + 1):
            birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
            birth_scheduled = birth_df.loc[birth_df['user_id'] == uid]['birth_scheduled'].tolist()[0]
            if birth_scheduled == 2:
                start_date = birth_date + pd.to_timedelta(-start + slide_i + 1, unit='d')
                end_date = start_date + pd.to_timedelta(delta, unit='d')
                each_df = df[(df['date'] >= start_date) & (df['date'] < end_date)]
                if len(each_df) == delta:   
                    each_df = each_df.dropna()
                    if len(each_df) == delta:
                        clean_positive_data['user_id'].append(int(uid))   
                        clean_positive_data['start_date'].append((birth_date - start_date).days)
                        for col in oura_sleep_list + oura_activity_list:
                            clean_positive_data[col].append(np.array(each_df[col].tolist()))

In [40]:
print(len(clean_positive_data['user_id']), len(set(clean_positive_data['user_id'])))

133 65


In [41]:
clean_positive_data['hr_average'][0]

array([72.79, 73.41, 68.65])

In [42]:
def random_pick(start_date, end_date, duration):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + datetime.timedelta(days=random_number_of_days)

    return [random_date, random_date + datetime.timedelta(days=duration)]

In [43]:
oura_sleep_list + oura_activity_list

['hr_lowest', 'hr_average', 'rmssd', 'deep', 'light', 'awake', 'rem']

In [53]:
cut_off_day = 30
clean_negative_data_same = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id'] })
for uid in set(clean_positive_data['user_id']):
    df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
    if len(df) > 0:
        birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
        birth_scheduled = birth_df.loc[birth_df['user_id'] == uid]['birth_scheduled'].tolist()[0]
        if birth_scheduled == 2:
            end_date = birth_date + pd.to_timedelta(-start, unit='d')

            start_date = df['date'].min()
            # start_date = end_date + pd.to_timedelta(-cut_off_day, unit='d')
            
            for count in range(1000):
                random_days = random_pick(start_date, end_date, delta)
                each_df = df[(df['date'] >= random_days[0]) & (df['date'] < random_days[1])]
                if len(each_df) >= delta:
                    each_df = each_df.dropna()
                    if len(each_df) == delta:
                        clean_negative_data_same['user_id'].append(int(uid))   
                        clean_negative_data_same['start_date'].append((birth_date - random_days[0]).days)
                        for col in oura_sleep_list + oura_activity_list:
                            clean_negative_data_same[col].append(np.array(each_df[col].tolist()))
                        break
                else:
                    continue

In [54]:
clean_negative_data_diff = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id'] })
for uid in birth_df['user_id'].unique():
    if uid not in clean_positive_data['user_id']:
        df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
        if len(df) > 0:
            birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
            birth_scheduled = birth_df.loc[birth_df['user_id'] == uid]['birth_scheduled'].tolist()[0]
            if birth_scheduled == 2:
                end_date = birth_date + pd.to_timedelta(-start, unit='d')

                start_date = df['date'].min()
                # start_date = end_date + pd.to_timedelta(-cut_off_day, unit='d')

                if start_date < end_date:
                    for count in range(1000):  
                        random_days = random_pick(start_date, end_date, delta)
                        each_df = df[(df['date'] >= random_days[0]) & (df['date'] < random_days[1])]
                        if len(each_df) >= delta:
                            each_df = each_df.dropna()
                            if len(each_df) == delta:
                                clean_negative_data_diff['user_id'].append(int(uid))   
                                clean_negative_data_diff['start_date'].append((birth_date - random_days[0]).days)
                                for col in oura_sleep_list + oura_activity_list:
                                    clean_negative_data_diff[col].append(np.array(each_df[col].tolist()))
                                break
                        else:
                            continue
                else:
                    print(start_date, end_date, birth_date, uid)

In [55]:
print(len(clean_negative_data_same['hr_average'][0]))
print(len(clean_negative_data_diff['hr_average'][0]))
print(len(clean_positive_data['hr_average'][0]))

3
3
3


In [56]:
clean_positive_data['user_id'] = np.array(clean_positive_data['user_id'])
clean_negative_data_diff['user_id'] = np.array(clean_negative_data_diff['user_id'])
clean_negative_data_same['user_id'] = np.array(clean_negative_data_same['user_id'])

In [52]:
len(clean_negative_data_same['user_id'].tolist() + clean_negative_data_diff['user_id'].tolist())

110

In [32]:
# k-fold users that have both pos and neg windows
train = {'X': [], 'y': [], 'uid': [], 'start_date': [], 'feature_name': []}
test = {'X': [], 'y': [], 'uid': [], 'start_date': [], 'feature_name': []}

both_users = list(set(clean_positive_data['user_id']))
kf_both = KFold(n_splits=10, random_state=0, shuffle=True)
for fold_i, (train_index, test_index) in enumerate(kf_both.split(both_users)):
    print(f"Fold {fold_i}")
    train_X_lst, train_y_lst, train_uid_lst, train_start_date_lst = [], [], [], []
    test_X_lst, test_y_lst, test_uid_lst, test_start_date_lst = [], [], [], []
    train_ids = list(np.array(both_users)[train_index]) # user ids of training 
    test_ids = list(np.array(both_users)[test_index]) # user ids of testing

    # initialize 
    train_indices_pos = []
    train_indices_neg_same = []
    test_indices_pos = []
    test_indices_neg_same = []
    # get train and test user ids
    for x in train_ids:
        train_indices_pos.extend(np.where(clean_positive_data['user_id']==x)[0].tolist())
        train_indices_neg_same.extend(np.where(clean_negative_data_same['user_id']==x)[0].tolist())
    for x in test_ids:
        test_indices_pos.extend(np.where(clean_positive_data['user_id']==x)[0].tolist())
        test_indices_neg_same.extend(np.where(clean_negative_data_same['user_id']==x)[0].tolist())

    # append train data for both pos and neg classes
    for i in train_indices_neg_same:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_negative_data_same[feature][i])
        train_X_lst.append(np.vstack(x_processed))
        train_y_lst.append(0)
        train_uid_lst.append(clean_negative_data_same['user_id'][i])
        train_start_date_lst.append(clean_negative_data_same['start_date'][i])
    for i in train_indices_pos:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_positive_data[feature][i])
        train_X_lst.append(np.vstack(x_processed))
        train_y_lst.append(1)
        train_uid_lst.append(clean_positive_data['user_id'][i])
        train_start_date_lst.append(clean_positive_data['start_date'][i])

    # append test data for both pos and neg classes
    # add this to training
    for i in test_indices_neg_same:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_negative_data_same[feature][i])
        train_X_lst.append(np.vstack(x_processed))
        train_y_lst.append(0)
        train_uid_lst.append(clean_negative_data_same['user_id'][i])
        train_start_date_lst.append(clean_negative_data_same['start_date'][i])

    for i in test_indices_pos:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_positive_data[feature][i])
        test_X_lst.append(np.vstack(x_processed))
        test_y_lst.append(1)
        test_uid_lst.append(clean_positive_data['user_id'][i])
        test_start_date_lst.append(clean_positive_data['start_date'][i])
    
    # append both train and test data to the main data dict
    train['X'].append(train_X_lst)
    train['y'].append(train_y_lst)
    train['uid'].append(train_uid_lst)
    train['start_date'].append(train_start_date_lst)
    test['X'].append(test_X_lst)
    test['y'].append(test_y_lst)
    test['uid'].append(test_uid_lst)
    test['start_date'].append(test_start_date_lst)


Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9


In [33]:
# k-fold users that only have neg windows
only_neg_users = clean_negative_data_diff['user_id']
kf_only = KFold(n_splits=10, random_state=0, shuffle=True)
for fold_i, (train_index, test_index) in enumerate(kf_only.split(only_neg_users)):
    print(f"Fold {fold_i}")
    # initialize 
    train_indices_neg_diff = []
    test_indices_neg_diff = []
    train_ids = list(np.array(only_neg_users)[train_index]) # user ids of training 
    test_ids = list(np.array(only_neg_users)[test_index]) # user ids of testing

    # get train and test user ids
    for x in train_ids:
        train_indices_neg_diff.extend(np.where(clean_negative_data_diff['user_id']==x)[0].tolist())
    for x in test_ids:
        test_indices_neg_diff.extend(np.where(clean_negative_data_diff['user_id']==x)[0].tolist())

    # append train and test data for both neg diff classes
    for i in train_indices_neg_diff:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_negative_data_diff[feature][i])
        train['X'][fold_i].append(np.vstack(x_processed))
        train['y'][fold_i].append(0)
        train['uid'][fold_i].append(clean_negative_data_diff['user_id'][i])
        train['start_date'][fold_i].append(clean_negative_data_diff['start_date'][i])
    for i in test_indices_neg_diff:
        x_processed = []
        for feature in oura_activity_list + oura_sleep_list:
            x_processed.append(clean_negative_data_diff[feature][i])
        train['X'][fold_i].append(np.vstack(x_processed))
        train['y'][fold_i].append(0)
        train['uid'][fold_i].append(clean_negative_data_diff['user_id'][i])
        train['start_date'][fold_i].append(clean_negative_data_diff['start_date'][i])

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9


In [34]:
for fold_i in range(len(train['X'])):
    train['X'][fold_i] = np.stack(train['X'][fold_i])
    each_train = {'X': train['X'][fold_i], 'y': train['y'][fold_i], 'uid': train['uid'][fold_i], 'start_date': train['start_date'][fold_i], 'feature_name': oura_activity_list + oura_sleep_list}
    # with open(f'/repos/Delivery-Readiness/data/daily_new_split_non_schedule/largest_window/train_{fold_i}.pickle', 'wb') as handle:
    with open(f'/repos/Delivery-Readiness/data/daily_new_split_non_schedule/{cut_off_day}days/train_{fold_i}.pickle', 'wb') as handle:
        pickle.dump(each_train, handle)
for fold_i in range(len(test['X'])):
    test['X'][fold_i] = np.stack(test['X'][fold_i])
    each_test = {'X': test['X'][fold_i], 'y': test['y'][fold_i], 'uid': test['uid'][fold_i], 'start_date': test['start_date'][fold_i], 'feature_name': oura_activity_list + oura_sleep_list}
    # with open(f'/repos/Delivery-Readiness/data/daily_new_split_non_schedule/largest_window/test_{fold_i}.pickle', 'wb') as handle:
    with open(f'/repos/Delivery-Readiness/data/daily_new_split_non_schedule/{cut_off_day}days/test_{fold_i}.pickle', 'wb') as handle:
        pickle.dump(each_test, handle)