In [3]:
from src.utils import data_load
import pandas as pd
import matplotlib.pyplot as plt
from src.s3_utils import pandas_from_csv_s3
import re
import datetime
import seaborn as sns
import numpy as np
from collections import defaultdict
import os
import pickle
import json
import math
import random
random.seed(0)
np.random.seed(0)

In [4]:
keys = {'oura_sleep', 'oura_activity', 'birth'}
data = data_load(data_keys=keys, wave=5)

In [7]:
oura_sleep_list = ['hr_lowest', 'hr_average', 'rmssd', 'deep', 'light', 'awake', 'rem']
oura_activity_list = ['high', 'medium', 'low', 'inactive', 'rest', 'met_min_inactive', 'met_min_low', 'met_min_medium', 'met_min_high', 'average_met']
birth_list = ['user_id', 'birth_date', 'birth_scheduled', 'birth_gestage']

In [8]:
oura_df = data['oura_sleep'][['user_id', 'date'] + oura_sleep_list]
oura_activity = data['oura_activity'][['user_id', 'date'] + oura_activity_list]
oura_df = pd.merge(oura_df, oura_activity, on=['user_id', 'date'], how='inner')
oura_df['date'] = pd.to_datetime(oura_df['date'])
birth_df = data['birth'][birth_list]
birth_df['birth_date'] = pd.to_datetime(birth_df['birth_date'])

In [9]:
data['oura_sleep'].columns

Index(['id', 'user_id', 'identity_id', 'created_at', 'updated_at',
       'retrieved_at', 'subsource', 'event_date', 'awake', 'bedtime_end',
       'bedtime_end_delta', 'bedtime_start', 'bedtime_start_delta',
       'breath_average', 'deep', 'duration', 'efficiency', 'hr_5min',
       'hr_average', 'hr_lowest', 'hypnogram_5min', 'is_longest', 'light',
       'midpoint_at_delta', 'midpoint_time', 'onset_latency', 'period_id',
       'rem', 'restless', 'rmssd', 'rmssd_5min', 'score', 'score_alignment',
       'score_deep', 'score_disturbances', 'score_efficiency', 'score_latency',
       'score_rem', 'score_total', 'temperature_delta',
       'temperature_deviation', 'temperature_trend_deviation', 'timezone',
       'total', 'date'],
      dtype='object')

In [31]:
clean_positive_data = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id', 'start_date'] })
delta = 3
start = 5
for uid in birth_df['user_id'].unique():
    df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
    if len(df) > 0:
        for slide_i in range(start - delta + 1):
            birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
            start_date = birth_date + pd.to_timedelta(-start + slide_i + 1, unit='d')
            end_date = start_date + pd.to_timedelta(delta, unit='d')
            each_df = df[(df['date'] >= start_date) & (df['date'] < end_date)]
            if len(each_df) == delta:   
                each_df = each_df.dropna()
                if len(each_df) == delta:
                    clean_positive_data['user_id'].append(int(uid))   
                    clean_positive_data['start_date'].append((birth_date - start_date).days)
                    for col in oura_sleep_list + oura_activity_list:
                        clean_positive_data[col].append(np.array(each_df[col].tolist()))

In [32]:
print(len(clean_positive_data['user_id']), len(set(clean_positive_data['user_id'])))

282 136


In [35]:
clean_positive_data['hr_average'][0]

array([72.79, 73.41, 68.65])

In [36]:
def random_pick(start_date, end_date, duration):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + datetime.timedelta(days=random_number_of_days)

    return [random_date, random_date + datetime.timedelta(days=duration)]

In [37]:
oura_sleep_list + oura_activity_list

['hr_lowest',
 'hr_average',
 'rmssd',
 'deep',
 'light',
 'awake',
 'rem',
 'high',
 'medium',
 'low',
 'inactive',
 'rest',
 'met_min_inactive',
 'met_min_low',
 'met_min_medium',
 'met_min_high',
 'average_met']

In [64]:
clean_negative_data_same = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id'] })
for uid in set(clean_positive_data['user_id']):
    df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
    if len(df) > 0:
        birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
        end_date = birth_date + pd.to_timedelta(-start, unit='d')

        # start_date = df['date'].min()
        start_date = end_date + pd.to_timedelta(-30, unit='d')
        
        for count in range(1000):
            random_days = random_pick(start_date, end_date, delta)
            each_df = df[(df['date'] >= random_days[0]) & (df['date'] < random_days[1])]
            if len(each_df) >= delta:
                each_df = each_df.dropna()
                if len(each_df) == delta:
                    clean_negative_data_same['user_id'].append(int(uid))   
                    clean_negative_data_same['start_date'].append((birth_date - random_days[0]).days)
                    for col in oura_sleep_list + oura_activity_list:
                        clean_negative_data_same[col].append(np.array(each_df[col].tolist()))
                    break
            else:
                continue

In [65]:
clean_negative_data_diff = defaultdict(list,{ k:[] for k in oura_sleep_list + oura_activity_list + ['user_id'] })
for uid in birth_df['user_id'].unique():
    if uid not in clean_positive_data['user_id']:
        df = oura_df.loc[oura_df['user_id'] == uid].sort_values(by='date')
        if len(df) > 0:
            birth_date = birth_df.loc[birth_df['user_id'] == uid]['birth_date'].tolist()[0]
            end_date = birth_date + pd.to_timedelta(-start, unit='d')

            # start_date = df['date'].min()
            start_date = end_date + pd.to_timedelta(-30, unit='d')

            if start_date < end_date:
                for count in range(1000):  
                    random_days = random_pick(start_date, end_date, delta)
                    each_df = df[(df['date'] >= random_days[0]) & (df['date'] < random_days[1])]
                    if len(each_df) >= delta:
                        each_df = each_df.dropna()
                        if len(each_df) == delta:
                            clean_negative_data_diff['user_id'].append(int(uid))   
                            clean_negative_data_diff['start_date'].append((birth_date - random_days[0]).days)
                            for col in oura_sleep_list + oura_activity_list:
                                clean_negative_data_diff[col].append(np.array(each_df[col].tolist()))
                            break
                    else:
                        continue
            else:
                print(start_date, end_date, birth_date, uid)

In [66]:
print(len(clean_negative_data_same['hr_average'][0]))
print(len(clean_negative_data_diff['hr_average'][0]))
print(len(clean_positive_data['hr_average'][0]))

3
3
3


In [67]:
clean_positive_data['user_id'] = np.array(clean_positive_data['user_id'])
clean_negative_data_diff['user_id'] = np.array(clean_negative_data_diff['user_id'])
clean_negative_data_same['user_id'] = np.array(clean_negative_data_same['user_id'])

In [68]:
clean_positive_data['user_id']

array([  29,   30,   30,   30,   37,   39,   39,   39,   42,   42,   45,
         53,   55,   55,   62,   62,   64,   64,   66,   66,   67,   67,
         67,   74,   74,   95,   95,   99,   99,  122,  122,  137,  137,
        158,  159,  168,  168, 1000, 1000, 1000, 1400, 1400,  173,  173,
        174,  185,  186,  186, 1035,  190,  192,  192,  192,  193,  199,
        200,  200, 1021, 1021,  976,  976,  972,  972, 1724, 1724, 1724,
       1004, 1004,  234,  975,  975,  975,  289,  289,  290,  404,  404,
        404,  407,  407,  407,  408,  408,  410,  977,  977,  977, 1047,
       1047,  428,  428,  428,  429,  980,  980,  980,  581,  581,  603,
        603,  603,  604,  604,  604, 1032, 1032,  615,  615,  615,  734,
       1658, 1658, 1723, 1723,  983,  983,  983, 1659,  969,  989,  989,
        991,  991,  991,  992,  992, 1374, 1374,  997, 1005, 1005, 1023,
       1023, 1023, 1024, 1024, 1041, 1041, 1403, 1403, 1037, 1037, 1038,
       1038, 1044, 1044, 1369, 1370, 1370, 1759, 13

In [69]:
np.where(clean_positive_data['user_id']==30)

(array([1, 2, 3]),)

In [70]:
train = {'X': [], 'y': [], 'uid': [], 'start_date': [], 'feature_name': []}
test = {'X': [], 'y': [], 'uid': [], 'start_date': [], 'feature_name': []}

both_users = list(set(clean_positive_data['user_id']))
random_seeds = list(range(len(both_users)))
positive_len = int(len(both_users) * 0.8)
random.shuffle(random_seeds)
train_ids = list(np.array(both_users)[random_seeds[:positive_len]]) # user ids of training 
test_ids = list(np.array(both_users)[random_seeds[positive_len:]]) # user ids of testing
train_indices_pos = []
train_indices_neg_same = []
train_indices_neg_diff = []
test_indices_pos = []
test_indices_neg_same = []
test_indices_neg_diff = []
for x in train_ids:
    train_indices_pos.extend(np.where(clean_positive_data['user_id']==x)[0].tolist())
    train_indices_neg_same.extend(np.where(clean_negative_data_same['user_id']==x)[0].tolist())
for x in test_ids:
    test_indices_pos.extend(np.where(clean_positive_data['user_id']==x)[0].tolist())
    test_indices_neg_same.extend(np.where(clean_negative_data_same['user_id']==x)[0].tolist())

for i in range(len(train_indices_neg_same)):
    x_processed = []
    for feature in oura_activity_list + oura_sleep_list:
        x_processed.append(clean_negative_data_same[feature][i])
    train['X'].append(np.vstack(x_processed))
    train['y'].append(0)
    train['uid'].append(clean_negative_data_same['user_id'][i])
    train['start_date'].append(clean_negative_data_same['start_date'][i])

for i in range(len(train_indices_pos)):
    x_processed = []
    for feature in oura_activity_list + oura_sleep_list:
        x_processed.append(clean_positive_data[feature][i])
    train['X'].append(np.vstack(x_processed))
    train['y'].append(1)
    train['uid'].append(clean_positive_data['user_id'][i])
    train['start_date'].append(clean_positive_data['start_date'][i])

for i in range(len(test_indices_neg_same)):
    x_processed = []
    for feature in oura_activity_list + oura_sleep_list:
        x_processed.append(clean_negative_data_same[feature][i])
    test['X'].append(np.vstack(x_processed))
    if np.isnan(test['X'][-1]).any():
        print(np.isnan(test['X'][-1]).any(), clean_negative_data_same['user_id'][i])
    test['y'].append(0)
    test['uid'].append(clean_negative_data_same['user_id'][i])
    test['start_date'].append(clean_negative_data_same['start_date'][i])

for i in range(len(test_indices_pos)):
    x_processed = []
    for feature in oura_activity_list + oura_sleep_list:
        x_processed.append(clean_positive_data[feature][i])
    test['X'].append(np.vstack(x_processed))
    test['y'].append(1)
    test['uid'].append(clean_positive_data['user_id'][i])
    test['start_date'].append(clean_positive_data['start_date'][i])

In [71]:
only_neg_users = clean_negative_data_diff['user_id']
negative_len = int(len(only_neg_users) * 0.8)
random.shuffle(only_neg_users)
for i in range(len(only_neg_users)):
    x_processed = []
    for feature in oura_activity_list + oura_sleep_list:
        x_processed.append(clean_negative_data_diff[feature][i])
    if i < negative_len:
        train['X'].append(x_processed)
        train['y'].append(0)
        train['uid'].append(only_neg_users[i])
        train['start_date'].append(clean_negative_data_diff['start_date'][i])
    else:
        test['X'].append(x_processed)
        test['y'].append(0)
        test['uid'].append(only_neg_users[i])
        test['start_date'].append(clean_negative_data_diff['start_date'][i])

train['feature_name'] = ['hr_5min', 'rmssd_5min', 'hypnogram_5min', 'class_5min', 'met_5min']
test['feature_name'] = ['hr_5min', 'rmssd_5min', 'hypnogram_5min', 'class_5min', 'met_5min']

In [72]:
train['X'] = np.stack(train['X'])
test['X'] = np.stack(test['X'])

In [73]:
len(train['start_date'])

395

In [74]:
train['X'].shape

(395, 17, 3)

In [75]:
with open('data/daily/train_month.pickle', 'wb') as handle:
    pickle.dump(train, handle)
with open('data/daily/test_month.pickle', 'wb') as handle:
    pickle.dump(test, handle)