Registration information:

- [The relationship between sleep and academic performance in first-year college students: a longitudinal, multi-university analysis](https://osf.io/5xngv?view_only=bf5d479c7e4a409dbd72b15165bfd560)
- [The relationship between sleep and academic performance in first-year college students: a longitudinal, multi-university analysis (NetHealth continuation)](https://osf.io/x76b4?view_only=3d8f8c27841d4a5e90fce711c936c0ed)

In [1]:
import os
import pandas as pd
import numpy as np
from numpy.polynomial.polynomial import polyfit
import csv, json, sys
import math
import matplotlib.pyplot as plt
import matplotlib.dates as dates

from shutil import copyfile
from pandas.io.json import json_normalize
from tqdm import tqdm
from datetime import datetime, timedelta, time, date
from sklearn.linear_model import LinearRegression
from scipy import stats
from statistics import median
import statsmodels.api as sm
import warnings
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.transforms import offset_copy
import seaborn as sns


#warnings.simplefilter("ignore")


**Start Of UW-Specific Work**

The raw UWEXP Fitbit data was provided as JSON files. We converted these to csv files in the same format as the Life@CMU datasets to run this data through the same pipeline. The following section converts the raw UWEXP data (given in "Daily Sleep/") to "sleep_raw_data/" and "steps_raw_data/"

In [None]:
# copy all json files from raw data to json folder
def copyAllJsonFilesFromRawToJsonFolder(phase):

    if phase == 'uw1':
        uw = 'Fitbit UWEXPI/'
    elif phase == 'uw2':
        uw = 'Fitbit UWEXPII/'
    else:
        raise('Invalid phase argument')

    raw_dir = os.path.join(uw, 'Daily Sleep')
    csv_dir = os.path.join(uw, 'daily_sleep_raw_csv')
    json_dir = os.path.join(uw, 'daily_sleep_raw_json')

    # create directories if they don't yet exist
    dir_list = [csv_dir, json_dir]
    for d in dir_list:
        if not os.path.exists(d): 
            os.makedirs(d)

    for f in os.listdir(raw_dir):
        if f[-4:] == 'json': 
            copyfile(os.path.join(raw_dir,f),
                     os.path.join(json_dir,f))

In [None]:
# UNCOMMENT TO RUN
copyAllJsonFilesFromRawToJsonFolder('uw1')
copyAllJsonFilesFromRawToJsonFolder('uw2')

In [None]:
# Converts one section of sleep episode data to a pandas dataframe
# input: json_data[0]['sleep'][0] = one sleep episode data in JSON
# output: dataframe with a dateTime = '%Y-%m-%d %H:%M:%S' and 'value' column
def getDF(json_sleep_episode):
    
    # get the JSON values as dataframe
    json_df = json_normalize(json_sleep_episode['minuteData']) # YSS: could just use pd.DataFrame(json_sleep_episode['minuteData'])
    
    # create a minute index with all the minutes in the sleep episode
    start = json_sleep_episode['startTime']
    end = json_sleep_episode['endTime']
    TIME_INDEX = pd.date_range(start=start,end=end,freq='min')[:-1] # [:-1] b/c json excludes end point
    
    #YSS
    if len(TIME_INDEX) != json_df.shape[0]:
        # it is expected that there is a one-to-one matching between dateTime values in minuteData 
        # and the time index generated between start and end times
        print('\t\tinvestigate mismatch in timing of sleep data')
    
    # build final dataframe
    df = pd.DataFrame(index=TIME_INDEX)
    df['dt'] = df.index.strftime('%Y-%m-%d %H:%M:%S')
    df['time'] = df['dt'].apply(lambda x: x.split()[1])
    df = df.merge(json_df, how='left', left_on='time', right_on='dateTime')
    df = df[['dt', 'value']]
    df = df.rename(columns={'dt':'dateTime'})
    
    #YSS
    if len(TIME_INDEX) != df.shape[0]:
        # it is expected that there is a one-to-one matching between dateTime values in minuteData 
        # and the time index generated between start and end times
        print('\t\tinvestigate issue in timestamping sleep data')
    
    return df

In [None]:
# input: json file
# output: raw sleep dataframe
def getParticipantSleepData(filename):

    with open(filename, 'r') as f:
        data = f.readlines()

    # fix JSON formatting errors
    count = data[0].count("}}") - 1
    data_val = '[' + data[0].replace('}}','}},', count) + ']'

    # load JSON data
    json_data = json.loads(data_val)
    
    # initialize empty dataframe
    all_df = pd.DataFrame()
    #all_df = [] #YSS
    
    # append all sleep episodes to empty dataframe
    for idx, sleep_day in enumerate(json_data):
        print('\tprocessing sleep day {:03}...'.format(idx)) #YSS
        for episode in sleep_day['sleep']:
            episode_df = getDF(episode)
            all_df = all_df.append(episode_df) #YSS use of concatenation is more efficient
            
    return all_df
    #return pd.concat(all_df) #YSS

In [None]:
def convertAllJSONToCSV(phase):
    
    print('Beginning directory conversion...')

    if phase == 'uw1':
        uw = 'Fitbit UWEXPI/'
    elif phase == 'uw2':
        uw = 'Fitbit UWEXPII/'
    else:
        raise('Invalid phase argument')

    csv_dir = os.path.join(uw, 'daily_sleep_raw_csv') #YSS sleep_raw_csv -> daily_sleep_raw_csv
    json_dir = os.path.join(uw, 'daily_sleep_raw_json')        
        
    for fn in tqdm(os.listdir(json_dir)):
        print('converting {}...'.format(fn)) #YSS
        df = getParticipantSleepData(os.path.join(json_dir, fn))
        new_fn = os.path.join(csv_dir,fn.replace('.json', '.csv'))
        df.to_csv(new_fn, index=False)

In [None]:
# UNCOMMENT TO RUN
convertAllJSONToCSV('uw1')
convertAllJSONToCSV('uw2')
#YSS no further investigation was reported (good!)

In [None]:
# Convert steps csv to correct format
# columns: datetime (%Y-%m-%d %H:%M:%S) and steps
def processStepsFile(src, dest):
    df = pd.read_csv(src)
    df['datetime'] = df.date.str.cat(df.time, sep=" ")
    new_df = df[['datetime', 'steps']]
    new_df.to_csv(dest, index=False)

In [None]:
# Get entire steps raw folder
def processAllSteps(phase):
    
    print('Beginning directory conversion...')
    
    if phase == 'uw1':
        uw = 'Fitbit UWEXPI/'
    elif phase == 'uw2':
        uw = 'Fitbit UWEXPII/'
    else:
        raise('Invalid phase argument')


    steps_all_dir = os.path.join(uw, 'Daily Step Details')
    steps_dir = os.path.join(uw, 'daily_steps_raw_data') #YSS steps_raw_data -> daily_steps_raw_data
    
    if not os.path.exists(steps_dir): 
        os.makedirs(steps_dir)    
    
    for fn in tqdm(os.listdir(steps_all_dir)):
        if fn[-3:] == 'csv': 
            src = os.path.join(steps_all_dir, fn)
            dest = os.path.join(steps_dir, fn)
            processStepsFile(src, dest)

In [None]:
# UNCOMMENT TO RUN
processAllSteps('uw1')
processAllSteps('uw2')
#YSS NOTE use of detailed step data is potentially problematic because of missing data otherwise available in daily step data

In [None]:
# renames files from 'PID001_step.csv' -> 'PID001.csv'
def convertAllFilenamesToID(phase):
    if phase == 'uw1':
        uw = 'Fitbit UWEXPI/'
    elif phase == 'uw2':
        uw = 'Fitbit UWEXPII/'
    else:
        raise('Invalid phase argument')
    
    sleep_dir = os.path.join(uw,'daily_sleep_raw_csv/') #YSS sleep_raw_data --> daily_sleep_raw_csv
    steps_dir = os.path.join(uw,'daily_steps_raw_data/') #YSS steps_raw_data -> daily_steps_raw_data
    
    sleep_files = os.listdir(sleep_dir)
    steps_files = os.listdir(steps_dir)
    
    #YSS
    sleep_fns_unique = set([fn[:6] for fn in sleep_files])
    if len(sleep_fns_unique) != len(sleep_files):
        print('investigate split sleep data being overwritten')
    step_fns_unique = set([fn[:6] for fn in steps_files])
    if len(step_fns_unique) != len(steps_files):
        print('investigate split step data being overwritten')
    
    for f in sleep_files:
        src = os.path.join(sleep_dir,f)
        new_f = f[:6] +'.csv'
        dest = os.path.join(sleep_dir, new_f)
        os.rename(src,dest)
        
    for f in steps_files:
        src = os.path.join(steps_dir,f)
        new_f = f[:6] +'.csv'
        dest = os.path.join(steps_dir, new_f)
        os.rename(src,dest)

In [None]:
# UNCOMMENT TO RUN
convertAllFilenamesToID('uw1')
convertAllFilenamesToID('uw2')
#YSS PID136 data is split and the second one overwrited the first one in both sleep and step data

**End Of UW-Specific Work**

We have multiple datasets, and getPath allows us to grab the path of whatever dataset we're working with. Each dataset folder assumes a similar underlying directory structure:
* steps_raw_data
* sleep_raw_data
* sleep_steps_data
* sleep_episodes
* computed_features

The steps_raw_data is optional, and for the NetHealth dataset, sleep_steps_data is missing steps. We don't need steps to help us compute sleep, rather steps is useful in differentiating missing data from all-nighters.

In [2]:
def getPath(phase):
    if phase == 'uw1':
        path = 'Fitbit UWEXPI/'    
    elif phase == 'uw2':
        path = 'Fitbit UWEXPII/'
    elif phase == 'lac1':
        path = 'LifeAtCMU_Phase1/'
    elif phase == 'lac2':
        path = 'LifeAtCMU_Phase2/'
    elif phase == 'nh':
        path = 'NetHealth/'
    else:
        raise('Invalid phase argument')
    return path

In [None]:
def createSleepStepsDir():
    for d in ['uw1','uw2','lac1','lac2']:
        new_d = os.path.join(getPath(d),'sleep_steps_data/')
        if not os.path.exists(new_d): 
            os.makedirs(new_d)

In [None]:
# UNCOMMENT TO RUN
createSleepStepsDir()

The following combines all sleep and steps into a single combined dataframe for the given date range in the sleep_steps_data folder. This step is optional for these sleep analyses, and is done for NetHealth in the NetHealth cleaning code.

In [None]:
# combines all raw sleep and raw steps 
# input: "uw1", "uw2", "lac1", "lac2"
def saveAllCombined(phase):

    if phase == 'uw1':
        TIME_INDEX = pd.date_range(start='1/1/2018',
                                   end='6/13/2018',
                                   freq='min')
    elif phase == 'uw2': 
        TIME_INDEX = pd.date_range(start='4/1/2019',
                                   end='6/14/2019',
                                   freq='min')
    elif phase == 'lac1':
        TIME_INDEX = pd.date_range(start='1/16/2017',
                                   end='5/15/2017',
                                   freq='min')
    elif phase == 'lac2':
        TIME_INDEX = pd.date_range(start='1/16/2018',
                                   end='5/15/2018',
                                   freq='min')
    else:
        raise('Invalid phase argument')
        
    path = getPath(phase)
    
    
    
    sleep_dir = os.path.join(path,'daily_sleep_raw_csv/') #YSS sleep_raw_data --> daily_sleep_raw_data
    steps_dir = os.path.join(path,'daily_steps_raw_data/') #YSS steps_raw_data -> daily_steps_raw_data
    combined_dir = os.path.join(path,'sleep_steps_data/')

    sleep_files = os.listdir(sleep_dir)
    steps_files = os.listdir(steps_dir)
    both = list(set(sleep_files) & set(steps_files))
    both.sort() #YSS

    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    bad_ids = []
    for idx in tqdm(range(len(both))):
        
        #print('check if {} exists ...'.format(os.path.join(combined_dir, both[idx])), end=' ') #YSS

        if os.path.exists(os.path.join(combined_dir, both[idx])):
            #print('YES; continue to the next file') #YSS
            continue
        else:
            #print('NO; combine sleep and step files') #YSS
            try:
                df_sleep = pd.read_csv(os.path.join(sleep_dir, both[idx]))
                df_steps = pd.read_csv(os.path.join(steps_dir, both[idx]))

                #YSS - check if all rows have consistent second values
                secs = df_sleep['dateTime'].apply(lambda time: time[-2:]).unique()
                if len(secs) > 1:
                    print('inconsistent sleep second values for {}: {}'.format(both[idx], list(secs)))
                secs = df_steps['datetime'].apply(lambda time: time[-2:]).unique()
                if len(secs) > 1:
                    print('inconsistent step second values for {}: {}'.format(both[idx], list(secs)))
                # NOTE should there be inconsistentsies, there is a chance that replacing seconds with 0 results in 
                #      duplicate timestamps
                #print('finished 1st check')


                # Convert times to datetime objects
                df_sleep['dt_typed'] = df_sleep['dateTime'].apply(lambda time: datetime.strptime(time,DATE_FORMAT).replace(second=0))
                df_steps['dt_typed'] = df_steps['datetime'].apply(lambda time: datetime.strptime(time,DATE_FORMAT).replace(second=0))
                df_sleep = df_sleep.rename(columns={'value' : 'sleep_value'})

                #YSS - check if any rows have date&time values that do not appear in TIME_INDEX
                dt_index = df_sleep['dt_typed'].isin(TIME_INDEX)
                if (~dt_index).sum() > 0:
                    print('inconsistent sleep timestamps for {}: {}'.format(both[idx], df_sleep[~dt_index]['dt_typed']))
                dt_index = df_steps['dt_typed'].isin(TIME_INDEX)
                if (~dt_index).sum() > 0:
                    print('inconsistent step timestamps for {}: {}'.format(both[idx], df_steps[~dt_index]['dt_typed']))
                # NOTE if all timestamps exist in TIME_INDEX there should not be any duplicates
                #print('finished 2nd check')

                #YSS - check if there are duplicate timestamps
                duplicates = df_sleep.groupby(by=['dt_typed']).size()
                if duplicates[duplicates > 1].shape[0] > 0:
                    print('duplicates in sleep dataframe for {}: {}'.format(both[idx], duplicates[duplicates > 1]))
                duplicates = df_steps.groupby(by=['dt_typed']).size()
                if duplicates[duplicates > 1].shape[0] > 0:
                    print('duplicates in step dataframe for {}'.format(both[idx], duplicates[duplicates > 1]))
                # NOTE duplciates in combination can only exist if there are duplciates in the original timestamps
                #print('finished 3rd check')

                df = pd.DataFrame(index=TIME_INDEX)
                df = df.merge(df_sleep[['sleep_value','dt_typed']],how='left',left_index=True,right_on='dt_typed')
                df = df.merge(df_steps[['steps','dt_typed']],how='left',left_on='dt_typed',right_on='dt_typed')

                #YSS
                duplicates = df.groupby(by=['dt_typed']).size()
                if duplicates[duplicates > 1].shape[0] > 0:
                    print('duplicates in combined dataframe for {}'.format(both[idx]))
                    duplicates[duplicates > 1].to_csv('debug{}-combine_sleep_step.csv'.format(both[idx][:-4]))
                #print('finished 4th check')

                df = df.drop_duplicates(subset='dt_typed', keep='first') #YSS why should there be any duplicates?
                df = df.set_index('dt_typed')

                # lac1 only gives steps every 5 minutes, so we interpolate
                if phase == 'lac1':
                    df['steps'] = df['steps'].interpolate()

                df.index.names = ['time']
                df.sort_index(inplace=True)
                df.to_csv(os.path.join(combined_dir, both[idx]))
            except:
                print('exception happened in processing', sys.exc_info()) #YSS
                bad_ids.append(both[idx])
    print(phase, bad_ids)

In [None]:
# UNCOMMENT TO RUN - uw1/2 both have a few empty participants
# these will be ignored and their PIDs will be printed

saveAllCombined('uw1')
saveAllCombined('uw2')
#YSS no duplicates for UW-I or UW-II
saveAllCombined('lac1')
#YSS no duplicates for CMU-I
saveAllCombined('lac2')
#YSS it seems that data appears twice under '202.csv', '224.csv', '245.csv', '277.csv'

In [None]:
# calculates the number of minutes that separates two datetimes
def diff_min(dt1,dt2):
    
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    td = datetime.strptime(dt1, DATE_FORMAT) - datetime.strptime(dt2, DATE_FORMAT)
    return int(td.total_seconds() / 60)

Once we have all the combined sleep-steps files, we then want to extract the sleep episodes. To do this, we have two parameters: MIN_CONSEC_NON_AWAKE and MAX_CONSEC_AWAKE.

MIN_CONSEC_NON_AWAKE is the number of consecutive minutes that a student has to be asleep/restless for an episode to be recognized, and MAX_CONSEC_AWAKE is the number of consecutive awake minutes that have to be recognized for the termination of an episode. Any timepoint where steps are positive is considered awake.


In [None]:
MIN_CONSEC_NON_AWAKE = 20
MAX_CONSEC_AWAKE = 5

In [None]:
def getSubjectSleepEpisodes(phase, 
                            filename,
                            min_consec_non_awake = MIN_CONSEC_NON_AWAKE,
                            max_consec_awake = MAX_CONSEC_AWAKE):
        
    path = getPath(phase)

    COMBINED_DIR = os.path.join(path, 'sleep_steps_data/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')
    
    # Read combined data
    filename = os.path.join(COMBINED_DIR, filename)
    print('episodes for', filename) #YSS
    combined_df = pd.read_csv(filename, index_col=0)
    
    #YSS
    duplicates = combined_df.groupby(combined_df.index).size()
    if duplicates[duplicates > 1].shape[0] > 0:
        print('duplicates in combined dataframe for {}'.format(filename))
        duplicates[duplicates > 1].to_csv('debug{}-episode_sleep_step.csv'.formt(filename[:-4]))
    
    # Define fields
    columns = ['start_time', 'end_time', 'length', 'time_asleep', 'time_restless', 'time_awake']
    
    # Make dictionary
    episode_dict = {} # start_time -> ['end_time', 'length', 'time_asleep', 'time_restless', 'time_awake']
    
    # tracks the last time point the subject was awake
    last_awake = combined_df.index[0]
    
    # used for tracking when in or not in and episode
    consec_non_awake = 0
    consec_awake = 0
    on_episode = False
    start_index = None
    combined_df = combined_df[~combined_df.index.duplicated(keep='first')] #YSS what is this about duplicated   
    
    # iterate through the subject's combined dataframe
    for index, row in combined_df.iterrows():
        sleep = row['sleep_value']

        # update counts
        if math.isnan(sleep) or sleep == 3 or sleep == 0: # 3 = awake, 0 = missing in lac1
            consec_awake += 1
            consec_non_awake = 0
            last_awake = index
        elif sleep == 1: # 1 = asleep
            consec_awake = 0
            consec_non_awake += 1
        elif sleep == 2: # 2 = restless
            consec_awake = 0
            consec_non_awake += 1
        else:
            raise Exception('sleep value is not NaN, 1, 2, or 3')

        # update episode status
        # starting new episode
        if not on_episode and consec_non_awake >= min_consec_non_awake:
            on_episode = True
            start_index = last_awake
            start_loc = combined_df.index.get_loc(index)
        # ending episode
        elif on_episode and consec_awake >= max_consec_awake:
            index_loc = combined_df.index.get_loc(index)
            end_index = combined_df.index[index_loc-max_consec_awake+1]
            length = diff_min(end_index,start_index)
            temp_df = combined_df.iloc[start_loc:index_loc]
            time_awake = len(temp_df[temp_df['sleep_value']==3])
            time_restless = len(temp_df[temp_df['sleep_value']==2])
            time_asleep = length - time_restless - time_awake
            episode_dict[start_index] = [end_index, length, time_asleep, time_restless, time_awake]
            start_index = index
            on_episode = False

    # Make dataframe from dictionary
    episode_df = pd.DataFrame.from_dict(episode_dict, orient='index', columns=columns[1:])
    episode_df.index.name = columns[0]
    
    return episode_df

In [None]:
#MIN_CONSEC_NON_AWAKE = 4
#MAX_CONSEC_AWAKE = 2
#combined_df.loc[combined_df.iloc[:9].index, 'sleep_value'] = pd.Series([1, 1, 3, 1, 1, 1, 3, 1, 1], index=combined_df.iloc[:9].index) # no episode
#combined_df.loc[combined_df.iloc[:9].index, 'sleep_value'] = pd.Series([3, 3, 1, 1, 1, 1, 3, 1, 1], index=combined_df.iloc[:9].index) # no episode
#combined_df.loc[combined_df.iloc[:9].index, 'sleep_value'] = pd.Series([1, 3, 1, 1, 1, 1, 3, 3, 1], index=combined_df.iloc[:9].index)
#combined_df.loc[combined_df.iloc[:9].index, 'sleep_value'] = pd.Series([3, 1, 1, 1, 1, 3, 1, 3, 3], index=combined_df.iloc[:9].index)
#YSS issues in episode calculations (all minor):
# - sensitive to awake oscillation at the start of an episode but not in the middle
# - neither start_time nor end_time are inclusive 
#   (the episode actually starts after start_time and ends before end_time)
# - actual length of time is smaller by one minute
# - cannot find episodes if there are not enough awake minutes at the end of the timeseries; 
#   can be fixed by padding MAX_CONSEC_AWAKE awake minutes at the end

In [None]:
def computeAllEpisodesForParams(phase,
                                min_consec_non_awake, 
                                max_consec_awake):

    path = getPath(phase)

    COMBINED_DIR = os.path.join(path, 'sleep_steps_data/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')    
        
    if not os.path.exists(EPISODES_DIR): 
        os.makedirs(EPISODES_DIR) 
    
    for src in tqdm(os.listdir(COMBINED_DIR)):

        dest = 'EPI_' + src.split('.')[0] + '.csv'

        dest = os.path.join(EPISODES_DIR,dest)

        if not os.path.exists(dest):    # remove to writeover existing files
            df = getSubjectSleepEpisodes(
                     phase,
                     src,
                     min_consec_non_awake,
                     max_consec_awake)
            df.to_csv(dest)
        

In [None]:
# UNCOMMENT TO RUN - took 3-5 hours on my personal computer 
# for phase in ['uw1', 'uw2', 'lac1', 'lac2', 'nh']:
#     computeAllEpisodesForParams(phase,
#                             MIN_CONSEC_NON_AWAKE,
#                             MAX_CONSEC_AWAKE)
computeAllEpisodesForParams('uw1', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE) #YSS
computeAllEpisodesForParams('uw2', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE) #YSS
computeAllEpisodesForParams('lac1', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE) #YSS
computeAllEpisodesForParams('lac2', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE) #YSS
#computeAllEpisodesForParams('nh', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE) #YSS

In [None]:
computeAllEpisodesForParams('uw2', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE)

In [None]:
computeAllEpisodesForParams('lac1', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE)

In [None]:
computeAllEpisodesForParams('lac2', MIN_CONSEC_NON_AWAKE, MAX_CONSEC_AWAKE)

Once we've extracted the sleep episodes (which are in the sleep_episodes/ folder under a subdirectory titled "MIN_CONSEC_NONAWAKE""MAX_CONSEC_AWAKE"), we then have to compute the sleep features of interest.

To do so, we need to convert the bedtime and waketime into minutes after a given zero (e.g. minutes after 6 pm) since time is modulo 24 hr. For bedtime, students generally go to bed after 6 pm, and for waketime, students generally wake up after 4 am (these numbers were not chosen formally). Thus, we set BED_ZERO = 18 (i.e. 6 pm), and WAKE_ZERO = 4 (i.e. 4 am), and convert bedtime and waketime to the number of minutes after these respective zeros.

Additionally, we identified the main sleep episode of Day n as the longest sleep episode which started between noon of Day n and noon of Day (n+1).

In [3]:
BED_ZERO = 18
WAKE_ZERO = 4

In [4]:
# returns the main episode date for given time
def getWindow(time):
    if time.hour < 12: #YSS use of 6 is more appropriate; see UW-I PID155 on 2018-06-04
        time = time - timedelta(days=1)
    return time.date()

In [5]:
def getEpisodeData(phase,
                   filename): # ex: 'PID001.csv'
    
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    path = getPath(phase)
    
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')
    file = os.path.join(EPISODES_DIR, 'EPI_'+ filename.split('.')[0] + '.csv')
    
    df = pd.read_csv(file, header=0) #YSS consider using pd.read_csv(file, header=0, parse_dates=['start_time', 'end_time'])
    
    df['start_time'] = df['start_time'].apply(lambda x: datetime.strptime(x,DATE_FORMAT)) #YSS not needed if using parse_dates in read_csv
    df['end_time'] = df['end_time'].apply(lambda x: datetime.strptime(x,DATE_FORMAT)) #YSS not needed if using parse_dates in read_csv
    
    # define a main sleep episode as the longest sleep episode
    # in sleep window
    df['subject_id'] = filename.split('.')[0]
    df['main_episode_of'] = df['start_time'].apply(lambda x: getWindow(x))
    #df = df.sort_values(by='length', ascending=False) #YSS not needed
    #df = df.sort_values(by='main_episode_of') #YSS not needed
    
    #YSS
    df.index.name = 'inds'
    main_index = df.reset_index().groupby(by=['main_episode_of']).apply(lambda x: 
                                                                        x.sort_values(by=['length'], 
                                                                                      ascending=False).iloc[0]['inds'])
    df['main_sleep'] = False
    df.loc[df.index.isin(main_index), 'main_sleep'] = True
    
    return df

In [6]:
#YSS
getEpisodeData('uw1', 'PID001.csv')

Unnamed: 0_level_0,start_time,end_time,length,time_asleep,time_restless,time_awake,subject_id,main_episode_of,main_sleep
inds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2018-01-05 04:20:00,2018-01-05 09:58:00,338,335,3,0,PID001,2018-01-04,True
1,2018-01-06 02:57:00,2018-01-06 08:34:00,337,319,16,2,PID001,2018-01-05,True
2,2018-01-07 03:16:00,2018-01-07 06:34:00,198,189,5,4,PID001,2018-01-06,True
3,2018-01-07 06:40:00,2018-01-07 08:48:00,128,126,2,0,PID001,2018-01-06,False
4,2018-01-08 00:38:00,2018-01-08 08:30:00,472,460,12,0,PID001,2018-01-07,True
...,...,...,...,...,...,...,...,...,...
169,2018-06-10 00:26:00,2018-06-10 01:08:00,42,35,3,4,PID001,2018-06-09,False
170,2018-06-10 01:13:00,2018-06-10 10:22:00,549,519,29,1,PID001,2018-06-09,True
171,2018-06-10 23:20:00,2018-06-11 07:48:00,508,486,12,10,PID001,2018-06-10,True
172,2018-06-11 08:22:00,2018-06-11 10:10:00,108,107,0,1,PID001,2018-06-10,False


In [7]:
def getMainEpisodeData(phase, filename):
    
    df = getEpisodeData(phase, filename)
    df = df.sort_values(by='length', ascending=False)
    df = df.drop_duplicates(subset='main_episode_of',keep='first')
    df = df.sort_values(by='main_episode_of') 
    
    return df

In [8]:
def timeToMin(time, zero_hour):
    
    hour = time.hour
    minute = time.minute
    
    if hour < zero_hour:
        hour += 24
        
    return (hour-zero_hour)*60 + minute

Here, we compute various sleep features of interest for a given time window.

MSSD refers to mean successive squared difference. This is a measure of variability that also takes into account the temporal nature of the data. For instance, the MSSD of [3,4,10] is ((4-3)^2 + (10-4)^2)/2 = 37/2

In [9]:
def getMainEpisodeMSSD(main_episode_df):
    
    main_episode_df = main_episode_df.sort_values(by='main_episode_of')
    
    main_episode_df['bedtime'] = main_episode_df['start_time'].apply(lambda x: timeToMin(x,BED_ZERO))
    main_episode_df['waketime'] = main_episode_df['end_time'].apply(lambda x: timeToMin(x,WAKE_ZERO))
    main_episode_df['midpoint_sleep'] = (main_episode_df['waketime'] + main_episode_df['bedtime']) / 2.0

    
    count = 0
    bt_total = []
    wt_total = []
    mp_total = []
    
    for i, idx1 in enumerate(main_episode_df.index): #YSS this does not handle data dicontinuity (e.g. missing days)
        row1 = main_episode_df.loc[idx1]
        
        if i < len(main_episode_df.index)-1:
            idx2 = main_episode_df.index[i+1]
            row2 = main_episode_df.loc[idx2]
        else:
            row2 = None
        
        if row2 is not None:
            count += 1
            bt_total.append((row2['bedtime']-row1['bedtime'])**2)
            wt_total.append((row2['waketime']-row1['waketime'])**2)
            mp_total.append((row2['midpoint_sleep']-row1['midpoint_sleep'])**2)
        
    
    if count == 0:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
    
    bt_mssd = np.array(bt_total).mean() / (float(count) * 3600) #YR: why deviding by float(count) SP: there should not be
    wt_mssd = np.array(wt_total).mean() / (float(count) * 3600)
    mp_mssd = np.array(mp_total).mean() / (float(count) * 3600)
        
    bt_mssd_median = median(bt_total)
    wt_mssd_median = median(wt_total)
    mp_mssd_median = median(mp_total)
        
    return bt_mssd, wt_mssd, mp_mssd, bt_mssd_median, wt_mssd_median, mp_mssd_median

In [None]:
#YSS
def mssd(s:pd.Series)->float:
    """
    returns the mean of squared difference of consecutive elements of s.
    returns NaN if s is None, empty, or has a single element
    """
    if s is None:
        return np.nan
    if len(s) == 0:
        return np.nan
    delta_squared = (s - s.shift(-1))**2
    return delta_squared.mean()

In [None]:
#YSS
def getMainEpisodeMSSD(main_episode_df):
    main_episode_df = main_episode_df.sort_values(by='main_episode_of', ascending=True)

    main_episode_df['day'] = main_episode_df['main_episode_of'].apply(lambda x: x.timetuple().tm_yday) # day of year
    main_episode_df['diff'] = main_episode_df['day'].shift(-1) - main_episode_df['day'] # number of days to the next row
    main_episode_df['consec'] = False # am I the consecutive day of my previous row?
    main_episode_df.loc[main_episode_df['diff'] == 1, 'consec'] = True
    main_episode_df['consec'] = main_episode_df['consec'].shift(1).replace({None:True})
    main_episode_df['stretch'] = (~main_episode_df['consec']).cumsum() # episodes are streteches of consecutive days

    main_episode_df['bedtime'] = main_episode_df['start_time'].apply(lambda x: timeToMin(x,BED_ZERO))
    main_episode_df['waketime'] = main_episode_df['end_time'].apply(lambda x: timeToMin(x,WAKE_ZERO))
    main_episode_df['midpoint_sleep'] = (main_episode_df['waketime'] + main_episode_df['bedtime']) / 2.0

    bt_mssds = main_episode_df.groupby(by=['stretch']).apply(lambda x: mssd(x['bedtime']))
    wt_mssds = main_episode_df.groupby(by=['stretch']).apply(lambda x: mssd(x['waketime']))
    mp_mssds = main_episode_df.groupby(by=['stretch']).apply(lambda x: mssd(x['midpoint_sleep']))
    # TO-DO further test (1) consecutive day episode construction (single episode-DONE, one-day episode-DONE, every-other days)
    #                    (2) mssd calculation (None, empty, single-day episode-DONE)
    # outstanding questions: how to compute the total mssd based on mssd in each episode
    #                        what's the deal with median measures; a separate function (e.g. mssd_median) should be implemented

In [None]:
#YSS
file = '/Users/yasaman/UWEXP/cmu-sleep-gpa/Fitbit UWEXPI/sleep_episodes/EPI_PID155.csv'
df = pd.read_csv(file, header=0, parse_dates=['start_time', 'end_time'])

df['main_episode_of'] = df['start_time'].apply(lambda x: getWindow(x))
df.index.name = 'inds'
main_index = df.reset_index().groupby(by=['main_episode_of']).apply(lambda x: 
                                                                    x.sort_values(by=['length'], 
                                                                                  ascending=False).iloc[0]['inds'])
df['main_sleep'] = False
df.loc[df.index.isin(main_index), 'main_sleep'] = True

main_episode_df = df[df['main_sleep']]

For a given time window, computeSummaryStats computes all the sleep features for a single participant, while computeAllSummaryStats computes all the sleep features for **all** participants for the given time window.

In [10]:
# start_window and end_window are date objects
# extracts start to end inclusive
def computeSummaryStats(phase, 
                        filename, # ex: 'PID001.csv'
                        start_window, 
                        end_window):
    
        
        DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
        
        summary = dict()
        
        subject_id = filename.split('.')[0]
        
        summary['subject_id'] = subject_id
        
 
        all_df = getEpisodeData(phase, subject_id)
        main_df = getMainEpisodeData(phase, subject_id)

        start = pd.Timestamp(start_window)
        end = pd.Timestamp(end_window)
        main_df = main_df[main_df['main_episode_of'].between(start,end)]
        
        #all_df['start_time'] = all_df['start_time'].apply(lambda x: datetime.strptime(str(x),DATE_FORMAT)) #YSS this is already done in getEpisodeData
        #all_df['main_episode_of'] = all_df['start_time'].apply(lambda x: getWindow(x)) #YSS this s already done in getEpisodeData
        all_df = all_df[all_df['main_episode_of'].between(start,end)] 
        
        nap_df = all_df[~all_df.start_time.isin(main_df.start_time)]
        
        #YSS
        if all_df.shape[0] != (main_df.shape[0] + nap_df.shape[0]):
            print('investigate construction of nap_df in {} (inequality)'.format(filename))
        if all_df[~all_df['main_sleep']].shape[0] != nap_df.shape[0]:
            print('investigate construction of nap_df in {} (alternative construction)'.format(filename))
        
        summary['num_naps'] = len(nap_df)
        summary['avg_nap_length'] = nap_df['length'].mean()
        
        #YSS
        if all_df.shape[0] > 0:
            if all_df[~all_df['main_sleep']]['length'].mean() != summary['avg_nap_length']:
                print('investigate construction of nap_df {} (alternative construction - mean)'.format(filename))
            if all_df[~all_df['main_sleep']]['length'].sum() != nap_df['length'].sum():
                print('investigate construction of nap_df {} (alternative construction - sum)'.format(filename))
            if abs((all_df.groupby(all_df.set_index('start_time').index.dayofyear)['length'].sum()).mean() - 
                all_df['length'].sum() / len(all_df['main_episode_of'].unique())) > 0.01:
                print('investigate construction of nap_df {} (alternative construction - sum)'.format(filename))
        
        try:
            summary['frac_napped_of_total_sleep'] = nap_df['length'].sum() / all_df['length'].sum()
            summary['avg_24_hr_sleep'] = all_df['length'].sum() / len(all_df['main_episode_of'].unique()) 
            summary['frac_sleep_episodes_as_naps'] = len(nap_df) / len(all_df)
            
        except: # all_df is empty
            #YSS suggest filling in all the other feilds and returning summary
            summary['frac_napped_of_total_sleep'] = np.nan
            summary['avg_24_hr_sleep'] = np.nan
            summary['frac_sleep_episodes_as_naps'] = np.nan
        
        summary['frac_nights_with_data'] = len(main_df) / ((end_window-start_window).days + 1)
 

        if len(main_df) == 0:
            #YSS this is not consistent with try-catch above
            return None

        
        main_df['bedtime'] = main_df['start_time'].apply(lambda x: timeToMin(x,BED_ZERO))
        main_df['waketime'] = main_df['end_time'].apply(lambda x: timeToMin(x,WAKE_ZERO))
        main_df['midpoint_sleep'] = (main_df['waketime'] + main_df['bedtime']) / 2.0 #YSS waketime and bedtime are calculated wrt different reference points; what does it mean to find their average? what reference point to use to interpret this average?
        #YSS alterantive approach find mid_time based on 'start_time' and length of the main episode then similarly remap it wrt a reference (e.g. BED_ZERO)

            
        main_df['proportion_awake'] = main_df['time_awake'] / main_df['length']
        main_df['proportion_restless'] = main_df['time_restless'] / main_df['length']
        main_df['weekday'] = main_df['main_episode_of'].apply(lambda x: x.weekday())
        
        # 4 = friday night, 5 = saturday night
        main_df['is_weekend'] = main_df['weekday'].apply(lambda x: x in [4,5])
        
        # bedtime measures
        summary['bedtime'] = main_df['bedtime'].mean()
        summary['bedtime_std'] = main_df['bedtime'].std()
    
        # waketime measures
        summary['waketime'] = main_df['waketime'].mean()
        
        # midpoint sleep measures
        summary['midpoint_sleep'] = main_df['midpoint_sleep'].mean()

        # weekday measures
        try:  
            summary['bedtime_weekday'] = main_df.groupby('is_weekend')['bedtime'].mean()[False]        
            summary['waketime_weekday'] = main_df.groupby('is_weekend')['waketime'].mean()[False]
            summary['midpoint_sleep_weekday'] = main_df.groupby('is_weekend')['midpoint_sleep'].mean()[False]            
        except KeyError: #YSS if an empty summary (all fields are np.nan) is returned in the first try-catch this won't be needed
            summary['bedtime_weekday'] = np.nan     
            summary['waketime_weekday'] = np.nan
            summary['midpoint_sleep_weekday'] = np.nan                
        
        # weekend measures
        try:
            summary['bedtime_weekend'] = main_df.groupby('is_weekend')['bedtime'].mean()[True]  
            summary['waketime_weekend'] = main_df.groupby('is_weekend')['waketime'].mean()[True]      
            summary['midpoint_sleep_weekend'] = main_df.groupby('is_weekend')['midpoint_sleep'].mean()[True]      
        except KeyError: #YSS if an empty summary (all fields are np.nan) is returned in the first try-catch this won't be needed
            summary['bedtime_weekend'] = np.nan
            summary['waketime_weekend'] = np.nan
            summary['midpoint_sleep_weekend'] = np.nan

            
        try:
            summary['social_jetlag'] = summary['bedtime_weekend'] - summary['bedtime_weekday'] #YSS this is incorrect; midpoint sleep is used for calculating social jetlag
        except:
            summary['social_jetlag'] = np.nan
            
        summary['time_in_bed'] = main_df['length'].mean()
                
        bt_mssd, wt_mssd, mp_mssd, bt_mssd_median, wt_mssd_median, mp_mssd_median = getMainEpisodeMSSD(main_df)
        
        summary['bedtime_mssd'] = bt_mssd
        summary['waketime_mssd'] = wt_mssd
        summary['midpoint_sleep_mssd'] = mp_mssd
        
        summary['bedtime_mssd_median'] = bt_mssd_median
        summary['waketime_mssd_median'] = wt_mssd_median
        summary['midpoint_sleep_mssd_median'] = mp_mssd_median
        
        summary['WASO_fraction'] = main_df['proportion_awake'].mean() #YSS what is WASO?
        summary['restless_fraction'] = main_df['proportion_restless'].mean()
        summary['TST'] = main_df['time_asleep'].mean()
        summary['TST_std'] = main_df['time_asleep'].std()
        
        
        return summary

In [11]:
#YSS
computeSummaryStats('uw1', 'PID001.csv', date(2018,4,1), date(2018,4,28))

investigate construction of nap_df PID001.csv (alternative construction - sum)


{'subject_id': 'PID001',
 'num_naps': 9,
 'avg_nap_length': 122.11111111111111,
 'frac_napped_of_total_sleep': 0.08573880480574192,
 'avg_24_hr_sleep': 512.72,
 'frac_sleep_episodes_as_naps': 0.2647058823529412,
 'frac_nights_with_data': 0.8928571428571429,
 'bedtime': 491.4,
 'bedtime_std': 128.23091150992673,
 'waketime': 360.16,
 'midpoint_sleep': 425.78,
 'bedtime_weekday': 453.8888888888889,
 'waketime_weekday': 352.05555555555554,
 'midpoint_sleep_weekday': 402.97222222222223,
 'bedtime_weekend': 587.8571428571429,
 'waketime_weekend': 381.0,
 'midpoint_sleep_weekend': 484.42857142857144,
 'social_jetlag': 133.96825396825398,
 'time_in_bed': 468.76,
 'bedtime_mssd': 0.3211829668209876,
 'waketime_mssd': 0.1348070987654321,
 'midpoint_sleep_mssd': 0.14321168499228396,
 'bedtime_mssd_median': 12902.5,
 'waketime_mssd_median': 3662.5,
 'midpoint_sleep_mssd_median': 4949.125,
 'WASO_fraction': 0.004501156732832872,
 'restless_fraction': 0.044838352985259074,
 'TST': 445.24,
 'TST_std

In [12]:
# start_window and end_window are date objects
def computeAllSummaryStats(phase,
                           start_window, 
                           end_window,
                           overwrite=False):
    
    path = getPath(phase)

    FEATURES_FOLDER = os.path.join(path, 'computed_features/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')
    feature_file = str(start_window) + '_' + str(end_window) + '.csv'       
    feature_file = os.path.join(FEATURES_FOLDER,feature_file)    
   
    if os.path.exists(feature_file) and not overwrite:
        return 'Already written'
        
    ids = os.listdir(os.path.join(path, 'sleep_steps_data/'))

    
    with open(feature_file, 'w') as f:
    
        header = ['subject_id',
                  'num_naps',
                  'avg_nap_length',
                  'frac_napped_of_total_sleep',
                  'avg_24_hr_sleep',
                  'frac_sleep_episodes_as_naps',
                  'frac_nights_with_data',
                  'bedtime',
                  'waketime',
                  'midpoint_sleep',
                  'time_in_bed',
                  'bedtime_mssd',
                  'bedtime_mssd_median',
                  'bedtime_std',
                  'bedtime_weekend',
                  'bedtime_weekday',
                  'waketime_mssd',
                  'waketime_mssd_median',
                  'waketime_weekend',
                  'waketime_weekday',
                  'midpoint_sleep_mssd',
                  'midpoint_sleep_mssd_median',
                  'midpoint_sleep_weekend',
                  'midpoint_sleep_weekday',
                  'WASO_fraction',
                  'restless_fraction',
                  'social_jetlag',
                  'TST',
                  'TST_std']
        
        w = csv.DictWriter(f, header)
        w.writeheader()
        
        for filename in ids:
    
            summary = computeSummaryStats(phase, 
                                          filename,
                                          start_window, 
                                          end_window)
                
            if not summary:
                summary = dict()
                for val in header:
                    summary[val] = np.nan
                summary['subject_id'] = filename.split('.')[0]
                summary['frac_nights_with_data'] = 0
                
            w.writerow(summary)

**Regression Analyses**

In [13]:
# UNCOMMENT TO RUN
for d in ['uw1','uw2','lac1','lac2']:#YSS['uw1','uw2','lac1','lac2', 'nh']:
    new_d = os.path.join(getPath(d),'computed_features/')
    if not os.path.exists(new_d): 
        os.makedirs(new_d)

In [20]:
REGRESSION_PERIODS = [('uw1',date(2018,4,1),date(2018,4,28)),
                      ('uw2',date(2019,4,7),date(2019,5,4)),
                      ('lac1',date(2017,2,7),date(2017,3,7)),
                      ('lac2',date(2018,2,7),date(2018,3,7)),
                      #YSS('nh', date(2016,2,4), date(2016,3,4))
                     ]

PREDICTORS = ['bedtime_mssd',
              'TST',
              'midpoint_sleep']

In [16]:
#YSS
for phase, start, end in REGRESSION_PERIODS:
    print('calculating features for...', phase)
    computeAllSummaryStats(phase, start, end, overwrite=True) # ADD overwrite=True
    print('complete!')

calculating features for... uw1
investigate construction of nap_df PID076.csv (alternative construction - sum)
investigate construction of nap_df PID062.csv (alternative construction - sum)
investigate construction of nap_df PID089.csv (alternative construction - sum)
investigate construction of nap_df PID117.csv (alternative construction - sum)
investigate construction of nap_df PID103.csv (alternative construction - sum)
investigate construction of nap_df PID088.csv (alternative construction - sum)


  summary['frac_napped_of_total_sleep'] = nap_df['length'].sum() / all_df['length'].sum()
  summary['avg_24_hr_sleep'] = all_df['length'].sum() / len(all_df['main_episode_of'].unique())


investigate construction of nap_df PID061.csv (alternative construction - sum)
investigate construction of nap_df PID075.csv (alternative construction - sum)
investigate construction of nap_df PID115.csv (alternative construction - mean)
investigate construction of nap_df PID115.csv (alternative construction - sum)
investigate construction of nap_df PID128.csv (alternative construction - mean)
investigate construction of nap_df PID048.csv (alternative construction - sum)
investigate construction of nap_df PID060.csv (alternative construction - sum)
investigate construction of nap_df PID058.csv (alternative construction - sum)
investigate construction of nap_df PID064.csv (alternative construction - sum)
investigate construction of nap_df PID138.csv (alternative construction - mean)
investigate construction of nap_df PID138.csv (alternative construction - sum)
investigate construction of nap_df PID104.csv (alternative construction - sum)
investigate construction of nap_df PID111.csv (al

investigate construction of nap_df PID068.csv (alternative construction - sum)
investigate construction of nap_df PID097.csv (alternative construction - sum)
investigate construction of nap_df PID120.csv (alternative construction - mean)
investigate construction of nap_df PID109.csv (alternative construction - sum)
investigate construction of nap_df PID082.csv (alternative construction - sum)
investigate construction of nap_df PID055.csv (alternative construction - sum)
investigate construction of nap_df PID045.csv (alternative construction - sum)
investigate construction of nap_df PID092.csv (alternative construction - sum)
investigate construction of nap_df PID125.csv (alternative construction - sum)
investigate construction of nap_df PID130.csv (alternative construction - sum)
investigate construction of nap_df PID118.csv (alternative construction - sum)
investigate construction of nap_df PID093.csv (alternative construction - sum)
investigate construction of nap_df PID087.csv (alte

investigate construction of nap_df PID415.csv (alternative construction - sum)
investigate construction of nap_df PID373.csv (alternative construction - sum)
investigate construction of nap_df PID367.csv (alternative construction - sum)
investigate construction of nap_df PID429.csv (alternative construction - sum)
investigate construction of nap_df PID561.csv (alternative construction - sum)
investigate construction of nap_df PID549.csv (alternative construction - sum)
investigate construction of nap_df PID424.csv (alternative construction - sum)
investigate construction of nap_df PID342.csv (alternative construction - sum)
investigate construction of nap_df PID356.csv (alternative construction - sum)
investigate construction of nap_df PID430.csv (alternative construction - sum)
investigate construction of nap_df PID395.csv (alternative construction - sum)
investigate construction of nap_df PID394.csv (alternative construction - sum)
investigate construction of nap_df PID380.csv (alter

investigate construction of nap_df PID319.csv (alternative construction - sum)
investigate construction of nap_df PID523.csv (alternative construction - sum)
investigate construction of nap_df PID537.csv (alternative construction - sum)
complete!
calculating features for... lac1
investigate construction of nap_df 223.csv (alternative construction - sum)
investigate construction of nap_df 196.csv (alternative construction - sum)
investigate construction of nap_df 357.csv (alternative construction - sum)
investigate construction of nap_df 343.csv (alternative construction - sum)
investigate construction of nap_df 356.csv (alternative construction - mean)
investigate construction of nap_df 197.csv (alternative construction - sum)
investigate construction of nap_df 168.csv (alternative construction - mean)
investigate construction of nap_df 236.csv (alternative construction - sum)
investigate construction of nap_df 222.csv (alternative construction - mean)
investigate construction of nap_d

investigate construction of nap_df 349.csv (alternative construction - mean)
investigate construction of nap_df 189.csv (alternative construction - sum)
investigate construction of nap_df 176.csv (alternative construction - sum)
investigate construction of nap_df 200.csv (alternative construction - sum)
investigate construction of nap_df 214.csv (alternative construction - sum)
investigate construction of nap_df 199.csv (alternative construction - mean)
investigate construction of nap_df 198.csv (alternative construction - sum)
investigate construction of nap_df 173.csv (alternative construction - mean)
investigate construction of nap_df 167.csv (alternative construction - sum)
investigate construction of nap_df 211.csv (alternative construction - sum)
investigate construction of nap_df 171.csv (alternative construction - mean)
investigate construction of nap_df 170.csv (alternative construction - sum)
complete!
calculating features for... lac2
investigate construction of nap_df 551.cs

investigate construction of nap_df 662.csv (alternative construction - sum)
investigate construction of nap_df 266.csv (alternative construction - mean)
investigate construction of nap_df 266.csv (alternative construction - sum)
investigate construction of nap_df 264.csv (alternative construction - sum)
investigate construction of nap_df 338.csv (alternative construction - sum)
investigate construction of nap_df 476.csv (alternative construction - sum)
investigate construction of nap_df 304.csv (alternative construction - sum)
investigate construction of nap_df 305.csv (alternative construction - sum)
investigate construction of nap_df 477.csv (alternative construction - mean)
investigate construction of nap_df 477.csv (alternative construction - sum)
investigate construction of nap_df 271.csv (alternative construction - sum)
investigate construction of nap_df 202.csv (alternative construction - sum)
investigate construction of nap_df 559.csv (alternative construction - sum)
investigat

FileNotFoundError: [Errno 2] No such file or directory: 'NetHealth/sleep_steps_data/'

In [None]:
def computeAllFeatureWindows():
    for period in REGRESSION_PERIODS:

        phase = period[0]
        start_window = period[1]
        end_window = period[2]

        computeAllSummaryStats(phase, start_window, end_window, overwrite=True)    

In [17]:
def getSummaryStats(phase, start, end):

    start_window = start
    end_window = end
    
    path = getPath(phase)
            
    # get entire semester participant data   
    filename = str(start_window) + '_' + str(end_window) + '.csv'
    filename = os.path.join(path, 'computed_features/', filename)
    summary_stats_df = pd.read_csv(filename)
    summary_stats_df.set_index('subject_id', inplace=True)
    
    return summary_stats_df[summary_stats_df['frac_nights_with_data']>=.2]

In [18]:
# used to extract first-year student label from EMA for lac2
def getLAC2Freshmen():
    phase = 'lac2'
    path = os.path.join('GPA_data/raw_gpa/',
                        phase + '_pre_post_ema.csv')
    df = pd.read_csv(path, index_col='ID', low_memory=False)
    return list(df[df['IRA_YearOfStudy']==1].index)

In [19]:
def getGPAData(phase):
    path = os.path.join('GPA_data/cleaned_gpa/', 
                        phase + '_freshmen_gpa.csv')
    return pd.read_csv(path, index_col='subject_id')

In [21]:
# all predictors of interest, GPA features, no nan rows,
# and participants with >=20% fraction of nights data 
def getRegDF(phase, start, end, predictors, control=None, thresh=0.2):
        
        # compute summary statistics
        #computeAllSummaryStats(phase, start, end, overwrite=True) # ADD overwrite=True #YSS
        stats_df = getSummaryStats(phase, start, end)
        
        # filter by freshmen
        if phase == 'lac2':
            stats_df = stats_df[stats_df.index.isin(getLAC2Freshmen())]

        # filter by completeness threshold
        stats_df = stats_df[stats_df['frac_nights_with_data'] >= thresh] #YSS how is this different from what is alreay done in getSummaryStats?

        # get GPA data
        gpa_df = getGPAData(phase)
        
        # combine summary statistics with GPA data
        combined_df = stats_df.merge(gpa_df, on='subject_id', how='outer')

        # choose only columns of variables of interest
        columns = predictors + ['cum_gpa', 'term_gpa', 'frac_nights_with_data']
                
        if control:
            columns += [control]        
        
        combined_df = combined_df[columns].dropna()
        
        return combined_df[columns]

In [22]:
def saveRegressionDF():

    dfs = []
    for period in REGRESSION_PERIODS:
        df = getRegDF(period[0], period[1], period[2], PREDICTORS, thresh=0.2)
        df['cohort'] = period[0]
        dfs.append(df)
    all_reg_dfs = pd.concat(dfs)
    all_reg_dfs.to_csv('regression_df.csv')


In [23]:
# UNCOMMENT TO RUN
saveRegressionDF()