The NetHealth dataset was provided in a very different format from the Fitbit and UWEXP datasets. Specifically, the file Sleep contains all the raw sleep data, but each row in Sleep is missing information about the date and subject. These items can be identified with the sleepMetaID column which points to idSleepMeta in sleep_meta.csv, and then mapping the StudyID in sleep_meta.csv to the idStudy column in StudyEgo.csv. StudyEgo.csv contains the information about both the date and subject. The relationships are summarized below:

* sleepMetaID in Sleep points to idSleepMeta in sleep_meta.csv
* StudyID in sleep_meta.csv points to idStudy in StudyEgo.csv

The NetHealth_cleaning directory is structured as follows:
* Sleep_split_by_sleepMetaID/ is the raw Sleep file split by the unique values in the sleepMetaID column. This folder was created by placing the Sleep file into the folder and running the awk command in the following cell in the folder's working directory.
* StudyEgo_split_by_egoid/ is the StudyEgo.csv file split by the unique values in the EgoID column. This folder was created by placing the StudyEgo.csv file into the folder and running the awk command in the following cell in the folder's working directory.
* StudyEgo_split_by_egoid_grades_only/ is a subset of StudyEgo_split_by_egoid/ which only contains the participants who agreed to share institutional grade data (i.e. those participants that are listed in the CourseGrades(3-6-20).csv file)

Sleep -> Sleep_split_by_sleepMetaID/
```
foo@bar:~/Sleep_split_by_sleepMetaID$ awk -F, 'NR==1 {h=$0; next} {f=$2".csv"} !($2 in p) {p[$2]; print h > f} {print >> f;close(f);}' Sleep
```

StudyEgo.csv -> StudyEgo_split_by_egoid/
```
foo@bar:~/StudyEgo_split_by_egoid$ awk -F, 'NR==1 {h=$0; next} {f=$2".csv"} !($2 in p) {p[$2]; print h > f} {print >> f}' StudyEgo.csv
```

In [4]:
import pandas as pd
import numpy as np
import os
from shutil import copy
from tqdm import tqdm
import csv
from datetime import datetime, timedelta, time, date


In [5]:
EGOID_DIR = 'StudyEgo_split_by_egoid/'
EGOID_GRADES_DIR = 'StudyEgo_split_by_egoid_grades_only/'
SPLIT_BY_METAID_DIR = 'Sleep_split_by_sleepMetaID/'
SLEEP_RAW_DIR = 'sleep_raw_data/'
COMBINED_DIR = '../sleep_steps_data/'

GRADES = pd.read_csv('CourseGrades(3-6-20).csv', low_memory=False)
SLEEP_META_PATH = 'sleep_meta.csv'
# sleepMetaID_paths = set([int(x.split('.')[0]) for x in os.listdir(split_by_metaid_dir)])

Here, we create the folder StudyEgo_split_by_egoid_grades_only/ as a subset of StudyEgo_split_by_egoid/ by copying over the participants whose ID is available in CourseGrades(3-6-20).csv

In [6]:
def copyAllStudentsWithGradesData():
    egoids = os.listdir(EGOID_DIR) # all egoids 
    egoids_grades = list(GRADES['egoid'].unique()) # egoids w/ grade data available
    for file in tqdm(egoids):
        file_egoid = int(file.split('.')[0])
        if file_egoid in egoids_grades:
            copy(os.path.join(egoid_dir, file), EGOID_GRADES_DIR)

In [7]:
# copyAllStudentsWithGradesData()

The below functions help get the mappings from various variables, so that we can map date and egoid back to the original raw sleep data.

In [8]:
# list of all egoids with grades
def get_egoid_list():
    egoids = os.listdir(EGOID_GRADES_DIR)
    return [int(x.split('.')[0]) for x in egoids]

In [9]:
# gets a map from each idSleepMeta to its corresponding idStudy
def get_map_idSleepMeta_to_idStudy():
    map_idSleepMeta_to_studyID_sleep = {}
    with open(SLEEP_META_PATH, 'r') as file:
        reader = csv.reader(file)
        next(reader) # skips header
        for row in reader:
            map_idSleepMeta_to_studyID_sleep[int(row[0])] = int(row[1])
    return map_idSleepMeta_to_studyID_sleep

In [10]:
# get list of all idSleepMeta values
def get_idSleepMeta_list(idStudy_list):
    idSleepMeta_list = []
    idStudy_set = set(idStudy_list)
    with open(SLEEP_META_PATH, 'r') as file:
        reader = csv.reader(file)
        next(reader) # skips header
        for row in reader:
            if int(row[1]) in idStudy_set: idSleepMeta_list.append(int(row[0]))
    return idSleepMeta_list

In [11]:
# gets a map from each idSleepMeta to its corresponding date
def get_map_idStudy_to_dataDate(egoid):
    egoid_file = os.path.join(EGOID_GRADES_DIR,str(egoid)+'.csv')
    map_idStudy_to_dataDate = {}
    with open(egoid_file, 'r') as file:
        reader = csv.reader(file)
        next(reader) # skips header
        for row in reader:
            map_idStudy_to_dataDate[int(row[0])] = row[2]
    return map_idStudy_to_dataDate

In [12]:
# get all the idStudy values from an egoid
def get_idStudy_list(egoid):
    map_idStudy_to_dataDate = get_map_idStudy_to_dataDate(egoid)
    return list(map_idStudy_to_dataDate.keys())

The updateDataFrame function below takes in a dataframe from the Sleep_split_by_sleepMetaID/ folder, the corresponding egoid, a map from study to date, and meta to study, and uses this to create the file to go in sleep_raw_data that has the time and corresponding sleep value for every minute for the given participant.

In [13]:
# gets the sleep_raw_data file for the given df (from Sleep_split_by_sleepMetaID/) 
# and egoid 
def updateDataFrame(df, egoid, study_to_date, meta_to_study):

    df['date'] = df['sleepMetaID'].apply(lambda x: study_to_date[meta_to_study[x]])
    df['hhmmss'] = df['time']
    df = df.drop(columns=['time'])
    df['time'] = df.apply(lambda row: row['date'] + ' ' + row['hhmmss'], axis=1)  
    df['sleep_value'] = df['state'].apply(lambda x: int(x.split(' ')[1]))
    return df[['time','sleep_value']]

Now we convert all the files split by egoid with grades (StudyEgo_split_by_egoid_grades_only/) and convert them to their corresponding sleep raw data files in sleep_raw_data/

In [14]:
# takes in an egoid and coverts that file to the raw sleep file
def getRawSleepFile(egoid):
    egoid_file = os.path.join(EGOID_GRADES_DIR,str(egoid)+'.csv')

    idStudy_list = get_idStudy_list(egoid)
    idSleepMeta_list = get_idSleepMeta_list(idStudy_list)

    study_to_date = get_map_idStudy_to_dataDate(egoid)
    meta_to_study = get_map_idSleepMeta_to_idStudy()

    df_list = []
    for sleepMetaID_path in idSleepMeta_list:
        sleep_episode_path = os.path.join(SPLIT_BY_METAID_DIR, 
                                          str(sleepMetaID_path)+'.csv')

        try:
            sleep_episode_df = pd.read_csv(sleep_episode_path)
            sleep_df = updateDataFrame(sleep_episode_df, egoid, 
                                       study_to_date, meta_to_study)
            df_list.append(sleep_df)
        except:
            print('ERROR:', sleep_episode_path)


    dfs = pd.concat(df_list,axis=0)
    return dfs

In [15]:
# iterates through all the egoids with grades, and converts them to their
# corresponding raw sleep files
def saveAllSleepRawFiles():
    egoid_list = get_egoid_list() # gets only egoids with grades
    
    for egoid in tqdm(egoid_list):
        dfs = getRawSleepFile(egoid)
        dfs.to_csv(os.path.join(SLEEP_RAW_DIR, str(egoid)+'.csv'), index=False)

In [2]:
# saveAllSleepRawFiles()

We do not use steps data from the NetHealth dataset in our raw data files, but to make the folder names consistent with those from Life@CMU and UWEXP, we call the folder which contains the minute-by-minute sleep/wake data "sleep_steps_data/".

Below, we position each of the sleep_raw_data/ files onto a minute-by-minute index for the first Spring semester of the study (when all the participating students are first-years), and save these results to the combined sleep_steps_data/ file.

In [22]:
# put all raw sleep data on same minute index
# NetHealth Jan 12 - May 6, 2016 (Spring Break = Mar 5-13)

def saveAllCombined():
    TIME_INDEX = pd.date_range(start='1/12/2016', end='5/6/2016', freq='min')
    combined_dir = os.path.join('sleep_steps_data/')

    sleep_files = os.listdir(SLEEP_RAW_DIR)

    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'

    for sleep_file in tqdm(sleep_files):

        if os.path.isfile(os.path.join(COMBINED_DIR, sleep_file)):
            continue

        try:
            df_sleep = pd.read_csv(os.path.join(SLEEP_RAW_DIR, sleep_file))
            df_sleep['time'] = df_sleep['time'].apply(lambda time: 
                                                      datetime.strptime(time,DATE_FORMAT).replace(second=0))

            df = pd.DataFrame(index=TIME_INDEX)
            df = df.merge(df_sleep[['sleep_value','time']],how='left',left_index=True,right_on='time')
            df = df.set_index('time')
            df.sort_index(inplace=True)
            df.to_csv(os.path.join(COMBINED_DIR, sleep_file))
        except:
            print(sleep_file)

In [23]:
# saveAllCombined()