### Health Tracker 2.0

In [1]:
import pandas as pd
import xmltodict
import calendar
from datetime import *

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

import re
import os.path
import zipfile

#### Download and extract data

In [2]:
# Authenticate into Google Drive

gauth = GoogleAuth()
gauth.LocalWebserverAuth()

drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=414641206861-31vsg5oe1qgiisil1vam0pk61mlkqjsl.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [3]:
# Download the most recent Apple Health export file

# use the folder id where health data is stored
file_list = drive.ListFile({'q': "'1TWmwU17n7cgaPUGX8xmiqM3VoCF6A04v' in parents and trashed=false"}).GetList()

selection_dt = datetime.strptime("2000-01-01T01:01:01.001Z","%Y-%m-%dT%H:%M:%S.%fZ")
print("Matching Files")
for file1 in file_list: 
    if re.search("^export-*\d*.zip",file1['title']):
        dt = datetime.strptime(file1['createdDate'],"%Y-%m-%dT%H:%M:%S.%fZ")
        if dt > selection_dt:
            selection_id = file1['id']
            selection_dt = dt
        print('    title: %s, id: %s createDate: %s' % (file1['title'], file1['id'], file1['createdDate']))

Matching Files
    title: export.zip, id: 1kJRIku6u7H1dIfsiz8tMxFEqGDVVS-sk createDate: 2020-12-10T00:48:12.297Z
    title: export.zip, id: 16nDgNcaKH525oTiOsOfxotaUOuOX4ZLW createDate: 2020-12-07T00:54:55.013Z


In [4]:
if not os.path.exists('healthextract'):
    os.mkdir('healthextract')

In [5]:
for file1 in file_list:
        if file1['id'] == selection_id:
            print('Downloading this file: %s, id: %s createDate: %s' % (file1['title'], file1['id'], file1['createdDate']))
            file1.GetContentFile("healthextract/export.zip")

Downloading this file: export.zip, id: 1kJRIku6u7H1dIfsiz8tMxFEqGDVVS-sk createDate: 2020-12-10T00:48:12.297Z


In [6]:
# Unzip and parse data
zip_ref = zipfile.ZipFile('healthextract/export.zip', 'r')
zip_ref.extractall('healthextract')
zip_ref.close()

#### Create seaparate data files

In [7]:
input_path = 'healthextract/apple_health_export/export.xml'
with open(input_path, 'r') as xml_file:
    input_data = xmltodict.parse(xml_file.read())

records_list = input_data['HealthData']['Record']
health_data = pd.DataFrame(records_list)

In [8]:
health_data.shape

(580590, 11)

In [9]:
health_data['@type'].unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierOxygenSaturation',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierRestingHeartRate',
       'HKQuantityTypeIdentifierWalkingHeartRateAverage',
       'HKQuantityTypeIdentifierEnvironmentalAudioExposure',
       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
       'HKQuantityTypeIdentifierWalkingDoubleSupportPercentage',
       'HKQuantityTypeIdentifierAppleStandTime',
       'HKQuantityTypeIdentifierWalkingSpeed',
       'HKQuantityTypeIdentifierWalkingStepLength',
       'HKQuantityTypeIdentifierWalkingAsymmetryPercentage',
       'HKQua

#### Workout Data

In [10]:
workout_list = input_data['HealthData']['Workout']
health_data_workout = pd.DataFrame(workout_list)

In [11]:
health_data_workout.columns

Index(['@workoutActivityType', '@duration', '@durationUnit', '@totalDistance',
       '@totalDistanceUnit', '@totalEnergyBurned', '@totalEnergyBurnedUnit',
       '@sourceName', '@sourceVersion', '@creationDate', '@startDate',
       '@endDate', 'MetadataEntry', '@device', 'WorkoutEvent'],
      dtype='object')

In [12]:
health_data_workout['@workoutActivityType'].unique()

array(['HKWorkoutActivityTypeHighIntensityIntervalTraining',
       'HKWorkoutActivityTypeRunning', 'HKWorkoutActivityTypeOther',
       'HKWorkoutActivityTypeHiking',
       'HKWorkoutActivityTypeFunctionalStrengthTraining',
       'HKWorkoutActivityTypeYoga', 'HKWorkoutActivityTypeCrossTraining',
       'HKWorkoutActivityTypeWalking'], dtype=object)

In [13]:
health_data_workout['@sourceName'].unique()

array(['Seven', 'Runkeeper', 'Pratik’s Apple\xa0Watch', 'Nike Training'],
      dtype=object)

In [14]:
health_data_workout['@durationUnit'].unique()

array(['min'], dtype=object)

In [15]:
health_data_workout.shape

(163, 15)

In [16]:
health_data_workout.to_csv('./data/workoutData.csv', sep=',', encoding='utf-8', index=False)

#### Activity Data

In [17]:
activity_summary_list = input_data['HealthData']['ActivitySummary']
health_data_activity = pd.DataFrame(activity_summary_list)

In [18]:
health_data_activity.columns

Index(['@dateComponents', '@activeEnergyBurned', '@activeEnergyBurnedGoal',
       '@activeEnergyBurnedUnit', '@appleMoveTime', '@appleMoveTimeGoal',
       '@appleExerciseTime', '@appleExerciseTimeGoal', '@appleStandHours',
       '@appleStandHoursGoal'],
      dtype='object')

In [19]:
# format timestamp data

# format = '%Y-%m-%d %H:%M:%S'
# health_data_activity['@dateComponents'] = pd.to_datetime(health_data_activity['@dateComponents'],
#                                      format=format)

In [20]:
metrics = ['@activeEnergyBurned', '@activeEnergyBurnedGoal', '@appleExerciseTime',
           '@appleExerciseTimeGoal', '@appleStandHours', '@appleStandHoursGoal']
for metric in metrics:
    health_data_activity.loc[:, metric] = pd.to_numeric(health_data_activity.loc[:, metric])
    health_data_activity.loc[:, metric] = pd.to_numeric(health_data_activity.loc[:, metric])

In [21]:
cols = ['@dateComponents', '@activeEnergyBurned', '@activeEnergyBurnedGoal', '@activeEnergyBurnedUnit',
        '@appleExerciseTime', '@appleExerciseTimeGoal', '@appleStandHours', '@appleStandHoursGoal']
health_data_activity = health_data_activity[cols]
health_data_activity.to_csv('./data/activityData.csv', sep=',', encoding='utf-8', index=False)

#### Record Data  
1) Sleep Analysis  
2) Oygen Saturation  
3) Step Count  
4) Distance Walking/Running  
5) Basal Energy Burned  
6) Active Enregy Burned  
7) Flights Climbed

In [22]:
data_record_columns = ['HKCategoryTypeIdentifierSleepAnalysis',
                       'HKQuantityTypeIdentifierEnvironmentalAudioExposure',
                       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
                       'HKQuantityTypeIdentifierHeartRate',
                       'HKQuantityTypeIdentifierOxygenSaturation',
                       'HKQuantityTypeIdentifierBasalEnergyBurned',
                       'HKQuantityTypeIdentifierActiveEnergyBurned'
                      ]

health_data_record = health_data[health_data['@type'].isin(data_record_columns)]
cols = ['@type', '@unit', '@creationDate', '@startDate', '@endDate', '@value']
health_data_record = health_data_record[cols]

In [23]:
health_data_record.to_csv('./data/recordData.csv', sep=',', encoding='utf-8', index=False)

#### Walking Data

In [24]:
walking_data_columns = ['HKQuantityTypeIdentifierStepCount',
                        'HKQuantityTypeIdentifierDistanceWalkingRunning',
                        'HKQuantityTypeIdentifierWalkingDoubleSupportPercentage',
                        'HKQuantityTypeIdentifierWalkingSpeed',
                        'HKQuantityTypeIdentifierWalkingStepLength',
                        'HKQuantityTypeIdentifierWalkingAsymmetryPercentage',
                        'HKQuantityTypeIdentifierFlightsClimbed'
                       ]

walking_data_record = health_data[health_data['@type'].isin(walking_data_columns)]

In [25]:
for ctype in walking_data_record['@type'].unique():
    print(walking_data_record[walking_data_record['@type'] == ctype].shape)

(66443, 11)
(71374, 11)
(3882, 11)
(845, 11)
(868, 11)
(878, 11)
(530, 11)


In [26]:
walking_data_record.to_csv('./data/walkingData.csv', sep=',', encoding='utf-8', index=False)

In [43]:
health_data_activity.shape

(623, 8)