# Load Data from apple_health_export/ 
Originally as XML file, want to convert to csv

In [3]:
# Load needed packages
import pandas as pd
import xmltodict

In [4]:
# use your own directory, this is where I exported my data
data_path = 'apple_health_export/export.xml' 

# read XML file, convert to a dictionary
with open(data_path, 'r') as xml_file:
    all_data = xmltodict.parse(xml_file.read())

In [6]:
# look at all data available (remember it is in a dictionary)
all_data['HealthData'].keys()

odict_keys(['@locale', 'ExportDate', 'Me', 'Record', 'Workout', 'ActivitySummary'])

For now, let's just look at my individual activity summary for each day and focus on that. Will put all health Records and Activity Summary into list, and then pandas dataframe

In [8]:
# create a records list for overall health data, put into pandas dataframe
records_list = all_data['HealthData']['Record']
all_records = pd.DataFrame(records_list)

In [109]:
# activity summary (acts)
acts_list = all_data['HealthData']['ActivitySummary']
acts_df = pd.DataFrame(acts_list)

In [110]:
# glimpse at data
acts_df.head()

Unnamed: 0,@dateComponents,@activeEnergyBurned,@activeEnergyBurnedGoal,@activeEnergyBurnedUnit,@appleMoveTime,@appleMoveTimeGoal,@appleExerciseTime,@appleExerciseTimeGoal,@appleStandHours,@appleStandHoursGoal
0,1969-12-30,0.0,0,Cal,0,0,0,30,0,12
1,1969-12-31,0.0,0,Cal,0,0,0,30,0,12
2,2015-08-29,0.0,0,Cal,0,0,0,30,0,12
3,2015-08-30,0.467,0,Cal,0,0,0,30,0,12
4,2015-10-01,0.0,0,Cal,0,0,0,30,0,12


# Now that data is imported, I will clean it for easier use 

In [111]:
## Cleaning 

# remove special characters from column names 
acts_df.columns = acts_df.columns.str.replace('@', '')

# convert date column to date format
acts_df['dateComponents'] = pd.to_datetime(acts_df['dateComponents'])

# make sure dates are only within range from 1/2016-recent (4/2022),
# this is when I got the apple watch, data is not just from phone
acts_df = acts_df[(acts_df['dateComponents'] > '2016-1-1') & (acts_df['dateComponents'] <= '2022-4-1')]


In [118]:
acts_df.head()

Unnamed: 0,dateComponents,activeEnergyBurned,activeEnergyBurnedGoal,activeEnergyBurnedUnit,appleMoveTime,appleMoveTimeGoal,appleExerciseTime,appleExerciseTimeGoal,appleStandHours,appleStandHoursGoal
97,2016-01-02,565.0,320,Cal,0,0,49,30,12,12
98,2016-01-03,805.0,320,Cal,0,0,73,30,12,12
99,2016-01-04,791.0,350,Cal,0,0,73,30,14,12
100,2016-01-05,207.0,350,Cal,0,0,5,30,12,12
101,2016-01-06,617.0,350,Cal,0,0,54,30,13,12


Now that dates and columns are done, lets look at data types

In [113]:
acts_df.dtypes

dateComponents            datetime64[ns]
activeEnergyBurned                object
activeEnergyBurnedGoal            object
activeEnergyBurnedUnit            object
appleMoveTime                     object
appleMoveTimeGoal                 object
appleExerciseTime                 object
appleExerciseTimeGoal             object
appleStandHours                   object
appleStandHoursGoal               object
dtype: object

In [117]:
# convert the rest of the columns to integers (except for Unit)
cols = acts_df.columns
acts_df[cols[1:3]] = acts_df[cols[1:3]].apply(pd.to_numeric, errors='coerce')
acts_df[cols[4:]] = acts_df[cols[4:]].apply(pd.to_numeric, errors='coerce')

# round active energy so it is consistent
acts_df = acts_df.round()

# Now that data is consistent, will write to csv and use from here on out

In [119]:
# activity summary
acts_df.to_csv("apple_health_export/activitysummary.csv")

In [120]:
# will also write all health records to csv, but note that it is not clean
all_records.to_csv("apple_health_export/all_records.csv")