# Load Data from apple_health_export/ 
Originally as XML file, want to convert to csv

In [4]:
# Load needed packages
import pandas as pd
import xmltodict

In [None]:
# use your own directory, this is where I exported my data
data_path = 'apple_health_export/export.xml' 

# read XML file, convert to a dictionary
with open(data_path, 'r') as xml_file:
    all_data = xmltodict.parse(xml_file.read())

In [None]:
# look at all data available (remember it is in a dictionary)
all_data['HealthData'].keys()

For now, I will just show the steps for cleaning my individual activity summary (acts) and workouts, which I will put into a list, and then pandas dataframe. I will also do so for other records (all), workout, etc. in the future

In [None]:
# create a records list for overall health data, put into pandas dataframe
records_list = all_data['HealthData']['Record']
all_records = pd.DataFrame(records_list)

In [None]:
# activity summary (acts)
acts_list = all_data['HealthData']['ActivitySummary']
acts_df = pd.DataFrame(acts_list)

In [None]:
# glimpse at data
acts_df.head()

# Now that data is imported, I will clean it for easier use 

In [12]:
## Cleaning 

# remove special characters from column names 
acts_df.columns = acts_df.columns.str.replace('@', '')

# convert date column to date format
acts_df['dateComponents'] = pd.to_datetime(acts_df['dateComponents'])

# make sure dates are only within range from 1/2016-recent (4/2022),
# this is when I got the apple watch, data is not just from phone
acts_df = acts_df[(acts_df['dateComponents'] > '2016-1-1') & (acts_df['dateComponents'] <= '2022-4-1')]


In [13]:
acts_df.head()

Unnamed: 0,dateComponents,activeEnergyBurned,activeEnergyBurnedGoal,activeEnergyBurnedUnit,appleMoveTime,appleMoveTimeGoal,appleExerciseTime,appleExerciseTimeGoal,appleStandHours,appleStandHoursGoal
0,2016-01-02,565.0,320,Cal,0,0,49,30,12,12
1,2016-01-03,805.0,320,Cal,0,0,73,30,12,12
2,2016-01-04,791.0,350,Cal,0,0,73,30,14,12
3,2016-01-05,207.0,350,Cal,0,0,5,30,12,12
4,2016-01-06,617.0,350,Cal,0,0,54,30,13,12


Now that dates and columns are done, lets look at data types

In [14]:
acts_df.dtypes

dateComponents            datetime64[ns]
activeEnergyBurned               float64
activeEnergyBurnedGoal             int64
activeEnergyBurnedUnit            object
appleMoveTime                      int64
appleMoveTimeGoal                  int64
appleExerciseTime                  int64
appleExerciseTimeGoal              int64
appleStandHours                    int64
appleStandHoursGoal                int64
dtype: object

In [None]:
# convert the rest of the columns to integers (except for Unit)
cols = acts_df.columns
acts_df[cols[1:3]] = acts_df[cols[1:3]].apply(pd.to_numeric, errors='coerce')
acts_df[cols[4:]] = acts_df[cols[4:]].apply(pd.to_numeric, errors='coerce')

# round active energy so it is consistent
acts_df = acts_df.round()

# For more analyses and visualizations, I will also clean the workouts and put into a dataframe

In [None]:
# create workouts df
workouts_list = all_data['HealthData']['Workout']
workout_df = pd.DataFrame(workouts_list)

In [None]:
# look at variables (workout)
workout_df.keys()

In [None]:
# remove special characters from column names 
all_records.columns = all_records.columns.str.replace('@', '')
workout_df.columns = workout_df.columns.str.replace('@', '')

In [None]:
workout_df['workoutActivityType'].unique()

# convert/string replace for easier naming
workout_df['workoutActivityType'] = workout_df['workoutActivityType'].str.replace('HKWorkoutActivityType', '')

In [None]:
# What columns do we have to work with? Can decide how to parse data 
all_records['type'].unique()
workout_df['workoutActivityType'].unique()

In [None]:
# convert date column to date format for three columns containing 'date'
# make sure dates are only within range from 1/2016-recent (4/2022),
dates = ['startDate', 'endDate', 'creationDate'] # list of cols with date in them 
for cols in dates:
    workout_df[cols] = pd.to_datetime(workout_df[cols])
    workout_df = workout_df[(workout_df[cols] > '2016-1-1') & (workout_df[cols] <= '2022-4-1')]

In [17]:
# turn creation date to just datetime to use for date info
workout_df['creationDate'] = pd.to_datetime(workout_df['creationDate']).dt.date

In [None]:
#drop columns we do not need
workout_df = workout_df.drop(columns=['MetadataEntry', 'WorkoutEvent', 'device', 'WorkoutRoute'])

In [None]:
# look at data types
workout_df.dtypes

for cols in ['duration', 'totalDistance', 'totalEnergyBurned']:
    workout_df[cols] = pd.to_numeric(workout_df[cols], errors='coerce')
    
# round numeric values so it is consistent
workout_df = workout_df.round()

In [18]:
# quickly view data
workout_df.sample(n=5)

Unnamed: 0,workoutActivityType,duration,durationUnit,totalDistance,totalDistanceUnit,totalEnergyBurned,totalEnergyBurnedUnit,sourceName,sourceVersion,creationDate,startDate,endDate
57,Walking,10.0,min,1.0,mi,49.0,Cal,Rebecca’s Apple Watch,5.1.2,2018-12-14,2018-12-14 21:20:03-04:00,2018-12-14 21:30:27-04:00
192,Walking,51.0,min,2.0,mi,224.0,Cal,Rebecca’s Apple Watch,5.2.1,2019-08-16,2019-08-16 18:03:36-04:00,2019-08-16 18:54:47-04:00
323,Cycling,45.0,min,11.0,mi,353.0,Cal,Peloton,19746,2020-09-22,2020-09-19 19:40:52-04:00,2020-09-19 20:25:52-04:00
37,Elliptical,2.0,min,0.0,mi,19.0,Cal,Rebecca’s Apple Watch,2.1,2016-04-11,2016-04-11 11:09:06-04:00,2016-04-11 11:11:34-04:00
625,FunctionalStrengthTraining,30.0,min,0.0,mi,193.0,Cal,Peloton,112957,2021-04-09,2021-04-06 21:03:06-04:00,2021-04-06 21:33:35-04:00


# Now that data is consistent, will write to csv and use from here on out

In [None]:
# activity summary
acts_df.to_csv("data/activitysummary.csv", index=False)

In [19]:
# workout
workout_df.to_csv("data/workouts.csv", index=False)

In [None]:
# all records
# NOTE not all data is cleaned, will need to re-format if using this in the future
# for now will put with all export 
all_records.to_csv("apple_health_export/all_records.csv")

Note: steps data cleaned in "get_steps.py" script, added on 4.25