### Setup

In [None]:
import pandas as pd
import numpy as np
import xmltodict
import inflection

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 500)

### Load data

Later, convert this into a Python script to clean automatically. Save Jupyter notebook for analysis

In [None]:
# Load Apple Health file as a mega-dictionary
file_path = 'data/export.xml'
with open(file_path, 'r') as apple_health_file:
    apple_health_dict = xmltodict.parse(apple_health_file.read())
    
# Extract relevant part of mega-dictionary as another (smaller) dictionary
# The dict called HealthData contains sub-dictionaries for @locale, ExportDate, Me, Record
# We are most interested in Record - contains Health records
records_dict = apple_health_dict['HealthData']['Record']

# Convert to a dataframe
apple_health_raw = pd.DataFrame(records_dict)

## Thanks to Guido Casiraghi and his Medium article for helping read in the file
## https://medium.com/better-programming/analyze-your-icloud-health-data-with-pandas-dd5e963e902f

### Clean data

In [None]:
## Define cleaning procedures
# Drop unwanted columns
# Rename remaining columns
# Clean each column

# List of dropped columns
dropped_cols = ['@sourceVersion', '@device', '@creationDate', 'MetadataEntry', 
                'HeartRateVariabilityMetadataList']

# List of new and old column names
current_cols = list(apple_health_raw.columns)
new_cols = [inflection.underscore(i).replace('@', '') for i in current_cols]
dict_col_names = dict(zip(current_cols, new_cols))

# Function to clean 'type' column
def clean_type(df):
    df = df.copy()
    # Remove unnecessary string at the front
    string_to_remove = 'HKQuantityTypeIdentifier'
    df['type'] = df['type'].str[len(string_to_remove):]
    # Drop unwanted types
    unwanted_types = ['VO2Max','HighHeartRateEvent', 'MenstrualFlow', 'WaistCircumference', 
                      'MindfulSession', 'BodyMass', 'Height']
    df = df[~df['type'].isin(unwanted_types)]
    # Convert to category dtype
    df['type'] = df['type'].astype('category')
    return df

# Function to clean 'source_name' column
def clean_source_name(df):
    df = df.copy()
    # Replace commonly customised device names with standard keywords (e.g. Rachel's iPhone to iPhone)
    custom_device_name_keywords = ['Watch', 'iPhone']
    for i in custom_device_name_keywords:
        df.loc[df['source_name'].str.contains(i), 'source_name'] = i
    # Convert to category dtype
    df['source_name'] = df['source_name'].astype('category')
    return df

# Function to clean 'unit' column
def clean_unit(df):
    df = df.copy()
    # Convert to category dtype
    df['unit'] = df['unit'].astype('category')
    return df

# Function to clean 'start_date' and 'end_date' columns
# All times appear to be in local time of location where data was extracted - no value to keeping
def clean_dates(df):
    df = df.copy()
    datetime_cols = ['start_date', 'end_date']
    # Convert columns to datetime and remove time zone
    for i in datetime_cols:
        df[i] = pd.to_datetime(df[i]).dt.tz_localize(tz = None)
    return df

# Function to clean 'value' column
def clean_value(df):
    df = df.copy()
    # Replace string categories with numerical eqiuvalents
    string_values = ['HKCategoryValueAppleStandHourIdle', 'HKCategoryValueAppleStandHourStood',
                     'HKCategoryValueSleepAnalysisAsleep','HKCategoryValueSleepAnalysisInBed']
    numerical_values = [0, 1, 2, 1]
    df['value'] = df['value'].replace(string_values, numerical_values)
    # Convert to float dtype
    df['value'] = df['value'].astype('float')
    return df

## Run cleaning procedures
ap = (
    apple_health_raw
    .copy()
    .drop(dropped_cols, axis = 1)
    .rename(columns = dict_col_names)
    .pipe(clean_type)
    .pipe(clean_source_name)
    .pipe(clean_unit)
    .pipe(clean_dates)
    .pipe(clean_value)
)

# Note - dropped columns that may of interest for future deep dives
# MetadataEntry - heart rate analysis and sleep analysis
# HeartRateVariabilityMetadataList - heart rate analysis

# Thanks to Tom Augspurger and his wonderful series on Modern Pandas for teaching me method chaining
# https://tomaugspurger.github.io/method-chaining.html