### Setup

In [76]:
import pandas as pd
import numpy as np
import xmltodict
import inflection

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 500)

### Load data

Later, convert this into a Python script to clean automatically. Save Jupyter notebook for analysis

In [2]:
# Load Apple Health file as dictionaries

file_path = 'data/export.xml'

with open(file_path, 'r') as apple_health_file:
    apple_health_dict = xmltodict.parse(apple_health_file.read())
    
# Extract relevant part of dictionary as another (smaller) dictionary
# The dict called HealthData contains sub-dictionaries for @locale, ExportDate, Me, Record
# We are most interested in Record - contains Health records

records_dict = apple_health_dict['HealthData']['Record']

# Convert to a dataframe

apple_health_raw = pd.DataFrame(records_dict)
te

## Thanks to Guido Casiraghi and his Medium article for helping read in the file
## https://medium.com/better-programming/analyze-your-icloud-health-data-with-pandas-dd5e963e902f

In [72]:
# Clean up file


# apple_health_raw


# Drop unwanted columns
# Rename all the columns
# Clean each column



def clean_type(df):
    df = df.copy()
    df['type'] = df['@type']
    df['type'] = df.str[len(string_to_remove):]
    return df
    


# Plan for each column
# @type - rename options to left of HKQuantityTypeIdentifier, convert camel case to underscore casing, convert to category, drop rare or sensitive categories I don't want to share (HighHeartRate,MenstrualFlow, WaistCircumference,MindfulSession, BodyMass,Height)
# @sourceName - rename to standardised source naumes (not Rachels, iPhone of me, etc), convert to category
# @sourceVersion - drop
# @device - drop
# @unit - keep, rename column, convert to category
# @creationDate - drop - seems to be the same as either startDate or endDate
# @startDate and @endDate - convert to datetime, keep in time zone recorded, get rid of time zone signifier (+1000)
# @value- convert to integer
# MetadataEntry - drop from main dataframe but preserve for separate 'deep dives' into heart rate analysis and sleep analysis
# HeartRateVariabilityMetadataList - drop from main dataframe but preserve for separate 'deep dive' into heart rate analysis

# Thanks to Tom Augspurger and his wonderful series on Modern Pandas for teaching me method chaining
# https://tomaugspurger.github.io/method-chaining.html

In [236]:
# List of dropped columns
dropped_cols = ['@sourceVersion', '@device', '@creationDate', 'MetadataEntry', 'HeartRateVariabilityMetadataList']

# List of new and old column names
current_cols = list(apple_health_raw.columns)
new_cols = [inflection.underscore(i).replace('@', '') for i in current_cols]
dict_col_names = dict(zip(current_cols, new_cols))

# Function to clean 'type' column
def clean_type(df):
    df = df.copy()
    # Remove unnecessary string at the front
    string_to_remove = 'HKQuantityTypeIdentifier'
    df['type'] = df['type'].str[len(string_to_remove):]
    # Drop unwanted types
    unwanted_types = ['VO2Max','HighHeartRateEvent', 'MenstrualFlow', 'WaistCircumference', 
                      'MindfulSession', 'BodyMass', 'Height']
    df = df[~df['type'].isin(unwanted_types)]
    # Convert to category dtype
    df['type'] = df['type'].astype('category')
    return df

# Function to clean 'source_name' column
def clean_source_name(df):
    df = df.copy()
    # Replace common device key names
    custom_device_name_keywords = ['Watch', 'iPhone']
    for i in custom_device_name_keywords:
        df.loc[df['source_name'].str.contains(i), 'source_name'] = i
#     old_source_names = list(ap['source_name'].unique())
#     new_source_names = ['Streaks', 'Watch', 'iPhone', 'Clock', 'AutoSleep']
#     df['source_name'] = df['source_name'].replace(old_source_names, new_source_names)
    # Convert to category dtype
    df['source_name'] = df['source_name'].astype('category')
    return df

# Function to clean 'unit' column
def clean_unit(df):
    df = df.copy()
    # Convert to category dtype
    df['unit'] = df['unit'].astype('category')
    return df

# Function to clean 'value' column
# def clean_value(df):
#     df = df.copy()
    # Replace 'HKCategoryValueAppleStandHourStood' with 1 to indicate a standing hour
#     df['value'] = df['value'].replace({}'HKCategoryValueAppleStandHourStood', 1)
    
    # Convert to float dtype
#     df['value'] = df['value'].astype('float')
#     return df


ap = (
    apple_health_raw
    .copy()
    .drop(dropped_cols, axis = 1)
    .rename(columns = dict_col_names)
    .pipe(clean_type)
    .pipe(clean_source_name)
)

In [235]:
# list(ap['source_name'].unique())

#if it cona


# ap['source_name'].replace('Rachel’s Apple\xa0Watch', 'Watch')

custom_device_name_keywords = ['Watch', 'iPhone']
for i in custom_device_name_keywords:
    ap.loc[ap['source_name'].str.contains(i), 'source_name'] = i

# ap.loc[ap['source_name'].str.contains('Watch'), 'source_name'] = 'Watch'
# ap.loc[ap['source_name'].str.contains('iPhone'), 'source_name'] = 'iPhone'

# replace('(Watch)', 'Watch2', regex = True).value_counts()

# ap['source_name'].value_counts()

# .value_counts()

In [234]:
ap['source_name'].unique()

array(['Streaks', 'Watch', 'iPhone', 'Clock', 'AutoSleep'], dtype=object)

In [237]:
# ap[ap['value'] == 'HKCategoryValueAppleStandHourIdle']

# ap.pipe(clean_value)

# ap['value'].value_counts().sort_index()

# HKCategoryValueAppleStandHourIdle      744
# HKCategoryValueAppleStandHourStood    1465
# HKCategoryValueSleepAnalysisAsleep     129
# HKCategoryValueSleepAnalysisInBed 




In [238]:
# ap['source_name'].unique()

In [170]:
# old_source_name = list(ap['source_name'].unique())
# new_source_name = ['Streaks', 'Watch', 'iPhone', 'Clock', 'AutoSleep']

# ap['source_name'].replace(old_source_name, new_source_name).unique()

# # ap

array(['Streaks', 'Watch', 'iPhone', 'Clock', 'AutoSleep'], dtype=object)

In [239]:
# ap['type'].value_counts().index


# # Unwanted rows
# unwanted_types = ['VO2Max','HighHeartRateEvent', 'MenstrualFlow', 'WaistCircumference', 'MindfulSession', 'BodyMass', 'Height']

# ap[~ap['type'].isin(unwanted_types)]

In [97]:
# current_cols = list(ap.columns)
# new_cols = [inflection.underscore(i).replace('@', '') for i in current_cols]

# dict(zip(current_cols, new_cols))

{'@type': 'type',
 '@sourceName': 'source_name',
 '@unit': 'unit',
 '@startDate': 'start_date',
 '@endDate': 'end_date',
 '@value': 'value'}

In [78]:
# a = 'DietaryWater'
# inflection.underscore(a)

'dietary_water'

In [240]:
# string_to_remove = 'HKQuantityTypeIdentifier'
# apple_health_raw['@type'].str[len(string_to_remove):]

# # len('HKQuantityTypeIdentifier')
