In [2]:
# Import all the frameworks
import xml.etree.ElementTree as ET
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import StandardScaler




# Getting Data

In [3]:
# First to get all the healthkit data from dataset, healthkit data is exported in xml file
root_list = []

path = 'C:/Users/lmm20/Downloads/healthkit_data'
for file in os.listdir(path):
    if file.endswith('.xml'):
        file_name = path + '/' +file
        tree = ET.parse(file_name)
        root = tree.getroot()
        root_list.append(root)
print(root_list)

[<Element 'HealthData' at 0x000001BDC482F1F0>, <Element 'HealthData' at 0x000001BDC482F100>, <Element 'HealthData' at 0x000001BDC482F2E0>, <Element 'HealthData' at 0x000001BE4E2C0630>, <Element 'HealthData' at 0x000001BDC438AE30>, <Element 'HealthData' at 0x000001BE00DF0CC0>, <Element 'HealthData' at 0x000001BDC482F330>, <Element 'HealthData' at 0x000001BE6FACFEC0>]


In [4]:
# In this part, we are loading all the stepcount data and store it in pd.dataframe with three columns(user_id, date, steps)

stepcount = []
count = 0

for i in root_list:
    for record in i.findall(".//Record[@type='HKQuantityTypeIdentifierStepCount']"):
        small = []
        user_id = "user"+str(count)
        small.append(user_id)
        stepcount.append(small)
        small.append(record.get('startDate'))
        small.append(record.get('value'))
    count += 1

# set the format of datetime
date_format = "%Y-%m-%d %H:%M:%S"

df = pd.DataFrame(stepcount, columns=['user_id', 'date', 'steps'])

# Convert 'date' string to datetime with specified format
df['date'] = pd.to_datetime(df['date'], format=date_format, errors='coerce')

# Check for any rows where 'date' conversion resulted in NaN
if df['date'].isna().any():
    print("Warning: Some 'date' values failed to convert and are NaN.")
    
# Only proceed if there are no NaT values, or handle them accordingly
if not df['date'].isna().any():
    # Safely extract the date component
    df['date'] = df['date'].apply(lambda x: x.date())
else:
    # Handle NaT values as needed
    # For example, you might choose to drop these rows or fill them with a placeholder
    df = df.dropna(subset=['date'])

# Ensure 'steps' is numeric for summation
df['steps'] = pd.to_numeric(df['steps'], errors='coerce')

# Group by 'user_id' and 'date', then sum steps
df_steps = df.groupby(['user_id', 'date'], as_index=False)['steps'].sum()


print(df_steps)


      user_id        date  steps
0       user0  2017-02-23    550
1       user0  2017-02-24   6819
2       user0  2017-02-25    133
3       user0  2017-02-26    320
4       user0  2017-02-27    676
...       ...         ...    ...
15898   user7  2024-02-15   7872
15899   user7  2024-02-16   7772
15900   user7  2024-02-17   4646
15901   user7  2024-02-18   4868
15902   user7  2024-02-19    209

[15903 rows x 3 columns]


In [5]:
# In this part, we are loading all the weights data and store it in pd.dataframe with three columns(user_id, date, weight, unit)


weights = []
user_count = 0 
# Loop through the XML to find weight records
for i in root_list:
    for record in i.findall(".//Record[@type='HKQuantityTypeIdentifierBodyMass']"):
        # Extract the relevant attributes for each weight record
        date = record.get('startDate')  # The date and time when the weight was recorded
        #value = 0
        value = int(float(record.get('value')))  # The weight value
        unit = record.get('unit') 
        
        # We need to make sure all the weight values are stored with the same unit. If the weight unit is lb, we convert it to
        # kg by multiply lb by 0.453592
        if unit == 'lb':
            value = value * 0.453592
            unit = 'kg'
        user_id = 'user'+str(user_count)
        # Store the extracted data in the list
        weights.append({'user_id':str(user_id), 'date': date, 'weight': value, 'unit': unit})
    user_count += 1
date_format = "%Y-%m-%d"

df_weights = pd.DataFrame(weights)

df_weights['date'] = pd.to_datetime(df_weights['date'], utc=True).dt.tz_convert(None).dt.date

print(df_weights)

   user_id        date    weight unit
0    user0  2019-04-21  65.00000   kg
1    user0  2017-06-30  59.00000   kg
2    user0  2020-08-23  70.00000   kg
3    user0  2021-01-11  67.00000   kg
4    user0  2023-11-01  72.00000   kg
..     ...         ...       ...  ...
89   user5  2021-08-11  67.00000   kg
90   user5  2023-01-18  69.00000   kg
91   user6  2023-12-28  76.00000   kg
92   user7  2023-09-28  90.71840   kg
93   user7  2019-09-11  86.18248   kg

[94 rows x 4 columns]


In [6]:
# Mergde the dataframes of steps and weights according to the user_id and date, but there are many rows with no weight because 
# users do not store the change of weight everyday. 

merged_df = pd.merge(df_steps, df_weights, on=['user_id', 'date'], how='outer')

print(merged_df)

      user_id        date  steps  weight unit
0       user0  2017-02-23    550     NaN  NaN
1       user0  2017-02-24   6819     NaN  NaN
2       user0  2017-02-25    133     NaN  NaN
3       user0  2017-02-26    320     NaN  NaN
4       user0  2017-02-27    676     NaN  NaN
...       ...         ...    ...     ...  ...
15927   user7  2024-02-15   7872     NaN  NaN
15928   user7  2024-02-16   7772     NaN  NaN
15929   user7  2024-02-17   4646     NaN  NaN
15930   user7  2024-02-18   4868     NaN  NaN
15931   user7  2024-02-19    209     NaN  NaN

[15932 rows x 5 columns]


In [7]:
# This function aims to fill in the blank of weight data. Suppose one user do not have the data of weight on 2018-5-7, but the
# system read that the weight on 2018-5-30 is 70kg so the algorithm automatically fill the weight on 2018-5-7 with 70kg. 


for i, row in merged_df.iterrows():
    if pd.isna(row['unit']):  # Check if 'unit' is NaN
        # Initialize variables to track the closest weight and its date difference
        temp_weight = None
        temp_date_difference = pd.Timedelta.max  # Use max timedelta for initial comparison

        # Find the closest weight record in 'df_weights' for the same 'user_id'
        for _, weight_row in df_weights[df_weights['user_id'] == row['user_id']].iterrows():
            weight_date = pd.to_datetime(weight_row['date'])
            steps_date = pd.to_datetime(row['date'])

            # Calculate the absolute difference in days (or any other suitable measure)
            diff = abs(weight_date - steps_date)

            # Update if this record is closer to the 'steps' record date
            if diff < temp_date_difference:
                temp_weight = weight_row['weight']
                temp_date_difference = diff

        # Update the 'merged_df' DataFrame directly
        if temp_weight is not None:
            merged_df.loc[i, 'weight'] = temp_weight
            merged_df.loc[i, 'unit'] = 'kg'


In [8]:
merged_df

Unnamed: 0,user_id,date,steps,weight,unit
0,user0,2017-02-23,550,59.0000,kg
1,user0,2017-02-24,6819,59.0000,kg
2,user0,2017-02-25,133,59.0000,kg
3,user0,2017-02-26,320,59.0000,kg
4,user0,2017-02-27,676,59.0000,kg
...,...,...,...,...,...
15927,user7,2024-02-15,7872,90.7184,kg
15928,user7,2024-02-16,7772,90.7184,kg
15929,user7,2024-02-17,4646,90.7184,kg
15930,user7,2024-02-18,4868,90.7184,kg


In [9]:
# In this part, we are loading all the calorie consumption data and store it in pd.dataframe with 
# three columns(user_id, date, weight, unit)


calorie_data = []
user_id = 0
# Loop through the XML to find calorie consumption records
for i in root_list:
    for record in i.findall(".//Record[@type='HKQuantityTypeIdentifierActiveEnergyBurned']"):
        # Extract the relevant attributes
        date = record.get('startDate')  # The date and time the calories were burned
        calories_burned = float(record.get('value'))  # The amount of calories burned
        unit = record.get('unit')  # The unit of measurement for the calories burned
        user = 'user' + str(user_id)
        # Append the data to the list
        calorie_data.append({'user_id':user, 'date': date, 'calories_burned': calories_burned, 'unit': unit})
    user_id += 1
df_calories = pd.DataFrame(calorie_data)
df_calories['date'] = pd.to_datetime(df_calories['date'], utc=True).dt.tz_convert(None).dt.date
df_calories = df_calories.groupby(['user_id', 'date'], as_index=False)['calories_burned'].sum()
print(df_calories)

     user_id        date  calories_burned
0      user0  2021-01-11           34.000
1      user0  2021-01-12          239.000
2      user0  2021-01-23            4.000
3      user0  2022-11-28           61.564
4      user0  2022-11-29           37.894
...      ...         ...              ...
5055   user7  2024-02-18           75.383
5056   user7  2024-02-19          239.965
5057   user7  2024-02-20          165.445
5058   user7  2024-02-21          299.957
5059   user7  2024-02-22           33.190

[5060 rows x 3 columns]


In [10]:
# Merging the dataframe of calorie consumption with weight and steps
merged_df = pd.merge(merged_df, df_calories, on=['user_id', 'date'], how='outer')

In [11]:
merged_df

Unnamed: 0,user_id,date,steps,weight,unit,calories_burned
0,user0,2017-02-23,550.0,59.0,kg,
1,user0,2017-02-24,6819.0,59.0,kg,
2,user0,2017-02-25,133.0,59.0,kg,
3,user0,2017-02-26,320.0,59.0,kg,
4,user0,2017-02-27,676.0,59.0,kg,
...,...,...,...,...,...,...
15957,user7,2023-11-02,,,,436.823
15958,user7,2023-11-03,,,,222.403
15959,user7,2024-02-20,,,,165.445
15960,user7,2024-02-21,,,,299.957


In [12]:
count = 0 
for i, j in merged_df.iterrows():
    if pd.isna(j['calories_burned']):
        count += 1
print(count)

10874


In [13]:
# Some users do not have the data of calories burned on a specific date, so we just delete the row with no data

df_not_nan = merged_df[merged_df['calories_burned'].notna()]
df_not_nan = df_not_nan[merged_df['steps'].notna()]
df_not_nan = df_not_nan.reset_index()
df_not_nan

  df_not_nan = df_not_nan[merged_df['steps'].notna()]


Unnamed: 0,index,user_id,date,steps,weight,unit,calories_burned
0,979,user0,2021-01-11,10065.0,67.0000,kg,34.000
1,980,user0,2021-01-12,1464.0,67.0000,kg,239.000
2,991,user0,2021-01-23,2588.0,67.0000,kg,4.000
3,1665,user0,2022-11-28,3364.0,72.0000,kg,61.564
4,1666,user0,2022-11-29,3824.0,72.0000,kg,37.894
...,...,...,...,...,...,...,...
5053,15927,user7,2024-02-15,7872.0,90.7184,kg,342.828
5054,15928,user7,2024-02-16,7772.0,90.7184,kg,277.667
5055,15929,user7,2024-02-17,4646.0,90.7184,kg,349.917
5056,15930,user7,2024-02-18,4868.0,90.7184,kg,75.383


In [14]:
# add two more columns with the difference of steps and calorie consumption between two days

df_not_nan['steps_difference'] = df_not_nan['steps'].diff()
df_not_nan['calories_difference'] = df_not_nan['calories_burned'].diff()
df_not_nan['calories_difference'][0] = 0
df_not_nan['steps_difference'][0] = 0
df_not_nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_nan['calories_difference'][0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_nan['steps_difference'][0] = 0


Unnamed: 0,index,user_id,date,steps,weight,unit,calories_burned,steps_difference,calories_difference
0,979,user0,2021-01-11,10065.0,67.0000,kg,34.000,0.0,0.000
1,980,user0,2021-01-12,1464.0,67.0000,kg,239.000,-8601.0,205.000
2,991,user0,2021-01-23,2588.0,67.0000,kg,4.000,1124.0,-235.000
3,1665,user0,2022-11-28,3364.0,72.0000,kg,61.564,776.0,57.564
4,1666,user0,2022-11-29,3824.0,72.0000,kg,37.894,460.0,-23.670
...,...,...,...,...,...,...,...,...,...
5053,15927,user7,2024-02-15,7872.0,90.7184,kg,342.828,-3842.0,-103.408
5054,15928,user7,2024-02-16,7772.0,90.7184,kg,277.667,-100.0,-65.161
5055,15929,user7,2024-02-17,4646.0,90.7184,kg,349.917,-3126.0,72.250
5056,15930,user7,2024-02-18,4868.0,90.7184,kg,75.383,222.0,-274.534


In [15]:
# drop the column of unit because it does not help with training 
df_not_nan.drop('unit',  axis=1, inplace=True)
df_not_nan

Unnamed: 0,index,user_id,date,steps,weight,calories_burned,steps_difference,calories_difference
0,979,user0,2021-01-11,10065.0,67.0000,34.000,0.0,0.000
1,980,user0,2021-01-12,1464.0,67.0000,239.000,-8601.0,205.000
2,991,user0,2021-01-23,2588.0,67.0000,4.000,1124.0,-235.000
3,1665,user0,2022-11-28,3364.0,72.0000,61.564,776.0,57.564
4,1666,user0,2022-11-29,3824.0,72.0000,37.894,460.0,-23.670
...,...,...,...,...,...,...,...,...
5053,15927,user7,2024-02-15,7872.0,90.7184,342.828,-3842.0,-103.408
5054,15928,user7,2024-02-16,7772.0,90.7184,277.667,-100.0,-65.161
5055,15929,user7,2024-02-17,4646.0,90.7184,349.917,-3126.0,72.250
5056,15930,user7,2024-02-18,4868.0,90.7184,75.383,222.0,-274.534


In [16]:
df_not_nan.drop('index',  axis=1, inplace=True)
df_not_nan

Unnamed: 0,user_id,date,steps,weight,calories_burned,steps_difference,calories_difference
0,user0,2021-01-11,10065.0,67.0000,34.000,0.0,0.000
1,user0,2021-01-12,1464.0,67.0000,239.000,-8601.0,205.000
2,user0,2021-01-23,2588.0,67.0000,4.000,1124.0,-235.000
3,user0,2022-11-28,3364.0,72.0000,61.564,776.0,57.564
4,user0,2022-11-29,3824.0,72.0000,37.894,460.0,-23.670
...,...,...,...,...,...,...,...
5053,user7,2024-02-15,7872.0,90.7184,342.828,-3842.0,-103.408
5054,user7,2024-02-16,7772.0,90.7184,277.667,-100.0,-65.161
5055,user7,2024-02-17,4646.0,90.7184,349.917,-3126.0,72.250
5056,user7,2024-02-18,4868.0,90.7184,75.383,222.0,-274.534


In [17]:
df = df_not_nan

In [23]:
# Implement my own accuracy function, if (predicted-true_value) <= 1000, counts as correct 
def calculate_accuracy(real_value, predicted):
    count = 0
    for i in range(len(predicted)):
        if abs(real_value[i] - predicted[i])<=1000:
            count +=1
    return count/len(predicted)

In [20]:
# Training the model of KNN anb Logistic Regression 

# Define step range categories
# bins = [0, 2000, 4000, 6000, 8000, 10000, float('inf')]
# labels = ['0-2000', '2001-4000', '4001-6000', '6001-8000', '8001-10000', '10000+']
# df['step_category'] = pd.cut(df['steps'], bins=bins, labels=labels)

# Feature and target variable
X = df[['calories_burned', 'weight']]  # Using only calories_burned and weight as the features
y = df['steps']

# Encoding the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# KNN model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Logistic Regression model
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)

# Example prediction with 200 calorie consumption and 70kg weight
calorie_goal = [[200, 70]]  # Calorie goal as input
predicted_step_category_knn = knn.predict(calorie_goal)
predicted_step_category_log_reg = log_reg.predict(calorie_goal)

# Translate prediction back to label
print("KNN predicted step category:", le.inverse_transform(predicted_step_category_knn)[0])
print("Logistic Regression predicted step category:", le.inverse_transform(predicted_step_category_log_reg)[0])


KNN predicted step category: 2169.0
Logistic Regression predicted step category: 4483.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Calculate the accuracy of two models
y_pred_knn = knn.predict(X_test)
y_pred_log_reg = log_reg.predict(X_test)

# Calculating accuracy
accuracy_knn = calculate_accuracy(y_test, y_pred_knn)
accuracy_log_reg = calculate_accuracy(y_test, y_pred_log_reg)

# Displaying accuracy
print("Accuracy of KNN model:", accuracy_knn)
print("Accuracy of Logistic Regression model:", accuracy_log_reg)

Accuracy of KNN model: 0.48122529644268774
Accuracy of Logistic Regression model: 0.541501976284585


In [32]:
# Conclusion: The accuracy right now is not satisfactory enough. I think there are two reasons. First, we need more factors
# that having impact to stepcounts other than weigth and calorie consumption. Second, the data loaded from healthkit is not 
# good enough. For example, user0 with 67kg had 10065 steps on 2021-01-11, but only 34 calorie consumption. Hence, we have 
# reason to say the data is not convincing. 