In [None]:
import pandas as pd
import numpy as np

import diffprivlib.tools as dp

In [None]:
from bs4 import BeautifulSoup

In [None]:
pathname = ''

In [None]:
with open(pathname, 'r') as f:
    data = f.read()

In [None]:
soup = BeautifulSoup(data, features='xml')

In [None]:
records = soup.find_all("Record")

In [None]:
def convert_record_to_dict(record):
    data = {
            'type': record.get('type'),
            'source_name': record.get('sourceName'),
            'source_version': record.get('sourceVersion'),
            'unit': record.get('unit'),
            'value': record.get('value'),
            'creation_date': record.get('creationDate'),
            'start_date': record.get('startDate'),
            'end_date': record.get('endDate')
        }
    
    return data
    

In [None]:
type_parameter = "HKQuantityTypeIdentifierStepCount"

data_list = []

for record in records:
    if record.get("type") == type_parameter: 
        data_list.append(convert_record_to_dict(record))

In [None]:
df = pd.DataFrame(data_list)

In [None]:
# Columns that should be converted to float
float_columns = ['value']  # Add other numeric columns as needed

# Columns that should be converted to datetime
datetime_columns = ['creation_date', 'start_date', 'end_date']

# Convert float columns
for col in float_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert datetime columns
for col in datetime_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        
# use end date as a baseline
df['date'] = df['end_date'].dt.strftime("%Y-%m-%d")

In [None]:
summary_df = df.groupby('date')['value'].agg(['sum', 'count']).reset_index()

In [None]:
summary_df.columns = ['date', 'step_count', 'step_entries']

In [None]:
summary_df

In [None]:
summary_df.describe()

# Create the differential private sums

In [None]:
dp_df = []
epsilon = 0.5

for date in df['date'].unique():
    record_values = df[df['date'] == date]['value']
    dp_df.append({
        'date': date,
        'dp_step_count': dp.sum(
            record_values, 
            epsilon=epsilon, 
            bounds=(1, record_values.max())),
        'dp_step_entries': dp.count_nonzero(
            record_values, 
            epsilon=epsilon
        )
    })
    
dp_df = pd.DataFrame(dp_df)

In [None]:
dp_df.describe()

In [None]:
summary_df.describe()   

In [None]:
summary_df.to_json("daily_steps.json")
dp_df.to_json("dp_daily_steps.json")