In [3]:
from config import * 


In [42]:
# Imports
from io import BytesIO
import yaml
import logging
import os
import hashlib
import datetime
import json
import zipfile

import pandas as pd
import numpy as np
import diffprivlib.tools as dp

from bs4 import BeautifulSoup
from pathlib import Path
from syftbox.lib import Client, SyftPermission


def calculate_file_hash(filepath: str) -> str:
    """
    Calculate SHA-256 hash of a file

    Args:
        filepath (str): Path to the file to hash

    Returns:
        str: Hexadecimal string of the hash
    """
    sha256_hash = hashlib.sha256()

    with open(filepath, 'rb') as f:
        # Read the file in chunks to handle large files efficiently
        for chunk in iter(lambda: f.read(4096), b''):
            sha256_hash.update(chunk)

    return sha256_hash.hexdigest()


def should_run(filepath: str) -> bool:
    '''
    Check whether current file on filepath hash is the same with the one last recorded

    Args:
        filepath (str): Path to the file to check

    Returns:
        bool: True if file has changed or no previous hash exists, False otherwise
    '''
    hashes_file = f"./hashes/{API_NAME}_last_run"

    # Calculate current file hash
    current_hash = calculate_file_hash(filepath)

    # If hashes directory or file doesn't exist, we should run
    if not os.path.exists(hashes_file):
        return True

    try:
        with open(hashes_file, 'r') as f:
            stored_hash = json.load(f).get('hash')

        # Return True if hashes are different (file has changed)
        return current_hash != stored_hash

    except (json.JSONDecodeError, KeyError):
        # If there's any error reading the hash, we should run to be safe
        return True


def record_filehash(filepath: str) -> None:
    '''
    Store the current filepath hash 

    Args:
        filepath (str): Path to the file whose hash should be stored
    '''
    hashes_file = f"./hashes/{API_NAME}_last_run"
    current_hash = calculate_file_hash(filepath)

    # Create hashes directory if it doesn't exist
    os.makedirs(os.path.dirname(hashes_file), exist_ok=True)

    # Store hash in JSON format with timestamp for debugging purposes
    hash_data = {
        'hash': current_hash,
        'timestamp': datetime.datetime.now().isoformat()
    }

    with open(hashes_file, 'w') as f:
        json.dump(hash_data, f, indent=2)


def validate_config(config):
    required_keys = ['filepath', 'parameters']
    required_params = ['type', 'epsilon', 'bounds']

    if not all(key in config for key in required_keys):
        raise ValueError(f"Missing required config keys: {required_keys}")
    if not all(param in config['parameters'] for param in required_params):
        raise ValueError(f"Missing required parameters: {required_params}")
    if config['parameters']['epsilon'] <= 0:
        raise ValueError("Epsilon must be positive")


# Following code is from https://github.com/OpenMined/cpu_tracker_member/blob/main/main.py
def create_restricted_public_folder(filepath: Path) -> None:
    """
    Create an output folder for Health Steps data within the specified path.

    This function creates a directory structure for storing Health Steps data under `api_data`. If the directory
    already exists, it will not be recreated. Additionally, default permissions for accessing the created folder are set using the
    `SyftPermission` mechanism to allow the data to be read by an aggregator.

    Args:
        path (Path): The base path where the output folder should be created.

    """
    os.makedirs(filepath, exist_ok=True)

    # Set default permissions for the created folder
    permissions = SyftPermission.datasite_default(email=client.email)
    permissions.read.append(AGGREGATOR_DATASITE)
    permissions.save(filepath)


def create_private_folder(filepath: Path) -> Path:
    """
    Create a private folder for Health Steps data within the specified path.

    This function creates a directory structure for storing Health Steps data under `private/filepath`.
    If the directory already exists, it will not be recreated. Additionally, default permissions for
    accessing the created folder are set using the `SyftPermission` mechanism, allowing the data to be
    accessible only by the owner's email.

    Args:
        path (Path): The base path where the output folder should be created.

    Returns:
        Path: The path to the created directory.
    """
    path: Path = filepath / "private" / "health_steps_counter"
    os.makedirs(path, exist_ok=True)

    # Set default permissions for the created folder
    permissions = SyftPermission.datasite_default(email=client.email)
    permissions.save(path)

    return path


# Following code is from https://github.com/OpenMined/cpu_tracker_member/blob/main/main.py
def convert_record_to_dict(record):
    data = {
        'type': record.get('type'),
        'source_name': record.get('sourceName'),
        'source_version': record.get('sourceVersion'),
        'unit': record.get('unit'),
        'value': record.get('value'),
        'creation_date': record.get('creationDate'),
        'start_date': record.get('startDate'),
        'end_date': record.get('endDate')
    }

    return data

def read_apple_health(filepath, type_parameter=None):
    logger.info("Loading health records ...")

    if filepath.endswith('.zip'):
        logger.info("Unzipping the file")
        with open(filepath, 'rb') as f:
            data = f.read()

        with zipfile.ZipFile(BytesIO(data)) as zip_ref:
            with zip_ref.open('apple_health_export/export.xml') as f:
                file_content = f.read()
    else:
        with open(filepath, 'r') as f:
            file_content = f.read()

    if not file_content:
        logger.error("No export file found named")

    soup = BeautifulSoup(file_content, features='xml')
    records = soup.find_all("Record")

    data_list = []
    if type_parameter: 
        for record in records:
            if record.get("type") == type_parameter:
                data_list.append(convert_record_to_dict(record))
    else:
        for record in records:
            data_list.append(convert_record_to_dict(record))

    logger.info("Corresponding health records loaded and parsed")
    # Create the initial dataframe from the XML file, and perform cleansing / preparation

    return pd.DataFrame(data_list)


def clean_up_df(df):

    # Columns that should be converted to float
    float_columns = ['value']
    # Columns that should be converted to datetime
    datetime_columns = ['creation_date', 'start_date', 'end_date']

    df = df.copy()

    df[float_columns] = df[float_columns].apply(
        pd.to_numeric, errors='coerce')
    df[datetime_columns] = df[datetime_columns].apply(
        pd.to_datetime, errors='coerce')

    # Use end date as the comparison date
    df['date'] = df['end_date'].dt.strftime("%Y-%m-%d")

    return df



logger = logging.getLogger(__name__)
logger.info("Started health steps counter")

try:
    with open('config.yaml', 'r') as file:
        config = yaml.safe_load(file)
    validate_config(config)
except ValueError as e:
    logger.error(str(e))
    exit()
except FileNotFoundError as e:
    logger.error(
        "config.yaml not found. Please create config.yaml (see readme for details)!")
    exit()

epsilon = PARAMETERS['epsilon']
bounds_config = PARAMETERS['bounds']

df = read_apple_health(FILEPATH)
# df = clean_up_df(df)

2024-12-10 22:51:52 - INFO: Started health steps counter
2024-12-10 22:51:52 - INFO: Loading health records ...
2024-12-10 22:52:38 - INFO: Corresponding health records loaded and parsed


In [43]:
df['creation_date'] = df['creation_date'].dt.tz_localize(None)
df['start_date'] = df['start_date'].dt.tz_localize(None)
df['end_date'] = df['end_date'].dt.tz_localize(None)


AttributeError: Can only use .dt accessor with datetimelike values

In [44]:
df.to_excel("data.xlsx")

In [None]:
df['type'].unique()

array(['HKQuantityTypeIdentifierForcedVitalCapacity',
       'HKQuantityTypeIdentifierForcedExpiratoryVolume1',
       'HKQuantityTypeIdentifierBodyMassIndex',
       'HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierOxygenSaturation',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierBodyFatPercentage',
       'HKQuantityTypeIdentifierLeanBodyMass',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierDistanceCycling',
       'HKQuantityTypeIdentifierDistanceSwimming',
       'HKQuantityTypeIdentifierSwimmingStrokeCount',
       'HKQuantityTypeIdentifierWaistCircumference

In [37]:
dp.sum(
                    [1,2,3,5,10,22,50],
                    epsilon=0.5,
                    bounds=(1, 50),
                ),

(7.0,)

In [2]:
import tenseal as ts
import base64
import json 

In [3]:
def create_he_context():
    # Setup TenSEAL context
    context = ts.context(
        ts.SCHEME_TYPE.CKKS,
        poly_modulus_degree=8192,
        coeff_mod_bit_sizes=[60, 40, 40, 60]
    )
    context.generate_galois_keys()
    context.global_scale = 2**40

    serialized_context = context.serialize()

    with open('context.bin', 'wb') as f:
        serialized_context = base64.b64encode(serialized_context)
        f.write(serialized_context)
        logger.info(type(serialized_context))
        # Check if we actually got any data
        logger.info(len(serialized_context))
        

In [7]:
def create_bfvhe_context():
    # Setup TenSEAL context
    context = ts.context(
        ts.SCHEME_TYPE.BFV,
        poly_modulus_degree=8192,
        plain_modulus=1032193,  # This is specific to BFV
        coeff_mod_bit_sizes=[60, 40, 40, 60]
    )
    return context

In [8]:
create_bfvhe_context()

<tenseal.enc_context.Context at 0x1071f85b0>

In [60]:
create_he_context()

with open('context.bin', 'rb') as f:
    serialized_data = bytes(f.read())
    a = ts.context_from(base64.b64decode(serialized_data))

2024-12-11 16:07:10 - INFO: <class 'bytes'>
2024-12-11 16:07:10 - INFO: 47305292


In [56]:
a

<tenseal.enc_context.Context at 0x3b06ef670>

In [55]:
!python --version

Python 3.10.15


In [61]:
ts.__version__

'0.3.15'