In [6]:
import os
import boto3
import pandas as pd
from fitparse import FitFile
from io import BytesIO
import xml.etree.ElementTree as ET
from dotenv import load_dotenv

load_dotenv()

# Setup S3 client
s3 = boto3.client(
    's3',
    endpoint_url=os.getenv("SUPABASE_S3_ENDPOINT"),
    aws_access_key_id=os.getenv("SUPABASE_S3_KEY_ID"),
    aws_secret_access_key=os.getenv("SUPABASE_S3_SECRET")
)

bucket = os.getenv("SUPABASE_BUCKET")


In [7]:
def parse_fit_to_df(file_bytes):
    fitfile = FitFile(BytesIO(file_bytes))
    records = []
    for record in fitfile.get_messages("record"):
        row = {}
        for field in record:
            row[field.name] = field.value
        records.append(row)
    return pd.DataFrame(records)

def parse_tcx_to_df(file_bytes):
    tree = ET.ElementTree(ET.fromstring(file_bytes.decode("utf-8")))
    ns = {'tcx': 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2'}
    records = []
    for tp in tree.findall('.//tcx:Trackpoint', ns):
        record = {}
        # 1. Timestamp
        time_elem = tp.find('tcx:Time', ns)
        if time_elem is not None:
            record['timestamp'] = pd.to_datetime(time_elem.text)
        # 2. Speed (basic)
        speed_elem = tp.find('tcx:Speed', ns)
        if speed_elem is not None:
            record['speed'] = float(speed_elem.text)
        # 3. Cadence
        cadence_elem = tp.find('tcx:Cadence', ns)
        if cadence_elem is not None:
            record['cadence'] = float(cadence_elem.text)
        # 4. Heart rate
        hr_elem = tp.find('.//tcx:HeartRateBpm/tcx:Value', ns)
        if hr_elem is not None:
            record['heart_rate'] = int(hr_elem.text)
        # 5. Power (and speed) inside Extensions > TPX
        extensions_elem = tp.find('tcx:Extensions', ns)
        if extensions_elem is not None:
            tpx_elem = extensions_elem.find('.//', ns)
            if tpx_elem is not None:
                for elem in tpx_elem:
                    tag = elem.tag.split('}')[-1].lower()
                    if tag == 'watts':
                        record['power'] = float(elem.text)
                    elif tag == 'speed':
                        record['speed'] = float(elem.text)  # Override if found here
        # Append record if it has a timestamp (required)
        if 'timestamp' in record:
            records.append(record)
    return pd.DataFrame(records)


In [10]:
# List .fit and .tcx files in bronze/original/
response = s3.list_objects_v2(Bucket=bucket, Prefix="bronze/original/")
keys = [obj["Key"] for obj in response.get("Contents", []) if obj["Key"].endswith((".fit", ".tcx"))]

for key in keys:
    print(f"Processing: {key}")
    extension = os.path.splitext(key)[-1]

    obj = s3.get_object(Bucket=bucket, Key=key)
    file_bytes = obj["Body"].read()

    # Parse
    try:
        if extension == ".fit":
            df = parse_fit_to_df(file_bytes)
        elif extension == ".tcx":
            df = parse_tcx_to_df(file_bytes)
        else:
            print(f"Unknown extension: {extension}")
            continue
    except Exception as e:
        print(f"Failed to parse {key}: {e}")
        continue

    if df.empty:
        print(f"Empty DataFrame for {key}, skipping.")
        continue

    # Save to Parquet locally
    bronze_name = os.path.basename(key).replace(extension, ".parquet")
    local_path = f"bronze_local/{bronze_name}"
    os.makedirs("bronze_local/", exist_ok=True)
    df.to_parquet(local_path, index=False)

    # Upload .parquet to bronze/parquet/ in Supabase Storage
    s3_key = f"bronze/parquet/{bronze_name}"
    with open(local_path, "rb") as f:
        s3.upload_fileobj(f, bucket, s3_key)

    print(f"Saved to Supabase: {s3_key}")

Processing: bronze/original/bronze_activity_2025-04-14_493c8809.tcx
Saved to Supabase: bronze/parquet/bronze_activity_2025-04-14_493c8809.parquet
Processing: bronze/original/bronze_activity_2025-04-14_6a1e0cfe.fit
Saved to Supabase: bronze/parquet/bronze_activity_2025-04-14_6a1e0cfe.parquet
Processing: bronze/original/bronze_activity_2025-04-26_1f6a6649.fit
Saved to Supabase: bronze/parquet/bronze_activity_2025-04-26_1f6a6649.parquet
