In [None]:
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from dataspaces_utils.common import get_s3_client, bucket, pd
import xml.etree.ElementTree as ET
import pandas as pd
from fitparse import FitFile
import os
from io import BytesIO  
import uuid

In [None]:
#Create S3 client for Supabase 
s3 = get_s3_client() 

#List files in "raw/" folder 
response = s3.list_objects_v2(Bucket=bucket, Prefix="raw/") 
for obj in response.get("Contents", []): 
    print(obj["Key"]) 

raw/!!DO_NOT_DELETE!!.txt
raw/Evening_Ride.fit
raw/Evening_Ride.tcx
raw/Solo_Ride.fit


In [None]:
#Function to extract date from activity data 
def get_activity_date(file_bytes, extension): 
    if extension == ".fit":
        try:
            fitfile = FitFile(BytesIO(file_bytes))  
            for record in fitfile.get_messages('record'):
                for field in record:
                    if field.name == "timestamp":
                        return field.value.date().isoformat()
        except Exception as e:
            print(f"Failed to parse .fit file: {e}") 
    elif extension == ".tcx":
        try:
            tree = ET.ElementTree(ET.fromstring(file_bytes.decode("utf-8")))
            namespaces = {'tcx': 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2'}
            time_elem = tree.find('.//tcx:Trackpoint/tcx:Time', namespaces)
            if time_elem is not None:
                return pd.to_datetime(time_elem.text).date().isoformat()
        except Exception as e:
            print(f"Failed to parse .tcx file: {e}")
    return "unknown"


In [None]:
response = s3.list_objects_v2(Bucket=bucket, Prefix="raw/")

for obj in response.get("Contents", []):
    raw_key = obj["Key"]
    if not raw_key.endswith((".fit", ".tcx")):
        continue

    # Download raw bytes only for parsing (safe, read-only)
    raw_obj = s3.get_object(Bucket=bucket, Key=raw_key)
    file_bytes = raw_obj["Body"].read()
    extension = os.path.splitext(raw_key)[-1]

    # Extract activity date
    activity_date = get_activity_date(file_bytes, extension)
    if activity_date == "unknown":
        print(f"Could not extract date from {raw_key}")
        continue

    short_id = str(uuid.uuid4())[:8]
    new_key = f"bronze/original/bronze_activity_{activity_date}_{short_id}{extension}"

    # Rename file, copy to bronze/original and delete from raw folder
    s3.copy_object(
        Bucket=bucket,
        CopySource={'Bucket': bucket, 'Key': raw_key},
        Key=new_key
    )
    s3.delete_object(Bucket=bucket, Key=raw_key)

    print(f"Moved: {raw_key} ➜ {new_key}")


Moved: raw/Evening_Ride.fit ➜ bronze/original/bronze_activity_2025-04-14_bf81db1b.fit
Moved: raw/Evening_Ride.tcx ➜ bronze/original/bronze_activity_2025-04-14_f5a06ce9.tcx
Moved: raw/Solo_Ride.fit ➜ bronze/original/bronze_activity_2025-04-26_2ff07ffe.fit
