In [54]:
from pathlib import Path
import polars as pl


project = Path().home().cwd().parent
data = project / "data"
fitbitdata = data / "Takeout" / "Fitbit"

data_collection: dict[str, list[Path]] = {}

for subdir in fitbitdata.glob("*"):
    for file in subdir.glob("*.csv"):
        topic = subdir.parts[-1]
        if topic in data_collection.keys():
            data_collection[topic].append(file)
        else:
            data_collection[topic] = [file]



In [61]:
step_files = filter(
    lambda p : "steps" in p.parts[-1],
    data_collection["Physical Activity_GoogleData"]
)

query_buf = []

for step_file in step_files:
    query_buf.append(pl.scan_csv(
        step_file,
        schema_overrides={"timestamp": pl.Datetime}
    ))

query  = pl.concat(query_buf)
query.collect_schema()

Schema([('timestamp', Datetime(time_unit='us', time_zone=None)),
        ('steps', Int64)])

In [69]:
result = (
    query.group_by_dynamic("timestamp", every="1h", closed="left")
    .agg(pl.col("steps").sum().alias("total_steps"))
)

df = result.collect()

In [71]:
df.describe()

statistic,timestamp,total_steps
str,str,f64
"""count""","""318""",318.0
"""null_count""","""0""",0.0
"""mean""","""2024-12-01 19:13:35.094339""",580.647799
"""std""",,761.311998
"""min""","""2024-11-22 22:00:00""",4.0
"""25%""","""2024-11-27 10:00:00""",128.0
"""50%""","""2024-12-01 17:00:00""",271.0
"""75%""","""2024-12-06 09:00:00""",717.0
"""max""","""2024-12-10 19:00:00""",3949.0
