## Create Spark Session
This sets up the PySpark environment for local processing.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AppleHealthAnalysis") \
    .getOrCreate()


Read the transformed data to show only the first 5 row for verification

In [None]:
df = spark.read.csv("../data/export.csv", header=True, inferSchema=True)
df.show(5)


Filter by only steps counts, sleep details and the burnt calories

In [None]:
focus_types = [
    "HKQuantityTypeIdentifierStepCount",
    "HKQuantityTypeIdentifierActiveEnergyBurned",
    "HKCategoryTypeIdentifierSleepAnalysis"
]

df_filtered = df.filter(df["type"].isin(focus_types))
df_filtered.show(5)


Filter by 30 days

In [None]:
from pyspark.sql.functions import to_date, col
from datetime import datetime, timedelta

# Add date column
df_with_date = df_filtered.withColumn("date", to_date("startDate"))

# Filter for last 30 days
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

df_30days = df_with_date.filter(
    col("date") >= thirty_days_ago.strftime("%Y-%m-%d")
)


Then I group and convert:

convert to Pandas to plot to Pandas

In [None]:
from pyspark.sql.functions import sum as _sum

df_summary = df_30days.groupBy("type", "date").agg(
    _sum("value").alias("total_value")
)
summary_pd = df_summary.toPandas()


In [None]:
import matplotlib.pyplot as plt

# Plot Step Count
steps = summary_pd[summary_pd['type'] == "HKQuantityTypeIdentifierStepCount"]
# Ensure one value per date
steps = steps.groupby("date")["total_value"].sum().reset_index()

plt.figure(figsize=(10,4))
plt.plot(steps["date"], steps["total_value"], marker='o')
plt.title("Daily Step Count")
plt.xlabel("Date")
plt.ylabel("Steps")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig("../notebooks/step_trend.png")


plot Sleep

In [None]:
sleep = summary_pd[summary_pd['type'] == "HKCategoryTypeIdentifierSleepAnalysis"]
sleep = sleep.groupby("date")["total_value"].sum().reset_index()
plt.figure(figsize=(10,4))
plt.plot(sleep["date"], sleep["total_value"] / 60, marker='o')  # convert minutes to hours
plt.title("Daily Sleep Duration (Hours)")
plt.xlabel("Date")
plt.ylabel("Sleep (hrs)")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig("../notebooks/sleep_trend.png")


Calories / Active Energy

In [None]:
energy = summary_pd[summary_pd['type'] == "HKQuantityTypeIdentifierActiveEnergyBurned"]
energy = energy.groupby("date")["total_value"].sum().reset_index()
plt.figure(figsize=(10,4))
plt.plot(energy["date"], energy["total_value"], marker='o', color='orange')
plt.title("Active Energy Burned")
plt.xlabel("Date")
plt.ylabel("Calories")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig("../notebooks/energy_trend.png")
