# 03 · EDA — Cardiovascular (Enhanced)

Interactive + weekly aggregation + consolidated Excel summary.

**Inputs**: `features_cardiovascular.csv`, `features_daily_updated.csv`, `extract_manifest.json`, `cardio_manifest.json`
**Outputs**: PNG/HTML figures in `eda_outputs/`, JSON + Excel summaries.


In [13]:
# Plotly renderer robusto para VSCode/Jupyter/Browser
import os
import plotly.io as pio; pio.renderers.default = "browser"; pio.renderers.default

def _pick_renderer():
    # VSCode
    if os.environ.get("VSCODE_PID"):
        return "vscode"
    # JupyterLab/Notebook recentes
    for r in ("notebook_connected", "jupyterlab"):
        if r in pio.renderers:
            return r
    # Fallback universal (salva <iframe>)
    return "browser"

pio.renderers.default = _pick_renderer()
print("Plotly renderer:", pio.renderers.default)


Plotly renderer: notebook_connected


In [2]:
import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from datetime import datetime

PID = 'P000001'
SNAP = '2025-09-29'  # <- altere aqui se quiser outro snapshot
AGG = 'median'       # 'median' ou 'mean' para weekly aggregation

SNAPDIR = Path('data_ai')/PID/'snapshots'/SNAP
OUTDIR  = SNAPDIR/'eda_outputs'
OUTDIR.mkdir(parents=True, exist_ok=True)
print('SNAPDIR =', SNAPDIR)
pio.renderers.default = 'notebook'


SNAPDIR = data_ai\P000001\snapshots\2025-09-29


## 01 | Read manifests (reproducibility)

In [3]:
extract_manifest = {}
cardio_manifest = {}
em = SNAPDIR/'extract_manifest.json'
cm = SNAPDIR/'cardio_manifest.json'
if em.exists(): extract_manifest = json.loads(em.read_text(encoding='utf-8'))
if cm.exists(): cardio_manifest  = json.loads(cm.read_text(encoding='utf-8'))
print('extract_manifest keys:', list(extract_manifest.keys()))
print('cardio_manifest keys:', list(cardio_manifest.keys()))


extract_manifest keys: []
cardio_manifest keys: []


## 02 | Load data & QC

In [4]:
def _read_csv(path: Path, dtcols=('date',)):
    if not path.exists(): return pd.DataFrame()
    return pd.read_csv(path, parse_dates=[c for c in dtcols if c in pd.read_csv(path, nrows=0).columns])

feat_cardio = _read_csv(SNAPDIR/'features_cardiovascular.csv', ('date',))
feat_dailyu = _read_csv(SNAPDIR/'features_daily_updated.csv', ('date',))

def qc_report(df: pd.DataFrame, name: str):
    if df.empty:
        return {'name': name, 'empty': True}
    cols_num = [c for c in df.columns if c!='date' and pd.api.types.is_numeric_dtype(df[c])]
    rep = {
        'name': name,
        'empty': False,
        'n_rows': int(len(df)),
        'date_min': str(df['date'].min().date()) if 'date' in df else None,
        'date_max': str(df['date'].max().date()) if 'date' in df else None,
        'n_na_total': int(df.isna().sum().sum()),
        'n_num_cols': len(cols_num)
    }
    return rep

qc_cardio = qc_report(feat_cardio, 'features_cardiovascular')
qc_dailyu = qc_report(feat_dailyu, 'features_daily_updated')
print(qc_cardio); print(qc_dailyu)

(SNAPDIR/'eda_outputs'/'eda_qc.json').write_text(json.dumps({'cardio': qc_cardio, 'dailyu': qc_dailyu}, indent=2), encoding='utf-8')


{'name': 'features_cardiovascular', 'empty': True}
{'name': 'features_daily_updated', 'empty': True}


153

## 03 | Temporal trends (HR / HRV) — static + interactive

In [5]:
if feat_cardio.empty:
    print('features_cardiovascular.csv is empty — skipping temporal trends')
else:
    df = feat_cardio.copy().sort_values('date')
    # heuristics to detect columns
    hr_cols  = [c for c in df.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
    hrv_cols = [c for c in df.columns if re.search(r'hrv.*(sdnn|sdnn_ms).*mean', c)]
    # ---- matplotlib (static)
    if hr_cols:
        plt.figure(figsize=(11,3))
        plt.plot(df['date'], df[hr_cols[0]])
        plt.title(f'Daily HR mean — {hr_cols[0]}'); plt.xlabel('date'); plt.ylabel('bpm'); plt.tight_layout()
        plt.savefig(OUTDIR/'trend_hr_mean.png'); plt.show()
    if hrv_cols:
        plt.figure(figsize=(11,3))
        plt.plot(df['date'], df[hrv_cols[0]])
        plt.title(f'Daily HRV (SDNN) mean — {hrv_cols[0]}'); plt.xlabel('date'); plt.ylabel('ms'); plt.tight_layout()
        plt.savefig(OUTDIR/'trend_hrv_sdnn_mean.png'); plt.show()
    # ---- plotly (interactive)
    if hr_cols:
        fig = px.line(df, x='date', y=hr_cols[0], title=f'HR mean (interactive) — {hr_cols[0]}')
        fig.write_html(str(OUTDIR/'trend_hr_mean.html'))
        fig.show()
    if hrv_cols:
        fig = px.line(df, x='date', y=hrv_cols[0], title=f'HRV SDNN mean (interactive) — {hrv_cols[0]}')
        fig.write_html(str(OUTDIR/'trend_hrv_sdnn_mean.html'))
        fig.show()


features_cardiovascular.csv is empty — skipping temporal trends


## 04 | Weekly aggregation (median/mean)

In [6]:
weekly = pd.DataFrame()
if not feat_cardio.empty:
    df = feat_cardio.copy().set_index('date').sort_index()
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    aggfn = np.median if AGG=='median' else np.mean
    weekly = df[num_cols].resample('W').apply(aggfn)
    weekly.index.name = 'week'
    weekly.to_csv(OUTDIR/'weekly_cardio.csv')
    display(weekly.head())
    # interactive pair: HR vs HRV weekly
    hr_cols  = [c for c in weekly.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
    hrv_cols = [c for c in weekly.columns if re.search(r'hrv.*(sdnn|sdnn_ms).*mean', c)]
    if hr_cols and hrv_cols:
        wdf = weekly.reset_index()
        fig = px.scatter(wdf, x=hr_cols[0], y=hrv_cols[0], trendline='ols', title=f'Weekly {AGG}: HR vs HRV')
        fig.write_html(str(OUTDIR/'weekly_hr_vs_hrv.html'))
        fig.show()


## 05 | Correlations (daily + weekly)

In [7]:
def corr_plot(df: pd.DataFrame, title: str, outpng: Path):
    if df.empty: return
    corr = df.corr(method='spearman')
    plt.figure(figsize=(7,6))
    plt.imshow(corr.values, aspect='auto')
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.index)), corr.index)
    plt.colorbar(); plt.title(title); plt.tight_layout(); plt.savefig(outpng); plt.show()

if not feat_cardio.empty:
    dfnum = feat_cardio.drop(columns=['date']).select_dtypes(include=[np.number])
    corr_plot(dfnum, 'Spearman correlation (daily cardio)', OUTDIR/'corr_daily.png')
if not weekly.empty:
    corr_plot(weekly, 'Spearman correlation (weekly cardio)', OUTDIR/'corr_weekly.png')

# plotly interactive corr (daily)
if not feat_cardio.empty:
    dfnum = feat_cardio.drop(columns=['date']).select_dtypes(include=[np.number])
    corr = dfnum.corr(method='spearman')
    fig = px.imshow(corr, aspect='auto', title='Spearman correlation (daily cardio, interactive)')
    fig.write_html(str(OUTDIR/'corr_daily.html'))
    fig.show()


## 06 | Segment analysis (S1–S6)

In [8]:
seg_stats = pd.DataFrame()
if not feat_cardio.empty and 'segment_id' in feat_cardio.columns:
    seg_counts = feat_cardio['segment_id'].value_counts(dropna=False).sort_index()
    print('segment counts:\n', seg_counts)
    seg_counts.to_csv(OUTDIR/'segments_counts.csv')
    # primeira métrica de HR para boxplot
    hrcols = [c for c in feat_cardio.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
    if hrcols:
        data = [feat_cardio.loc[feat_cardio['segment_id']==sid, hrcols[0]].dropna().values for sid in sorted(feat_cardio['segment_id'].dropna().unique())]
        plt.figure(figsize=(8,3))
        plt.boxplot(data)
        plt.title(f'HR mean by segment ({hrcols[0]})'); plt.tight_layout(); plt.savefig(OUTDIR/'box_hr_by_segment.png'); plt.show()
    # estatísticas agregadas por segmento
    numcols = [c for c in feat_cardio.columns if c!='date' and pd.api.types.is_numeric_dtype(feat_cardio[c])]
    seg_stats = feat_cardio.groupby('segment_id')[numcols].agg(['median','mean','std','count'])
    seg_stats.to_csv(OUTDIR/'segment_stats.csv')
    display(seg_stats.head())


## 07 | Label preview (if present)

In [9]:
label_cols = [c for c in feat_dailyu.columns if c.lower()=='label'] if not feat_dailyu.empty else []
if label_cols:
    df = feat_dailyu[['date', label_cols[0]]].merge(feat_cardio, on='date', how='left')
    lab = label_cols[0]
    print('label distribution:\n', df[lab].value_counts(dropna=False))
    hrcols = [c for c in feat_cardio.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
    if hrcols:
        groups = [g.dropna().values for _, g in df.groupby(lab)[hrcols[0]]]
        plt.figure(figsize=(6,3))
        plt.boxplot(groups)
        plt.title(f'{hrcols[0]} by label'); plt.tight_layout(); plt.savefig(OUTDIR/'box_hr_by_label.png'); plt.show()
else:
    print('No label column found; skipping label preview.')


No label column found; skipping label preview.


## 08 | Export consolidated summary (JSON + XLSX)

In [10]:
summ = {
    'pid': PID,
    'snapshot': SNAP,
    'manifests': {
        'extract': (extract_manifest.get('export_sha256','') if extract_manifest else ''),
        'cardio_outputs': list((cardio_manifest.get('outputs') or {}).keys()) if cardio_manifest else [],
    },
    'qc': {'cardio': qc_cardio, 'dailyu': qc_dailyu},
}
(OUTDIR/'eda_summary.json').write_text(json.dumps(summ, indent=2), encoding='utf-8')

# XLSX with multiple sheets
xlsx_path = OUTDIR/'eda_summary.xlsx'
with pd.ExcelWriter(xlsx_path, engine='openpyxl') as xw:
    # QC
    pd.DataFrame([qc_cardio]).to_excel(xw, index=False, sheet_name='qc_cardio')
    pd.DataFrame([qc_dailyu]).to_excel(xw, index=False, sheet_name='qc_dailyu')
    # Daily describe
    if not feat_cardio.empty:
        num = feat_cardio.drop(columns=['date']).select_dtypes(include=[np.number])
        num.describe().T.to_excel(xw, sheet_name='daily_describe')
    # Weekly
    if not weekly.empty:
        weekly.to_excel(xw, sheet_name='weekly')
    # Segment stats
    if 'seg_stats' in globals() and isinstance(seg_stats, pd.DataFrame) and not seg_stats.empty:
        seg_stats.to_excel(xw, sheet_name='segment_stats')

print('Wrote:', xlsx_path)


Wrote: data_ai\P000001\snapshots\2025-09-29\eda_outputs\eda_summary.xlsx


In [11]:
from pathlib import Path
import pandas as pd, re

print("SNAPDIR =", SNAPDIR)
files = [
    SNAPDIR/"features_cardiovascular.csv",
    SNAPDIR/"features_daily_updated.csv",
    SNAPDIR/"per-metric"/"apple_heart_rate.csv",
    SNAPDIR/"per-metric"/"apple_hrv_sdnn.csv",
    SNAPDIR/"per-metric"/"apple_sleep_intervals.csv",
]
for f in files:
    print(f.name, "→", f.exists(), (f.stat().st_size if f.exists() else 0))

fc = pd.read_csv(SNAPDIR/"features_cardiovascular.csv", parse_dates=["date"])
print("features_cardiovascular shape:", fc.shape)
hr_cols  = [c for c in fc.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
hrv_cols = [c for c in fc.columns if re.search(r'hrv.*(sdnn|sdnn_ms).*mean', c)]
print("hr_cols:", hr_cols)
print("hrv_cols:", hrv_cols)


SNAPDIR = data_ai\P000001\snapshots\2025-09-29
features_cardiovascular.csv → False 0
features_daily_updated.csv → False 0
apple_heart_rate.csv → False 0
apple_hrv_sdnn.csv → False 0
apple_sleep_intervals.csv → False 0


FileNotFoundError: [Errno 2] No such file or directory: 'data_ai\\P000001\\snapshots\\2025-09-29\\features_cardiovascular.csv'

In [14]:
import pandas as pd, re
fc = pd.read_csv(SNAPDIR/"features_cardiovascular.csv", parse_dates=["date"])
print("shape:", fc.shape)
print("cols:", list(fc.columns)[:20])
hr_cols  = [c for c in fc.columns if re.search(r'(^|_)hr_mean(_|$)', c)]
hrv_cols = [c for c in fc.columns if re.search(r'hrv.*(sdnn|sdnn_ms).*mean', c)]
print("hr_cols:", hr_cols)
print("hrv_cols:", hrv_cols)


FileNotFoundError: [Errno 2] No such file or directory: 'data_ai\\P000001\\snapshots\\2025-09-29\\features_cardiovascular.csv'