# NB-01_EDA-ANY-DAILY-overview

Purpose: Run EDA on the public daily subset only (`features_daily_public.csv`).

In [None]:
from pathlib import Path; OUT=Path('notebooks')/ 'outputs'; (OUT/'figures').mkdir(parents=True,exist_ok=True); (OUT/'tables').mkdir(parents=True,exist_ok=True); (OUT/'manifests').mkdir(parents=True,exist_ok=True); print('out',OUT)

In [None]:
from glob import glob; c=glob('data/ai/*/snapshots/*/public_subset/features_daily_public.csv'); print('candidates',c); assert c, 'no public subset'; INPUT=Path(c[0]); print('INPUT',INPUT)

In [None]:
import pandas as pd; df=pd.read_csv(INPUT); print('loaded',df.shape); assert 'date_utc' in df.columns; assert ('hr_mean' in df.columns) or ('sleep_total_minutes' in df.columns)

In [None]:
import numpy as np; import matplotlib.pyplot as plt; import seaborn as sns; sns.set(); (df.select_dtypes(include=[np.number]).describe().transpose()).to_csv('notebooks/outputs/tables/summary_stats.csv'); df.head(20).to_csv('notebooks/outputs/tables/head.csv',index=False); import hashlib; print('wrote tables')

In [None]:
import json,platform,hashlib; manifest={'env':'LOCAL','python':platform.python_version(),'input':{'path':str(INPUT),'sha256':hashlib.sha256(INPUT.read_bytes()).hexdigest()}}; (Path('notebooks')/'outputs'/'manifests'/'run_manifest.json').write_text(json.dumps(manifest,indent=2,sort_keys=True)); print('manifest written')

# NB-01_EDA-ANY-DAILY-overview

Purpose: Run EDA on the public daily subset only (`features_daily_public.csv`).
- Environment detection: LOCAL / KAGGLE / COLAB
- Hard block: notebook must not read `data/raw/**` or `data/etl/**` (AI-only guard)
- Save non-sensitive figures/tables to `notebooks/outputs/{figures,tables,manifests}`


# NB-01_EDA-ANY-DAILY-overview

Purpose: Run EDA on the public daily subset only (`features_daily_public.csv`).
- Environment detection: LOCAL / KAGGLE / COLAB
- Hard block: notebook must not read `data/raw/**` or `data/etl/**` (AI-only guard)
- Save non-sensitive figures/tables to `notebooks/outputs/{figures,tables,manifests}`

Instructions:
1. Run all cells top-to-bottom. The notebook bootstraps minimal dependencies if missing (idempotent).
2. It will locate the public subset CSV under `data/ai/**/public_subset/features_daily_public.csv`.
3. The notebook enforces a schema guard: `date_utc` AND at least one of (`hr_mean`, `sleep_total_minutes`).

In [None]:
# Standard imports and output dirs
import sys
from pathlib import Path
import json
import hashlib
import platform

OUT_BASE = Path('notebooks') / 'outputs'
FIG_DIR = OUT_BASE / 'figures'
TAB_DIR = OUT_BASE / 'tables'
MAN_DIR = OUT_BASE / 'manifests'
for d in (FIG_DIR, TAB_DIR, MAN_DIR):
    d.mkdir(parents=True, exist_ok=True)
print('Outputs ->', OUT_BASE)

In [None]:
# Environment detection: LOCAL / KAGGLE / COLAB
def detect_env():
    import os
    if 'KAGGLE_URL_BASE' in os.environ:
        return 'KAGGLE'
    # Colab heuristics
    try:
        import google.colab  # type: ignore
        return 'COLAB'
    except Exception:
        pass
    return 'LOCAL'

ENV = detect_env()
print('Environment:', ENV)

In [None]:
# Bootstrap minimal deps idempotently: pandas, matplotlib, seaborn, numpy
import importlib
import subprocess

REQS = ['pandas', 'numpy', 'matplotlib', 'seaborn']
missing = []
for r in REQS:
    if importlib.util.find_spec(r) is None:
        missing.append(r)
if missing:
    print('Installing missing packages:', missing)
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', *missing])
else:
    print('All required packages present')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
# Locate public CSV (search under data/ai/*/snapshots/*/public_subset)
from glob import glob
candidates = glob('data/ai/*/snapshots/*/public_subset/features_daily_public.csv')
if not candidates:
    raise FileNotFoundError('No public subset found under data/ai/*/snapshots/*/public_subset')
INPUT_CSV = Path(candidates[0])
print('Using public CSV:', INPUT_CSV)

# Hard AI-only guard: if project contains data/raw or data/etl directories, block any attempt to read them
if Path('data/raw').exists() or Path('data/etl').exists():
    # We still allow the public_subset read, but ensure no code will reference raw/etl paths in produced artifacts
    print('WARNING: data/raw or data/etl exists in repository root. The notebook will not read them.', file=sys.stderr)

# Compute input SHA256
def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    b = p.read_bytes()
    h.update(b)
    return h.hexdigest()

INPUT_SHA256 = sha256_file(INPUT_CSV)
print('Input SHA256:', INPUT_SHA256)

In [None]:
# Load dataframe and schema guard
df = pd.read_csv(INPUT_CSV)
print('rows, cols =', df.shape)

# Ensure date_utc present
if 'date_utc' not in df.columns:
    raise ValueError('Schema guard failed: date_utc column is required in features_daily_public.csv')

# At least one of hr_mean OR sleep_total_minutes must be present
if not (('hr_mean' in df.columns) or ('sleep_total_minutes' in df.columns)):
    raise ValueError('Schema guard failed: one of hr_mean or sleep_total_minutes must be present')

# Parse date_utc to datetime where possible
try:
    df['date_utc'] = pd.to_datetime(df['date_utc'])
except Exception:
    pass

# Tolerate optional columns' absence; record available cols
available_cols = list(df.columns)
print('Available columns:', available_cols)

In [None]:
# EDA: summary stats, missingness, correlations, and simple plots
numeric = df.select_dtypes(include=[np.number])
summary = numeric.describe().transpose()
summary.to_csv(TAB_DIR / 'summary_stats.csv', index=True)
print('Wrote summary stats ->', TAB_DIR / 'summary_stats.csv')

# Head (non-sensitive)
df.head(20).to_csv(TAB_DIR / 'head.csv', index=False)
print('Wrote head ->', TAB_DIR / 'head.csv')

# Missingness (percent)
miss = pd.DataFrame({'missing_pct': df.isna().mean() * 100})
miss.to_csv(TAB_DIR / 'missingness_pct.csv')
print('Wrote missingness ->', TAB_DIR / 'missingness_pct.csv')

# Correlations if enough numeric columns
if numeric.shape[1] >= 2:
    corr = numeric.corr(method='pearson')
    corr.to_csv(TAB_DIR / 'correlations.csv')
    print('Wrote correlations ->', TAB_DIR / 'correlations.csv')
    # save a heatmap
    plt.figure(figsize=(6,6))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag')
    plt.title('Numeric Correlations')
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'correlations_heatmap.png', dpi=150)
    plt.close()
    print('Wrote heatmap ->', FIG_DIR / 'correlations_heatmap.png')
else:
    print('Not enough numeric columns for correlations')

# Histograms for hr_mean and sleep_total_minutes if present
if 'hr_mean' in df.columns:
    plt.figure()
    sns.histplot(df['hr_mean'].dropna(), kde=False, bins=30)
    plt.title('hr_mean distribution')
    plt.savefig(FIG_DIR / 'hr_mean_hist.png', dpi=150)
    plt.close()
    print('Wrote hr_mean_hist ->', FIG_DIR / 'hr_mean_hist.png')

if 'sleep_total_minutes' in df.columns:
    plt.figure()
    sns.histplot(df['sleep_total_minutes'].dropna(), kde=False, bins=30)
    plt.title('sleep_total_minutes distribution')
    plt.savefig(FIG_DIR / 'sleep_total_minutes_hist.png', dpi=150)
    plt.close()
    print('Wrote sleep_total_minutes_hist ->', FIG_DIR / 'sleep_total_minutes_hist.png')

In [None]:
# Write run manifest with env, params, input hash, and produced artifact checksums
import pkgutil
import pkg_resources

def lib_version(name: str) -> str:
    try:
        return pkg_resources.get_distribution(name).version
    except Exception:
        try:
            import importlib
            m = importlib.import_module(name)
            return getattr(m, '__version__', 'unknown')
        except Exception:
            return 'unknown'

manifest = {
    'env': ENV,
    'python': platform.python_version(),
    'libs': {
        'pandas': lib_version('pandas'),
        'numpy': lib_version('numpy'),
        'matplotlib': lib_version('matplotlib'),
        'seaborn': lib_version('seaborn'),
    },
    'input': {
        'path': str(INPUT_CSV),
        'sha256': INPUT_SHA256,
    },
    'params': {
        'found_columns': available_cols
    },
    'artifacts': []
}

for p in sorted((TAB_DIR).glob('*')) + sorted((FIG_DIR).glob('*')):
    if p.is_file():
        s = hashlib.sha256(p.read_bytes()).hexdigest()
        manifest['artifacts'].append({'path': str(p), 'sha256': s})

manifest_path = MAN_DIR / 'run_manifest.json'
with manifest_path.open('w', encoding='utf-8') as f:
    json.dump(manifest, f, indent=2, sort_keys=True)
print('Wrote run manifest ->', manifest_path)

Notes:
- All produced artifacts are under `notebooks/outputs/**`.
- The notebook intentionally searches only for `public_subset/features_daily_public.csv` and does not reference `data/raw` or `data/etl`.
- If you need to run on a different participant/snapshot, place the public subset under `data/ai/<PID>/snapshots/<SNAP>/public_subset/` and rerun.