# NB-01_EDA-ANY-DAILY-overview

Compact EDA for daily features. Produces run-scoped outputs under `notebooks/outputs/NB1/<TIMESTAMP>/` and a mirrored `latest/`.
Prefer `date` column; fallback to `date_utc`.

In [1]:
# Standard imports and run-scoped output dirs for NB1
import sys
from pathlib import Path
import json
import hashlib
import platform
from datetime import datetime

# Root for NB1 outputs (immutable per run under a timestamped folder)
OUT_BASE_ROOT = Path('notebooks') / 'outputs' / 'NB1'
TIMESTAMP = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
RUN_DIR = OUT_BASE_ROOT / TIMESTAMP
FIG_DIR = RUN_DIR / 'figures'
TAB_DIR = RUN_DIR / 'tables'
MAN_DIR = RUN_DIR / 'manifests'
LOG_DIR = RUN_DIR / 'logs'
for d in (FIG_DIR, TAB_DIR, MAN_DIR, LOG_DIR):
    d.mkdir(parents=True, exist_ok=True)
# 'latest' mirror - will be refreshed at the end of the run
LATEST_DIR = OUT_BASE_ROOT / 'latest'
OUT_BASE_ROOT.mkdir(parents=True, exist_ok=True)
print('NB1-BOOT Outputs ->', RUN_DIR)
print('NB1-BOOT Latest mirror ->', LATEST_DIR)

NB1-BOOT Outputs -> notebooks\outputs\NB1\20251026_044407
NB1-BOOT Latest mirror -> notebooks\outputs\NB1\latest


  TIMESTAMP = datetime.utcnow().strftime('%Y%m%d_%H%M%S')


In [2]:
# Environment detection: LOCAL / KAGGLE / COLAB
import os
def detect_env():
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ or 'KAGGLE_URL_BASE' in os.environ:
        return 'KAGGLE'
    try:
        import google.colab  # type: ignore
        return 'COLAB'
    except Exception:
        pass
    return 'LOCAL'

ENV = detect_env()
print('NB1-BOOT Environment ->', ENV)

NB1-BOOT Environment -> LOCAL


In [3]:
# NB1-BOOT: check required packages and instruct (no auto install)
import importlib, warnings
warnings.filterwarnings('ignore', message='pkg_resources is deprecated as an API')
REQUIRED = ['pandas', 'numpy', 'matplotlib', 'seaborn']
missing = [r for r in REQUIRED if importlib.util.find_spec(r) is None]
if missing:
    print('NB1-BOOT Missing packages:', missing)
    print('Install with: ', sys.executable + ' -m pip install ' + ' '.join(missing))
else:
    print('NB1-BOOT All required packages present')

# Safe imports (user may have installed them)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

NB1-BOOT All required packages present


In [None]:
# NB1-PATHS: locate features CSV. LOCAL: data/etl/*/snapshots/*/joined/features_daily.csv
# KAGGLE: search under /kaggle/input/**/features_daily.csv
import os, re
from glob import glob
# Allow explicit override via FEATURES_CSV variable or env var
if 'FEATURES_CSV' in globals() and FEATURES_CSV:
    INPUT_CSV = Path(FEATURES_CSV)
    print('NB1-PATHS Using FEATURES_CSV override ->', INPUT_CSV)
elif os.environ.get('FEATURES_CSV'):
    INPUT_CSV = Path(os.environ.get('FEATURES_CSV'))
    print('NB1-PATHS Using FEATURES_CSV env override ->', INPUT_CSV)
else:
    if ENV == 'KAGGLE':
        # In Kaggle kernels dataset files are mounted under /kaggle/input/<dataset>/
        pattern = '/kaggle/input/**/features_daily.csv'
        candidates = glob(pattern, recursive=True)
        print('NB1-PATHS (KAGGLE) candidate count ->', len(candidates))
        chosen = candidates[0] if candidates else None
    else:
        pattern = str(Path.cwd() / 'data' / 'etl' / '*' / 'snapshots' / '*' / 'joined' / 'features_daily.csv')
        candidates = glob(pattern)
        print('NB1-PATHS (LOCAL) candidate count ->', len(candidates))
        dated = []
        for p in candidates:
            m = re.search(r'[\\/](?:snapshots)[\\/]([0-9]{4}-[0-9]{2}-[0-9]{2})', p)
            dated.append((p, m.group(1) if m else None))
        chosen = None
        if any(d[1] for d in dated):
            dated_with = [d for d in dated if d[1] is not None]
            dated_with.sort(key=lambda x: x[1], reverse=True)
            chosen = dated_with[0][0]
        elif dated:
            chosen = dated[0][0]
    if not chosen:
        # follow the contract for blocked response
        raise FileNotFoundError('<FEATURES_CSV não encontrado em LOCAL>')
    INPUT_CSV = Path(chosen)
    print('NB1-PATHS Chosen INPUT_CSV ->', INPUT_CSV)
# compute sha256
def sha256_file(p: Path) -> str:
    import hashlib
    h = hashlib.sha256()
    h.update(p.read_bytes())
    return h.hexdigest()
try:
    INPUT_SHA256 = sha256_file(INPUT_CSV)
    print('NB1-PATHS INPUT_SHA256 ->', INPUT_SHA256)
except Exception as e:
    raise ValueError('<arquivo corrompido ou vazio>') from e

NB1-PATHS candidate count -> 0


FileNotFoundError: <FEATURES_CSV não encontrado em LOCAL>

In [None]:
# DIAGNOSTIC: show first lines and sample columns
print('NB1-PATHS INPUT_CSV ->', INPUT_CSV)
print('Exists ->', INPUT_CSV.exists())
print('Size ->', getattr(INPUT_CSV.stat(), 'st_size', 'n/a'))
print('
--- first 10 raw lines ---')
with INPUT_CSV.open('r', encoding='utf-8', errors='replace') as fh:
    for i, line in enumerate(fh):
        print(f'{i+1}: {line.rstrip()}')
        if i >= 9:
            break
# pandas sample
try:
    sample = pd.read_csv(INPUT_CSV, nrows=5)
    print('Detected columns:', list(sample.columns))
except Exception as e:
    print('pandas.sample failed:', e)

In [None]:
# NB1-LOAD: Load dataframe and apply tolerant schema guards (prefer 'date')
try:
    df = pd.read_csv(INPUT_CSV)
except FileNotFoundError:
    raise FileNotFoundError('<FEATURES_CSV não encontrado em LOCAL>')
except Exception as e:
    raise ValueError('<arquivo corrompido ou vazio>') from e
print('NB1-LOAD rows, cols =', df.shape)
# Prefer 'date' column; fallback to 'date_utc' if 'date' missing
if 'date' in df.columns:
    DATE_COL = 'date'
    print("NB1-LOAD Using date column: 'date'")
elif 'date_utc' in df.columns:
    DATE_COL = 'date'
    df[DATE_COL] = df['date_utc']
    print("NB1-LOAD Mapped 'date_utc' -> 'date'")
else:
    raise ValueError("<coluna de data ausente: 'date' ou 'date_utc'>")
# Normalize date column to datetime, drop invalid rows, sort
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors='coerce')
before_drop = len(df)
df = df.dropna(subset=[DATE_COL])
dropped = before_drop - len(df)
if dropped > 0:
    print(f'NB1-LOAD Dropped {dropped} rows with invalid {DATE_COL}')
df = df.sort_values(by=[DATE_COL]).reset_index(drop=True)
# Tolerant available cols
available_cols = list(df.columns)
print('NB1-LOAD Available columns (sample):', available_cols[:20])
# Segment info (optional)
has_segment = 'segment_id' in df.columns
segment_counts = None
if has_segment:
    segment_counts = df['segment_id'].value_counts().to_dict()
    print('NB1-LOAD segment counts (sample):', dict(list(segment_counts.items())[:10]))

In [None]:
# NB1-EDA: summary stats, missingness, correlations, and light figures
exclude_cols = {'date', 'segment_id', 'version_id', 'participant_id'}
numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
print('NB1-EDA Detected numeric features count ->', len(numeric_cols))
# Summary stats
summary = df[numeric_cols].describe().transpose() if numeric_cols else pd.DataFrame()
summary.to_csv(TAB_DIR / 'summary_stats.csv', index=True)
print('NB1-OUT Wrote summary stats ->', TAB_DIR / 'summary_stats.csv')
# Head (first 10 rows)
df.head(10).to_csv(TAB_DIR / 'head.csv', index=False)
print('NB1-OUT Wrote head ->', TAB_DIR / 'head.csv')
# Missingness (percent)
miss = pd.DataFrame({'missing_pct': df.isna().mean() * 100})
miss.to_csv(TAB_DIR / 'missingness_pct.csv')
print('NB1-OUT Wrote missingness ->', TAB_DIR / 'missingness_pct.csv')
# Correlations limited to top-variance 50 features
if numeric_cols:
    variances = df[numeric_cols].var(skipna=True).sort_values(ascending=False)
    top = list(variances.index[:50])
    corr = df[top].corr(method='pearson')
    corr.to_csv(TAB_DIR / 'correlations.csv')
    print('NB1-OUT Wrote correlations ->', TAB_DIR / 'correlations.csv')
    # Matplotlib heatmap
    fig, ax = plt.subplots(figsize=(6,6))
    cax = ax.imshow(corr.values, cmap='bwr', vmin=-1, vmax=1)
    ax.set_xticks(range(len(corr.columns)))
    ax.set_yticks(range(len(corr.index)))
    ax.set_xticklabels(corr.columns, rotation=90, fontsize=6)
    ax.set_yticklabels(corr.index, fontsize=6)
    fig.colorbar(cax, ax=ax, fraction=0.046, pad=0.04)
    plt.title('Numeric Correlations')
    plt.tight_layout()
    fig.savefig(FIG_DIR / 'correlations_heatmap.png', dpi=150)
    plt.close(fig)
    print('NB1-OUT Wrote heatmap ->', FIG_DIR / 'correlations_heatmap.png')
else:
    print('NB1-EDA Not enough numeric columns for correlations')
# Optional histograms for common columns
for col in ['hr_mean', 'sleep_total_minutes']:
    if col in df.columns:
        try:
            plt.figure()
            df[col].dropna().hist(bins=30)
            plt.title(f'{col} distribution')
            plt.tight_layout()
            plt.savefig(FIG_DIR / f'{col}_hist.png', dpi=150)
            plt.close()
            print('NB1-OUT Wrote', col, 'hist ->', FIG_DIR / f'{col}_hist.png')
        except Exception as e:
            print('NB1-EDA could not write histogram for', col, e)

In [None]:
# NB1-OUT: write run manifest, logs, and refresh latest mirror
import shutil
def lib_version(name: str) -> str:
    try:
        import pkg_resources
        return pkg_resources.get_distribution(name).version
    except Exception:
        try:
            import importlib
            m = importlib.import_module(name)
            return getattr(m, '__version__', 'unknown')
        except Exception:
            return 'unknown'
manifest = {
    'env': ENV,
    'python_executable': sys.executable,
    'input': {'path': str(INPUT_CSV), 'sha256': INPUT_SHA256},
    'rows': int(df.shape[0]),
    'cols': int(df.shape[1]),
    'date_min': str(df[DATE_COL].min()),
    'date_max': str(df[DATE_COL].max()),
    'numeric_features': list(numeric_cols),
    'has_segment_id': bool(has_segment),
}
if has_segment:
    manifest['segment_counts'] = segment_counts
manifest['artifacts'] = []
for p in sorted(TAB_DIR.glob('*')) + sorted(FIG_DIR.glob('*')):
    if p.is_file():
        try:
            s = hashlib.sha256(p.read_bytes()).hexdigest()
        except Exception:
            s = 'error'
        manifest['artifacts'].append({'path': str(p), 'sha256': s})
manifest_path = MAN_DIR / 'run_manifest.json'
with manifest_path.open('w', encoding='utf-8') as f:
    json.dump(manifest, f, indent=2, sort_keys=True)
print('NB1-OUT Wrote run manifest ->', manifest_path)
# Write log
log_path = LOG_DIR / 'run.log'
with log_path.open('a', encoding='utf-8') as fh:
    fh.write('NB1-BOOT TIMESTAMP=' + TIMESTAMP + '\n')
    fh.write('NB1-PATHS INPUT=' + str(INPUT_CSV) + ' SHA256=' + INPUT_SHA256 + '\n')
    fh.write('NB1-LOAD rows=' + str(df.shape[0]) + ' cols=' + str(df.shape[1]) + '\n')
    fh.write('NB1-EDA num_features=' + str(len(numeric_cols)) + '\n')
print('NB1-OUT Wrote log ->', log_path)
# Refresh latest mirror
try:
    if LATEST_DIR.exists():
        shutil.rmtree(LATEST_DIR)
    shutil.copytree(RUN_DIR, LATEST_DIR)
    print('NB1-OUT Refreshed latest mirror ->', LATEST_DIR)
except Exception as e:
    print('NB1-OUT Could not refresh latest mirror:', e)
# Final console summary
num_missing_over_20 = int((df.isna().mean() * 100 > 20).sum())
num_const_cols = int((df.nunique(dropna=False) <= 1).sum())
print('NB1-RESULT ENV=', ENV)
print('NB1-RESULT INPUT=', INPUT_CSV, 'SHA256=', INPUT_SHA256)
print('NB1-RESULT rows,cols=', df.shape)
print('NB1-RESULT date_range=', str(df[DATE_COL].min()), '->', str(df[DATE_COL].max()))
print('NB1-RESULT #num_features=', len(numeric_cols), '#missing>20%=', num_missing_over_20, '#const_cols=', num_const_cols)
print('NB1-RESULT run_dir=', RUN_DIR)
print('NB1-RESULT latest_mirror=', LATEST_DIR)
print('READY for NB2: true')

Notes:\n
- Outputs are saved under `notebooks/outputs/NB1/<TIMESTAMP>/` with `tables`, `figures`, `manifests`, and `logs`.\n
- `latest/` is a mirror of the most recent run.\n
- If the notebook raises one of the special block messages, act as follows:\n
  - `<FEATURES_CSV não encontrado em LOCAL>`: place the CSV under `data/etl/<PID>/snapshots/<SNAP>/joined/features_daily.csv` or set `FEATURES_CSV` env/variable.\n
  - `<coluna de data ausente: 'date' ou 'date_utc'>`: ensure the dataset has a `date` or `date_utc` column.\n
  - `<arquivo corrompido ou vazio>`: inspect the CSV file for corruption or empty content.