In [2]:
import os
import json
import joblib
import warnings
import numpy as np
import pandas as pd

from typing import List, Optional, Dict, Iterable

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

warnings.filterwarnings("ignore")

# ==============================
#          CONFIG
# ==============================
BASE_DIR = r"C:\Users\sagni\Downloads\Med Assist"
FOLDER1  = os.path.join(BASE_DIR, r"archive\mturkfitbit_export_4.12.16-5.12.16\Fitabase Data 4.12.16-5.12.16")
FOLDER2  = os.path.join(BASE_DIR, r"archive\mturkfitbit_export_3.12.16-4.11.16\Fitabase Data 3.12.16-4.11.16")

OUT_PKL  = os.path.join(BASE_DIR, "medassist_preprocess.pkl")
OUT_H5   = os.path.join(BASE_DIR, "medassist_model.h5")
OUT_YAML = os.path.join(BASE_DIR, "medassist_model.yaml")
OUT_JSON = os.path.join(BASE_DIR, "medassist_metadata.json")

# Memory controls
INCLUDE_MINUTE_FILES = False        # set True to include minute/intraday CSVs
ROW_LIMIT_PER_CSV    = None         # e.g., 150_000 to cap rows per large CSV
SAMPLE_FRAC_PER_CSV  = 0.15         # subsample fraction if ROW_LIMIT_PER_CSV is None (applied to "big" files only)
BIG_FILE_MIN_ROWS    = 200_000      # treat files with >= this many rows as "big"

# Column name patterns to **include** by default (daily/summaries)
INCLUDE_PATTERNS = [
    "daily", "day", "Sleep", "sleep", "weight", "Weight", "minuteSleep", "summary",
    "calories", "Calories", "activities", "heart", "resting", "steps", "Steps", "distance", "Distance"
]

# Column/file name patterns to **exclude** (intraday/minute/second)
EXCLUDE_PATTERNS = [
    "minute", "Minute", "seconds", "second", "intraday", "Intraday"
]

# Target column guesses
TARGET_CANDIDATES = [
    "Calories", "calories", "TotalCalories", "Calories Burned", "Calories_Burned"
]

RANDOM_STATE = 42
EPOCHS       = 20
BATCH_SIZE   = 32

# ==============================
#       LOADING HELPERS
# ==============================
def _want_file(fname: str) -> bool:
    """Decide whether to load a CSV based on config flags and patterns."""
    low = fname.lower()
    if not INCLUDE_MINUTE_FILES:
        for bad in EXCLUDE_PATTERNS:
            if bad.lower() in low:
                return False
    # if we excluded minute files, still allow daily/summary
    for good in INCLUDE_PATTERNS:
        if good.lower() in low:
            return True
    # If no include pattern matched, allow anyway if not excluded
    return INCLUDE_MINUTE_FILES  # load only when explicitly allowed

def _downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast floats/ints to reduce memory."""
    for c in df.select_dtypes(include=["float64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="float")
    for c in df.select_dtypes(include=["int64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

def _safe_union_concat(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """Concatenate with unioned columns to avoid huge reindex allocations."""
    if not dfs:
        return pd.DataFrame()
    cols = set()
    for d in dfs:
        cols.update(d.columns.tolist())
    cols = list(cols)
    aligned = [d.reindex(columns=cols) for d in dfs]
    return pd.concat(aligned, ignore_index=True)

def _read_csv_safely(path: str, row_limit: Optional[int]) -> pd.DataFrame:
    """Read CSV with dtype downcast and optional row cap."""
    try:
        if row_limit:
            df = pd.read_csv(path, nrows=row_limit)
        else:
            df = pd.read_csv(path)
        df = _downcast_numeric(df)
        return df
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return pd.DataFrame()

def _maybe_sample(df: pd.DataFrame, frac: float) -> pd.DataFrame:
    if frac is None or frac >= 1.0 or len(df) == 0:
        return df
    # only sample if "big"
    if len(df) >= BIG_FILE_MIN_ROWS:
        df = df.sample(frac=frac, random_state=RANDOM_STATE)
    return df

def load_fitbit_folder(folder: str) -> pd.DataFrame:
    """Memory-aware loader for a Fitabase export folder."""
    if not os.path.isdir(folder):
        print(f"[WARN] Folder not found: {folder}")
        return pd.DataFrame()

    csv_files = [f for f in os.listdir(folder) if f.lower().endswith(".csv")]
    kept = [f for f in csv_files if _want_file(f)]
    skipped = set(csv_files) - set(kept)
    if skipped:
        print(f"[INFO] Skipping {len(skipped)} CSVs (likely minute/intraday): first 5 -> {list(skipped)[:5]}")
    if not kept:
        print("[WARN] No CSVs selected to load after filtering.")
        return pd.DataFrame()

    dfs: List[pd.DataFrame] = []
    total_rows = 0
    for i, fname in enumerate(kept, 1):
        fpath = os.path.join(folder, fname)
        # quick probe: if file is HUGE and no ROW_LIMIT, sample fractionally
        row_cap = ROW_LIMIT_PER_CSV
        if row_cap is None:
            # peek rows cheaply to decide sampling
            try:
                # read just to get shape quickly (may still be expensive)
                probe = pd.read_csv(fpath, nrows=1000)
                # If there are many columns, union later can be heavy — still safe.
            except Exception:
                probe = None
        df = _read_csv_safely(fpath, row_cap)
        if df.empty:
            continue
        if row_cap is None:
            df = _maybe_sample(df, SAMPLE_FRAC_PER_CSV)
        df["__source_file"] = fname
        dfs.append(df)
        total_rows += len(df)
        # concat in small batches to keep memory in check
        if len(dfs) >= 8:
            dfs = [_safe_union_concat(dfs)]
        if i % 10 == 0:
            print(f"[INFO] Loaded ~{i} files; current rows: {total_rows:,}")

    combined = _safe_union_concat(dfs)
    print(f"[INFO] Loaded folder: {folder}")
    print(f"[INFO] Combined shape: {combined.shape}")
    return combined

# ==============================
#   TARGET SELECTION & CLEAN
# ==============================
def pick_target(df: pd.DataFrame) -> str:
    for c in TARGET_CANDIDATES:
        if c in df.columns:
            return c
    # fallback: last numeric column
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        raise ValueError("No numeric target column detected.")
    return num_cols[-1]

def drop_obvious_ids_and_dates(df: pd.DataFrame) -> pd.DataFrame:
    bads = []
    for c in df.columns:
        cl = c.lower()
        if "id" in cl or "date" in cl or "time" in cl or "datetime" in cl:
            bads.append(c)
    return df.drop(columns=bads, errors="ignore")

# ==============================
#     PREPROCESS & TRAIN
# ==============================
def make_preprocessor(X: pd.DataFrame, target_col: str) -> ColumnTransformer:
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    # sklearn compatibility: OneHotEncoder param is sparse_output in >=1.2, sparse in older
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", ohe)
    ])
    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])
    return pre

def build_model(input_dim: int) -> keras.Model:
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# ==============================
#           MAIN
# ==============================
def main():
    print("[INFO] Memory-aware loading...")
    df1 = load_fitbit_folder(FOLDER1)
    df2 = load_fitbit_folder(FOLDER2)
    df  = _safe_union_concat([df1, df2])
    if df.empty:
        raise RuntimeError("No data loaded. Adjust INCLUDE_MINUTE_FILES / patterns or verify folders.")

    # Clean
    df = drop_obvious_ids_and_dates(df)
    # Drop entirely empty columns (after union)
    empty_cols = [c for c in df.columns if df[c].isna().all()]
    if empty_cols:
        df = df.drop(columns=empty_cols)

    # Target
    target_col = pick_target(df)
    df = df.dropna(subset=[target_col]).copy()

    # Prepare X/y
    y = df[target_col].astype("float32").values
    X = df.drop(columns=[target_col])

    # Build preprocess & transform
    pre = make_preprocessor(X, target_col)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )
    Xtr = pre.fit_transform(X_train)
    Xte = pre.transform(X_test)

    # Ensure float32 for Keras and reduce memory
    if hasattr(Xtr, "toarray"):
        Xtr = Xtr.toarray()
        Xte = Xte.toarray()
    Xtr = Xtr.astype("float32", copy=False)
    Xte = Xte.astype("float32", copy=False)

    print(f"[INFO] Feature matrix: train={Xtr.shape}, test={Xte.shape}")
    model = build_model(Xtr.shape[1])

    # Train
    hist = model.fit(
        Xtr, y_train,
        validation_data=(Xte, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=1
    )

    # ==========================
    #       SAVE ARTIFACTS
    # ==========================
    bundle = {
        "preprocess": pre,
        "target_col": target_col,
        "numeric_cols": X.select_dtypes(include=[np.number]).columns.tolist(),
        "cat_cols": X.select_dtypes(exclude=[np.number]).columns.tolist(),
    }
    joblib.dump(bundle, OUT_PKL)
    print("[OK] Saved preprocess bundle ->", OUT_PKL)

    model.save(OUT_H5)
    print("[OK] Saved model (.h5) ->", OUT_H5)

    # YAML (optional; not all TF builds keep to_yaml)
    try:
        yaml_txt = model.to_yaml()
        with open(OUT_YAML, "w", encoding="utf-8") as f:
            f.write(yaml_txt)
        print("[OK] Saved model (.yaml) ->", OUT_YAML)
    except Exception as e:
        print(f"[WARN] Skipped YAML export: {e}")

    meta = {
        "base_dir": BASE_DIR,
        "folders": [FOLDER1, FOLDER2],
        "include_minute_files": INCLUDE_MINUTE_FILES,
        "row_limit_per_csv": ROW_LIMIT_PER_CSV,
        "sample_frac_per_csv": SAMPLE_FRAC_PER_CSV,
        "target_col": target_col,
        "train_rows": int(len(y_train)),
        "test_rows": int(len(y_test)),
        "input_dim": int(Xtr.shape[1]),
        "final_val_mae": float(hist.history["val_mae"][-1]),
        "final_val_loss": float(hist.history["val_loss"][-1]),
        "sklearn_version": __import__("sklearn").__version__,
        "tensorflow_version": tf.__version__,
    }
    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)
    print("[OK] Saved metadata (.json) ->", OUT_JSON)

    print("\n[DONE] Artifacts in:", BASE_DIR)

if __name__ == "__main__":
    main()


[INFO] Memory-aware loading...
[INFO] Skipping 10 CSVs (likely minute/intraday): first 5 -> ['minuteStepsWide_merged.csv', 'minuteCaloriesNarrow_merged.csv', 'heartrate_seconds_merged.csv', 'minuteCaloriesWide_merged.csv', 'minuteIntensitiesNarrow_merged.csv']
[INFO] Loaded folder: C:\Users\sagni\Downloads\Med Assist\archive\mturkfitbit_export_4.12.16-5.12.16\Fitabase Data 4.12.16-5.12.16
[INFO] Combined shape: (48438, 30)
[INFO] Skipping 7 CSVs (likely minute/intraday): first 5 -> ['minuteCaloriesNarrow_merged.csv', 'heartrate_seconds_merged.csv', 'minuteIntensitiesNarrow_merged.csv', 'minuteSleep_merged.csv', 'hourlyIntensities_merged.csv']
[INFO] Loaded folder: C:\Users\sagni\Downloads\Med Assist\archive\mturkfitbit_export_3.12.16-4.11.16\Fitabase Data 3.12.16-4.11.16
[INFO] Combined shape: (48658, 25)
[INFO] Feature matrix: train=(38816, 1526), test=(9704, 1526)
Epoch 1/20
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 174836.2656 - mae: 135.34



[OK] Saved preprocess bundle -> C:\Users\sagni\Downloads\Med Assist\medassist_preprocess.pkl
[OK] Saved model (.h5) -> C:\Users\sagni\Downloads\Med Assist\medassist_model.h5
[WARN] Skipped YAML export: 'Sequential' object has no attribute 'to_yaml'
[OK] Saved metadata (.json) -> C:\Users\sagni\Downloads\Med Assist\medassist_metadata.json

[DONE] Artifacts in: C:\Users\sagni\Downloads\Med Assist
