In [1]:
import os
import warnings
import json
import numpy as np
import pandas as pd
import joblib

warnings.filterwarnings("ignore")

# ========= EDIT YOUR PATHS =========
BASE_DIR = r"C:\Users\sagni\Downloads\Med Assist"

FOLDER1  = os.path.join(BASE_DIR, r"archive\mturkfitbit_export_4.12.16-5.12.16\Fitabase Data 4.12.16-5.12.16")
FOLDER2  = os.path.join(BASE_DIR, r"archive\mturkfitbit_export_3.12.16-4.11.16\Fitabase Data 3.12.16-4.11.16")

PKL_PATH = os.path.join(BASE_DIR, "medassist_preprocess.pkl")
H5_PATH  = os.path.join(BASE_DIR, "medassist_model.h5")

OUT_PRED = os.path.join(BASE_DIR, "medassist_predictions.csv")
OUT_SUMM = os.path.join(BASE_DIR, "medassist_prediction_summary.json")

# ========= RUNTIME TUNING =========
INCLUDE_MINUTE_FILES = False
ROW_LIMIT_PER_CSV    = None          # or set to a number like 100000
SAMPLE_FRAC_PER_CSV  = 0.12
BIG_FILE_MIN_ROWS    = 200_000
RANDOM_STATE         = 42

# number of rows to score (RAM-safe)
MAX_ROWS_FOR_PREDICT = 10_000
BATCH_SIZE           = 1024

# candidate ground-truth columns (if present we’ll print metrics)
TARGET_CANDIDATES = [
    "Calories", "calories", "TotalCalories",
    "Calories Burned", "Calories_Burned", "target", "y"
]

# ========= FILE FILTERS =========
INCLUDE_PATTERNS = [
    "daily", "day", "sleep", "Sleep", "weight", "Weight",
    "summary", "calories", "Calories", "activities",
    "heart", "resting", "steps", "Steps", "distance", "Distance"
]
EXCLUDE_PATTERNS = ["minute", "Minute", "seconds", "second", "intraday", "Intraday"]


# ========= HELPERS =========
def ensure_exists(p: str, name: str):
    if not os.path.exists(p):
        raise FileNotFoundError(f"{name} not found at: {p}")

def _downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.select_dtypes(include=["float64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="float")
    for c in df.select_dtypes(include=["int64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

def _want_file(fname: str) -> bool:
    low = fname.lower()
    if not INCLUDE_MINUTE_FILES:
        for bad in EXCLUDE_PATTERNS:
            if bad.lower() in low:
                return False
    for good in INCLUDE_PATTERNS:
        if good.lower() in low:
            return True
    return INCLUDE_MINUTE_FILES

def _read_csv_safely(path: str, row_limit):
    try:
        if row_limit:
            df = pd.read_csv(path, nrows=row_limit)
        else:
            df = pd.read_csv(path)
        return _downcast_numeric(df)
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return pd.DataFrame()

def _maybe_sample(df: pd.DataFrame, frac: float) -> pd.DataFrame:
    if frac is None or frac >= 1.0 or len(df) == 0:
        return df
    if len(df) >= BIG_FILE_MIN_ROWS:
        df = df.sample(frac=frac, random_state=RANDOM_STATE)
    return df

def _safe_union_concat(dfs):
    if not dfs:
        return pd.DataFrame()
    cols = set()
    for d in dfs:
        cols.update(d.columns.tolist())
    cols = list(cols)
    aligned = [d.reindex(columns=cols) for d in dfs]
    return pd.concat(aligned, ignore_index=True)

def load_fitbit_folder(folder: str) -> pd.DataFrame:
    if not os.path.isdir(folder):
        print(f"[WARN] Folder not found: {folder}")
        return pd.DataFrame()
    csvs = [f for f in os.listdir(folder) if f.lower().endswith(".csv")]
    kept = [f for f in csvs if _want_file(f)]
    if not kept:
        print("[WARN] No CSVs selected to load after filtering.")
        return pd.DataFrame()

    dfs, total = [], 0
    for i, fname in enumerate(kept, 1):
        fpath = os.path.join(folder, fname)
        df = _read_csv_safely(fpath, ROW_LIMIT_PER_CSV)
        if df.empty:
            continue
        if ROW_LIMIT_PER_CSV is None:
            df = _maybe_sample(df, SAMPLE_FRAC_PER_CSV)
        df["__source_file"] = fname
        dfs.append(df)
        total += len(df)
        if len(dfs) >= 8:  # keep memory in check
            dfs = [_safe_union_concat(dfs)]
        if i % 10 == 0:
            print(f"[INFO] Loaded ~{i} files; rows so far: {total:,}")
    out = _safe_union_concat(dfs)
    print(f"[INFO] Loaded {folder} -> shape={out.shape}")
    return out

def drop_ids_dates(df: pd.DataFrame) -> pd.DataFrame:
    bads = []
    for c in df.columns:
        cl = c.lower()
        if "id" in cl or "date" in cl or "time" in cl or "datetime" in cl:
            bads.append(c)
    return df.drop(columns=bads, errors="ignore")

def pick_target(df: pd.DataFrame, bundle_target: str | None) -> str | None:
    if bundle_target and bundle_target in df.columns:
        return bundle_target
    for c in TARGET_CANDIDATES:
        if c in df.columns:
            return c
    # no target found is OK (pure inference)
    return None

# RAM-safe batch predict from (possibly sparse) matrix
def sparse_batch_predict(model, X, batch_size=1024, dtype=np.float32):
    n = X.shape[0]
    out = np.empty(n, dtype=np.float32)
    for start in range(0, n, batch_size):
        stop = min(start + batch_size, n)
        Xi = X[start:stop]
        if hasattr(Xi, "toarray"):   # sparse
            Xi = Xi.toarray().astype(dtype, copy=False)
        else:
            Xi = np.asarray(Xi, dtype=dtype)
        out[start:stop] = model.predict(Xi, verbose=0).ravel().astype(np.float32)
    return out

# Force all OneHotEncoders to be sparse_output=True (and sparse=True if present)
def force_sparse_onehot(preprocessor):
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    def _patch_ohe(ohe):
        if hasattr(ohe, "sparse_output"):
            try:
                ohe.sparse_output = True
            except Exception:
                pass
        if hasattr(ohe, "sparse"):
            try:
                ohe.sparse = True
            except Exception:
                pass
        return ohe

    def _recurse(est):
        if isinstance(est, Pipeline):
            for _, step in est.steps:
                _recurse(step)
        elif isinstance(est, ColumnTransformer):
            for _, trans, _ in est.transformers:
                _recurse(trans)
        else:
            if est.__class__.__name__ == "OneHotEncoder" or isinstance(est, OneHotEncoder):
                _patch_ohe(est)

    _recurse(preprocessor)
    return preprocessor


# ========= MAIN: PREDICT & SHOW =========
def main():
    # sanity checks
    ensure_exists(PKL_PATH, "preprocess PKL")
    ensure_exists(H5_PATH,  "model H5")

    # load bundle + preprocessor
    bundle = joblib.load(PKL_PATH)
    pre = bundle["preprocess"]
    bundle_target = bundle.get("target_col", None)

    # force OHE sparse output to avoid giant dense matrices
    pre = force_sparse_onehot(pre)

    # load data from both folders
    d1 = load_fitbit_folder(FOLDER1)
    d2 = load_fitbit_folder(FOLDER2)
    df = _safe_union_concat([d1, d2])

    if df.empty:
        print("[ERROR] No rows to predict on.")
        return

    # keep only useful columns and limit rows (RAM-safe)
    df = drop_ids_dates(df)
    if len(df) > MAX_ROWS_FOR_PREDICT:
        df = df.sample(n=MAX_ROWS_FOR_PREDICT, random_state=RANDOM_STATE).reset_index(drop=True)

    # detect target if present
    target_col = pick_target(df, bundle_target)
    if target_col and target_col in df.columns:
        df_infer = df.dropna(subset=[target_col]).copy()
    else:
        df_infer = df.copy()

    # build X / y (if available)
    y_true = None
    if target_col and target_col in df_infer.columns:
        y_true = df_infer[target_col].astype("float32").values
        X = df_infer.drop(columns=[target_col])
    else:
        X = df_infer

    # transform features (stays sparse)
    Xp = pre.transform(X)

    # load Keras model without compiling (avoids 'mse' deserialization mismatch)
    from tensorflow import keras
    model = keras.models.load_model(H5_PATH, compile=False)

    # batched predictions
    y_pred = sparse_batch_predict(model, Xp, batch_size=BATCH_SIZE, dtype=np.float32)

    # assemble output frame
    out = df_infer.copy()
    out["prediction"] = y_pred

    # if ground truth exists, compute quick metrics
    summary = {"rows_scored": int(len(out))}
    if y_true is not None and len(y_true) == len(y_pred):
        diff = y_pred - y_true
        mae = float(np.mean(np.abs(diff)))
        mse = float(np.mean(diff ** 2))
        rmse = float(np.sqrt(mse))
        # simple R2 (using numpy)
        ss_res = float(np.sum(diff ** 2))
        ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2)) if len(y_true) > 1 else float("nan")
        r2 = float(1.0 - ss_res / ss_tot) if ss_tot not in (0.0, float("nan")) else float("nan")
        summary.update({"mae": mae, "mse": mse, "rmse": rmse, "r2": r2})

    # show a small preview in console
    show_cols = []
    # Prefer a couple of common signal columns if present
    for c in ["Steps", "Distance", "Calories", "HeartRate", "SleepMinutes"]:
        if c in out.columns:
            show_cols.append(c)
    # Always include prediction (and target if exists)
    if target_col and target_col in out.columns:
        show_cols = [target_col] + show_cols
    show_cols = list(dict.fromkeys(show_cols))  # dedupe
    if "prediction" not in show_cols:
        show_cols.append("prediction")

    print("\n=== Prediction Preview (first 10 rows) ===")
    print(out[show_cols].head(10).to_string(index=False))

    # save outputs
    out.to_csv(OUT_PRED, index=False)
    with open(OUT_SUMM, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"\n[OK] Saved predictions -> {OUT_PRED}")
    print(f"[OK] Saved summary     -> {OUT_SUMM}")
    if "mae" in summary:
        print("\nQuick metrics on scored rows:")
        for k in ["mae", "rmse", "r2"]:
            print(f"  {k.upper():>4}: {summary[k]:,.4f}")


if __name__ == "__main__":
    main()


[INFO] Loaded C:\Users\sagni\Downloads\Med Assist\archive\mturkfitbit_export_4.12.16-5.12.16\Fitabase Data 4.12.16-5.12.16 -> shape=(48438, 30)
[INFO] Loaded C:\Users\sagni\Downloads\Med Assist\archive\mturkfitbit_export_3.12.16-4.11.16\Fitabase Data 3.12.16-4.11.16 -> shape=(48658, 25)

=== Prediction Preview (first 10 rows) ===
 Calories  prediction
     55.0   74.901382
    137.0  135.074036
    184.0  102.015648
     56.0  104.622215
    103.0  124.953102
     82.0  119.581207
     84.0  116.181900
     82.0  124.672668
     83.0   75.677811
     46.0  101.693863

[OK] Saved predictions -> C:\Users\sagni\Downloads\Med Assist\medassist_predictions.csv
[OK] Saved summary     -> C:\Users\sagni\Downloads\Med Assist\medassist_prediction_summary.json

Quick metrics on scored rows:
   MAE: 52.3179
  RMSE: 126.0931
    R2: 0.9367
