In [1]:
# === predict_mindbreath.py ===
# MindBreath: Prediction script (CLI + Jupyter safe), CSV or single-text.

import os, sys, argparse
import numpy as np
import pandas as pd
import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.keras.models import load_model
import tensorflow as tf

# -----------------------------
# Paths
# -----------------------------
OUT_DIR    = r"C:\Users\sagni\Downloads\Mind Breath"
PKL_PATH   = os.path.join(OUT_DIR, "mindbreath_preprocess.pkl")
H5_PATH    = os.path.join(OUT_DIR, "mindbreath_model.h5")
DEFAULT_OUT = os.path.join(OUT_DIR, "predictions_mindbreath.csv")

# -----------------------------
# Helper classes (must match training for safe unpickling)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column): self.column = column
    def fit(self, X, y=None): return self
    def transform(self, X): return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X.iloc[:, 0].astype(str).values
        return np.asarray(X).astype(str).ravel()

class DateTimeExpand(BaseEstimator, TransformerMixin):
    def __init__(self, columns): self.columns = columns; self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow", f"{c}_hour"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
                f"{c}_hour":  s.dt.hour.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# -----------------------------
# Utilities
# -----------------------------
def ensure_dense_if_small(X, max_feats=50000):
    if hasattr(X, "toarray") and X.shape[1] <= max_feats:
        return X.toarray()
    return X

def compile_loaded_model(model, n_classes: int):
    # Not required for predict(), but silences TF warnings
    if n_classes <= 2:
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

def ensure_schema(df_in: pd.DataFrame, bundle: dict) -> pd.DataFrame:
    """
    Ensure incoming DataFrame has the columns seen at training time.
    Missing columns are filled with safe defaults by type.
    Extra columns are fine; ColumnTransformer ignores them unless referenced.
    """
    df = df_in.copy()
    num_cols = bundle.get("numeric_cols", [])
    cat_cols = bundle.get("cat_cols", [])
    txt_cols = bundle.get("text_cols", [])
    dt_cols  = bundle.get("datetime_cols", [])

    for c in num_cols:
        if c not in df.columns:
            df[c] = 0
    for c in cat_cols:
        if c not in df.columns:
            df[c] = "unknown"
    for c in txt_cols:
        if c not in df.columns:
            df[c] = ""
    for c in dt_cols:
        if c not in df.columns:
            df[c] = pd.Timestamp.today()

    # Keep a stable order (optional but nice)
    preferred = [*num_cols, *cat_cols, *txt_cols, *dt_cols]
    front = [c for c in preferred if c in df.columns]
    tail  = [c for c in df.columns if c not in front]
    return df[front + tail] if front else df

def build_df_for_text(text: str, bundle: dict) -> pd.DataFrame:
    """
    Create a 1-row DataFrame aligned to training schema, dropping the text
    into the first text column (if any). If no text columns existed, create one.
    """
    base = ensure_schema(pd.DataFrame(), bundle)
    # convert 0-row to 1-row defaults
    row = {}
    for c in base.columns:
        if c in bundle.get("numeric_cols", []): row[c] = 0
        elif c in bundle.get("cat_cols", []):   row[c] = "unknown"
        elif c in bundle.get("datetime_cols", []): row[c] = pd.Timestamp.today()
        else: row[c] = ""
    df = pd.DataFrame([row])
    txt_cols = bundle.get("text_cols", [])
    if txt_cols:
        df.loc[0, txt_cols[0]] = text
    else:
        df["text"] = text
    return df

def predict_dataframe(df_in: pd.DataFrame, bundle: dict, model, out_csv: str = None, print_out: bool = False) -> pd.DataFrame:
    preprocess    = bundle["preprocess"]
    label_encoder = bundle["label_encoder"]
    classes       = [str(c) for c in label_encoder.classes_]
    n_classes     = len(classes)

    df_aligned = ensure_schema(df_in, bundle)
    X = preprocess.transform(df_aligned)
    X = ensure_dense_if_small(X)

    probs = model.predict(X, verbose=0)

    # Handle binary and multiclass probabilities
    if probs.ndim == 1 or probs.shape[1] == 1:
        pos = probs.ravel()
        neg = 1.0 - pos
        prob_mat = np.vstack([neg, pos]).T
        pred_idx = (pos >= 0.5).astype(int)
    else:
        prob_mat = probs
        pred_idx = np.argmax(prob_mat, axis=1)

    pred_labels = label_encoder.inverse_transform(pred_idx)

    out = df_in.copy()
    out.insert(0, "pred_label", pred_labels)

    # Append probabilities
    if prob_mat.shape[1] == 2 and n_classes == 2:
        out["prob_neg"] = prob_mat[:, 0]
        out["prob_pos"] = prob_mat[:, 1]
    else:
        # Align prob columns with label order (truncate/min-match just in case)
        k = min(prob_mat.shape[1], n_classes)
        for j in range(k):
            out[f"prob_{classes[j]}"] = prob_mat[:, j]

    if out_csv:
        out.to_csv(out_csv, index=False, encoding="utf-8")
        print(f"[SAVE] Predictions -> {out_csv}")

    if print_out:
        cols = ["pred_label"] + [c for c in out.columns if c.startswith("prob_")]
        print(out[cols].head(min(20, len(out))).to_string(index=False))

    return out

# -----------------------------
# CLI / Entry
# -----------------------------
def parse_args(argv=None):
    p = argparse.ArgumentParser(description="MindBreath Predictor")
    p.add_argument("--in",   dest="in_csv",   type=str, default=None, help="Input CSV (same schema as training)")
    p.add_argument("--out",  dest="out_csv",  type=str, default=None, help="Output predictions CSV path")
    p.add_argument("--text", dest="single_text", type=str, default=None, help="Single text to predict (if text columns existed)")
    p.add_argument("--print", dest="do_print", action="store_true", help="Print predictions table")
    return p.parse_args(argv)

def main(argv=None):
    # Make Jupyter safe: strip injected args like "-f kernel-*.json"
    if argv is None and ("ipykernel_launcher" in sys.argv[0] or any(a == "-f" for a in sys.argv)):
        argv = []

    args = parse_args(argv)

    # Interactive fallback when no args provided
    if args.single_text is None and args.in_csv is None:
        try:
            s = input("Enter a sentence to predict (or leave blank to use a CSV): ").strip()
        except EOFError:
            s = ""
        if s:
            args.single_text = s
        else:
            csv_path = input("Enter path to input CSV (or press Enter to cancel): ").strip()
            if not csv_path:
                raise ValueError("Provide either --text 'some sentence' or --in path\\to\\file.csv")
            args.in_csv = csv_path

    # Load artifacts
    if not os.path.exists(PKL_PATH):
        raise FileNotFoundError(f"Missing preprocess bundle: {PKL_PATH}")
    if not os.path.exists(H5_PATH):
        raise FileNotFoundError(f"Missing model file: {H5_PATH}")

    bundle = joblib.load(PKL_PATH)
    model  = load_model(H5_PATH)
    compile_loaded_model(model, len(bundle["label_encoder"].classes_))

    # Build input dataframe
    if args.single_text is not None:
        df_in = build_df_for_text(args.single_text, bundle)
    else:
        if not os.path.exists(args.in_csv):
            raise FileNotFoundError(f"Input CSV not found: {args.in_csv}")
        df_in = pd.read_csv(args.in_csv)

    out_csv = args.out_csv or DEFAULT_OUT
    predict_dataframe(df_in, bundle, model, out_csv=out_csv, print_out=args.do_print)

# -----------------------------
# Optional notebook helpers
# -----------------------------
def predict_text(text: str, save_to: str = None, show=True):
    bundle = joblib.load(PKL_PATH)
    model  = load_model(H5_PATH)
    compile_loaded_model(model, len(bundle["label_encoder"].classes_))
    df_in = build_df_for_text(text, bundle)
    return predict_dataframe(df_in, bundle, model, out_csv=save_to, print_out=show)

def predict_csv(csv_path: str, save_to: str = None, show=True):
    bundle = joblib.load(PKL_PATH)
    model  = load_model(H5_PATH)
    compile_loaded_model(model, len(bundle["label_encoder"].classes_))
    df_in = pd.read_csv(csv_path)
    return predict_dataframe(df_in, bundle, model, out_csv=save_to, print_out=show)

# -----------------------------
if __name__ == "__main__":
    main()


Enter a sentence to predict (or leave blank to use a CSV):  
Enter path to input CSV (or press Enter to cancel):  C:\Users\sagni\Downloads\Mind Breath\archive\Stress_Dataset.csv




[SAVE] Predictions -> C:\Users\sagni\Downloads\Mind Breath\predictions_mindbreath.csv
