In [1]:
# === predict_mood.py ===
# Predict mood/emotion labels on new data using previously trained MindPal artifacts.
# Outputs: predictions_mood.csv with columns:
#   id, top1_label, top1_prob, [prob_<class_1>, prob_<class_2>, ...]
#
# Usage:
#   python predict_mood.py --in "C:\Users\sagni\Downloads\Mind Pal\archive\new_data.csv" --out "C:\Users\sagni\Downloads\Mind Pal\predictions_mood.csv"
#   python predict_mood.py --text "I felt calm and focused during study."
#
# Notes:
# - The input CSV should have the SAME feature columns as training (except the target).
# - If you use --text, we’ll build a single-row DataFrame and put the text into the first text column
#   we detect from the preprocess bundle; the rest will be left missing (imputers handle it).

import os
import argparse
import json
import numpy as np
import pandas as pd
import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import load_model

# -----------------------------
# Default artifact paths
# -----------------------------
OUT_DIR   = r"C:\Users\sagni\Downloads\Mind Pal"
PKL_PATH  = os.path.join(OUT_DIR, "mindpal_preprocess.pkl")
H5_PATH   = os.path.join(OUT_DIR, "mindpal_model.h5")

# -----------------------------
# Helper classes (must match training script)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow numeric features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# -----------------------------
# Utility
# -----------------------------
def load_bundle_and_model(pkl_path: str, h5_path: str):
    if not os.path.exists(pkl_path):
        raise FileNotFoundError(f"Missing preprocess bundle: {pkl_path}")
    if not os.path.exists(h5_path):
        raise FileNotFoundError(f"Missing model file: {h5_path}")

    bundle = joblib.load(pkl_path)
    model  = load_model(h5_path)
    return bundle, model

def build_df_for_text(single_text: str, bundle: dict) -> pd.DataFrame:
    """
    Create a single-row DataFrame compatible with training columns.
    Will place the provided text into the FIRST available text column.
    """
    # Try to reconstruct the full input schema:
    numeric_cols   = bundle.get("numeric_cols", [])
    cat_cols       = bundle.get("cat_cols", [])
    text_cols      = bundle.get("text_cols", [])
    datetime_cols  = bundle.get("datetime_cols", [])

    all_cols = list(dict.fromkeys(numeric_cols + cat_cols + text_cols + datetime_cols))

    if not all_cols:
        # Fallback: single 'text' column
        all_cols = ["text"]

    df = pd.DataFrame({c: [np.nan] for c in all_cols})

    # Place text into the first text column (or 'text' if that was created)
    if text_cols:
        df[text_cols[0]] = single_text
    else:
        # Ensure a 'text' column exists
        if "text" not in df.columns:
            df["text"] = [single_text]
        else:
            df.loc[0, "text"] = single_text

    return df

def ensure_dense_if_small(X):
    if hasattr(X, "toarray"):
        # Heuristic threshold to avoid RAM blow-ups
        return X.toarray() if X.shape[1] <= 50000 else X
    return X

def predict_dataframe(df_in: pd.DataFrame, bundle: dict, model) -> pd.DataFrame:
    preprocess     = bundle["preprocess"]
    label_encoder  = bundle["label_encoder"]
    target_col     = bundle.get("target_col", None)

    X = df_in.copy()
    if target_col and target_col in X.columns:
        X = X.drop(columns=[target_col])

    X_t = preprocess.transform(X)
    X_t = ensure_dense_if_small(X_t)

    probs = model.predict(X_t, verbose=0)
    classes = [str(c) for c in label_encoder.classes_]

    if probs.ndim == 1 or probs.shape[1] == 1:
        # Binary case: probs is (N,1); construct 2-class distribution (class_1 is "positive")
        pos = probs.ravel()
        neg = 1.0 - pos
        probs_mat = np.vstack([neg, pos]).T
        # label_encoder.classes_ defines order; assume positive prob aligns with the second class
        top_idx = (pos >= 0.5).astype(int)
    else:
        probs_mat = probs
        top_idx = np.argmax(probs_mat, axis=1)

    top_label = [classes[i] for i in top_idx]
    top_prob  = probs_mat[np.arange(len(probs_mat)), top_idx]

    # Build output frame
    out = pd.DataFrame({
        "id": np.arange(len(df_in)),
        "top1_label": top_label,
        "top1_prob": top_prob
    })

    # Add per-class probability columns
    for j, cls in enumerate(classes):
        out[f"prob_{cls}"] = probs_mat[:, j]

    return out

# -----------------------------
# Main
# -----------------------------
def main():
    parser = argparse.ArgumentParser(description="Predict mood/emotion labels with MindPal model.")
    parser.add_argument("--in",  dest="in_csv",  type=str, default=None,
                        help="Path to input CSV with same columns as training (except target).")
    parser.add_argument("--out", dest="out_csv", type=str, default=None,
                        help="Where to save predictions CSV. Defaults to 'predictions_mood.csv' in OUT_DIR.")
    parser.add_argument("--text", dest="single_text", type=str, default=None,
                        help="Predict on a single text string (optional).")
    parser.add_argument("--print", dest="do_print", action="store_true",
                        help="Print top predictions to console.")
    args = parser.parse_args()

    bundle, model = load_bundle_and_model(PKL_PATH, H5_PATH)

    # Build input DataFrame
    if args.single_text is not None:
        df_in = build_df_for_text(args.single_text, bundle)
    elif args.in_csv is not None:
        if not os.path.exists(args.in_csv):
            raise FileNotFoundError(f"Input CSV not found: {args.in_csv}")
        df_in = pd.read_csv(args.in_csv)
    else:
        raise ValueError("Provide either --text 'your sentence' or --in path\\to\\file.csv")

    preds = predict_dataframe(df_in, bundle, model)

    # Default output path
    out_csv = args.out_csv or os.path.join(OUT_DIR, "predictions_mood.csv")
    preds.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"[SAVE] Predictions -> {out_csv}")

    if args.do_print:
        # Print first few rows
        cols_to_show = ["id", "top1_label", "top1_prob"] + \
            [c for c in preds.columns if c.startswith("prob_")]
        print(preds[cols_to_show].head(10).to_string(index=False))

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--in IN_CSV] [--out OUT_CSV] [--text SINGLE_TEXT] [--print]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\sagni\AppData\Roaming\jupyter\runtime\kernel-df4e8e00-2b12-430b-9490-6a2edfb25a2e.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
