In [2]:
# predict.py
import os
import re
import json
import argparse
import warnings
import joblib
import numpy as np
import pandas as pd
from typing import Any, Dict, List, Optional, Tuple, Union

# Silence TF/keras info logs if desired
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
warnings.filterwarnings("ignore", category=UserWarning)

# -----------------------------
# 1) Keras safe loader
# -----------------------------
try:
    # TF/Keras >= 3
    from keras.src.saving import saving_api as keras_saving_api
    from keras.src.models import load_model as keras_load_model  # type: ignore
except Exception:
    try:
        # TF/Keras 2.x
        from keras.models import load_model as keras_load_model  # type: ignore
    except Exception as e:
        keras_load_model = None

def load_keras_model_safely(path: str):
    """
    Load a Keras model without trying to re-compile (avoids 'mse' missing, etc.).
    Works for .h5 or .keras. Raises if unsupported.
    """
    if keras_load_model is None:
        raise RuntimeError("Could not import keras load_model.")
    # compile=False prevents deserializing losses/metrics like 'mse' you don't need for inference
    return keras_load_model(path, compile=False)

# -----------------------------
# 2) Helpers & transformers
# -----------------------------

_dt_part = r"(year|month|day|dow|hour)"
_double_dt_re = re.compile(rf"^(.*)__{_dt_part}$")

def sanitize_datetime_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize any accidental double-underscore datetime features:
    'col__year' -> 'col_year'. If both exist, drop the double-underscore version.
    """
    df = pd.DataFrame(df) if not isinstance(df, pd.DataFrame) else df
    rename_map, drop_cols = {}, []
    for c in list(df.columns):
        m = _double_dt_re.match(c)
        if m:
            base, part = m.group(1), m.group(2)
            fixed = f"{base}_{part}"
            if fixed in df.columns:
                drop_cols.append(c)
            else:
                rename_map[c] = fixed
    if rename_map:
        df = df.rename(columns=rename_map)
    if drop_cols:
        df = df.drop(columns=drop_cols)
    return df

# Minimal transformer class used by the fitted ColumnTransformer
# (must be present to unpickle the preprocess pipeline).
from sklearn.base import BaseEstimator, TransformerMixin

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """
    Matches the class used during fit: expands datetime cols into
    <col>_year/_month/_day/_dow/_hour (single underscore).
    """
    def __init__(self, features: Optional[List[str]] = None):
        self.features = features if features is not None else ["year", "month", "day", "dow", "hour"]

    def fit(self, X, y=None):
        import pandas as pd
        X = pd.DataFrame(X)
        self.cols_ = list(X.columns)
        return self

    def transform(self, X):
        import pandas as pd
        X = pd.DataFrame(X)
        out = {}
        for col in self.cols_:
            dt = pd.to_datetime(X[col], errors="coerce", utc=False)
            if "year" in self.features:
                out[f"{col}_year"] = dt.dt.year.astype("Int64")
            if "month" in self.features:
                out[f"{col}_month"] = dt.dt.month.astype("Int64")
            if "day" in self.features:
                out[f"{col}_day"] = dt.dt.day.astype("Int64")
            if "dow" in self.features:
                out[f"{col}_dow"] = dt.dt.dayofweek.astype("Int64")
            if "hour" in self.features:
                out[f"{col}_hour"] = dt.dt.hour.astype("Int64")
        Xout = pd.DataFrame(out).fillna(-1)
        return Xout.values

    def get_feature_names_out(self, input_features=None):
        names = []
        for col in getattr(self, "cols_", []):
            if "year" in self.features:  names.append(f"{col}_year")
            if "month" in self.features: names.append(f"{col}_month")
            if "day" in self.features:   names.append(f"{col}_day")
            if "dow" in self.features:   names.append(f"{col}_dow")
            if "hour" in self.features:  names.append(f"{col}_hour")
        return np.array(names, dtype=object)

# -----------------------------
# 3) I/O & prediction routines
# -----------------------------

def _ensure_dense_small(X, max_dense_features: int = 200_000):
    """
    If X is sparse and tiny, convert to dense to avoid Keras sparse issues.
    """
    try:
        import scipy.sparse as sp
        if sp.issparse(X):
            nfeat = X.shape[1]
            if nfeat <= max_dense_features:
                return X.toarray()
        return X
    except Exception:
        return X

def _align_to_fit_columns(df: pd.DataFrame, preprocess) -> pd.DataFrame:
    """
    Ensure df has exactly the raw columns (feature_names_in_) the fitted
    ColumnTransformer expects: re-order, add missing as NaN, drop extras.
    """
    if not hasattr(preprocess, "feature_names_in_"):
        # Older scikit-learn may not set this; in that case assume current df columns are fine.
        return df

    expected = list(preprocess.feature_names_in_)
    cur = set(df.columns)
    # Add missing columns as NaN
    for c in expected:
        if c not in cur:
            df[c] = np.nan
    # Drop unexpected columns
    df = df[expected]
    return df

def _topk_from_probs(probs: np.ndarray, topk: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Return (topk_indices, topk_probs) for each row.
    """
    if probs.ndim == 1:
        probs = probs[:, None]
    k = min(topk, probs.shape[1])
    top_idx = np.argpartition(-probs, kth=k-1, axis=1)[:, :k]
    # sort those top-k
    row_indices = np.arange(probs.shape[0])[:, None]
    sorted_order = np.argsort(-probs[row_indices, top_idx], axis=1)
    top_sorted_idx = top_idx[row_indices, sorted_order]
    top_sorted_probs = probs[row_indices, top_sorted_idx]
    return top_sorted_idx, top_sorted_probs

def load_bundle(pkl_path: str) -> Dict[str, Any]:
    if not os.path.exists(pkl_path):
        raise FileNotFoundError(pkl_path)
    # DateTimeExpand is defined above so unpickling works
    bundle = joblib.load(pkl_path)
    if "preprocess" not in bundle:
        raise KeyError("Bundle missing 'preprocess'")
    return bundle

def predict_file(
    input_csv: str,
    pkl_path: str,
    model_path: str,
    out_csv: str,
    topk: int = 5,
    id_cols: Optional[List[str]] = None,
    target_col: Optional[str] = None,
    print_out: bool = True,
):
    # Load bundle and model
    print("[INFO] Loading preprocess bundle...")
    bundle = load_bundle(pkl_path)
    preprocess = bundle["preprocess"]
    label_encoder = bundle.get("label_encoder", None)
    class_names = bundle.get("class_names", None)
    target_col_bundle = bundle.get("target_col", None)
    if target_col is None and target_col_bundle is not None:
        target_col = target_col_bundle

    print("[INFO] Loading model...")
    model = load_keras_model_safely(model_path)

    # Read input
    print(f"[INFO] Reading input: {input_csv}")
    df = pd.read_csv(input_csv)

    # Drop target if present
    if target_col and target_col in df.columns:
        df = df.drop(columns=[target_col])

    # Normalize accidental datetime double-underscore artifacts
    df = sanitize_datetime_columns(df)

    # Align raw columns to what the pipeline expects
    df = _align_to_fit_columns(df, preprocess)

    # Transform
    print("[INFO] Transforming inputs...")
    X_proc = preprocess.transform(df)
    X_proc = _ensure_dense_small(X_proc)

    # Predict probabilities or logits
    print("[INFO] Predicting...")
    raw = model.predict(X_proc, verbose=0)
    # Try to softmax if they look like logits
    probs = raw
    if raw.ndim == 2 and raw.shape[1] > 1:
        # normalize row-wise if not already in [0,1]
        row_sums = raw.max(axis=1)
        if np.any(row_sums > 1.0):
            # assume logits
            e = np.exp(raw - raw.max(axis=1, keepdims=True))
            probs = e / e.sum(axis=1, keepdims=True)
    else:
        # binary or regression fallback; treat as probability of class 1 if binary
        probs = np.hstack([1 - raw, raw]) if raw.ndim == 2 else raw

    # Map top-k to labels
    top_idx, top_p = _topk_from_probs(probs, topk=topk)
    n = probs.shape[0]
    out_rows = []

    # Build label mapping
    def idx_to_label(i: int) -> Union[str, int]:
        if label_encoder is not None and hasattr(label_encoder, "inverse_transform"):
            # inverse_transform expects array-like
            return label_encoder.inverse_transform([i])[0]
        elif class_names is not None and 0 <= i < len(class_names):
            return class_names[i]
        else:
            return int(i)  # fallback to class index

    # Prepare output DataFrame
    base_cols = []
    if id_cols:
        for c in id_cols:
            if c in df.columns:
                base_cols.append(c)

    for r in range(n):
        row = {}
        for c in base_cols:
            row[c] = df.iloc[r][c]
        # top-k label/prob pairs
        for k in range(top_idx.shape[1]):
            cls_i = int(top_idx[r, k])
            row[f"pred_top{k+1}_label"] = idx_to_label(cls_i)
            row[f"pred_top{k+1}_prob"] = float(top_p[r, k])
        out_rows.append(row)

    out_df = pd.DataFrame(out_rows)
    out_df.to_csv(out_csv, index=False)
    print(f"[INFO] Wrote predictions to: {out_csv}")

    if print_out:
        print(out_df.head(min(10, len(out_df))).to_string(index=False))

# -----------------------------
# 4) CLI
# -----------------------------
def parse_args():
    ap = argparse.ArgumentParser(description="Predict with fitted preprocess + Keras model.")
    ap.add_argument("--pkl",   required=True, help="Path to preprocess_bundle.pkl")
    ap.add_argument("--model", required=True, help="Path to Keras model (.h5 or .keras)")
    ap.add_argument("--input", required=True, help="CSV to score")
    ap.add_argument("--out",   required=True, help="Output CSV with predictions")
    ap.add_argument("--topk",  type=int, default=5, help="Top-K classes to output")
    ap.add_argument("--ids",   default="", help="Comma-separated id columns to copy into output (optional)")
    ap.add_argument("--target", default="", help="Target column name if present in input (will be dropped)")
    return ap.parse_args()

if __name__ == "__main__":
    args = parse_args()
    id_cols = [c.strip() for c in args.ids.split(",") if c.strip()] if args.ids else None
    target_col = args.target.strip() or None
    predict_file(
        input_csv=args.input,
        pkl_path=args.pkl,
        model_path=args.model,
        out_csv=args.out,
        topk=args.topk,
        id_cols=id_cols,
        target_col=target_col,
        print_out=True,
    )


usage: ipykernel_launcher.py [-h] --pkl PKL --model MODEL --input INPUT --out OUT [--topk TOPK] [--ids IDS] [--target TARGET]
ipykernel_launcher.py: error: the following arguments are required: --pkl, --model, --input, --out


SystemExit: 2