In [1]:
import os
import sys
import logging
import json
import numpy as np
import pandas as pd

import joblib
from tensorflow.keras.models import load_model

# -----------------------------------------------------------------------------
# User-config for Notebook use
# -----------------------------------------------------------------------------
INPUT_PATH  = r""  # e.g., r"C:\Users\sagni\Downloads\Agri Vision\new_batch.csv"
OUTPUT_PATH = r""  # e.g., r"C:\Users\sagni\Downloads\Agri Vision\predictions.csv"
run_main_in_ipynb = False  # set True in notebooks to run immediately

# -----------------------------------------------------------------------------
# Fixed artifact directory (from your training scripts)
# -----------------------------------------------------------------------------
ARTIFACT_DIR = r"C:\Users\sagni\Downloads\Agri Vision"
PKL_PATH     = os.path.join(ARTIFACT_DIR, "yield_sklearn_pipeline.pkl")
PREPROC_PKL  = os.path.join(ARTIFACT_DIR, "preprocessor_only.pkl")
H5_PATH      = os.path.join(ARTIFACT_DIR, "yield_mlp.h5")
TRAIN_FRAME  = os.path.join(ARTIFACT_DIR, "training_frame.csv")  # schema alignment
DEFAULT_OUT  = os.path.join(ARTIFACT_DIR, "predictions.csv")

# Keep some ID columns for readability if present
ID_KEYS = ["year","state","district","crop","season","region","block"]

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")


def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    df2.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
    return df2


def _load_artifacts():
    for p in [PKL_PATH, PREPROC_PKL, H5_PATH, TRAIN_FRAME]:
        if not os.path.exists(p):
            raise FileNotFoundError(f"Required artifact not found: {p}")

    logging.info("Loading artifacts...")
    pipe = joblib.load(PKL_PATH)       # sklearn pipeline (pre + RF)
    pre  = joblib.load(PREPROC_PKL)    # ColumnTransformer only
    mlp  = load_model(H5_PATH)         # Keras model

    train_df = pd.read_csv(TRAIN_FRAME)
    train_df = _normalize_cols(train_df)
    feature_cols = [c for c in train_df.columns if c != "yield"]
    return pipe, pre, mlp, feature_cols


def _prepare_input_df(new_df: pd.DataFrame, feature_cols):
    df = _normalize_cols(new_df)

    # Add any missing training columns as NaN
    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        logging.warning(f"Adding {len(missing)} missing columns with NaN: {missing[:10]}{' ...' if len(missing)>10 else ''}")
        for c in missing:
            df[c] = np.nan

    # Keep only the training features, preserve order
    df = df[feature_cols]

    # Clean strings
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].astype(str).str.strip()

    return df


def predict_file(input_csv: str, output_csv: str = DEFAULT_OUT, include_ids=True):
    # Load models and schema
    pipe, pre, mlp, feature_cols = _load_artifacts()

    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")

    logging.info(f"Reading new data: {input_csv}")
    new_df = pd.read_csv(input_csv, engine="python")
    new_df = _normalize_cols(new_df)

    id_cols_present = [k for k in ID_KEYS if k in new_df.columns] if include_ids else []

    X = _prepare_input_df(new_df, feature_cols)

    # 1) RandomForest predictions
    logging.info("Predicting with RandomForest pipeline (.pkl)...")
    rf_pred = pipe.predict(X)

    # 2) Keras predictions
    logging.info("Predicting with Keras MLP (.h5)...")
    Xt = pre.transform(X)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()
    mlp_pred = mlp.predict(Xt, verbose=0).ravel()

    # 3) Simple average (ensemble)
    avg_pred = (rf_pred + mlp_pred) / 2.0

    # Build output
    pred_df = pd.DataFrame({
        "pred_rf": rf_pred,
        "pred_mlp": mlp_pred,
        "pred_avg": avg_pred
    })

    if include_ids and id_cols_present:
        result = pd.concat([new_df[id_cols_present].reset_index(drop=True),
                            pred_df.reset_index(drop=True)], axis=1)
    else:
        result = pred_df

    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
    result.to_csv(output_csv, index=False, encoding="utf-8")
    logging.info(f"Saved predictions → {output_csv}")
    logging.info("Preview (first 5 rows):\n" + result.head(5).to_string(index=False))
    return output_csv


def _running_in_ipython() -> bool:
    try:
        from IPython import get_ipython  # noqa
        return get_ipython() is not None
    except Exception:
        return False


def _maybe_parse_cli_args(argv):
    """
    Lightweight, safe CLI parser that won't conflict with ipykernel.
    Accepts: --input <path> [--out <path>] [--no_ids]
    """
    args = {"input": None, "out": DEFAULT_OUT, "include_ids": True}
    tokens = list(argv[1:])  # skip script name

    i = 0
    while i < len(tokens):
        t = tokens[i]
        if t == "--input" and i + 1 < len(tokens):
            args["input"] = tokens[i+1]; i += 2
        elif t == "--out" and i + 1 < len(tokens):
            args["out"] = tokens[i+1]; i += 2
        elif t == "--no_ids":
            args["include_ids"] = False; i += 1
        else:
            # ignore unknown tokens (e.g., jupyter passes)
            i += 1
    return args


def main():
    # If running in IPython and user asked to run now, use the constants at the top.
    if _running_in_ipython():
        if run_main_in_ipynb:
            if not INPUT_PATH:
                raise ValueError("Please set INPUT_PATH at the top of this file before running in notebooks.")
            out_path = OUTPUT_PATH or DEFAULT_OUT
            predict_file(INPUT_PATH, out_path, include_ids=True)
        else:
            print("Notebook detected. Set `INPUT_PATH`, optional `OUTPUT_PATH`, and `run_main_in_ipynb = True` to run.")
        return

    # Terminal mode: accept simple flags without argparse
    args = _maybe_parse_cli_args(sys.argv)
    if not args["input"]:
        print(f"Usage (terminal):\n  python {os.path.basename(__file__)} --input <CSV> [--out <CSV>] [--no_ids]")
        print(f"Default artifacts dir: {ARTIFACT_DIR}")
        sys.exit(2)

    predict_file(args["input"], args["out"], include_ids=args["include_ids"])


if __name__ == "__main__":
    main()


Notebook detected. Set `INPUT_PATH`, optional `OUTPUT_PATH`, and `run_main_in_ipynb = True` to run.
