In [1]:
import os
import argparse
import logging
import json
import numpy as np
import pandas as pd

import joblib
from tensorflow.keras.models import load_model

# ---------- CONFIG: artifact directory ----------
ARTIFACT_DIR = r"C:\Users\sagni\Downloads\Agri Vision"

# Files produced by the training script
PKL_PATH       = os.path.join(ARTIFACT_DIR, "yield_sklearn_pipeline.pkl")
PREPROC_PKL    = os.path.join(ARTIFACT_DIR, "preprocessor_only.pkl")
H5_PATH        = os.path.join(ARTIFACT_DIR, "yield_mlp.h5")
TRAIN_FRAME    = os.path.join(ARTIFACT_DIR, "training_frame.csv")  # used to align feature columns
METRICS_JSON   = os.path.join(ARTIFACT_DIR, "metrics.json")        # optional (info only)

# Default output
DEFAULT_OUT    = os.path.join(ARTIFACT_DIR, "predictions.csv")

# Keys (if present) will be carried through to the output for easier tracking
ID_KEYS = ["year","state","district","crop","season","region","block"]

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")


def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    df2.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
    return df2


def _load_artifacts():
    if not os.path.exists(PKL_PATH):
        raise FileNotFoundError(f"Missing model: {PKL_PATH}")
    if not os.path.exists(PREPROC_PKL):
        raise FileNotFoundError(f"Missing preprocessor: {PREPROC_PKL}")
    if not os.path.exists(H5_PATH):
        raise FileNotFoundError(f"Missing Keras model: {H5_PATH}")
    if not os.path.exists(TRAIN_FRAME):
        raise FileNotFoundError(f"Missing training frame for schema alignment: {TRAIN_FRAME}")

    logging.info("Loading artifacts...")
    pipe = joblib.load(PKL_PATH)             # sklearn pipeline (pre + RF)
    pre  = joblib.load(PREPROC_PKL)          # ColumnTransformer only
    mlp  = load_model(H5_PATH)               # Keras
    train_df = pd.read_csv(TRAIN_FRAME)
    train_df = _normalize_cols(train_df)

    # Training features are training_frame minus 'yield' (if present)
    feature_cols = [c for c in train_df.columns if c != "yield"]
    return pipe, pre, mlp, feature_cols


def _prepare_input_df(new_df: pd.DataFrame, feature_cols):
    """
    Align new_df to the training feature set:
    - normalize column names
    - add missing columns with NaN
    - drop extra columns not used in training
    """
    df = _normalize_cols(new_df)

    # Create missing columns
    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        logging.warning(f"Adding {len(missing)} missing columns with NaN: {missing[:10]}{' ...' if len(missing)>10 else ''}")
        for c in missing:
            df[c] = np.nan

    # Keep only training features (preserve order)
    df = df[feature_cols]

    # Strip whitespace for object columns (prevents unseen categories due to stray spaces)
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].astype(str).str.strip()
    return df


def predict_file(input_csv: str, output_csv: str = DEFAULT_OUT, include_ids=True):
    # Load artifacts and schema
    pipe, pre, mlp, feature_cols = _load_artifacts()

    # Read new batch
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")
    logging.info(f"Reading new data: {input_csv}")
    new_df = pd.read_csv(input_csv, engine="python")
    new_df = _normalize_cols(new_df)

    # Keep ID cols for reference if present
    id_cols_present = [k for k in ID_KEYS if k in new_df.columns] if include_ids else []

    # Align to training features
    X = _prepare_input_df(new_df, feature_cols)

    # 1) RandomForest predictions via sklearn pipeline
    logging.info("Predicting with RandomForest pipeline (.pkl)...")
    rf_pred = pipe.predict(X)

    # 2) Keras predictions via saved preprocessor
    logging.info("Predicting with Keras MLP (.h5)...")
    Xt = pre.transform(X)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()
    mlp_pred = mlp.predict(Xt, verbose=0).ravel()

    # 3) Simple average (optional ensemble)
    avg_pred = (rf_pred + mlp_pred) / 2.0

    # Build output frame
    out_cols = []
    if id_cols_present:
        out_cols.extend(id_cols_present)

    out_df = pd.DataFrame({
        "pred_rf": rf_pred,
        "pred_mlp": mlp_pred,
        "pred_avg": avg_pred
    })

    # Concatenate IDs (if any) with predictions
    if id_cols_present:
        result = pd.concat([new_df[id_cols_present].reset_index(drop=True), out_df.reset_index(drop=True)], axis=1)
    else:
        result = out_df

    # Save
    out_dir = os.path.dirname(output_csv)
    os.makedirs(out_dir, exist_ok=True)
    result.to_csv(output_csv, index=False, encoding="utf-8")
    logging.info(f"Saved predictions → {output_csv}")

    # Optional: print a quick preview
    logging.info("Preview (first 5 rows):")
    logging.info("\n" + result.head(5).to_string(index=False))

    return output_csv


def main():
    parser = argparse.ArgumentParser(description="AgriVision Prediction Script")
    parser.add_argument(
        "--input",
        required=True,
        help="Path to CSV with NEW rows to predict (no 'yield' column required)."
    )
    parser.add_argument(
        "--out",
        default=DEFAULT_OUT,
        help=f"Output predictions CSV path (default: {DEFAULT_OUT})"
    )
    parser.add_argument(
        "--no_ids",
        action="store_true",
        help="Do not include ID columns (year/state/district/crop/season/region/block) in the output even if present."
    )
    args = parser.parse_args()

    predict_file(args.input, args.out, include_ids=not args.no_ids)


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --input INPUT [--out OUT] [--no_ids]
ipykernel_launcher.py: error: the following arguments are required: --input


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
