In [1]:
import os
import json
import math
import warnings
from typing import Optional, Dict, Any, List

import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import tensorflow as tf
from tensorflow import keras

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# USER PATHS (EDIT THESE)
# ----------------------------
BASE_DIR   = r"C:\Users\sagni\Downloads\Agri Mind"
PKL_PATH   = os.path.join(BASE_DIR, "neuro_preprocess.pkl")  # your preprocess bundle
MODEL_PATH = os.path.join(BASE_DIR, "neuro_model.h5")        # your trained model
INPUT_CSV  = os.path.join(BASE_DIR, "archive", "yield_df.csv")  # CSV to score
OUT_CSV    = os.path.join(BASE_DIR, "predictions.csv")          # where to save predictions

# If your bundle does not have 'target_col', we try these fallbacks:
POSSIBLE_TARGETS = ["hg/ha_yield", "yield", "Yield", "target", "y"]


# ----------------------------
# Helpers
# ----------------------------
def _ensure_dir_for_file(path: str) -> None:
    """Create parent directory for a file if it doesn't exist."""
    d = os.path.dirname(path)
    if d:
        os.makedirs(d, exist_ok=True)

def _to_numpy(X):
    """Convert dense/sparse to numpy array."""
    return X.toarray() if hasattr(X, "toarray") else np.asarray(X)

def _detect_target(df: pd.DataFrame) -> Optional[str]:
    for c in POSSIBLE_TARGETS:
        if c in df.columns:
            return c
    return None

def _align_to_preprocess_columns(df: pd.DataFrame, preprocess) -> pd.DataFrame:
    """
    Many sklearn pipelines (esp. ColumnTransformer) expect the same columns seen at fit time.
    We reindex the incoming DataFrame to match preprocess.feature_names_in_ when available,
    adding any missing columns as NaN and dropping extras.
    """
    cols_fit = getattr(preprocess, "feature_names_in_", None)
    if cols_fit is None:
        # Try to reach into the first step if it's a Pipeline with a ColumnTransformer
        # Otherwise, just return df as-is.
        return df

    # Create any missing cols as NaN
    aligned = df.reindex(columns=list(cols_fit))
    return aligned

def _load_bundle_and_model(pkl_path: str, model_path: str) -> Dict[str, Any]:
    if not os.path.exists(pkl_path):
        raise FileNotFoundError(f"Preprocess bundle not found: {pkl_path}")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")

    bundle = joblib.load(pkl_path)
    if not isinstance(bundle, dict) or "preprocess" not in bundle:
        raise ValueError("Invalid preprocess bundle: expected dict with key 'preprocess'.")

    preprocess = bundle["preprocess"]
    target_col = bundle.get("target_col")  # may be None

    # IMPORTANT: load without compiling to avoid legacy metric ('mse') issues
    model = keras.models.load_model(model_path, compile=False)

    return {"preprocess": preprocess, "target_col": target_col, "model": model}

def predict_file(
    input_csv: str,
    pkl_path: str,
    model_path: str,
    out_csv: str,
    print_out: bool = True,
    id_cols: Optional[List[str]] = None,
    target_col: Optional[str] = None,
) -> pd.DataFrame:
    """
    Load preprocess + model, score the CSV, and write predictions to out_csv.
    Returns the output DataFrame.
    """
    # 1) Load
    info = _load_bundle_and_model(pkl_path, model_path)
    preprocess = info["preprocess"]
    model = info["model"]
    bundle_target = info.get("target_col")

    # 2) Read input
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")
    df = pd.read_csv(input_csv)

    # 3) Determine target (if present; used only for metrics/cleanup)
    tgt = target_col or bundle_target or _detect_target(df)

    # 4) Optionally keep ID cols to pass through in the output
    id_cols = id_cols or [c for c in ["Area", "Item", "Year", "Country", "State", "id", "ID"] if c in df.columns]
    keep_cols = [c for c in id_cols if c in df.columns]

    # 5) If target is present, drop it before transform (but keep for metrics)
    y_true = None
    if tgt and tgt in df.columns:
        y_true = df[tgt].astype(float).values
        X_df = df.drop(columns=[tgt])
    else:
        X_df = df

    # 6) Align columns to what the pipeline expects
    X_df = _align_to_preprocess_columns(X_df, preprocess)

    # 7) Transform & predict
    X_proc = preprocess.transform(X_df)
    y_pred = model.predict(_to_numpy(X_proc), verbose=0).ravel()

    # 8) Build output frame
    out = pd.DataFrame(index=df.index)
    for c in keep_cols:
        out[c] = df[c]
    if tgt and tgt in df.columns:
        out["actual"] = df[tgt].values
    out["prediction"] = y_pred

    # 9) Save
    _ensure_dir_for_file(out_csv)
    out.to_csv(out_csv, index=False)

    # 10) Quick metrics if actuals available
    if y_true is not None:
        mae = float(mean_absolute_error(y_true, y_pred))
        mse = float(mean_squared_error(y_true, y_pred))
        rmse = float(math.sqrt(mse))   # avoid sklearn `squared=False` arg for compatibility
        r2 = float(r2_score(y_true, y_pred))
        metrics = {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}
        if print_out:
            print(json.dumps({"metrics": metrics, "rows_scored": int(len(out))}, indent=2))
    else:
        if print_out:
            print(json.dumps({"metrics": None, "rows_scored": int(len(out))}, indent=2))

    if print_out:
        print(f"[OK] Wrote predictions to: {out_csv}")
        print(out.head(10).to_string(index=False))

    return out


# ----------------------------
# Run directly (Option 2 style)
# ----------------------------
if __name__ == "__main__":
    _ = predict_file(
        input_csv=INPUT_CSV,
        pkl_path=PKL_PATH,
        model_path=MODEL_PATH,
        out_csv=OUT_CSV,
        print_out=True,
        id_cols=None,       # or like ["Area","Item","Year"]
        target_col=None,    # set if you want to override the bundle/auto-detected target
    )


{
  "metrics": {
    "MAE": 7277.987452027177,
    "MSE": 141142785.69759548,
    "RMSE": 11880.35292815813,
    "R2": 0.9804440155211175
  },
  "rows_scored": 28242
}
[OK] Wrote predictions to: C:\Users\sagni\Downloads\Agri Mind\predictions.csv
   Area        Item  Year  actual   prediction
Albania       Maize  1990   36613 20763.816406
Albania    Potatoes  1990   66667 77022.804688
Albania Rice, paddy  1990   23333 29063.724609
Albania     Sorghum  1990   12500 13657.093750
Albania    Soybeans  1990    7000  9684.549805
Albania       Wheat  1990   30197 18972.654297
Albania       Maize  1991   29068 22150.837891
Albania    Potatoes  1991   77818 80981.031250
Albania Rice, paddy  1991   28538 31464.589844
Albania     Sorghum  1991    6667 16279.347656
