In [3]:
# -*- coding: utf-8 -*-
"""
AgriMind training script (yield regression) with robust metrics (RMSE-safe)
and sklearn compatibility fixes.

Artifacts saved to BASE_DIR:
- neuro_preprocess.pkl
- neuro_model.h5
- neuro_metrics.json
- neuro_config.yaml
- neuro_feature_names.json
- neuro_history.csv
"""
import os, json, math, csv, sys, time, random, pathlib, warnings
from typing import Tuple, List, Dict, Any, Optional

import numpy as np
import pandas as pd

# ---- Paths (edit if needed) ----
BASE_DIR = r"C:\Users\sagni\Downloads\Agri Mind"
ARCHIVE  = os.path.join(BASE_DIR, "archive")

PTH_PEST = os.path.join(ARCHIVE, "pesticides.csv")
PTH_RAIN = os.path.join(ARCHIVE, "rainfall.csv")
PTH_TEMP = os.path.join(ARCHIVE, "temp.csv")
PTH_YLD  = os.path.join(ARCHIVE, "yield.csv")
PTH_PREMERGED = os.path.join(ARCHIVE, "yield_df.csv")  # if present, uses this

# ---- Artifact filenames ----
PKL_OUT   = os.path.join(BASE_DIR, "neuro_preprocess.pkl")
H5_OUT    = os.path.join(BASE_DIR, "neuro_model.h5")
METRICSJS = os.path.join(BASE_DIR, "neuro_metrics.json")
CONFYAML  = os.path.join(BASE_DIR, "neuro_config.yaml")
FEATJSON  = os.path.join(BASE_DIR, "neuro_feature_names.json")
HISTCSV   = os.path.join(BASE_DIR, "neuro_history.csv")

# ---- ML stack ----
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib
import yaml

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.display.width = 180

# -----------------------------
# Helpers
# -----------------------------
def rmse_safe(y_true, y_pred) -> float:
    """RMSE that works with old/new sklearn versions."""
    try:
        # sklearn >= 0.22
        return float(mean_squared_error(y_true, y_pred, squared=False))
    except TypeError:
        # very old sklearn
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def to_numpy(X):
    """Convert possibly-sparse matrix to a dense numpy array for Keras."""
    if hasattr(X, "toarray"):
        return X.toarray()
    return np.asarray(X)

def seed_everything(seed: int = 42):
    random.seed(seed); np.random.seed(seed); tf.random.set_seed(seed)

def print_info(msg: str):
    print(f"[INFO] {msg}")

def guess_key_cols(df: pd.DataFrame) -> List[str]:
    """Drop obvious row/id columns present in some Kaggle CSVs."""
    cands = []
    for c in df.columns:
        low = c.lower()
        if low in {"unnamed: 0", "index", "id"} or low.startswith("unnamed:"):
            cands.append(c)
    return cands

def load_data() -> Tuple[pd.DataFrame, str]:
    """Load and return dataframe + target column name."""
    print_info("Loading CSVs...")
    if os.path.exists(PTH_PREMERGED):
        df = pd.read_csv(PTH_PREMERGED)
        print_info(f"Using pre-merged dataset: {PTH_PREMERGED} (shape={df.shape})")
    else:
        # Minimal defensive merge (country / item / year)
        if not all(os.path.exists(p) for p in [PTH_PEST, PTH_RAIN, PTH_TEMP, PTH_YLD]):
            raise FileNotFoundError("Some required CSVs are missing in the archive folder.")
        pest = pd.read_csv(PTH_PEST)
        rain = pd.read_csv(PTH_RAIN)
        temp = pd.read_csv(PTH_TEMP)
        yld  = pd.read_csv(PTH_YLD)

        # Standardize keys (best-effort—adapt if your headers differ)
        for d in (pest, rain, temp, yld):
            # normalize column names
            d.columns = [c.strip() for c in d.columns]

        # pick likely keys
        key_cols = []
        for k in ["Area", "area", "Country", "country"]:
            if k in yld.columns: key_cols.append(k)
        for k in ["Item", "Crop", "item"]:
            if k in yld.columns and k not in key_cols: key_cols.append(k)
        for k in ["Year", "year"]:
            if k in yld.columns and k not in key_cols: key_cols.append(k)

        if not key_cols:
            raise ValueError("Could not infer merge keys; please ensure shared keys (Area/Item/Year).")

        df = yld.copy()
        for m, name in [(pest, "pest"), (rain, "rain"), (temp, "temp")]:
            join_keys = [k for k in key_cols if k in m.columns]
            df = df.merge(m, on=join_keys, how="left", suffixes=("", f"_{name}"))

        print_info(f"Merged dataset shape = {df.shape}")

    # Drop obvious index columns
    dropc = guess_key_cols(df)
    if dropc:
        print_info(f"Dropping likely row-id columns: {dropc}")
        df = df.drop(columns=dropc)

    # Detect target
    possible_targets = ["hg/ha_yield", "yield", "Yield", "target", "y"]
    target_col = None
    for c in possible_targets:
        if c in df.columns:
            target_col = c
            break
    if target_col is None:
        # default to last numeric column if nothing found (not ideal, but useful fallback)
        num_only = df.select_dtypes(include=[np.number]).columns.tolist()
        if not num_only:
            raise ValueError("No numeric target found and no numeric columns to choose from.")
        target_col = num_only[-1]
    print_info(f"Target column detected: {target_col}")
    # Remove NA target rows
    df = df[~df[target_col].isna()].copy()

    return df, target_col

def split_features(df: pd.DataFrame, target_col: str) -> Tuple[List[str], List[str]]:
    X = df.drop(columns=[target_col])
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]
    return num_cols, cat_cols

def build_preprocessor(df: pd.DataFrame, target_col: str) -> Tuple[Pipeline, List[str], List[str]]:
    X = df.drop(columns=[target_col])
    num_cols, cat_cols = split_features(df, target_col)

    # scikit-learn >=1.2 uses 'sparse_output'; older uses 'sparse'
    ohe_kwargs = dict(handle_unknown="ignore")
    try:
        OneHotEncoder(sparse_output=True, **ohe_kwargs)
        ohe = OneHotEncoder(sparse_output=True, **ohe_kwargs)
    except TypeError:
        # very old sklearn
        ohe = OneHotEncoder(sparse=True, **ohe_kwargs)

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", ohe)
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )
    pipe = Pipeline(steps=[("preprocess", pre)])
    return pipe, num_cols, cat_cols

def get_feature_names(pre: ColumnTransformer, num_cols: List[str], cat_cols: List[str]) -> List[str]:
    names = []
    # numeric features pass-through after scaler
    names.extend(num_cols)
    # get OHE names
    try:
        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
        ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
    except Exception:
        # fallback if not available
        ohe_names = [f"{c}_ohe" for c in cat_cols]
    names.extend(ohe_names)
    return names

def build_model(input_dim: int) -> keras.Model:
    inp = keras.Input(shape=(input_dim,), name="features")
    x = layers.Dense(256, activation="relu")(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.25)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.25)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(1, activation="linear", name="yield")(x)

    model = keras.Model(inputs=inp, outputs=out, name="AgriMindYieldNet")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="mse",
        metrics=[keras.metrics.MeanAbsoluteError(name="mae")]
    )
    return model

def main():
    seed_everything(42)
    os.makedirs(BASE_DIR, exist_ok=True)

    # 1) Load data
    df, target_col = load_data()

    # 2) Build preprocessor & split
    print_info("Fitting preprocessing pipeline...")
    pre_pipe, num_cols, cat_cols = build_preprocessor(df, target_col)

    X = df.drop(columns=[target_col])
    y = df[target_col].astype(float)

    Xtr_raw, Xte_raw, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # fit only on train
    pre_pipe.fit(Xtr_raw)
    Xtr = pre_pipe.transform(Xtr_raw)
    Xte = pre_pipe.transform(Xte_raw)

    input_dim = (Xtr.shape[1] if not hasattr(Xtr, "toarray") else Xtr.shape[1])
    print_info(f"Building Keras model with input_dim={input_dim} ...")
    model = build_model(input_dim=input_dim)

    # 3) Train
    es = callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True)
    rlrop = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=4, min_lr=1e-6)
    csvlog = callbacks.CSVLogger(HISTCSV, append=False)

    print_info("Training...")
    hist = model.fit(
        to_numpy(Xtr), y_train.values,
        validation_data=(to_numpy(Xte), y_test.values),
        epochs=25,
        batch_size=256,
        verbose=1,
        callbacks=[es, rlrop, csvlog],
    )

    # 4) Evaluate (with RMSE-safe)
    y_pred_tr = model.predict(to_numpy(Xtr), verbose=0).ravel()
    y_pred_te = model.predict(to_numpy(Xte), verbose=0).ravel()

    metrics = {
        "train": {
            "mae": float(mean_absolute_error(y_train, y_pred_tr)),
            "mse": float(mean_squared_error(y_train, y_pred_tr)),
            "rmse": rmse_safe(y_train, y_pred_tr),
            "r2":  float(r2_score(y_train, y_pred_tr)),
        },
        "test": {
            "mae": float(mean_absolute_error(y_test, y_pred_te)),
            "mse": float(mean_squared_error(y_test, y_pred_te)),
            "rmse": rmse_safe(y_test, y_pred_te),
            "r2":  float(r2_score(y_test, y_pred_te)),
        },
        "sklearn_version": sklearn.__version__,
        "tensorflow_version": tf.__version__,
        "rows": int(len(df)),
        "train_rows": int(len(Xtr_raw)),
        "test_rows": int(len(Xte_raw)),
        "target": target_col,
    }
    print_info("Metrics:\n" + json.dumps(metrics, indent=2))

    # 5) Persist artifacts
    # 5a) Feature names after fit
    feat_names = get_feature_names(pre_pipe.named_steps["preprocess"], num_cols, cat_cols)
    with open(FEATJSON, "w", encoding="utf-8") as f:
        json.dump({"feature_names": feat_names}, f, indent=2)

    # 5b) Save preprocess bundle
    bundle = {
        "preprocess": pre_pipe,
        "target_col": target_col,
        "numeric_cols": num_cols,
        "cat_cols": cat_cols,
        "feature_names": feat_names,
        "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "sklearn_version": sklearn.__version__,
    }
    joblib.dump(bundle, PKL_OUT)

    # 5c) Save model
    model.save(H5_OUT)

    # 5d) Save metrics
    with open(METRICSJS, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)

    # 5e) Save lightweight YAML config
    cfg = {
        "paths": {
            "base_dir": BASE_DIR,
            "archive": ARCHIVE,
            "preprocess_pkl": PKL_OUT,
            "model_h5": H5_OUT,
            "metrics_json": METRICSJS,
            "feature_names_json": FEATJSON,
            "history_csv": HISTCSV,
        },
        "training": {
            "random_seed": 42,
            "test_size": 0.2,
            "batch_size": 256,
            "epochs": 25,
            "optimizer": "adam",
            "loss": "mse",
            "callbacks": ["EarlyStopping(patience=8)", "ReduceLROnPlateau(patience=4)", "CSVLogger"],
        },
        "model": {
            "name": "AgriMindYieldNet",
            "layers": ["Dense(256,relu)", "BN", "Dropout(0.25)", "Dense(128,relu)", "BN", "Dropout(0.25)", "Dense(64,relu)", "Dense(1,linear)"],
            "input_dim": int(input_dim),
        },
        "target": target_col,
        "columns": {
            "numeric": num_cols,
            "categorical": cat_cols
        },
    }
    with open(CONFYAML, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False)

    print_info("Saved:")
    for p in [PKL_OUT, H5_OUT, METRICSJS, CONFYAML, FEATJSON, HISTCSV]:
        print("  ", p)
    print_info("DONE.")

if __name__ == "__main__":
    main()


[INFO] Loading CSVs...
[INFO] Using pre-merged dataset: C:\Users\sagni\Downloads\Agri Mind\archive\yield_df.csv (shape=(28242, 8))
[INFO] Dropping likely row-id columns: ['Unnamed: 0']
[INFO] Target column detected: hg/ha_yield
[INFO] Fitting preprocessing pipeline...
[INFO] Building Keras model with input_dim=115 ...
[INFO] Training...
Epoch 1/25
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 12881128448.0000 - mae: 76141.7031 - val_loss: 13188225024.0000 - val_mae: 77052.6875 - learning_rate: 0.0010
Epoch 2/25
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 12832051200.0000 - mae: 76007.4531 - val_loss: 13164433408.0000 - val_mae: 76995.9297 - learning_rate: 0.0010
Epoch 3/25
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 12685656064.0000 - mae: 75646.3281 - val_loss: 13134403584.0000 - val_mae: 76925.4062 - learning_rate: 0.0010
Epoch 4/25
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[INFO] Metrics:
{
  "train": {
    "mae": 7155.47090473934,
    "mse": 135786991.2866709,
    "rmse": 11652.767537656919,
    "r2": 0.9811623953156794
  },
  "test": {
    "mae": 7767.988619101578,
    "mse": 162563116.4044613,
    "rmse": 12750.024172701058,
    "r2": 0.977588851955616
  },
  "sklearn_version": "1.7.1",
  "tensorflow_version": "2.18.0",
  "rows": 28242,
  "train_rows": 22593,
  "test_rows": 5649,
  "target": "hg/ha_yield"
}
[INFO] Saved:
   C:\Users\sagni\Downloads\Agri Mind\neuro_preprocess.pkl
   C:\Users\sagni\Downloads\Agri Mind\neuro_model.h5
   C:\Users\sagni\Downloads\Agri Mind\neuro_metrics.json
   C:\Users\sagni\Downloads\Agri Mind\neuro_config.yaml
   C:\Users\sagni\Downloads\Agri Mind\neuro_feature_names.json
   C:\Users\sagni\Downloads\Agri Mind\neuro_history.csv
[INFO] DONE.
