In [1]:
# eco_forest_predict.py
import os
import json
import logging
import argparse
from typing import Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------------------------
# Default paths (match training)
# ---------------------------
DATA_CSV_DEFAULT = r"C:\Users\sagni\Downloads\Eco Detect\archive\goal15.forest_shares.csv"
OUT_DIR          = r"C:\Users\sagni\Downloads\Eco Detect"

PKL_PATH     = os.path.join(OUT_DIR, "eco_forest_rf.pkl")
HISTORY_CSV  = os.path.join(OUT_DIR, "history.csv")  # optional

# Plot outputs (written next to OUT_DIR)
PRED_CSV_DEFAULT = os.path.join(OUT_DIR, "predictions.csv")
PRED_HIST_PNG    = os.path.join(OUT_DIR, "pred_hist.png")
SCATTER_PNG      = os.path.join(OUT_DIR, "pred_vs_actual.png")
HEAT_PNG         = os.path.join(OUT_DIR, "residual_heatmap.png")

# Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")


# ---------------------------
# Helpers (same heuristics as training/show scripts)
# ---------------------------
def pick_target_column(df: pd.DataFrame) -> Optional[str]:
    lower = {c.lower(): c for c in df.columns}
    for name in ["forest_share","forest_share_percent","forest_area_pct",
                 "forest_area_percent","share","value","trend"]:
        if name in lower:
            return lower[name]
    # else: if there’s a single numeric column, you probably don’t want it auto-picked here.
    return None

def best_country_col(df: pd.DataFrame) -> Optional[str]:
    for cand in ["country","country_name","Country","Country Name","Entity","entity"]:
        if cand in df.columns:
            return cand
    objs = [c for c in df.columns if df[c].dtype == "object"]
    return objs[0] if objs else None

def best_year_col(df: pd.DataFrame) -> Optional[str]:
    for cand in ["year","Year","Time","time"]:
        if cand in df.columns: return cand
    for cand in ["date","Date"]:
        if cand in df.columns: return cand
    return None

def coerce_numeric_like(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    for c in df2.columns:
        if df2[c].dtype == "object":
            # try to coerce (strings that are numbers will become numeric; true text stays object)
            df2[c] = pd.to_numeric(df2[c], errors="ignore")
    return df2


# ---------------------------
# Plotters
# ---------------------------
def plot_pred_hist(y_pred: np.ndarray, out_png: str, show: bool = False):
    plt.figure(figsize=(8,5))
    plt.hist(y_pred, bins=30, alpha=0.8)
    plt.title("Prediction Distribution")
    plt.xlabel("Predicted value"); plt.ylabel("Count")
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout(); plt.savefig(out_png, dpi=160)
    if show:
        try: plt.show()
        except Exception: pass
    plt.close()
    logging.info(f"[PLOT] Saved → {out_png}")

def plot_pred_vs_actual(y_true: np.ndarray, y_pred: np.ndarray, out_png: str, show: bool = False):
    lim_min = float(min(y_true.min(), y_pred.min()))
    lim_max = float(max(y_true.max(), y_pred.max()))
    plt.figure(figsize=(7,7))
    plt.scatter(y_true, y_pred, s=18, alpha=0.7)
    plt.plot([lim_min, lim_max], [lim_min, lim_max], "r--", linewidth=1.5)
    plt.xlabel("True"); plt.ylabel("Predicted")
    plt.title("Predicted vs. True")
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout(); plt.savefig(out_png, dpi=160)
    if show:
        try: plt.show()
        except Exception: pass
    plt.close()
    logging.info(f"[PLOT] Saved → {out_png}")

def plot_residual_heatmap(df_pred: pd.DataFrame, out_png: str, show: bool = False):
    """
    df_pred should contain: residual + some country and optionally year columns.
    """
    ccol = best_country_col(df_pred)
    ycol = best_year_col(df_pred)

    if ccol is None:
        logging.warning("[HEATMAP] No country-like column found; skipping residual heatmap.")
        return

    # If year is parseable to year number, use it
    if ycol is not None and not np.issubdtype(df_pred[ycol].dtype, np.number):
        try:
            yr = pd.to_datetime(df_pred[ycol], errors="coerce").dt.year
            if yr.notna().any():
                df_pred[ycol] = yr
        except Exception:
            pass

    if ycol is not None and ycol in df_pred.columns and np.issubdtype(df_pred[ycol].dtype, np.number):
        # mean residual by Country × Year
        pivot = df_pred.pivot_table(index=ccol, columns=ycol, values="residual", aggfunc="mean")
    else:
        pivot = df_pred.groupby(ccol)["residual"].mean().to_frame("mean_residual")

    # show top-N countries by count
    counts = df_pred.groupby(ccol).size().sort_values(ascending=False)
    keep = counts.head(35).index
    pivot = pivot.loc[pivot.index.intersection(keep)]

    if pivot.empty:
        logging.warning("[HEATMAP] Nothing to plot after filtering; skipping.")
        return

    plt.figure(figsize=(12,9))
    im = plt.imshow(pivot.values, aspect="auto", interpolation="nearest")
    plt.colorbar(im, fraction=0.046, pad=0.04, label="Residual (pred − true)")
    plt.yticks(range(len(pivot.index)), pivot.index)
    # x ticks (years or single col)
    try:
        plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=90)
    except Exception:
        pass
    plt.title("Residual Heatmap (mean error by Country × Year)")
    plt.tight_layout(); plt.savefig(out_png, dpi=160)
    if show:
        try: plt.show()
        except Exception: pass
    plt.close()
    logging.info(f"[PLOT] Saved → {out_png}")


# ---------------------------
# Core
# ---------------------------
def predict_csv(
    input_csv: str = DATA_CSV_DEFAULT,
    out_csv: str = PRED_CSV_DEFAULT,
    show_plots: bool = True
) -> str:
    """
    Run predictions on an input CSV (defaults to training CSV).
    Saves predictions CSV and plots. Returns out_csv path.
    If the CSV contains the target column, also computes metrics and residual plots.
    """
    if not os.path.exists(PKL_PATH):
        raise FileNotFoundError(f"Missing sklearn pipeline: {PKL_PATH}. Train first.")

    os.makedirs(os.path.dirname(out_csv) or ".", exist_ok=True)

    # Load pipeline
    bundle = joblib.load(PKL_PATH)
    sk_pipe = bundle if not isinstance(bundle, dict) else bundle.get("pipeline", bundle)

    # Read input CSV
    df = pd.read_csv(input_csv)
    df = coerce_numeric_like(df)

    # Detect target (if present)
    target = pick_target_column(df)
    has_target = target in df.columns if target else False
    if has_target:
        logging.info(f"[PRED] Detected target column in input: '{target}'. Will compute metrics & residual plots.")
        X = df.drop(columns=[target])
        y_true = pd.to_numeric(df[target], errors="coerce")
    else:
        logging.info("[PRED] No target column detected in input — making predictions only.")
        X = df
        y_true = None

    # Predict
    y_pred = sk_pipe.predict(X)

    # Build prediction frame
    pred_df = df.copy()
    pred_df["y_pred"] = y_pred

    # Save predictions CSV
    pred_df.to_csv(out_csv, index=False, encoding="utf-8")
    logging.info(f"[SAVE] Predictions CSV → {out_csv}")

    # Always: prediction histogram
    plot_pred_hist(y_pred, PRED_HIST_PNG, show=show_plots)

    # If ground truth exists: metrics + plots
    if has_target and y_true is not None and y_true.notna().any():
        # Align to valid rows (avoid NaNs)
        mask = y_true.notna()
        y_true_v = y_true[mask].values
        y_pred_v = y_pred[mask]

        if len(y_true_v) > 0:
            r2  = r2_score(y_true_v, y_pred_v)
            rmse = np.sqrt(mean_squared_error(y_true_v, y_pred_v))
            mae  = mean_absolute_error(y_true_v, y_pred_v)
            logging.info(f"[METRICS] R2={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")

            # Scatter plot
            plot_pred_vs_actual(y_true_v, y_pred_v, SCATTER_PNG, show=show_plots)

            # Residual heatmap if we can
            df_resid = df.loc[mask].copy()
            df_resid["y_true"] = y_true_v
            df_resid["y_pred"] = y_pred_v
            df_resid["residual"] = df_resid["y_pred"] - df_resid["y_true"]
            plot_residual_heatmap(df_resid, HEAT_PNG, show=show_plots)
        else:
            logging.warning("[METRICS] After dropping NaNs, no rows left to compute metrics/plots.")
    else:
        logging.info("[PRED] No ground truth → skipping metrics, scatter, and residual heatmap.")

    return out_csv


# ---------------------------
# CLI
# ---------------------------
def _parse_args(argv=None):
    p = argparse.ArgumentParser(description="EcoDetect prediction with plots.")
    p.add_argument("--input", default=DATA_CSV_DEFAULT, help="Input CSV path. Defaults to training CSV.")
    p.add_argument("--out",   default=PRED_CSV_DEFAULT, help="Output predictions CSV path.")
    p.add_argument("--no_show", action="store_true", help="Do not display plots (still saved to disk).")
    return p.parse_args(argv)

def main():
    args = _parse_args()
    predict_csv(input_csv=args.input, out_csv=args.out, show_plots=not args.no_show)

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--input INPUT] [--out OUT] [--no_show]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\sagni\AppData\Roaming\jupyter\runtime\kernel-8d3d415c-51ce-4bc4-9c82-f5093e2fd094.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
