In [2]:
import os
import json
import yaml
import logging
import time
from typing import List, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# =======================
# Paths (adjust only if needed)
# =======================
DATA_CSV   = r"C:\Users\sagni\Downloads\Eco Detect\archive\goal15.forest_shares.csv"
OUT_DIR    = r"C:\Users\sagni\Downloads\Eco Detect"

H5_PATH      = os.path.join(OUT_DIR, "eco_forest_mlp.h5")
PKL_PATH     = os.path.join(OUT_DIR, "eco_forest_rf.pkl")
CONFIG_YAML  = os.path.join(OUT_DIR, "eco_forest_config.yaml")
METRICS_JSON = os.path.join(OUT_DIR, "eco_forest_metrics.json")
HISTORY_CSV  = os.path.join(OUT_DIR, "history.csv")

# =======================
# Train config
# =======================
TEST_SIZE    = 0.2
RANDOM_STATE = 42
BATCH_SIZE   = 32
EPOCHS       = 60
PATIENCE     = 6

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

# =======================
# Helpers
# =======================
def pick_target_column(df: pd.DataFrame) -> str:
    """Prefer forest-share-like names; else fall back to the last numeric; else 'trend' if present."""
    candidates = [c for c in df.columns]
    lower = {c.lower(): c for c in df.columns}

    preferred_order = [
        "forest_share", "forest_share_percent", "forest_area_pct", "forest_area_percent",
        "share", "value", "trend"
    ]
    for name in preferred_order:
        if name in lower:
            return lower[name]

    # Else: last numeric column
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if num_cols:
        return num_cols[-1]

    # As a last resort raise
    raise ValueError("Could not determine target column. Please rename/choose one explicitly.")

def best_country_col(df: pd.DataFrame) -> Optional[str]:
    for cand in ["country", "country_name", "Country", "Country Name", "Entity", "entity"]:
        if cand in df.columns:
            return cand
    # fallback: first object dtype column
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    return obj_cols[0] if obj_cols else None

def best_year_col(df: pd.DataFrame) -> Optional[str]:
    for cand in ["year", "Year", "Time", "time"]:
        if cand in df.columns:
            return cand
    # sometimes there's a 'date' column
    for cand in ["date", "Date"]:
        if cand in df.columns:
            # try converting to year
            return cand
    return None

def interpolate_target(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """Interpolate target per-country over year if possible; otherwise global interpolation."""
    df2 = df.copy()
    country_col = best_country_col(df2)
    year_col = best_year_col(df2)

    # If we have a date-like column, try to extract year numerically
    if year_col is not None:
        # coerce to numeric year if it's not already numeric
        if not np.issubdtype(df2[year_col].dtype, np.number):
            # Try parsing dates or strings to year
            try:
                df2["_year_tmp"] = pd.to_datetime(df2[year_col], errors="coerce").dt.year
                if df2["_year_tmp"].notna().sum() > 0:
                    df2[year_col] = df2["_year_tmp"]
            except Exception:
                pass
            df2.drop(columns=[c for c in ["_year_tmp"] if c in df2.columns], inplace=True)

    before_missing = int(df2[target].isna().sum())

    if country_col and year_col and np.issubdtype(df2[year_col].dtype, np.number):
        # Sort then interpolate within groups
        df2 = df2.sort_values([country_col, year_col])
        df2[target] = df2.groupby(country_col, group_keys=False)[target].apply(lambda s: s.interpolate(method="linear", limit_direction="both"))
    else:
        # global interpolation on index order
        df2[target] = df2[target].interpolate(method="linear", limit_direction="both")

    after_missing = int(df2[target].isna().sum())
    logging.info(f"[NaN repair] Target '{target}': {before_missing} NaNs → {after_missing} after interpolation.")

    # Drop any remaining missing target rows
    if after_missing > 0:
        df2 = df2.dropna(subset=[target]).reset_index(drop=True)
        logging.info(f"[NaN repair] Dropped remaining {after_missing} rows with missing '{target}'.")

    return df2

# =======================
# Main
# =======================
def main():
    t0 = time.time()
    os.makedirs(OUT_DIR, exist_ok=True)

    # 1) Load
    df = pd.read_csv(DATA_CSV)
    logging.info(f"Loaded {DATA_CSV} with shape {df.shape}")

    # 2) Pick target & repair NaNs
    target_col = pick_target_column(df)
    logging.info(f"Using target column: {target_col}")

    # Coerce obvious numeric columns (some datasets store numbers as strings)
    for c in df.columns:
        if df[c].dtype == "object":
            # try numeric coercion without breaking genuine text columns
            coerced = pd.to_numeric(df[c], errors="ignore")
            df[c] = coerced

    # Interpolate/clean target
    df = interpolate_target(df, target_col)
    if df.empty:
        raise ValueError("Dataset became empty after cleaning target NaNs.")

    # 3) Split features/target
    X = df.drop(columns=[target_col])
    y = df[target_col].astype(float)

    # Identify column types
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]

    # 4) Train/val split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    # 5) Preprocessing (with imputers to handle any remaining NaNs)
    num_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", num_tf, num_cols),
            ("cat", cat_tf, cat_cols),
        ],
        remainder="drop"
    )

    # 6) Sklearn model
    rf = RandomForestRegressor(
        n_estimators=400,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    sk_pipe = Pipeline([("pre", pre), ("rf", rf)])
    sk_pipe.fit(X_train, y_train)

    y_pred_rf = sk_pipe.predict(X_val)
    rmse_rf = float(np.sqrt(mean_squared_error(y_val, y_pred_rf)))
    mae_rf  = float(mean_absolute_error(y_val, y_pred_rf))
    r2_rf   = float(r2_score(y_val, y_pred_rf))

    joblib.dump(sk_pipe, PKL_PATH)
    logging.info(f"[SKLEARN] Saved → {PKL_PATH}")

    # 7) Keras model — reuse the *fitted* sklearn preprocessor to avoid leakage
    preproc = sk_pipe.named_steps["pre"]
    Xtr_proc = preproc.transform(X_train)
    Xva_proc = preproc.transform(X_val)

    # Ensure dense numpy arrays for Keras
    if hasattr(Xtr_proc, "toarray"):
        Xtr_proc = Xtr_proc.toarray()
        Xva_proc = Xva_proc.toarray()
    else:
        Xtr_proc = np.asarray(Xtr_proc, dtype=np.float32)
        Xva_proc = np.asarray(Xva_proc, dtype=np.float32)

    input_dim = Xtr_proc.shape[1]
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(192, activation="relu"),
        layers.Dropout(0.25),
        layers.Dense(96, activation="relu"),
        layers.Dense(1, activation="linear"),
    ])
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])

    es = keras.callbacks.EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True)
    hist = model.fit(
        Xtr_proc, y_train.values,
        validation_data=(Xva_proc, y_val.values),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=0,
        callbacks=[es]
    )

    # Save train history for optional plotting
    hist_df = pd.DataFrame(hist.history)
    hist_df.insert(0, "epoch", np.arange(1, len(hist_df) + 1))
    hist_df.to_csv(HISTORY_CSV, index=False)

    # Evaluate
    loss, mae_k = model.evaluate(Xva_proc, y_val.values, verbose=0)
    rmse_k = float(np.sqrt(loss))
    mae_k  = float(mae_k)

    model.save(H5_PATH)
    logging.info(f"[KERAS] Saved → {H5_PATH}")

    # 8) Metrics + Config
    metrics = {
        "dataset": {
            "rows": int(df.shape[0]),
            "cols": int(df.shape[1]),
        },
        "target": target_col,
        "sklearn_random_forest": {
            "rmse": rmse_rf,
            "mae": mae_rf,
            "r2": r2_rf
        },
        "keras_mlp": {
            "rmse": rmse_k,
            "mae": mae_k,
            "epochs_trained": int(len(hist.history.get("loss", [])))
        }
    }
    with open(METRICS_JSON, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)
    logging.info(f"[META] Saved → {METRICS_JSON}")

    cfg = {
        "data": {
            "csv": DATA_CSV,
            "target": target_col,
            "numeric_features": num_cols,
            "categorical_features": cat_cols
        },
        "preprocessing": {
            "numeric_imputer": "median",
            "categorical_imputer": "most_frequent",
            "scaler": "StandardScaler",
            "ohe_handle_unknown": "ignore"
        },
        "training": {
            "test_size": TEST_SIZE,
            "random_state": RANDOM_STATE,
            "batch_size": BATCH_SIZE,
            "epochs": EPOCHS,
            "patience": PATIENCE
        },
        "models": {
            "sklearn_random_forest": os.path.basename(PKL_PATH),
            "keras_mlp_h5": os.path.basename(H5_PATH)
        },
        "artifacts": {
            "metrics_json": os.path.basename(METRICS_JSON),
            "history_csv": os.path.basename(HISTORY_CSV),
            "config_yaml": os.path.basename(CONFIG_YAML)
        }
    }
    with open(CONFIG_YAML, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)
    logging.info(f"[CFG] Saved → {CONFIG_YAML}")

    logging.info(f"=== DONE in {time.time() - t0:.1f}s ===")
    logging.info(f"H5  -> {H5_PATH}")
    logging.info(f"PKL -> {PKL_PATH}")
    logging.info(f"CFG -> {CONFIG_YAML}")
    logging.info(f"MET -> {METRICS_JSON}")
    logging.info(f"HIS -> {HISTORY_CSV}")

if __name__ == "__main__":
    main()


2025-09-02 19:52:30,010 | INFO | Loaded C:\Users\sagni\Downloads\Eco Detect\archive\goal15.forest_shares.csv with shape (237, 4)
2025-09-02 19:52:30,012 | INFO | Using target column: trend
2025-09-02 19:52:30,018 | INFO | [NaN repair] Target 'trend': 10 NaNs → 0 after interpolation.
2025-09-02 19:52:30,783 | INFO | [SKLEARN] Saved → C:\Users\sagni\Downloads\Eco Detect\eco_forest_rf.pkl
2025-09-02 19:52:38,386 | INFO | [KERAS] Saved → C:\Users\sagni\Downloads\Eco Detect\eco_forest_mlp.h5
2025-09-02 19:52:38,388 | INFO | [META] Saved → C:\Users\sagni\Downloads\Eco Detect\eco_forest_metrics.json
2025-09-02 19:52:38,390 | INFO | [CFG] Saved → C:\Users\sagni\Downloads\Eco Detect\eco_forest_config.yaml
2025-09-02 19:52:38,390 | INFO | === DONE in 8.4s ===
2025-09-02 19:52:38,391 | INFO | H5  -> C:\Users\sagni\Downloads\Eco Detect\eco_forest_mlp.h5
2025-09-02 19:52:38,391 | INFO | PKL -> C:\Users\sagni\Downloads\Eco Detect\eco_forest_rf.pkl
2025-09-02 19:52:38,392 | INFO | CFG -> C:\Users\sag