In [None]:
# cai_normalisation_entropy.py
# Normalisation + reverse normalisation + ENTROPY weighting (Step-1 UNWEIGHTED) + CAI aggregation
# + optional RICA-weighted population summaries (for reporting only)

from __future__ import annotations
import pandas as pd, numpy as np
from typing import List, Dict, Optional, Iterable
from pathlib import Path


# ------------------- 3.7.1.1 Normalisation -------------------
class MinMaxCAINormalizer:
    def __init__(self, features: List[str], reverse_features: Optional[List[str]] = None,
                 groupby: Optional[List[str]] = None, suffix: str = "_n"):
        self.features = list(dict.fromkeys(features))
        self.reverse_features = set(reverse_features or [])
        self.groupby = groupby or []
        self.suffix = suffix
        self.params_: Optional[pd.DataFrame] = None

    def _compute_params(self, df: pd.DataFrame) -> pd.DataFrame:
        miss = [c for c in self.features if c not in df.columns]
        if miss:
            raise KeyError(f"Missing features: {miss}")
        if self.groupby:
            g = df[self.groupby + self.features].groupby(self.groupby, dropna=False)
            mins = g.min(numeric_only=True).rename(columns={c: f"{c}__min" for c in self.features})
            maxs = g.max(numeric_only=True).rename(columns={c: f"{c}__max" for c in self.features})
            params = mins.join(maxs, how="outer").reset_index()
        else:
            mins = df[self.features].min(numeric_only=True).rename(lambda c: f"{c}__min")
            maxs = df[self.features].max(numeric_only=True).rename(lambda c: f"{c}__max")
            params = pd.concat([mins, maxs], axis=0).to_frame("value").T
        for f in self.features:
            params[f"{f}__range"] = params[f"{f}__max"] - params[f"{f}__min"]
            params[f"{f}__is_constant"] = params[f"{f}__range"].abs() < 1e-15
        return params

    def fit(self, df: pd.DataFrame):
        self.params_ = self._compute_params(df)
        return self

    def transform(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
        if self.params_ is None:
            raise RuntimeError("fit() first")
        out = df if inplace else df.copy()
        if self.groupby:
            merged = out.merge(self.params_, on=self.groupby, how="left", validate="m:1")
        else:
            tmp = self.params_.copy(); tmp["_cj"] = 1; out["_cj"] = 1
            merged = out.merge(tmp, on="_cj", how="left").drop(columns="_cj")
            out.drop(columns="_cj", inplace=True)
        for f in self.features:
            zcol = f"{f}{self.suffix}"
            x = merged[f].astype(float)
            xmin, rng, is_const = merged[f"{f}__min"], merged[f"{f}__range"], merged[f"{f}__is_constant"]
            z = (x - xmin) / rng
            z = np.where(is_const, 0.0, z)
            z = np.clip(z, 0.0, 1.0)
            if f in self.reverse_features:
                z = 1.0 - z
            out[zcol] = z
        return out

    def inverse_transform(self, df_norm: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
        if self.params_ is None:
            raise RuntimeError("fit() first")
        out = df_norm if inplace else df_norm.copy()
        if self.groupby:
            merged = out.merge(self.params_, on=self.groupby, how="left", validate="m:1")
        else:
            tmp = self.params_.copy(); tmp["_cj"] = 1; out["_cj"] = 1
            merged = out.merge(tmp, on="_cj", how="left").drop(columns="_cj")
            out.drop(columns="_cj", inplace=True)
        for f in self.features:
            zcol = f"{f}{self.suffix}"
            if zcol not in merged.columns:
                raise KeyError(f"Missing {zcol}")
            z = merged[zcol].astype(float)
            if f in self.reverse_features:
                z = 1.0 - z
            xmin, rng, is_const = merged[f"{f}__min"], merged[f"{f}__range"], merged[f"{f}__is_constant"]
            x_rec = z * rng + xmin
            x_rec = np.where(is_const, xmin, x_rec)
            out[f"{f}__recovered"] = x_rec
        return out

    def report(self) -> pd.DataFrame:
        if self.params_ is None:
            raise RuntimeError("fit() first")
        rows = []
        for _, r in self.params_.iterrows():
            gvals = {g: r[g] for g in self.groupby} if self.groupby else {}
            for f in self.features:
                rows.append({
                    **gvals, "variable": f,
                    "min": r[f"{f}__min"], "max": r[f"{f}__max"], "range": r[f"{f}__range"],
                    "is_constant": bool(r[f"{f}__is_constant"]), "reversed": (f in self.reverse_features)
                })
        return pd.DataFrame(rows)


# --------------- 3.7.1.3 Entropy (Step-1) + 3.7.1.4 Aggregation ----------------
class EntropyAggregator:
    """
    Entropy weighting per eco-scheme (block) + aggregation.
    If weight_col is provided, Step-1 proportions are WEIGHTED:
        p_ij = w_i * z_ij / sum_i w_i * z_ij
    Otherwise defaults to unweighted.
    """
    def __init__(self, blocks_raw: Dict[str, List[str]], norm_suffix: str = "_n",
                 groupby: Optional[List[str]] = None, eps: float = 1e-12, weight_col: Optional[str] = None):
        self.blocks_raw = {k: list(dict.fromkeys(v)) for k, v in blocks_raw.items()}
        self.norm_suffix = norm_suffix
        self.groupby = groupby or []
        self.eps = eps
        self.weights_: Optional[pd.DataFrame] = None
        self.weight_col = weight_col

    def _norm_cols(self, raw_cols: Iterable[str]) -> List[str]:
        return [f"{c}{self.norm_suffix}" for c in raw_cols]

    def _entropy_weights_block(self, dfg: pd.DataFrame, block: str, cols_norm: List[str]) -> pd.DataFrame:
        Z = dfg[cols_norm].copy()
        mask_any = Z.notna().any(axis=1)
        Z = Z.loc[mask_any].fillna(0.0)

        # If all rows dropped → return uniform
        n_eff = len(Z)
        if n_eff == 0:
            return pd.DataFrame({
                "construct": block, "indicator": cols_norm, "H": np.nan, "d": 0.0,
                "w": 1.0 / max(len(cols_norm), 1)
            })

        # ----- Step 1: proportions (weighted if weight_col is provided) -----
        if self.weight_col and self.weight_col in dfg.columns:
            w = dfg.loc[Z.index, self.weight_col].fillna(0.0).clip(lower=0.0)
            col_sums = (Z.mul(w, axis=0)).sum(axis=0).replace(0.0, np.nan)
            P = Z.mul(w, axis=0).divide(col_sums, axis=1)
        else:
            col_sums = Z.sum(axis=0).replace(0.0, np.nan)
            P = Z.divide(col_sums, axis=1)

        for c in P.columns:
            if pd.isna(col_sums[c]):
                P[c] = 1.0 / n_eff  # uniform if column sum == 0

        # ----- Step 2: entropy -----
        k = 1.0 / np.log(max(n_eff, 2))
        P_safe = P.clip(self.eps, 1.0)
        H = (-k * (P_safe * np.log(P_safe)).sum(axis=0)).to_frame(name="H")

        # ----- Step 3: normalized weights -----
        d = (1.0 - H["H"]).clip(lower=0.0)
        denom = d.sum()
        wj = (d / denom) if denom > self.eps else pd.Series(np.full(len(d), 1.0 / len(d)), index=d.index)

        return pd.DataFrame({
            "construct": block,
            "indicator": wj.index,
            "H": H["H"].values,
            "d": d.values,
            "w": wj.values
        })

    def fit(self, df_norm: pd.DataFrame):
        needed = []
        for _, raws in self.blocks_raw.items():
            needed += self._norm_cols(raws)
        miss = [c for c in set(needed) if c not in df_norm.columns]
        if miss:
            raise KeyError(f"Missing normalized columns: {miss}")

        rows = []
        if self.groupby:
            for gkey, dfg in df_norm.groupby(self.groupby, dropna=False):
                gdict = dict(zip(self.groupby, gkey if isinstance(gkey, tuple) else (gkey,)))
                for block, raws in self.blocks_raw.items():
                    cols = self._norm_cols(raws)
                    wtbl = self._entropy_weights_block(dfg, block, cols)
                    for k, v in gdict.items():
                        wtbl[k] = v
                    rows.append(wtbl)
        else:
            for block, raws in self.blocks_raw.items():
                cols = self._norm_cols(raws)
                rows.append(self._entropy_weights_block(df_norm, block, cols))
        self.weights_ = pd.concat(rows, ignore_index=True)
        return self

    def transform(self, df_norm: pd.DataFrame, out_prefix: str = "CAI_") -> pd.DataFrame:
        if self.weights_ is None:
            raise RuntimeError("fit() first")
        out = df_norm.copy()
        gcols = self.groupby

        if gcols:
            for block, raws in self.blocks_raw.items():
                cols = self._norm_cols(raws)
                wsub = self.weights_[self.weights_["construct"] == block]
                wwide = wsub.pivot_table(index=gcols, columns="indicator", values="w")
                wwide = wwide.add_prefix("W::").reset_index()
                merged = out.merge(wwide, on=gcols, how="left")
                total = 0.0
                for col in cols:
                    wcol = "W::" + col
                    if wcol not in merged.columns:
                        merged[wcol] = 0.0
                    total = total + merged[col].fillna(0.0) * merged[wcol].fillna(0.0)
                out[f"{out_prefix}{block}"] = total
        else:
            for block, raws in self.blocks_raw.items():
                cols = self._norm_cols(raws)
                z = out[cols].copy().fillna(0.0)
                w = (self.weights_[self.weights_["construct"] == block]
                     .set_index("indicator")["w"]).reindex(cols).fillna(0.0).values
                out[f"{out_prefix}{block}"] = (z * w).sum(axis=1)
        return out

    def report_weights(self) -> pd.DataFrame:
        if self.weights_ is None:
            raise RuntimeError("fit() first")
        rep = self.weights_.copy()
        rep["raw_variable"] = rep["indicator"].str.replace(self.norm_suffix + r"$", "", regex=True)
        rep["step1_unweighted"] = self.weight_col is None
        return rep


# ----------------------- Weighted summaries (RICA) -----------------------
def weighted_mean(x: pd.Series, w: pd.Series) -> float:
    x = x.astype(float); w = w.fillna(0.0).clip(lower=0.0).astype(float)
    if w.sum() <= 0:
        return float(x.mean())
    return float(np.average(x, weights=w))


def weighted_group_mean(df: pd.DataFrame, value_col: str, weight_col: str, groupby: List[str]) -> pd.DataFrame:
    def agg(g):
        return pd.Series({
            "weighted_mean": weighted_mean(g[value_col], g[weight_col]),
            "n": len(g),
            "sum_weights": g[weight_col].fillna(0).sum()
        })
    return df.groupby(groupby, dropna=False).apply(agg).reset_index()


# -------------------
# Inputs
# -------------------
INFILE = "data/processed/01_preprocessed_knn_pmm_ohe_te.csv"
REV_FEATURES = "data/processed/reverse_features.csv"
OUT_NORM = "data/processed/02_normalized.csv"
OUT_WEIGHTS = "data/processed/entropy_weights.csv"
OUT_CAI = "data/processed/final_cai_scores.csv"

PESO_COL = "PESO"
ID_COLS = ["Cod_Azienda", "Anno"]

# -------------------
# Load
# -------------------
df = pd.read_csv(INFILE)
df_rev_feat = pd.read_csv(REV_FEATURES)

# Indicators = everything except IDs and PESO
INDICATORS = [c for c in df.columns if c not in ID_COLS + [PESO_COL]]

# -------------------
# 1. Normalisation
# -------------------
normalizer = MinMaxCAINormalizer(features=INDICATORS, reverse_features=df_rev_feat.columns)
df_norm = normalizer.fit(df).transform(df)

# -------------------
# 2. Entropy weights + CAI
# -------------------
# For now, treat all indicators as one block ("ALL")
blocks = {"ALL": INDICATORS}

aggregator = EntropyAggregator(blocks_raw=blocks, weight_col="PESO")
# Apply entropy calculations
aggregator.fit(df_norm)
# Apply weighting adjustments
df_cai = aggregator.transform(df_norm)

# -------------------
# Save results
# -------------------
# Normalized indicators
df_norm_out = pd.concat(
    [df[ID_COLS + ([PESO_COL] if PESO_COL in df.columns else [])],
     df_norm[[f"{c}_n" for c in INDICATORS]]],
    axis=1
)
df_norm_out.to_csv(OUT_NORM, index=False)

# Weights
weights_df = aggregator.report_weights()
weights_df.to_csv(OUT_WEIGHTS, index=False)

# Final CAI scores (composite index)
df_cai_out = pd.concat(
    [df[ID_COLS + ([PESO_COL] if PESO_COL in df.columns else [])],
     df_cai[["CAI_farm"]]],
    axis=1
)
df_cai_out.to_csv(OUT_CAI, index=False)

# Also save CAI scores merged with original
df_final = df_norm.copy()  # Start with normalised data
df_final["CAI_farm"] = df_cai["CAI_farm"]  # Add CAI scores

# Save complete dataset with CAI scores
df_final.to_csv("data/processed/normalised_with_cai.csv", index=False)

print("✅ Saved normalized data, entropy weights, and CAI scores")