In [None]:
# feature_engineering_knn_pmm.py
# ---------------------------------------------------------------
# 3.7.4 Feature Engineering
# - Merge many raw CSVs on keys
# - KNN imputation for CATEGORICALS (mode of neighbours by numeric distance)
# - PMM imputation for CONTINUOUS (Ridge-based predictive mean matching)
# - One-Hot + leakage-safe K-fold Target Encoding for categoricals
# - Saves a single modelling table with ID_COLS preserved
# ---------------------------------------------------------------

from __future__ import annotations
import glob, math, warnings
from functools import reduce
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold

# ---------------------- CONFIG ----------------------
RAW_GLOB = "data/raw/*.csv"
OUT_DIR  = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

ID_COLS   = ["Cod_Azienda", "Anno"]   # <-- merge keys
TARGET    = "target_var"              # <-- binary 0/1 target for target encoding

# Variable types.
CATEGORICALS: List[str] = []
NUMERICALS:   List[str] = []

#Missing Values
# KNN for categorical imputation
KNN_K = 5

# PMM for continuous imputation
PMM_K = 5
PMM_ALPHA = 1.0     # Ridge regularization for the predictive model
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# ---------------------------------------------------

def merge_on_keys(dfs: List[pd.DataFrame], on: List[str]) -> pd.DataFrame:
    if not dfs:
        return pd.DataFrame(columns=on)
    return reduce(lambda L, R: pd.merge(L, R, on=on, how="outer"), dfs)

def infer_types(df: pd.DataFrame, id_cols: List[str]) -> Tuple[List[str], List[str]]:
    cats = [c for c in df.columns if df[c].dtype == "object" and c not in id_cols]
    nums = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in id_cols]
    return cats, nums

# ---------- KNN imputation for CATEGORICALS (mode of NN) ----------
def knn_impute_categoricals(df: pd.DataFrame, categorical_cols: List[str], numeric_cols: List[str], k: int=5) -> pd.DataFrame:
    """
    Imputing missing categories using the mode of K nearest neighbours in numeric space.
    - Distances computed on numeric predictors (median-filled).
    - For each categorical column independently, donors are rows where that column is observed.
    """
    out = df.copy()

    # numeric matrix for distances (median imputed)
    if numeric_cols:
        num_imp = SimpleImputer(strategy="median")
        X_num = num_imp.fit_transform(out[numeric_cols])
    else:
        # no numeric predictors -> fall back to global mode per column
        X_num = None

    for col in categorical_cols:
        miss_mask = out[col].isna() | (out[col].astype(str).str.strip().eq(""))
        if not miss_mask.any():
            continue

        if X_num is None:
            # purely fallback: fill with mode
            mode_val = out[col].mode(dropna=True).iloc[0] if out[col].notna().any() else "MISSING"
            out.loc[miss_mask, col] = mode_val
            continue

        donors_mask = ~miss_mask & out[col].notna()
        if donors_mask.sum() == 0:
            out.loc[miss_mask, col] = "MISSING"
            continue

        # Fit NN on donors only
        nn = NearestNeighbors(n_neighbors=min(k, donors_mask.sum()), metric="euclidean")
        nn.fit(X_num[donors_mask.values, :])

        # Query neighbours for all missing rows at once
        dist, idx = nn.kneighbors(X_num[miss_mask.values, :], return_distance=True)
        donor_idx = np.where(donors_mask)[0]  # indices of donors in original df

        # For each missing row, pick the most frequent category among neighbours
        imputed = []
        for row in range(idx.shape[0]):
            neighbour_rows = donor_idx[idx[row]]
            cats = out[col].iloc[neighbour_rows].astype(object).values
            # mode; break ties by first occurrence
            vals, counts = np.unique(cats, return_counts=True)
            imputed.append(vals[np.argmax(counts)])
        out.loc[miss_mask, col] = imputed

    return out

# ---------- PMM imputation for CONTINUOUS (single pass) ----------
def pmm_impute_continuous(
    df: pd.DataFrame,
    cont_cols: List[str],
    feature_matrix: pd.DataFrame,
    k: int = 5,
    alpha: float = 1.0,
    random_state: int = 42
) -> pd.DataFrame:
    """
    Predictive Mean Matching (single imputation) per continuous variable:
      1) Fit Ridge on observed rows: y ~ features
      2) Compute predictive means for observed (donor) and missing cases
      3) For each missing case, find k donors with closest predicted means
         and randomly draw a donor's ACTUAL observed y as the imputed value.
    """
    rng = np.random.RandomState(random_state)
    out = df.copy()

    # Prepare features: impute any missing with median and standardize
    imp = SimpleImputer(strategy="median")
    X_imp = imp.fit_transform(feature_matrix.values)
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X_imp)

    X_std_df = pd.DataFrame(X_std, index=feature_matrix.index, columns=feature_matrix.columns)

    for col in cont_cols:
        miss_mask = out[col].isna()
        if not miss_mask.any():
            continue

        obs_mask = ~miss_mask
        y_obs = out.loc[obs_mask, col].astype(float).values
        if len(y_obs) < max(10, k + 1):
            # too few donors: fill with median
            med = np.nanmedian(y_obs) if len(y_obs) else 0.0
            out.loc[miss_mask, col] = med
            continue

        X_obs = X_std_df.loc[obs_mask, :]
        X_mis = X_std_df.loc[miss_mask, :]

        # Fit predictive model
        model = Ridge(alpha=alpha, random_state=random_state)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model.fit(X_obs, y_obs)

        # Predicted means
        mu_obs = model.predict(X_obs)
        mu_mis = model.predict(X_mis)

        # For each missing row, find k nearest donors in prediction space
        imputed_vals = []
        for mu in mu_mis:
            # distances in predictive means
            d = np.abs(mu_obs - mu)
            nn_idx = np.argpartition(d, kth=min(k-1, len(d)-1))[:k]
            pick = rng.choice(nn_idx)
            imputed_vals.append(y_obs[pick])

        out.loc[miss_mask, col] = imputed_vals

    return out

# ---------- K-fold Target Encoding (leakage-safe) ----------
def kfold_target_encode(
    df: pd.DataFrame,
    cat_cols: List[str],
    target_col: str,
    n_splits: int = 5,
    smoothing: float = 10.0,
    random_state: int = 42
) -> pd.DataFrame:
    """
    K-fold target mean with smoothing (per category):
      TE = (cat_mean * count + global_mean * smoothing) / (count + smoothing)
    Computed in out-of-fold fashion to avoid leakage.
    """
    out = pd.DataFrame(index=df.index)
    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not in DataFrame.")

    y = df[target_col].astype(float).values
    global_mean = float(np.nanmean(y))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # For stratification, coerce y to 0/1 if it's continuous but bounded:
    strat_y = pd.cut(y, bins=[-1e9, 0.5, 1e9], labels=[0,1]).astype(int) if len(np.unique(y)) > 2 else y

    for col in cat_cols:
        s = df[col].astype(str).fillna("MISSING")
        enc = np.zeros(len(df), dtype=float)

        for tr_idx, va_idx in skf.split(np.zeros(len(df)), strat_y):
            # Means on training fold
            train_y = y[tr_idx]
            train_s = s.iloc[tr_idx]
            cnt = train_s.value_counts()
            mean = train_s.groupby(train_s).apply(lambda idx: np.mean(train_y[train_s==idx.name]))
            # smoothed means
            smoothed = ((mean * cnt) + global_mean * smoothing) / (cnt + smoothing)

            # map to validation
            enc[va_idx] = s.iloc[va_idx].map(smoothed).fillna(global_mean).values

        out[f"{col}__te"] = enc

    return out

# ======================= MAIN PIPELINE =======================
if __name__ == "__main__":
    # ---- load & merge ----
    files = glob.glob(RAW_GLOB)
    datasets = {}
    for f in files:
        name = Path(f).stem
        df0 = pd.read_csv(f, low_memory=False)
        df0.columns = [c.strip() for c in df0.columns]
        datasets[name] = df0

    merged = merge_on_keys(list(datasets.values()), ID_COLS) if datasets else pd.DataFrame(columns=ID_COLS)
    df = merged.copy()

    # ---- infer types ----
    if not CATEGORICALS or not NUMERICALS:
        cats, nums = infer_types(df, ID_COLS)
        if not CATEGORICALS: CATEGORICALS = cats
        if not NUMERICALS:   NUMERICALS   = nums

    # ---- 1) KNN imputation for categorical features ----
    # Work on a copy to avoid modifying the original
    df_cats_imputed = knn_impute_categoricals(df, CATEGORICALS, NUMERICALS, k=KNN_K)

    # ---- 2) PMM imputation for continuous features ----
    # Build a feature matrix for PMM using: numeric (median-filled) + *temporary* OHE of imputed categoricals
    #   (OHE here is only to supply predictive signals for PMM; the final OHE is built again later)
    if CATEGORICALS:
        ohe_tmp = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        X_cats_tmp = pd.DataFrame(
            ohe_tmp.fit_transform(df_cats_imputed[CATEGORICALS]),
            index=df_cats_imputed.index
        )
    else:
        X_cats_tmp = pd.DataFrame(index=df_cats_imputed.index)

    # numeric block (median for PMM predictors)
    if NUMERICALS:
        num_imp_for_feat = SimpleImputer(strategy="median")
        X_nums_tmp = pd.DataFrame(num_imp_for_feat.fit_transform(df_cats_imputed[NUMERICALS]),
                                  columns=NUMERICALS, index=df_cats_imputed.index)
    else:
        X_nums_tmp = pd.DataFrame(index=df_cats_imputed.index)

    feat_for_pmm = pd.concat([X_nums_tmp, X_cats_tmp], axis=1)

    df_cont_imputed = pmm_impute_continuous(
        df_cats_imputed,
        cont_cols=NUMERICALS,
        feature_matrix=feat_for_pmm,
        k=PMM_K,
        alpha=PMM_ALPHA,
        random_state=RANDOM_SEED
    )

    # ---- 3) Final encodings: One-Hot + Target Encoding ----
    # One-Hot (final, machine-readable; avoids ordinal distortion)
    if CATEGORICALS:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        X_ohe = pd.DataFrame(
            ohe.fit_transform(df_cont_imputed[CATEGORICALS]),
            index=df_cont_imputed.index,
            columns=ohe.get_feature_names_out(CATEGORICALS)
        )
    else:
        X_ohe = pd.DataFrame(index=df_cont_imputed.index)

    # Target encoding (K-fold, smoothed). Only if TARGET present:
    if TARGET in df_cont_imputed.columns:
        te_frame = kfold_target_encode(df_cont_imputed, CATEGORICALS, TARGET, n_splits=5, smoothing=10.0, random_state=RANDOM_SEED)
    else:
        te_frame = pd.DataFrame(index=df_cont_imputed.index)

    # Assemble final table
    keep_cols = ID_COLS + ([TARGET] if TARGET in df_cont_imputed.columns else [])
    base = df_cont_imputed[keep_cols + NUMERICALS].reset_index(drop=True)
    X_final = pd.concat([base, X_ohe.reset_index(drop=True), te_frame.reset_index(drop=True)], axis=1)

    # Persist
    out_path = OUT_DIR / "01_preprocessed_knn_pmm_ohe_te.csv"
    X_final.to_csv(out_path, index=False)
    print(f"Saved -> {out_path.resolve()}")

    # Minimal audit
    n_before = len(df)
    n_after  = len(X_final)
    miss_report = X_final.isna().sum().sort_values(ascending=False).head(10)
    print(f"Rows: {n_before} -> {n_after}")
    print("Top remaining missing (should be 0):\n", miss_report)
