In [12]:

import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import inspect

from fairlearn.datasets import fetch_diabetes_hospital

# =========================
# 1) Load dataset
# =========================
bundle = fetch_diabetes_hospital()

X_raw = bundle.data
X = X_raw.copy() if isinstance(X_raw, pd.DataFrame) else pd.DataFrame(X_raw, columns=getattr(bundle, "feature_names", None))

y_raw = bundle.target
y = y_raw.copy() if isinstance(y_raw, pd.Series) else pd.Series(y_raw)

# =========================
# 2) Encode binary target
# =========================
def _clean_lab(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip().lower().replace(".", "")
    s = re.sub(r"\s+", "", s)
    return s

y_clean = y.apply(_clean_lab)
positive_aliases = {"<30", "yes", "y", "1", "true", "readmitted", "readmit30", "pos", "positive"}
negative_aliases = {">30", "no", "n", "0", "false", "notreadmitted", "neg", "negative"}

if set(y_clean.dropna().unique()) <= {"0", "1"}:
    y_encoded = y_clean.astype(int).values
elif set(y_clean.dropna().unique()).issubset(positive_aliases | negative_aliases):
    y_encoded = y_clean.apply(lambda s: 1 if s in positive_aliases else 0).astype(int).values
else:
    y_encoded = y_clean.apply(lambda s: 1 if s in positive_aliases else 0).astype(int).values

print("Target encoded to 0/1 (1 = readmitted within 30 days).")

# =========================
# 3) FAIRNESS COLUMNS (FIXED)
#    Use existing medicare/medicaid if present; only derive if missing
# =========================
def _find_col(df, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c in cols_lower:
            return cols_lower[c]
    return None

def _coerce_bool(series):
    """Coerce a column to boolean without flipping existing True/False."""
    if pd.api.types.is_bool_dtype(series):
        return series.astype(bool)
    if pd.api.types.is_numeric_dtype(series):
        # treat nonzero as True
        return series.astype(bool)
    # string-like
    s = series.astype(str).str.strip().str.lower()
    true_set  = {"true", "t", "1", "yes", "y"}
    false_set = {"false", "f", "0", "no", "n"}
    mapped = s.map(lambda v: True if v in true_set else (False if v in false_set else np.nan))
    # keep NaN if ambiguous (don’t silently flip)
    return mapped.astype("boolean").astype(object).where(~mapped.isna(), np.nan)

# --- medicare ---
medicare_col = _find_col(X, ["medicare"])
if medicare_col is not None:
    medicare = _coerce_bool(X[medicare_col])
else:
    # derive from payer/insurance ONLY IF column missing
    payer_col = _find_col(X, [
        "payer", "payer_code", "insurance", "primary_payer", "payer_type",
        "payor", "payor_type", "payment_typology", "payertype", "coverage", "insurance_type"
    ])
    if payer_col is not None:
        payer = X[payer_col].astype(str).str.strip().str.lower()
        medicare = payer.str.contains("medicare", na=False)
    else:
        raise ValueError("Could not find 'medicare' column or a payer/insurance column to derive it from.")

# --- medicaid ---
medicaid_col = _find_col(X, ["medicaid"])
if medicaid_col is not None:
    medicaid = _coerce_bool(X[medicaid_col])
else:
    if 'payer' in locals() or (payer_col := _find_col(X, [
        "payer", "payer_code", "insurance", "primary_payer", "payer_type",
        "payor", "payor_type", "payment_typology", "payertype", "coverage", "insurance_type"
    ])) is not None:
        payer = X[payer_col].astype(str).str.strip().str.lower() if 'payer' not in locals() else payer
        medicaid = payer.str.contains("medicaid", na=False)
    else:
        raise ValueError("Could not find 'medicaid' column or a payer/insurance column to derive it from.")

# --- age ---
# --- AGE (robust & prefers bundle.sensitive_features) ---
def _find_col(df, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c in cols_lower:
            return cols_lower[c]
    return None

def _parse_age_series(s: pd.Series) -> pd.Series:
    """Parse numeric age from strings like '[60-70)', '60-70', '70+', '45'.
    Returns numeric where parsed; otherwise NaN."""
    import re
    s_str = s.astype(str).str.strip()

    range_pat = re.compile(r"^\[?\s*(\d+)\s*[-–]\s*(\d+)\s*\)?$")   # [60-70) or 60-70
    plus_pat  = re.compile(r"^\[?\s*(\d+)\s*\+\)?$")                # 70+ or [70+)
    num_pat   = re.compile(r"^\d+$")                                # 45

    def _to_num(x):
        if pd.isna(x) or x == "" or x.lower() in {"nan", "none"}:
            return np.nan
        m = range_pat.match(x)
        if m:
            a, b = int(m.group(1)), int(m.group(2))
            return (a + b) / 2.0
        m = plus_pat.match(x)
        if m:
            a = int(m.group(1))
            # choose a lower-bound or midpoint heuristic; lower-bound is safer
            return float(a)
        m = num_pat.match(x)
        if m:
            return float(m.group(0))
        return np.nan

    return s_str.map(_to_num)

# 1) Try bundle.sensitive_features first
age_series = None
sf = getattr(bundle, "sensitive_features", None)
if sf is not None:
    if isinstance(sf, pd.Series):
        if (sf.name or "").lower() in {"age", "age_years", "patient_age"}:
            age_series = sf.copy()
    elif isinstance(sf, pd.DataFrame):
        age_sf_col = _find_col(sf, ["age", "age_years", "patient_age"])
        if age_sf_col is not None:
            age_series = sf[age_sf_col].copy()

# 2) Fall back to X if not found
if age_series is None:
    age_col = _find_col(X, ["age", "age_years", "patient_age"])
    if age_col is None:
        raise ValueError("Could not find an age column in bundle.sensitive_features or X.")
    age_series = X[age_col].copy()

# 3) Make age numeric when possible; otherwise keep original for grouping
if pd.api.types.is_numeric_dtype(age_series):
    age_numeric = pd.to_numeric(age_series, errors="coerce")
else:
    parsed = _parse_age_series(age_series)
    if parsed.notna().mean() >= 0.8:
        age_numeric = parsed
    else:
        # keep original strings if we can't confidently parse
        age_numeric = age_series.astype(str).str.strip()
X["age"] = age_numeric

# DIAGNOSTICS (you can keep these prints)
print("Age diagnostics:")
print("  dtype:", X["age"].dtype)
print("  NaN rate:", float(pd.isna(X["age"]).mean()))
print("  head values:", X["age"].head().tolist())

X["medicare"] = medicare.astype(bool)
X["medicaid"] = medicaid.astype(bool)


# checking if it has both true and false values
print("Sanity check value counts (after fix):")
print("medicare:\n", pd.Series(X["medicare"]).value_counts(dropna=False))
print("medicaid:\n", pd.Series(X["medicaid"]).value_counts(dropna=False))

# =========================
# 4) Exclude sensitive cols from model features
# =========================
sensitive_cols = ["medicare", "medicaid", "age"]
feature_X = X.drop(columns=sensitive_cols, errors="ignore")

cat_cols = feature_X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = feature_X.select_dtypes(exclude=["object", "category", "bool"]).columns.tolist()

print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
print(f"Numeric columns ({len(num_cols)}): {num_cols}")

# =========================
# 5) Pipelines (version-robust)
# =========================
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

ohe_params = {}
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe_params["sparse_output"] = False  # sklearn >= 1.2
else:
    ohe_params["sparse"] = False         # sklearn < 1.2

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", **ohe_params)),
])

def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )

# =========================
# 6) Split before preprocess; save 5 splits
# =========================
os.makedirs("data", exist_ok=True)
seeds = [42, 202, 777, 1234, 9001]

for i, seed in enumerate(seeds, start=1):
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        feature_X, y_encoded, test_size=0.20, random_state=seed, stratify=y_encoded
    )

    sens_train = X.loc[X_train_raw.index, sensitive_cols].copy()
    sens_test  = X.loc[X_test_raw.index, sensitive_cols].copy()

    preprocessor = make_preprocessor()
    X_train_processed = preprocessor.fit_transform(X_train_raw)
    X_test_processed  = preprocessor.transform(X_test_raw)

    try:
        processed_feature_names = preprocessor.get_feature_names_out()
    except AttributeError:
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        try:
            ohe_feature_names = ohe.get_feature_names_out(cat_cols)
        except AttributeError:
            ohe_feature_names = ohe.get_feature_names(cat_cols)
        processed_feature_names = np.array(list(num_cols) + list(ohe_feature_names))

    df_train = pd.DataFrame(X_train_processed, columns=processed_feature_names, index=X_train_raw.index)
    df_test  = pd.DataFrame(X_test_processed,  columns=processed_feature_names, index=X_test_raw.index)

    df_train["target"] = y_train
    df_test["target"] = y_test

    for col in sensitive_cols:
        df_train[col] = sens_train[col].values
        df_test[col]  = sens_test[col].values

    train_path = f"data/diab_hosp_split{i}_train.csv"
    test_path  = f"data/diab_hosp_split{i}_test.csv"
    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)

    print(f"[Split {i}] seed={seed} saved:")
    print(f"  {train_path} -> {df_train.shape}")
    print(f"  {test_path}  -> {df_test.shape}")

print("\nDone. True/False values for medicare/medicaid are preserved; 5 splits saved.")


Target encoded to 0/1 (1 = readmitted within 30 days).
Age diagnostics:
  dtype: object
  NaN rate: 0.0
  head values: ["'30 years or younger'", "'30 years or younger'", "'30 years or younger'", "'30-60 years'", "'30-60 years'"]
Sanity check value counts (after fix):
medicare:
 medicare
False    69327
True     32439
Name: count, dtype: int64
medicaid:
 medicaid
False    98234
True      3532
Name: count, dtype: int64
Categorical columns (15): ['race', 'gender', 'discharge_disposition_id', 'admission_source_id', 'medical_specialty', 'primary_diagnosis', 'max_glu_serum', 'A1Cresult', 'insulin', 'change', 'diabetesMed', 'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'readmitted']
Numeric columns (6): ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_diagnoses', 'readmit_binary']
[Split 1] seed=42 saved:
  data/diab_hosp_split1_train.csv -> (81412, 60)
  data/diab_hosp_split1_test.csv  -> (20354, 60)
[Split 2] seed=202 saved:
  data/diab_