In [2]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import inspect

from ucimlrepo import fetch_ucirepo

# =========================
# 1) Load Adult dataset
# =========================
adult = fetch_ucirepo(id=2)  # UCI Adult/Census Income
X = adult.data.features.copy()
y = adult.data.targets

# If y is a DataFrame, squeeze to Series
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]

# =========================
# 2) Clean + encode target
# =========================
def _clean_label(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().replace(".", "")
    s = re.sub(r"\s+", "", s)
    return s

y_clean = y.apply(_clean_label)

# Binary encoding: >50K -> 1, <=50K -> 0 (fallback prints mapping if unexpected labels)
if set(y_clean.dropna().unique()) <= {">50K", "<=50K"}:
    y_encoded = (y_clean == ">50K").astype(int).values
else:
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_clean.fillna("missing").values)
    print("Label mapping (fallback):", dict(zip(le.classes_, le.transform(le.classes_))))
print("Target encoding: 1 => >50K, 0 => <=50K (or fallback mapping printed above if used).")

# ============================================
# 3) Build explicit fairness/sensitive columns
#    (race, gender, education) — NOT age
# ============================================
# Normalize column names (handle minor naming differences)
X_cols_lower = {c.lower(): c for c in X.columns}

# gender from 'sex'
sex_col = X_cols_lower.get("sex", None)
if sex_col is None:
    raise KeyError("Could not find 'sex' column in Adult features.")
X["gender"] = (
    X[sex_col].astype(str).str.strip().str.lower()
      .map({"male": "male", "female": "female"})
      .fillna("unknown")
)

# race from 'race'
race_col = X_cols_lower.get("race", None)
if race_col is None:
    raise KeyError("Could not find 'race' column in Adult features.")
X["race"] = X[race_col].astype(str).str.strip()

# education from 'education'
edu_col = X_cols_lower.get("education", None)
if edu_col is None:
    raise KeyError("Could not find 'education' column in Adult features.")
X["education"] = X[edu_col].astype(str).str.strip()

# ============================================
# 4) Exclude fairness cols from model features
# ============================================
# We will compute fairness later using these three columns
sensitive_cols = ["race", "gender", "education"]

# Keep age and other features for modeling (age is NOT used for fairness later)
feature_X = X.drop(columns=sensitive_cols, errors="ignore")

# Identify column types on model features
cat_cols = feature_X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = feature_X.select_dtypes(exclude=["object", "category", "bool"]).columns.tolist()

print(f"Categorical columns ({len(cat_cols)}):", cat_cols)
print(f"Numeric columns ({len(num_cols)}):", num_cols)

# ============================================
# 5) Preprocessing pipelines (version-robust)
# ============================================
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Handle OneHotEncoder sparse/sparse_output param across sklearn versions
ohe_params = {}
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe_params["sparse_output"] = False  # sklearn >= 1.2
else:
    ohe_params["sparse"] = False         # sklearn < 1.2

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", **ohe_params))
])

def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )

# ======================================================
# 6) Split BEFORE preprocessing (avoid train->test leak)
#    Make 5 stratified 80/20 splits and save to disk
# ======================================================
os.makedirs("data", exist_ok=True)
seeds = [42, 202, 777, 1234, 9001]

for i, seed in enumerate(seeds, start=1):
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        feature_X, y_encoded, test_size=0.20, random_state=seed, stratify=y_encoded
    )

    # Keep fairness columns (unaltered) aligned with indices for fairness evaluation later
    sens_train = X.loc[X_train_raw.index, sensitive_cols].copy()
    sens_test  = X.loc[X_test_raw.index, sensitive_cols].copy()

    preprocessor = make_preprocessor()

    # Fit only on training data
    X_train_processed = preprocessor.fit_transform(X_train_raw)
    X_test_processed  = preprocessor.transform(X_test_raw)

    # Robust feature names
    try:
        processed_feature_names = preprocessor.get_feature_names_out()
    except AttributeError:
        # Older sklearn fallback
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        try:
            ohe_feature_names = ohe.get_feature_names_out(cat_cols)
        except AttributeError:
            ohe_feature_names = ohe.get_feature_names(cat_cols)
        processed_feature_names = np.array(list(num_cols) + list(ohe_feature_names))

    # Build DataFrames (model features only)
    df_train = pd.DataFrame(X_train_processed, columns=processed_feature_names, index=X_train_raw.index)
    df_test  = pd.DataFrame(X_test_processed,  columns=processed_feature_names, index=X_test_raw.index)

    # Add target
    df_train["target"] = y_train
    df_test["target"] = y_test

    # Append fairness columns (raw, unencoded) for later fairness analysis
    for col in sensitive_cols:
        df_train[col] = sens_train[col].values
        df_test[col]  = sens_test[col].values

    # Save files
    train_path = f"data/adult_split{i}_train.csv"
    test_path  = f"data/adult_split{i}_test.csv"
    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)

    print(f"[Split {i}] seed={seed} saved:")
    print(f"  {train_path}  -> {df_train.shape}")
    print(f"  {test_path}   -> {df_test.shape}")

print("\nDone. Created 5 train/test splits in 'data/' with model features + target + (race, gender, education).")


Target encoding: 1 => >50K, 0 => <=50K (or fallback mapping printed above if used).
Categorical columns (6): ['workclass', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']
Numeric columns (6): ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
[Split 1] seed=42 saved:
  data/adult_split1_train.csv  -> (39073, 94)
  data/adult_split1_test.csv   -> (9769, 94)
[Split 2] seed=202 saved:
  data/adult_split2_train.csv  -> (39073, 94)
  data/adult_split2_test.csv   -> (9769, 94)
[Split 3] seed=777 saved:
  data/adult_split3_train.csv  -> (39073, 94)
  data/adult_split3_test.csv   -> (9769, 94)
[Split 4] seed=1234 saved:
  data/adult_split4_train.csv  -> (39073, 93)
  data/adult_split4_test.csv   -> (9769, 93)
[Split 5] seed=9001 saved:
  data/adult_split5_train.csv  -> (39073, 93)
  data/adult_split5_test.csv   -> (9769, 93)

Done. Created 5 train/test splits in 'data/' with model features + target + (race, gender, education).
