In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Using cached ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Using cached ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [3]:
# -------------------------------
# importing libraries
# -------------------------------
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import inspect

from ucimlrepo import fetch_ucirepo

# -------------------------------
# fetching the dataset
# -------------------------------
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features.copy()
y = statlog_german_credit_data.data.targets

# If y is a DataFrame, squeeze to Series
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]

# -------------------------------
# derive sensitive attributes
# -------------------------------
# UCI Attribute 9: Personal status and sex
gender_map = {"A91": "male", "A92": "female", "A93": "male", "A94": "male", "A95": "female"}
if "Attribute9" not in X.columns:
    raise KeyError("Expected 'Attribute9' for gender derivation.")
X["gender"] = X["Attribute9"].map(gender_map)

# Age is Attribute13 in this schema
if "Attribute13" not in X.columns:
    raise KeyError("Expected 'Attribute13' for age.")
X["age"] = pd.to_numeric(X["Attribute13"], errors="coerce")

# -------------------------------
# encode target to 0/1 (e.g., bad/good)
# -------------------------------
le = LabelEncoder()
y_encoded = le.fit_transform(np.asarray(y))
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# -------------------------------
# build model feature frame (exclude sensitive columns from preprocessing)
# -------------------------------
sensitive_cols = ["gender", "age"]  # race removed
feature_X = X.drop(columns=sensitive_cols, errors="ignore")

# column types on model features
cat_cols = feature_X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = feature_X.select_dtypes(exclude=["object", "category", "bool"]).columns.tolist()
print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

# -------------------------------
# pipelines
# -------------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Robust to sklearn version differences: sparse vs sparse_output
ohe_params = {}
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe_params["sparse_output"] = False  # sklearn >= 1.2
else:
    ohe_params["sparse"] = False         # sklearn < 1.2

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", **ohe_params))
])

def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )

# -------------------------------
# split test and training data first before preprocessing
# -------------------------------
os.makedirs("data", exist_ok=True)
seeds = [42, 202, 777, 1234, 9001]

for i, seed in enumerate(seeds, start=1):
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        feature_X, y_encoded, test_size=0.20, random_state=seed, stratify=y_encoded
    )

    # keep sensitive columns aligned (raw, unencoded)
    sens_train = X.loc[X_train_raw.index, sensitive_cols].copy()
    sens_test  = X.loc[X_test_raw.index, sensitive_cols].copy()

    preprocessor = make_preprocessor()

    # fit on training only
    X_train_processed = preprocessor.fit_transform(X_train_raw)
    X_test_processed  = preprocessor.transform(X_test_raw)

    # robust feature names
    try:
        processed_feature_names = preprocessor.get_feature_names_out()
    except AttributeError:
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        try:
            ohe_feature_names = ohe.get_feature_names_out(cat_cols)
        except AttributeError:
            ohe_feature_names = ohe.get_feature_names(cat_cols)
        processed_feature_names = np.array(list(num_cols) + list(ohe_feature_names))

    # build DataFrames
    df_train = pd.DataFrame(X_train_processed, columns=processed_feature_names, index=X_train_raw.index)
    df_test  = pd.DataFrame(X_test_processed,  columns=processed_feature_names, index=X_test_raw.index)

    # add target
    df_train["target"] = y_train
    df_test["target"] = y_test

    # append sensitive columns (for fairness eval later)
    for col in sensitive_cols:
        df_train[col] = sens_train[col].values
        df_test[col]  = sens_test[col].values

    # save
    train_path = f"data/germancredit_split{i}_train.csv"
    test_path  = f"data/germancredit_split{i}_test.csv"
    df_train.to_csv(train_path, index=False)
    df_test.to_csv(test_path, index=False)

    print(f"[Split {i}] seed={seed} saved:")
    print(f"  {train_path}  -> {df_train.shape}")
    print(f"  {test_path}   -> {df_test.shape}")

print("\nDone. Created 5 train/test splits in 'data/' with model features + target + (gender, age).")

Label mapping: {1: 0, 2: 1}
Categorical columns: ['Attribute1', 'Attribute3', 'Attribute4', 'Attribute6', 'Attribute7', 'Attribute9', 'Attribute10', 'Attribute12', 'Attribute14', 'Attribute15', 'Attribute17', 'Attribute19', 'Attribute20']
Numeric columns: ['Attribute2', 'Attribute5', 'Attribute8', 'Attribute11', 'Attribute13', 'Attribute16', 'Attribute18']
[Split 1] seed=42 saved:
  data/germancredit_split1_train.csv  -> (800, 64)
  data/germancredit_split1_test.csv   -> (200, 64)
[Split 2] seed=202 saved:
  data/germancredit_split2_train.csv  -> (800, 64)
  data/germancredit_split2_test.csv   -> (200, 64)
[Split 3] seed=777 saved:
  data/germancredit_split3_train.csv  -> (800, 64)
  data/germancredit_split3_test.csv   -> (200, 64)
[Split 4] seed=1234 saved:
  data/germancredit_split4_train.csv  -> (800, 64)
  data/germancredit_split4_test.csv   -> (200, 64)
[Split 5] seed=9001 saved:
  data/germancredit_split5_train.csv  -> (800, 64)
  data/germancredit_split5_test.csv   -> (200, 64)
