- dutch(2): sex_bin, age_high

민감속성 이진화:
- sex: 1=male→0, 2=female→1
- age: 중앙값 초과 → 1 (age_high)

In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pathlib import Path
import os

In [2]:
# Load Dutch Census data (ARFF format)
raw_path = Path('raw/dutch/dutch_census_2001.arff')
data, meta = arff.loadarff(raw_path)
df = pd.DataFrame(data)

# Decode bytes to str for ARFF loaded data
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode('utf-8')

print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

Shape: (60420, 12)
Columns: ['sex', 'age', 'household_position', 'household_size', 'prev_residence_place', 'citizenship', 'country_birth', 'edu_level', 'economic_status', 'cur_eco_activity', 'Marital_status', 'occupation']


Unnamed: 0,sex,age,household_position,household_size,prev_residence_place,citizenship,country_birth,edu_level,economic_status,cur_eco_activity,Marital_status,occupation
0,1,6,1131,112,1,1,1,5,111,135,1,2_1
1,2,10,1122,113,1,1,1,2,111,122,2,5_4_9
2,1,8,1122,113,1,1,1,2,111,122,2,2_1
3,1,12,1121,112,1,1,1,1,111,137,2,5_4_9
4,2,4,1110,114,1,1,1,2,111,138,1,5_4_9


In [3]:
# Define label column
# occupation: high-level (5_4_9) vs low-level (2_1)
label_col = 'occupation'

# Binarize label
y_col = df[label_col]
uniq = y_col.unique()
if len(uniq) == 2:
    # 이진이면 그대로 0/1로 매핑
    le = LabelEncoder()
    y = le.fit_transform(y_col)
else:
    # 최빈값 vs 나머지
    top = y_col.value_counts().idxmax()
    y = (y_col == top).astype(int).values
print(f"Label distribution: {np.bincount(y)}")

# ============================================================
# 민감속성 이진화
# ============================================================
S_df = pd.DataFrame()

# sex: 이진 (1=male, 2=female) → 2=female을 1로
sex_col = pd.to_numeric(df['sex'], errors='coerce')
S_df['sex_bin'] = (sex_col == 2).astype(int)  # female=1
print(f"sex: {df['sex'].unique()} → sex_bin: {S_df['sex_bin'].value_counts().to_dict()}")

# age: 수치형으로 변환 후 중앙값 초과 → 1
age_col = pd.to_numeric(df['age'], errors='coerce')
age_thr = age_col.quantile(0.5)
S_df['age_high'] = (age_col > age_thr).astype(int)
print(f"age: median={age_thr} → age_high: {S_df['age_high'].value_counts().to_dict()}")

print(f"\nS shape: {S_df.shape}")
print(f"S columns: {list(S_df.columns)}")

Label distribution: [28763 31657]
sex: ['1' '2'] → sex_bin: {1: 30273, 0: 30147}
age: median=8.0 → age_high: {0: 31900, 1: 28520}

S shape: (60420, 2)
S columns: ['sex_bin', 'age_high']


In [4]:
# Feature columns (exclude label and raw sensitive attrs)
sens_raw = ['sex', 'age']
feature_cols = [c for c in df.columns if c not in [label_col] + sens_raw]
print(f"Feature columns: {feature_cols}")

# Encode categorical features
X_raw = df[feature_cols].copy()
for col in X_raw.select_dtypes([object]).columns:
    le_col = LabelEncoder()
    X_raw[col] = le_col.fit_transform(X_raw[col])

# Fill NaN and scale
X_raw = X_raw.apply(pd.to_numeric, errors='coerce')
X_raw = X_raw.fillna(X_raw.median())

scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"S shape: {S_df.shape}")

Feature columns: ['household_position', 'household_size', 'prev_residence_place', 'citizenship', 'country_birth', 'edu_level', 'economic_status', 'cur_eco_activity', 'Marital_status']
X shape: (60420, 9)
y shape: (60420,)
S shape: (60420, 2)
X shape: (60420, 9)
y shape: (60420,)
S shape: (60420, 2)


In [5]:
# Save dutch(2): 2 binarized sensitive attributes
save_dir = Path('.') / 'dutch(2)'
save_dir.mkdir(parents=True, exist_ok=True)

S = S_df.values

np.save(save_dir / 'X.npy', X)
np.save(save_dir / 'y.npy', y)
np.save(save_dir / 'S.npy', S)

print(f"Saved dutch(2):")
print(f"  X: {X.shape}")
print(f"  y: {y.shape}")
print(f"  S: {S.shape} (columns: {list(S_df.columns)})")
print("\nDone!")

Saved dutch(2):
  X: (60420, 9)
  y: (60420,)
  S: (60420, 2) (columns: ['sex_bin', 'age_high'])

Done!
