Create single version with 18 sensitive attributes:
- communities(18): 18 binarized sensitive attributes

Sensitive attributes (paper18 preset):
- Race percentages (4): racepctwhite, racepctblack, racepctasian, racepcthisp
- Per-capita income by race (6): whitepercap, blackpercap, indianpercap, asianpercap, otherpercap, hisppercap
- Language/immigration (8): pctnotspeakenglwell, pctforeignborn, pctimmigrecent, pctimmigrec5, pctimmigrec8, pctimmigrec10, pctrecentimmig, pctrecimmig5

**filtering**: Remove rows with missing values in sensitive attributes

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import re

In [14]:
def normalize_colname(name):
    """Normalize column name: lowercase, remove non-alphanumeric"""
    name = name.lower().strip().replace('\t', ' ')
    name = re.sub(r'[^a-z0-9_]', '', name)
    return name

def parse_names_file(names_path, ncols):
    """Parse .names file to get column names"""
    if not names_path.exists():
        return []
    allowed = ('continuous', 'integer', 'real', 'numeric', 'binary', 'nominal')
    cand = []
    with open(names_path, 'r', encoding='utf-8', errors='ignore') as f:
        for raw in f:
            line = raw.strip()
            if not line or line.startswith('|') or line.startswith('#'):
                continue
            if ':' not in line:
                continue
            left, right = line.split(':', 1)
            if any(tok in right.lower() for tok in allowed):
                name = normalize_colname(left)
                if name:
                    cand.append(name)
    if len(cand) == ncols:
        return cand
    if len(cand) > ncols:
        return cand[-ncols:]
    return []

# Load Communities data
raw_dir = Path('raw/communities')
data_path = raw_dir / 'communities.data'
names_path = raw_dir / 'communities.names'

df = pd.read_csv(data_path, header=None, na_values=['?'], skipinitialspace=True)
print(f"Shape: {df.shape}")

# Parse column names from .names file
header = parse_names_file(names_path, df.shape[1])
if header:
    df.columns = header
else:
    df.columns = [f'col{i}' for i in range(df.shape[1])]
    print("[WARN] Failed to parse names, using generic column names")

print(f"Columns (first 10): {list(df.columns[:10])}")
df.head()

Shape: (1994, 128)
Columns (first 10): ['attributecharacteristics', 'state', 'county', 'community', 'fold', 'population', 'householdsize', 'racepctblack', 'racepctwhite', 'racepctasian']


Unnamed: 0,attributecharacteristics,state,county,community,fold,population,householdsize,racepctblack,racepctwhite,racepctasian,...,landarea,popdens,pctusepubtrans,policcars,policoperbudg,lemaspctpoliconpatr,lemasgangunitdeploy,lemaspctofficdrugun,policbudgperpop,violentcrimesperpop
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,,,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.0,,0.67
2,24,,,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.0,,0.43
3,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,,,,,0.0,,0.12
4,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.0,,0.03


In [15]:
# Define sensitive attributes
sens_attrs_ordered = [
    # Race percentages (4)
    'racepctwhite', 'racepctblack', 'racepctasian', 'racepcthisp',
    # Per-capita income by race (6)
    'whitepercap', 'blackpercap', 'indianpercap', 'asianpercap', 'otherpercap', 'hisppercap',
    # Language/immigration related (8)
    'pctnotspeakenglwell', 'pctforeignborn',
    'pctimmigrecent', 'pctimmigrec5', 'pctimmigrec8', 'pctimmigrec10',
    'pctrecentimmig', 'pctrecimmig5',
]

# Check which columns exist
existing_sens = [c for c in sens_attrs_ordered if c in df.columns]
missing_sens = [c for c in sens_attrs_ordered if c not in df.columns]
print(f"Existing sensitive attrs: {len(existing_sens)}/{len(sens_attrs_ordered)}")
if missing_sens:
    print(f"Missing: {missing_sens}")

# Filter: remove rows with missing values in sensitive attributes
sens_df = df[existing_sens].copy()
mask_valid = sens_df.notna().all(axis=1)
print(f"\nRows with complete sensitive attrs: {mask_valid.sum()}/{len(df)}")

Existing sensitive attrs: 18/18

Rows with complete sensitive attrs: 1993/1994


In [16]:
# Apply filter
df_filtered = df.loc[mask_valid].reset_index(drop=True)
print(f"Filtered shape: {df_filtered.shape}")

def binarize_quantile(series, q=0.5):
    """Binarize by quantile threshold"""
    v = pd.to_numeric(series, errors='coerce')
    finite = v.dropna()
    if finite.empty:
        return pd.Series(0, index=v.index, dtype=int)
    
    thr = finite.quantile(q)
    result = (v > thr).astype(int)
    return result

# Build sensitive attribute matrix (binarized)
S_df = pd.DataFrame()
for attr in existing_sens:
    S_df[attr] = binarize_quantile(df_filtered[attr], q=0.5)

print(f"S shape: {S_df.shape}")
print("\nSensitive attribute value counts:")
for attr in S_df.columns[:5]:  # Show first 5 only
    counts = S_df[attr].value_counts().to_dict()
    print(f"  {attr}: {counts}")

Filtered shape: (1993, 128)
S shape: (1993, 18)

Sensitive attribute value counts:
  racepctwhite: {0: 1031, 1: 962}
  racepctblack: {0: 1024, 1: 969}
  racepctasian: {0: 997, 1: 996}
  racepcthisp: {0: 1040, 1: 953}
  whitepercap: {0: 1016, 1: 977}


In [17]:
# Define label column (ViolentCrimesPerPop)
target_col = 'violentcrimesperpop' if 'violentcrimesperpop' in df_filtered.columns else df_filtered.columns[-1]
print(f"Target column: {target_col}")

# Binarize label (top 30% = 1, as in paper)
y = binarize_quantile(df_filtered[target_col], q=0.7).values
print(f"Label distribution: {np.bincount(y)}")

# Feature columns (exclude target, sensitive attrs, and non-predictive cols)
non_feature_cols = (
    [target_col, 'state', 'county', 'community', 'communityname', 'fold']
    + list(existing_sens)
)
feature_cols = [c for c in df_filtered.columns if c not in non_feature_cols]
print(f"Feature columns: {len(feature_cols)}")

# Build feature matrix
X_raw = df_filtered[feature_cols].copy()
X_raw = X_raw.apply(pd.to_numeric, errors='coerce')
X_raw = X_raw.fillna(X_raw.median())
X_raw = X_raw.fillna(0)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"S shape: {S_df.shape}")

Target column: violentcrimesperpop
Label distribution: [1410  583]
Feature columns: 105

X shape: (1993, 105)
y shape: (1993,)
S shape: (1993, 18)

X shape: (1993, 105)
y shape: (1993,)
S shape: (1993, 18)


In [18]:
# Save communities(18): q=18 sensitive attributes
save_dir = Path('.') / 'communities(18)'
save_dir.mkdir(parents=True, exist_ok=True)

S = S_df.values

np.save(save_dir / 'X.npy', X)
np.save(save_dir / 'y.npy', y)
np.save(save_dir / 'S.npy', S)

print(f"Saved communities(18): X{X.shape}, y{y.shape}, S{S.shape}")
print(f"  Sensitive attrs: {list(S_df.columns)}")
print("\nDone!")

Saved communities(18): X(1993, 105), y(1993,), S(1993, 18)
  Sensitive attrs: ['racepctwhite', 'racepctblack', 'racepctasian', 'racepcthisp', 'whitepercap', 'blackpercap', 'indianpercap', 'asianpercap', 'otherpercap', 'hisppercap', 'pctnotspeakenglwell', 'pctforeignborn', 'pctimmigrecent', 'pctimmigrec5', 'pctimmigrec8', 'pctimmigrec10', 'pctrecentimmig', 'pctrecimmig5']

Done!
