In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Configuration
SEED = 42
PRIVACY_PARAM = 0.7  # Probability of telling the truth

# ==============================================================================
# HELPER FUNCTIONS (OPTIMIZED)
# ==============================================================================

def load_and_preprocess_data(filepath):
    """
    Loads Adult dataset, handles headers, and performs basic binary preprocessing.
    """
    cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
            'hours-per-week', 'native-country', 'income']
    
    try:
        df = pd.read_csv(filepath, header=None, names=cols)
    except FileNotFoundError:
        print(f"ERROR: No se encuentra el archivo '{filepath}'.")
        raise

    # Clean strings
    obj_cols = df.select_dtypes(include=['object']).columns
    df[obj_cols] = df[obj_cols].apply(lambda x: x.str.strip())

    # Binarization
    df['target'] = (df['income'] == '>50K').astype(int)
    df['age_binary'] = (df['age'] > df['age'].median()).astype(int) # 1 = Older
    df['sex_binary'] = df['sex'].map({'Male': 1, 'Female': 0})      # 1 = Male

    return df

def apply_randomized_response(series, p=0.7, seed=SEED):
    """
    Applies Local Differential Privacy (Vectorized = Very Fast)
    """
    np.random.seed(seed)
    mask = np.random.random(len(series)) < p
    noise = np.random.randint(0, 2, len(series))
    return np.where(mask, series, noise)

def compute_reweighing_weights(df_sens, y, protected_cols=['age_binary', 'sex_binary']):
    """
    Computes Reweighing weights.
    OPTIMIZED: Calculates weights per group and applies them via mask (Vectorized).
    """
    df = df_sens.copy()
    df['label'] = y.values
    # Create composite group key (e.g. "0_1")
    df['group'] = df[protected_cols].astype(str).agg('_'.join, axis=1)
    
    n = len(df)
    # Pre-calculate probabilities for the whole dataset
    prob_group = df['group'].value_counts() / n
    prob_label = df['label'].value_counts() / n
    prob_joint = df.groupby(['group', 'label']).size() / n
    
    # Initialize all weights to 1.0
    df['weight'] = 1.0
    
    # Iterate only over unique combinations (e.g., 8 combinations) instead of 30,000 rows
    for g in prob_group.index:
        for l in prob_label.index:
            p_expected = prob_group[g] * prob_label[l]
            # Use .get to avoid errors if a specific combination doesn't exist in data
            if (g, l) in prob_joint.index:
                p_observed = prob_joint[(g, l)]
                weight = p_expected / p_observed
                # Apply weight to all matching rows at once
                df.loc[(df['group'] == g) & (df['label'] == l), 'weight'] = weight
                
    return df['weight'].values

def calculate_spd(y_pred, df_sens, sensitive_col='sex_binary'):
    """
    Calculates Statistical Parity Difference.
    """
    df = pd.DataFrame({'pred': y_pred, 'sens': df_sens[sensitive_col].values})
    p_unpriv = df[df['sens'] == 0]['pred'].mean()
    p_priv = df[df['sens'] == 1]['pred'].mean()
    return p_unpriv - p_priv

# ==============================================================================
# MAIN 
# ==============================================================================

print("1. Loading Data...")
df = load_and_preprocess_data('adult.data.csv')

X_sens = df[['age_binary', 'sex_binary']]
X_feats = pd.get_dummies(df.drop(columns=['age', 'sex', 'income', 'target', 'fnlwgt', 'age_binary', 'sex_binary']), drop_first=True)
y = df['target']

print("2. Applying Privacy (LDP)...")
X_sens_private = X_sens.copy()
X_sens_private['age_binary'] = apply_randomized_response(X_sens['age_binary'], p=PRIVACY_PARAM, seed=10)
X_sens_private['sex_binary'] = apply_randomized_response(X_sens['sex_binary'], p=PRIVACY_PARAM, seed=20)

# Create datasets
X_real = pd.concat([X_feats, X_sens], axis=1)
X_priv = pd.concat([X_feats, X_sens_private], axis=1)

X_train_real, X_test_real, y_train, y_test = train_test_split(X_real, y, test_size=0.2, random_state=SEED)
X_train_priv, X_test_priv, _, _ = train_test_split(X_priv, y, test_size=0.2, random_state=SEED)

print("3. Training Models (this might take a few seconds)...")

# Model A: Benchmark (Fairness on Real Data)
# Removed n_jobs=-1 to prevent freezing on some systems
weights_real = compute_reweighing_weights(X_train_real[['age_binary', 'sex_binary']], y_train)
clf_fair = RandomForestClassifier(n_estimators=50, random_state=SEED, n_jobs=None) 
clf_fair.fit(X_train_real, y_train, sample_weight=weights_real)
pred_fair = clf_fair.predict(X_test_real)

# Model B: Private + Fair (Fairness on Private Data)
weights_priv = compute_reweighing_weights(X_train_priv[['age_binary', 'sex_binary']], y_train)
clf_priv = RandomForestClassifier(n_estimators=50, random_state=SEED, n_jobs=None)
clf_priv.fit(X_train_priv, y_train, sample_weight=weights_priv)
pred_priv = clf_priv.predict(X_test_priv)

# ==============================================================================
# RESULTS
# ==============================================================================

# Auditor Evaluation
test_sens_real = X_test_real[['age_binary', 'sex_binary']]
spd_benchmark = calculate_spd(pred_fair, test_sens_real)
spd_private = calculate_spd(pred_priv, test_sens_real)

results_df = pd.DataFrame({
    'Model Scenario': ['Fair Classifier (Benchmark)', 'Private + Fair Classifier'],
    'Data Access': ['Full Real Data', 'Private (Noisy) Data'],
    'SPD (Sex)': [spd_benchmark, spd_private],
    'Bias Interpretation': [
        'Baseline Fairness', 
        'High Bias (Privacy Interference)' if abs(spd_private) > abs(spd_benchmark) else 'Maintained'
    ]
})

print("\n--- Fairness vs. Privacy Evaluation Results ---\n")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(results_df.to_string(index=False, float_format="%.4f"))
print("\n" + "="*60)

1. Loading Data...
2. Applying Privacy (LDP)...
3. Training Models (this might take a few seconds)...

--- Fairness vs. Privacy Evaluation Results ---

             Model Scenario          Data Access  SPD (Sex)              Bias Interpretation
Fair Classifier (Benchmark)       Full Real Data    -0.1416                Baseline Fairness
  Private + Fair Classifier Private (Noisy) Data    -0.1575 High Bias (Privacy Interference)

