## This notebook contains a sample code for the Adult data experiment in Section 5.3.

Before running the code, please check README.md and install LEMON.

* Please use an appropriate machine to run this notebook.
    * This notebook runs min-cost flow solver in 10-parallel.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import stealth_sampling

### Functions

In [2]:
# split data to bins (s, y) = (1, 1), (1, 0), (0, 1), (0, 0)
def split_to_four(X, S, Y):
    Z = np.c_[X, S, Y]
    Z_pos_pos = Z[np.logical_and(S, Y), :]
    Z_pos_neg = Z[np.logical_and(S, np.logical_not(Y)), :]
    Z_neg_pos = Z[np.logical_and(np.logical_not(S), Y), :]
    Z_neg_neg = Z[np.logical_and(np.logical_not(S), np.logical_not(Y)), :]
    Z = [Z_pos_pos, Z_pos_neg, Z_neg_pos, Z_neg_neg]
    return Z

# compute demographic parity
def demographic_parity(W):
    p_pos = np.mean(np.concatenate(W[:2]))
    p_neg = np.mean(np.concatenate(W[2:]))
    return np.abs(p_pos - p_neg)

# compute the sampling size from each bin
def computeK(Z, Nsample, sampled_spos, sampled_ypos):
    Kpp = Nsample*sampled_spos*sampled_ypos[0]
    Kpn = Nsample*sampled_spos*(1-sampled_ypos[0])
    Knp = Nsample*(1-sampled_spos)*sampled_ypos[1]
    Knn = Nsample*(1-sampled_spos)*(1-sampled_ypos[1])
    K = [Kpp, Kpn, Knp, Knn]
    kratio = min([min(1, z.shape[0]/k) for (z, k) in zip(Z, K)])
    Kpp = int(np.floor(Nsample*kratio*sampled_spos*sampled_ypos[0]))
    Kpn = int(np.floor(Nsample*kratio*sampled_spos*(1-sampled_ypos[0])))
    Knp = int(np.floor(Nsample*kratio*(1-sampled_spos)*sampled_ypos[1]))
    Knn = int(np.floor(Nsample*kratio*(1-sampled_spos)*(1-sampled_ypos[1])))
    K = [max([k, 1]) for k in [Kpp, Kpn, Knp, Knn]]
    return K

# case-contrl sampling
def case_control_sampling(X, K):
    q = [(K[i]/sum(K)) * np.ones(x.shape[0]) / x.shape[0] for i, x in enumerate(X)]
    return q

# compute wasserstein distance w/ boostrap
def compute_wasserstein(X1, S1, X2, S2, n, num_sample=5, num_process=10, seed=0):
    dx = stealth_sampling.compute_wasserstein_bootstrap(X1, X2, n, path='./', prefix='adult', num_sample=num_sample, num_process=num_process, seed=seed, timeout=60)
    dx_s1 = stealth_sampling.compute_wasserstein_bootstrap(X1[S1>0.5, :], X2[S2>0.5, :], n, path='./', prefix='adult', num_sample=num_sample, num_process=num_process, seed=seed+1, timeout=60)
    dx_s0 = stealth_sampling.compute_wasserstein_bootstrap(X1[S1<0.5, :], X2[S2<0.5, :], n, path='./', prefix='adult', num_sample=num_sample, num_process=num_process, seed=seed+2, timeout=60)
    return dx, dx_s1, dx_s0

### Fetch data and preprocess
We modefied [https://www.kaggle.com/kost13/us-income-logistic-regression/notebook]

In [3]:
# fetch data
url1 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
url2 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
columns = ['Age','Workclass','fnlgwt','Education','Education num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Native country','Income']
df1 = pd.read_table(url1, sep=',', header=None, names=columns)
df2 = pd.read_table(url2, sep=',', skiprows=1, header=None, names=columns)
df = pd.concat([df1, df2], axis=0, ignore_index=True, sort=True)

In [4]:
# preprocessing
def primary(x):
    if x in [' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', ' 12th']:
        return ' Primary'
    else:
        return x
    
def native(country):
    if country in [' United-States', ' Cuba', ' 0']:
        return 'US'
    elif country in [' England', ' Germany', ' Canada', ' Italy', ' France', ' Greece', ' Philippines']:
        return 'Western'
    elif country in [' Mexico', ' Puerto-Rico', ' Honduras', ' Jamaica', ' Columbia', ' Laos', ' Portugal', ' Haiti',
                     ' Dominican-Republic', ' El-Salvador', ' Guatemala', ' Peru', 
                     ' Trinadad&Tobago', ' Outlying-US(Guam-USVI-etc)', ' Nicaragua', ' Vietnam', ' Holand-Netherlands' ]:
        return 'Poor' # no offence
    elif country in [' India', ' Iran', ' Cambodia', ' Taiwan', ' Japan', ' Yugoslavia', ' China', ' Hong']:
        return 'Eastern'
    elif country in [' South', ' Poland', ' Ireland', ' Hungary', ' Scotland', ' Thailand', ' Ecuador']:
        return 'Poland team'
    
    else: 
        return country

df.replace(' ?', np.nan, inplace=True)
df['Income'] = df['Income'].apply(lambda x: 1 if x in (' >50K', ' >50K.') else 0)
df['Workclass'].fillna(' 0', inplace=True)
df['Workclass'].replace(' Without-pay', ' Never-worked', inplace=True)
df['fnlgwt'] = df['fnlgwt'].apply(lambda x: np.log1p(x))
df['Education'] = df['Education'].apply(primary)
df['Marital Status'].replace(' Married-AF-spouse', ' Married-civ-spouse', inplace=True)
df['Occupation'].fillna(' 0', inplace=True)
df['Occupation'].replace(' Armed-Forces', ' 0', inplace=True)
df['Native country'].fillna(' 0', inplace=True)
df['Native country'] = df['Native country'].apply(native)
categorical_features = df.select_dtypes(include=['object']).axes[1]
for col in categorical_features:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col, prefix_sep=':')], axis=1)
    df.drop(col, axis=1, inplace=True)

### Experiment

In [5]:
# parameter settings for data
seed = 0                    # random seed
Ntr = 10000                 # number of samples for training
Nte = 20000                 # number of samples for testing

# parameter settings for model
classifier = 'LogReg'
#classifier = 'Forest'

# parameter settings for sampling
Nsample = 2000              # number of data to sample
sampled_ypos = [0.2, 0.2]   # the ratio of positive decisions '\alpha' in sampling

In [6]:
def sample_and_evaluate(df, Ntr=10000, Nte=20000, Nsample=2000, classifier='LogReg', sampled_ypos=[0.2, 0.2], seed=0):
    
    # split data
    df_train, df_ref = train_test_split(df, test_size=1.0-Ntr/df.shape[0], random_state=seed)
    df_test, df_ref = train_test_split(df_ref, test_size=1.0-Nte/df_ref.shape[0], random_state=seed)
    
    # df to numpy array
    Xtr = df_train.drop(['Income', 'Sex: Male', 'Sex: Female'], axis=1).values
    Str = df_train['Sex: Male'].values
    Ytr = df_train['Income'].values
    Xte = df_test.drop(['Income', 'Sex: Male', 'Sex: Female'], axis=1).values
    Ste = df_test['Sex: Male'].values
    Yte = df_test['Income'].values
    Xref = df_ref.drop(['Income', 'Sex: Male', 'Sex: Female'], axis=1).values
    Sref = df_ref['Sex: Male'].values
    Yref = df_ref['Income'].values
    
    # normalize
    scaler = StandardScaler()
    scaler.fit(Xtr)
    Xtr = scaler.transform(Xtr)
    Xte = scaler.transform(Xte)
    Xref = scaler.transform(Xref)
    
    # fit model
    if classifier == 'LogReg':
        model = LogisticRegressionCV(cv=3)
    elif classifier == 'Forest':
        model = RandomForestClassifier(n_estimators=100, random_state=seed)
    model.fit(np.c_[Xtr, Str], Ytr)
    Ttr = model.predict(np.c_[Xtr, Str])
    Tte = model.predict(np.c_[Xte, Ste])
    acc = 1.0 - np.mean(np.abs(Yte - Tte))
    Z = split_to_four(Xte, Ste, Tte)
    parity = demographic_parity([z[:, -1] for z in Z])
    
    # wasserstein distance between te and ref
    np.random.seed(seed)
    idx = np.random.permutation(Xte.shape[0])[:Nsample]
    dx, dx_s1, dx_s0 = compute_wasserstein(Xte[idx, :], Ste[idx], Xref, Sref, 2000, num_sample=3, num_process=10, seed=seed)
    
    # sampling
    results = [[acc, parity, dx, dx_s1, dx_s0]]
    sampled_spos = np.mean(Ste)
    K = computeK(Z, Nsample, sampled_spos, sampled_ypos)
    for i, sampling in enumerate(['case-control', 'stealth']):
        print('%s: sampling ...' % (sampling,), end='')
        np.random.seed(seed+i)
        if sampling == 'case-control':
            p = case_control_sampling([z[:, :-1] for z in Z], K)
        elif sampling == 'stealth':
            p = stealth_sampling.stealth_sampling_bootstrap([z[:, :-1] for z in Z], K, path='./', prefix='adult', ratio=0.20, num_sample=3, num_process=10, timeout=60.0)
        idx = np.random.choice(Nte, sum(K), p=np.concatenate(p), replace=False)
        Xs = np.concatenate([z[:, :-2] for z in Z], axis=0)[idx, :]
        Ss = np.concatenate([z[:, -2] for z in Z], axis=0)[idx]
        Ts = np.concatenate([z[:, -1] for z in Z], axis=0)[idx]
        print('done.')
        
        # demographic parity of the sampled data
        print('%s: evaluating ...' % (sampling,), end='')
        Zs = split_to_four(Xs, Ss, Ts)
        parity = demographic_parity([z[:, -1] for z in Zs])
        
        # wasserstein disttance
        dx, dx_s1, dx_s0 = compute_wasserstein(Xs, Ss, Xref, Sref, 2000, num_sample=3, num_process=10, seed=seed)
        print('done.')
        
        results.append([np.nan, parity, dx, dx_s1, dx_s0])
    return results

#### Experiment (One Run)

In [7]:
result = sample_and_evaluate(df, Ntr=Ntr, Nte=Nte, Nsample=Nsample, classifier=classifier, sampled_ypos=sampled_ypos, seed=seed)

case-control: sampling ...done.
case-control: evaluating ...done.
stealth: sampling ...done.
stealth: evaluating ...done.


In [8]:
df = pd.DataFrame(result)
df.index = ['Baseline', 'Case-control', 'Stealth']
df.columns = ['Accuracy', 'DP', 'WD on Pr[x]', 'WD on Pr[x|s=1]', 'WD on Pr[x|s=0]']
print('Result (alpha = %.2f, seed=%d)' % (sampled_ypos[0], seed))
df

Result (alpha = 0.20, seed=0)


Unnamed: 0,Accuracy,DP,WD on Pr[x],WD on Pr[x|s=1],WD on Pr[x|s=0]
Baseline,0.851,0.182413,22.163767,25.6454,35.042133
Case-control,,0.025077,23.906033,22.585533,37.954267
Stealth,,0.071191,23.639633,24.2404,36.1657
