## Magics

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Imports

In [None]:
import numpy as np
import shutil
import re
import json
from pathlib import Path
import os
from collections import namedtuple
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
plt.style.use('ggplot')
%matplotlib inline

In [None]:
input('are you sure?')
machines = """
""".split('\n')[1:-1]


model_path = Path('nas_results_base/')
model_path.mkdir(exist_ok=True)
for i in range(len(machines)):
    (model_path/str(i)).mkdir(exist_ok=True)

location = '~/fairness/deco/src/deco/post_hoc/results'

for i, machine in enumerate(machines):
    os.system(f"scp  -i ~/.ssh/ec2-key \"ubuntu@{machine}:{location}/*\" {str(model_path)}/{i}/")

In [None]:
model_path = Path('nas_results_bm/')

In [None]:
len(list(model_path.glob('**/*test_output.json')))

In [None]:
import json, re

data = {}
for file_results in model_path.glob('**/*test_output.json'):
    keys = re.match(fr'{model_path}/(\d)/(\w+_\w+_\d)_(\d)_baselines_test_output.json', str(file_results))
    with open(file_results) as fh:
        datum = json.load(fh)
    data.update({keys.groups() : {(k,kk): vv for k,v in datum.items() for kk,vv in v.items()}})
    
plotdf = pd.DataFrame(data).T.unstack(0).unstack(0).describe().loc[['count', 'mean', 'std']].T.unstack(1).reorder_levels([2,1,0]).sort_index().swaplevel(0,1,1).sort_index(1)
plotdf = plotdf['objective'].droplevel(0).unstack(1)

In [None]:
plotdf

In [None]:
plotdf.index = ['base', 'more dropout', 'more width', 'more layers']

# titlename = {'spd': 'Statistical Parity Difference', 'eod': 'Equal Opportunity Difference', 'aod': 'Average Odds Difference'}
column_order = ['default', 'ROC', 'EqOdds', 'CalibEqOdds', 'Random', 'adversarial', 'layerwiseOpt']
col_rename = {'default': 'Default', 'adversarial': 'Adversarial', 'layerwiseOpt': 'LayerwiseOpt'}

plotmean = plotdf['mean'].reindex(columns=column_order).rename(columns=col_rename)
plotstd = plotdf['std'].reindex(columns=column_order).rename(columns=col_rename)
plotmean.plot(kind='bar', yerr=plotstd, figsize=(10,5), rot=0)
plt.ylabel('Objective: $\lambda$|SPD| + $(1-\lambda)(1-$accuracy$)$')
plt.xlabel('Architecture BM (sex)')
plt.gca().legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=7)
plt.tight_layout()
# plt.savefig(f'images/{metric}_results.pdf')
plt.savefig(f'images/multinet_bm_results.png')
plt.savefig(f'images/multinet_bm_results.pdf')

In [None]:
from post_hoc.posthoc import *

In [None]:
datasets = ['adult', 'bank', 'compas']
results = {}

for dataset in datasets:
    train, valid, test, priv, unpriv = get_data(dataset, 1)
    priv_index = train.protected_attribute_names.index(list(priv[0].keys())[0])

    scale_orig = StandardScaler()
    X_train = torch.tensor(scale_orig.fit_transform(train.features), dtype=torch.float32)
    y_train = torch.tensor(train.labels.ravel(), dtype=torch.float32)
    # p_train = train.protected_attributes[:, priv_index]

    X_valid = torch.tensor(scale_orig.transform(valid.features), dtype=torch.float32)
    y_valid = torch.tensor(valid.labels.ravel(), dtype=torch.float32)
    p_valid = valid.protected_attributes[:, priv_index]

    X_test = torch.tensor(scale_orig.transform(test.features), dtype=torch.float32)
    y_test = torch.tensor(test.labels.ravel(), dtype=torch.float32)
    p_test = test.protected_attributes[:, priv_index]

    model = Model(X_train.size(1))

    roc_auc_scores = []
    accs = []
    for path in model_path.glob('*'):
        if dataset in str(path):
            model.load_state_dict(torch.load(path))
            model.eval()
            with torch.no_grad():
                yhat_test = model(X_test)[:, 0].reshape(-1, 1).numpy()
            roc_auc_scores.append(roc_auc_score(y_test, yhat_test))

            model.eval()
            with torch.no_grad():
                yhat_valid = model(X_valid)[:, 0].reshape(-1, 1).numpy()
                
            threshs = np.linspace(0,1,1001)
            bthresh = threshs[np.argmax([accuracy_score(y_valid, yhat_valid > thresh) for thresh in threshs])]
            accs.append(accuracy_score(y_test, yhat_test > bthresh))
    results[(dataset,  'neural network', 'roc_auc')] = roc_auc_scores
    results[(dataset, 'neural network', 'accuracy')] = accs
    
    roc_auc_scores = []
    accs = []
    for i in range(10):
        lr = LogisticRegressionCV()
        lr.fit(X_train, y_train)
        
        yhat_test = lr.predict_proba(X_test)[:,1]
        roc_auc_scores.append(roc_auc_score(y_test, yhat_test))
        
        yhat_valid = lr.predict_proba(X_valid)[:,1]
        threshs = np.linspace(0,1,1001)
        bthresh = threshs[np.argmax([accuracy_score(y_valid, yhat_valid > thresh) for thresh in threshs])]
        accs.append(accuracy_score(y_test, yhat_test > bthresh))
    results[(dataset,  'logistic regression', 'roc_auc')] = roc_auc_scores
    results[(dataset, 'logistic regression', 'accuracy')] = accs
    
    
    roc_auc_scores = []
    accs = []
    for i in range(10):
        rf = RandomForestClassifier()
        rf.fit(X_train, y_train)
        
        yhat_test = rf.predict_proba(X_test)[:,1]
        roc_auc_scores.append(roc_auc_score(y_test, yhat_test))
        
        yhat_valid = rf.predict_proba(X_valid)[:,1]
        threshs = np.linspace(0,1,1001)
        bthresh = threshs[np.argmax([accuracy_score(y_valid, yhat_valid > thresh) for thresh in threshs])]
        accs.append(accuracy_score(y_test, yhat_test > bthresh))
    results[(dataset,  'random forest', 'roc_auc')] = roc_auc_scores
    results[(dataset, 'random forest', 'accuracy')] = accs
    

In [None]:
tmp = pd.DataFrame(results).describe().loc[['count', 'mean', 'std']].T
df = (tmp['mean'].map('{:.3f}'.format) + ' $\pm$ ' + tmp['std'].map('{:.3f}'.format)).unstack(1)
df

In [None]:
print(df.to_latex())

In [None]:
dataset

In [None]:
train, valid, test, priv, unpriv = get_data('adult', 1)
priv_index = train.protected_attribute_names.index(list(priv[0].keys())[0])

scale_orig = StandardScaler()
X_train = torch.tensor(scale_orig.fit_transform(train.features), dtype=torch.float32)
y_train = torch.tensor(train.labels.ravel(), dtype=torch.float32)
# p_train = train.protected_attributes[:, priv_index]

X_valid = torch.tensor(scale_orig.transform(valid.features), dtype=torch.float32)
y_valid = torch.tensor(valid.labels.ravel(), dtype=torch.float32)
p_valid = valid.protected_attributes[:, priv_index]

X_test = torch.tensor(scale_orig.transform(test.features), dtype=torch.float32)
y_test = torch.tensor(test.labels.ravel(), dtype=torch.float32)
p_test = test.protected_attributes[:, priv_index]

model = Model(X_train.size(1))
deltas = []
biases = []
for path in model_path.glob('*'):
    if 'adult' in str(path):
        deltas.append([])
        biases.append([])
        for i in range(1000):
            model.load_state_dict(torch.load(path))
            delta = []
            for param in model.parameters():
                delta.append(torch.randn_like(param) * 0.1 + 1)
                param.data = param.data * delta[-1]
            delta = torch.cat([x.reshape(-1) for x in delta])
            deltas[-1].append(delta)

            model.eval()
            with torch.no_grad():
                yhat_test = model(X_test)[:, 0].reshape(-1, 1).numpy()
            bias = compute_bias(yhat_test, y_test.numpy(), p_test, metric='spd')
            biases[-1].append(bias)

In [None]:
shape = len(deltas[0])
train = torch.randperm(shape)[:int(shape*0.8)]
test = torch.randperm(shape)[int(shape*0.8):]

In [None]:
from sklearn.linear_model import LinearRegression
coefs = []
scores = []
for d, b in zip(deltas, biases):
    lr = LinearRegression()
    lr.fit(torch.stack(d).numpy()[train], np.array(b)[train])
    scores.append(lr.score(torch.stack(d).numpy()[test], np.array(b)[test]))
    coefs.append(lr.coef_)
coefs = np.array(coefs)

In [None]:
print(f'{np.array(scores).mean():.3f} $\pm$ {2.2*np.array(scores).std():.3f}')

In [None]:
plt.figure(figsize=(10,5))
pdf = pd.DataFrame(np.sort(np.abs(coefs)))
pdf.mean().plot()
plt.fill_between(pdf.std().index, pdf.mean() - 2.2*pdf.std(), pdf.mean() + 2.2*pdf.std(),alpha=0.4)
plt.xlabel('index of sorted coefficients')
plt.ylabel('coefficient value')
plt.savefig('coefs_sort.pdf')

In [None]:
coefs.shape

In [None]:
from scipy.linalg import svd

In [None]:
from sklearn.preprocessing import normalize

In [None]:
U,s,Vh = svd(normalize(coefs))

In [None]:
U.shape, s.shape, Vh.shape

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(0,s.size), s)
plt.xlabel('singular value index')
plt.ylabel('singular value')
plt.savefig('coefs_svd.pdf')

In [None]:
yash1 = 'ec2-3-233-221-255.compute-1.amazonaws.com'
yash2 = 'ec2-3-230-154-171.compute-1.amazonaws.com'
yash3 = 'ec2-3-235-41-184.compute-1.amazonaws.com'
yash4 = 'ec2-3-223-3-236.compute-1.amazonaws.com'
yashs = [yash1, yash2, yash3, yash4]

results_path = Path('NAS/')
# shutil.rmtree(results_path)
results_path.mkdir(exist_ok=True)

location = '~/fairness/deco/src/deco/post_hoc/results'

for i, yash in enumerate(yashs):
    results_dirpath = Path(f'NAS/{i}')
    results_dirpath.mkdir(exist_ok=True)
    os.system(f"scp  -i ~/.ssh/ec2-key \"ubuntu@{yash}:{location}/*\" {str(results_dirpath)}/")

In [None]:
import json, re
data = {}
for result_file in results_path.glob('**/*.json'):
#     print(result_file)
    matches = re.match(r'NAS/(?P<nn_type>\d+)/(?P<dataset>\w+)_(?P<bias>\w+)_(?P<protected>\d+)_(?P<iter>\d+)_baselines_(?P<fold>\w+)_output.json', str(result_file))
    if matches.group('fold') == 'valid':
        continue
    with open(result_file, 'r') as fh:
        datum = json.load(fh)
    datum = {(matches.group('nn_type'), matches.group('iter'), k): v for k,v in datum.items()}
    data.update(datum)
#     data = dict(data, **datum)

In [None]:
plotdf = pd.DataFrame(data).stack(1).unstack(0).describe().loc[['count', 'mean', 'std']].T.unstack(-1).loc[:,(slice(None, None),'objective')].droplevel(1,1).unstack(1)
plotdf.index = ['neural network 1', 'neural network 2', 'neural network 3', 'neural network 4']

# titlename = {'spd': 'Statistical Parity Difference', 'eod': 'Equal Opportunity Difference', 'aod': 'Average Odds Difference'}
column_order = ['default', 'ROC', 'EqOdds', 'CalibEqOdds', 'Random', 'adversarial', 'layerwiseOpt']
col_rename = {'default': 'Default', 'adversarial': 'Adversarial', 'layerwiseOpt': 'LayerwiseOpt'}

plotmean = plotdf['mean'].reindex(columns=column_order).rename(columns=col_rename)
plotstd = plotdf['std'].reindex(columns=column_order).rename(columns=col_rename)
plotmean.plot(kind='bar', yerr=plotstd, figsize=(10,5), rot=0)
plt.ylabel('Objective')
plt.xlabel('Dataset')
plt.gca().legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=7)
plt.tight_layout()
# plt.savefig(f'images/{metric}_results.pdf')
plt.savefig(f'images/multinet_results.png')
plt.savefig(f'images/multinet_results.pdf')