In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import functools
import pickle
import os
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from tqdm.notebook import tqdm

BASE_DIR = '../../../'
import sys
sys.path.append(BASE_DIR)

# custom code
import utils.utils
CONFIG = utils.utils.load_config("../../config.json")
from utils.fweg import FWEG

Using TensorFlow backend.


In [3]:
DATASET = os.path.basename(os.getcwd()) # name of folder this file is in
RANDOM_SEED = CONFIG['random_seed']
BATCH_SIZE = CONFIG["experiment_configs"][DATASET]["batch_size"]
EVAL_GROUPS = CONFIG['experiment_configs'][DATASET]['eval_groups']

print(RANDOM_SEED)

PROCESSED_DIR = os.path.join(BASE_DIR, f'processed/{DATASET}/rs={RANDOM_SEED}')
MODELS_DIR = os.path.join(BASE_DIR, f'models/{DATASET}/rs={RANDOM_SEED}')
RESULTS_DIR = os.path.join(BASE_DIR, "results")

PROCESSED_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, DATASET, ".pkl")
BASE_MODEL_SAVEPATH = utils.utils.get_savepath(MODELS_DIR, DATASET, ".h5", mt="base") # mt = model_type


55


In [4]:
os.makedirs(RESULTS_DIR, exist_ok=True)

In [5]:
train_df = pd.read_csv(os.path.join(PROCESSED_DIR, "train.csv"))
hyper_train_df = pd.read_csv(os.path.join(PROCESSED_DIR, "hyper_train.csv"))
val_df = pd.read_csv(os.path.join(PROCESSED_DIR, "val.csv"))
# in this setting val = hyper_val. the separation is purely for consistency with
# other experiments and using FWEG
hyper_val_df = pd.read_csv(os.path.join(PROCESSED_DIR, "hyper_val.csv"))
test_df = pd.read_csv(os.path.join(PROCESSED_DIR, "test.csv"))

# concat the two for train
train_full_df = pd.concat([train_df, hyper_train_df])

del train_df, hyper_train_df

In [6]:
x_train_full = train_full_df.drop([*EVAL_GROUPS, 'label'], axis=1).values
y_train_full = train_full_df['label'].values

x_val = val_df.drop([*EVAL_GROUPS, 'label'], axis=1).values
y_val = val_df['label'].values

x_hyper_val = hyper_val_df.drop([*EVAL_GROUPS, 'label'], axis=1).values
y_hyper_val = hyper_val_df['label'].values

x_test = test_df.drop([*EVAL_GROUPS, 'label'], axis=1).values
y_test = test_df['label'].values

In [7]:
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=x_train_full.shape[1]),
    tf.keras.layers.Dense(2, activation=tf.nn.softmax),
])
model.load_weights(BASE_MODEL_SAVEPATH)

In [8]:
preds_train_full = model.predict(x_train_full)
preds_val = model.predict(x_val)
preds_hyper_val = model.predict(x_hyper_val)
preds_test = model.predict(x_test)

In [9]:
def get_basis_fns(
    groups,
    train_full_df,
    val_df,
    hyper_val_df,
    test_df,
    add_all
    ):
    """
    TODO:
    """
    all_groups = [
        [],
        ['relationship_Husband', 'relationship_Wife'],
        ['private_workforce', 'non_private_workforce'],
        ['relationship_Husband', 'relationship_Wife', 'private_workforce', 'non_private_workforce']
    ]
    assert groups in all_groups
    if len(groups) == 0:
        assert add_all is True
        
    np.random.seed(RANDOM_SEED)
    
    if len(groups) == 0:
        basis_train_full = pd.DataFrame(np.ones(len(train_full_df)), columns=["All"])
        basis_val = pd.DataFrame(np.ones(len(val_df)), columns=["All"])
        basis_hyper_val = pd.DataFrame(np.ones(len(hyper_val_df)), columns=["All"])
        basis_test = pd.DataFrame(np.ones(len(test_df)), columns=["All"])
    else:
        basis_train_full = train_full_df[groups].copy()
        basis_val = val_df[groups].copy()
        basis_hyper_val = hyper_val_df[groups].copy()
        basis_test = test_df[groups].copy()
    
        if add_all:
            basis_train_full['All'] = 1.0
            basis_val['All'] = 1.0
            basis_hyper_val['All'] = 1.0
            basis_test['All'] = 1.0
        
    return basis_train_full, basis_val, basis_hyper_val, basis_test


In [10]:
protected_groups = val_df[EVAL_GROUPS]
METRIC_VAL = utils.metrics.AdultBBMetric(
    protected_groups['gender_Male'].values,
    protected_groups['gender_Female'].values
)

protected_groups = hyper_val_df[EVAL_GROUPS]
METRIC_HYPER_VAL = utils.metrics.AdultBBMetric(
    protected_groups['gender_Male'].values,
    protected_groups['gender_Female'].values
)

protected_groups = test_df[EVAL_GROUPS]
METRIC_TEST = utils.metrics.AdultBBMetric(
    protected_groups['gender_Male'].values,
    protected_groups['gender_Female'].values
)

In [11]:
CLASSES = 2

# must be one of:
# []
# ['relationship_Husband', 'relationship_Wife']
# ['private_workforce', 'non_private_workforce'],
# ['relationship_Husband', 'relationship_Wife', 'private_workforce', 'non_private_workforce']
GROUPS = ['relationship_Husband', 'relationship_Wife', 'private_workforce', 'non_private_workforce']

ADD_ALL = True
EPSILON = 1e-4

NUM_ITERS = 100

USE_LINEAR_VAL_METRIC = False

In [12]:
basis_train_full, basis_val, basis_hyper_val, basis_test = get_basis_fns(
    GROUPS,
    train_full_df,
    val_df,
    hyper_val_df,
    test_df,
    ADD_ALL
)

In [13]:
fweg = utils.fweg.FWEG(
    METRIC_VAL,
    NUM_ITERS,
    EPSILON,
    CLASSES,
    USE_LINEAR_VAL_METRIC,
    RANDOM_SEED,
)

val_train_list, grad_norm_list, cond_list = fweg.fit(
    preds_train_full,
    y_train_full,
    basis_train_full,
    preds_val,
    y_val,
    basis_val,
)

Initialization complete!


Val : 0.827: 100%|██████████| 100/100 [00:06<00:00, 16.65it/s]


In [14]:
# apply to hyper val set
preds_hyper_val_list, mval_hyper_val_list = fweg.predict(
    preds_hyper_val,
    y_hyper_val,
    basis_hyper_val,
    deterministic=False,
    metric=METRIC_HYPER_VAL,
)

# we check the latter half for better convergence estimates
start = len(mval_hyper_val_list)//2
best_idx = start + np.argmax(mval_hyper_val_list[start:])
print(f"Hyper Val: {mval_hyper_val_list[best_idx]}")

Hyper Val: 0.827409029882225


In [15]:
# apply to test set
preds_test_list, mval_test_list = fweg.predict(
    preds_test,
    y_test,
    basis_test,
    deterministic=False,
    metric=METRIC_TEST,
)
print(f"Test: {mval_test_list[best_idx]}")

Test: 0.8229357961064586


In [16]:
os.makedirs(RESULTS_DIR, exist_ok=True)
savepath = os.path.join(RESULTS_DIR, f"results_{DATASET}.csv")
saver = utils.record.Results_Recorder(savepath, DATASET)

Results file exists, appending to it...


In [17]:
CLASSES = 2
NUM_ITERS = 100

groups_list = [
#     [],
#     ['relationship_Husband', 'relationship_Wife'],
#     ['private_workforce', 'non_private_workforce'],
    ['relationship_Husband', 'relationship_Wife', 'private_workforce', 'non_private_workforce']
]
groups_descr_list = [
#     "single_group",
#     "relationship_Husband, relationship_Wife",
#     'private_workforce, non_private_workforce',
    'relationship_Husband, relationship_Wife, private_workforce, non_private_workforce'
]
add_all_list = [True]
epsilon_list = [0.0001, 0.001, 0.01, 0.1]
use_linear_val_metric_list = [False]

# this fills in most of the arguments for our basis function creator
# it is missing the `groups` arg and `add_all`. FWEG_Hyperparameter_Search
# is given basis_fn_generator and will fill these in as it iterates over
# the hyperparameters.
basis_fn_generator = functools.partial(
    get_basis_fns,
    train_full_df = train_full_df,
    val_df = val_df,
    hyper_val_df = hyper_val_df,
    test_df = test_df,
)

In [18]:
fweg_hp_s = utils.fweg.FWEG_Hyperparameter_Search(
    saver,
    CLASSES,
    NUM_ITERS,
    METRIC_VAL,
    basis_fn_generator,
    groups_list,
    groups_descr_list,
    add_all_list,
    epsilon_list,
    use_linear_val_metric_list,
    RANDOM_SEED,
    # G-mean black-box metric benefits from more convergence-sensitive
    # pickings of the best hyperparameters
    use_convergence=True
)

(best_groups, best_add_all, best_epsilon, best_use_linear_val_metric) = fweg_hp_s.search(
    preds_train_full,
    y_train_full,
    preds_val,
    y_val,
    preds_hyper_val,
    y_hyper_val,
    preds_test,
    y_test,
    metric_hyper_val=METRIC_HYPER_VAL,
    metric_test=METRIC_TEST,
)

best hyper val: 0.8274, test: 0.8229: 100%|██████████| 4/4 [00:22<00:00,  5.72s/it]


In [19]:
saver.close()