In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.decomposition import PCA
from KDEpy import NaiveKDE
import scipy

import json
import os

from bin.dataset import Dataset
from bin.experiment import Experiment
from bin.metrics import Metrics

from collections import defaultdict

from models.LR import Lr
from models.reduction import Reduction
from models.reweight import Reweight
from models.fair_reduction import FairReduction

from scipy.special import xlog1py
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score, recall_score
from fairlearn.metrics import (
    MetricFrame, plot_model_comparison,
    selection_rate, demographic_parity_difference, demographic_parity_ratio,
    false_positive_rate, false_negative_rate,
    false_positive_rate_difference, false_negative_rate_difference,true_positive_rate, 
    equalized_odds_difference)

import warnings
warnings.filterwarnings('ignore')


In [3]:
# Choose the largest available float on the system
try:
    FLOAT = scipy.float128
except AttributeError:
    FLOAT = np.float64

In [4]:
def read_config(path):
    """Reads the config file and returns a dictionary."""
    try:
        with open(path) as f:
            config = json.load(f)
    except FileNotFoundError:
        print("Config file not found.")
        config = None
    return config

def load_csv(path):
    """Loads the csv file and returns a dataframe."""
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print("CSV file not found.")
        df = None
    return df

In [5]:
# exp_conf = "configs/adult_noisy.json"
#exp_conf = "configs/COMPAS_noisy.json"
#exp_conf = "configs/synthetic_20_noisy.json"
exp_conf = "configs/income_noisy.json"
# exp_conf = "configs/baseline_config.json"

EXP = read_config(exp_conf)

In [6]:
EXP.keys()

dict_keys(['income_bias_0.1', 'income_bias_0.3', 'income_flip_0.1', 'income_flip_0.3', 'income_balanced_0.1', 'income_balanced_0.3'])

In [7]:
datasets = {}
for name, value in EXP.items():
    datasets[name] = Dataset(value)
    try:
    # if name in ['adult_bias_0.1','adult_bias_0.3']: continue
    # if name in ['COMPAS_balanced_0.1']: continue
    # if not name in ['income_balanced_0.1', 'income_balanced_0.3']: continue
    # if name  not in ['synthetic_20_balanced_0.1', 'synthetic_20_balanced_0.3']: continue
        continue
    # datasets[name].calculate_probabilities()

    except Exception as e:
        print(e)

In [8]:
dset_name = 'income_balanced_0.1_cov'
datasets

{'income_bias_0.1': <bin.dataset.Dataset at 0x7f3dce4da110>,
 'income_bias_0.3': <bin.dataset.Dataset at 0x7f3dcd720b10>,
 'income_flip_0.1': <bin.dataset.Dataset at 0x7f3dcd61a550>,
 'income_flip_0.3': <bin.dataset.Dataset at 0x7f3dcd48fbd0>,
 'income_balanced_0.1': <bin.dataset.Dataset at 0x7f3dcd51b990>,
 'income_balanced_0.3': <bin.dataset.Dataset at 0x7f3dcd3a77d0>}

In [38]:
dset_name = 'income_flip_0.3'
splits_path = 'data/covariate_shifted_data/0.25_1.1/income/flip/0.3/'

train_ids = []
test_ids = []
valid_ids = []

for i in range(1, 11):
    fold = 'x' + str(i)
    exclude = ['prob', 'emp_prob', EXP[dset_name]['sensitive_attribute'], EXP[dset_name]['label']]
    
    len_train = len(datasets[dset_name].foldwise_data[fold]['train'])
    len_test = len(datasets[dset_name].foldwise_data[fold]['test'])
    len_valid = len(datasets[dset_name].foldwise_data[fold]['valid'])
    
    dataX = datasets[dset_name].data
    dataX = dataX[list(set(dataX.columns) - set(exclude))]

    tr_idxs, test_idxs = create_shift(dataX, (len_train + len_valid) / (len_train + len_test + len_valid), holdout=0.4, alpha=0.25, beta=1.1)

    omitted = []
    for i in range(max(max(tr_idxs), max(test_idxs))):
        if i not in tr_idxs and i not in test_idxs:
            omitted.append(i)
    
    tr_data = dataX.iloc[tr_idxs]
    test_data = dataX.iloc[test_idxs]
    unused_data = dataX.iloc[omitted]
    
    train_ids.append(tr_idxs[:int(len(tr_idxs) * len_train / (len_train + len_valid))])
    test_ids.append(test_idxs)
    valid_ids.append(tr_idxs[int(len(tr_idxs) * len_train / (len_train + len_valid)):])

new_train_ids = pd.DataFrame(train_ids).transpose()
new_train_ids.columns = ['x' + str(i) for i in range(1, 11)]
new_train_ids.to_csv(splits_path + 'train_ids.csv', index=False)

new_test_ids = pd.DataFrame(test_ids).transpose()
new_test_ids.columns = ['x' + str(i) for i in range(1, 11)]
new_test_ids.to_csv(splits_path + 'test_ids.csv', index=False)

new_valid_ids = pd.DataFrame(valid_ids).transpose()
new_valid_ids.columns = ['x' + str(i) for i in range(1, 11)]
new_valid_ids.to_csv(splits_path + 'valid_ids.csv', index=False)

print("Done!")

Done!


In [21]:
max([max(train_ids[i]) for i in range(len(train_ids))])

195659

In [10]:
def dataset_distance(data1, data2):
    data = pd.concat([data1, data2], axis=0)
    
    pca = PCA(n_components=2)
    pc2 = pca.fit_transform(data)
    pc = pc2[:, 0]
    pc = pc.reshape(-1, 1)

    mean1 = np.mean(pc[:len(data1)])
    std1 = np.std(pc[:len(data1)])
    mean2 = np.mean(pc[len(data1):len(data1)+len(data2)])
    std2 = np.std(pc[len(data1):len(data1)+len(data2)])

    return mean1, std1, mean2, std2

def create_shift(
    data,
    src_split=0.8,
    holdout=0.2,
    alpha=1,
    beta=2,
    kdebw=0.3,
    eps=0.001,
):
    """
    Creates covariate shift sampling of data into disjoint source and target set.

    Let \mu and \sigma be the mean and the standard deviation of the first principal component retrieved by PCA on the whole data.
    The target is randomly sampled based on a Gaussian with mean = \mu and standard deviation = \sigma.
    The source is randomly sampled based on a Gaussian with mean = \mu + alpha and standard devaition = \sigma / beta

    data: [m, n]
    alpha, beta: the parameter that distorts the gaussian used in sampling
                   according to the first principle component
    output: source indices, target indices, ratios based on kernel density estimation with bandwidth = kdebw and smoothed by eps
    """
    m = np.shape(data)[0]
    source_size = int(m * src_split * (1 - holdout))
    target_size = int(m * (1 - src_split) * (1 - holdout))

    pca = PCA(n_components=2)
    pc2 = pca.fit_transform(data)
    pc = pc2[:, 0]
    pc = pc.reshape(-1, 1)

    pc_mean = np.mean(pc)
    pc_std = np.std(pc)

    sample_mean = pc_mean + alpha
    sample_std = pc_std / beta

    # sample according to the probs
    prob_s = norm.pdf(pc, loc=sample_mean, scale=sample_std)
    sum_s = np.sum(prob_s)
    prob_s = prob_s / sum_s
    prob_t = norm.pdf(pc, loc=pc_mean, scale=pc_std)
    sum_t = np.sum(prob_t)
    prob_t = prob_t / sum_t

    source_ind = np.random.choice(
        range(m), size=source_size, replace=False, p=np.reshape(prob_s, (m))
    )

    pt_proxy = np.copy(prob_t)
    pt_proxy[source_ind] = 0
    pt_proxy = pt_proxy / np.sum(pt_proxy)
    target_ind = np.random.choice(
        range(m), size=target_size, replace=False, p=np.reshape(pt_proxy, (m))
    )

    return source_ind, target_ind