In [4]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.decomposition import PCA
from KDEpy import NaiveKDE
import scipy

import json
import os

from bin.dataset import Dataset
from bin.experiment import Experiment
#from bin.metrics import Metrics

from collections import defaultdict

#from models.LR import Lr
#from models.reduction import Reduction
#from models.reweight import Reweight
#from models.fair_reduction import FairReduction

from scipy.special import xlog1py
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score, recall_score
#from fairlearn.metrics import (
#    MetricFrame, plot_model_comparison,
#    selection_rate, demographic_parity_difference, demographic_parity_ratio,
#    false_positive_rate, false_negative_rate,
#    false_positive_rate_difference, false_negative_rate_difference,true_positive_rate, 
#    equalized_odds_difference)

import warnings
warnings.filterwarnings('ignore')


In [5]:
# Choose the largest available float on the system
try:
    FLOAT = scipy.float128
except AttributeError:
    FLOAT = np.float64

In [6]:
def read_config(path):
    """Reads the config file and returns a dictionary."""
    try:
        with open(path) as f:
            config = json.load(f)
    except FileNotFoundError:
        print("Config file not found.")
        config = None
    return config

def load_csv(path):
    """Loads the csv file and returns a dataframe."""
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print("CSV file not found.")
        df = None
    return df

In [80]:
#exp_conf = "configs/adult_noisy.json"
#exp_conf = "configs/COMPAS_noisy.json"
#exp_conf = "configs/synthetic_20_noisy.json"
exp_conf = "configs/income_noisy.json"
# exp_conf = "configs/baseline_config.json"

EXP = read_config(exp_conf)

In [81]:
EXP2 = EXP.copy()
for k in EXP2.keys():
    if 'cov' in k:
        del EXP[k]
EXP.keys()

dict_keys(['income_bias_0.1', 'income_bias_0.3', 'income_flip_0.1', 'income_flip_0.3', 'income_balanced_0.1', 'income_balanced_0.3'])

In [82]:
datasets = {}
for name, value in EXP.items():
    datasets[name] = Dataset(value)
    try:
    # if name in ['adult_bias_0.1','adult_bias_0.3']: continue
    # if name in ['COMPAS_balanced_0.1']: continue
    # if not name in ['income_balanced_0.1', 'income_balanced_0.3']: continue
    # if name  not in ['synthetic_20_balanced_0.1', 'synthetic_20_balanced_0.3']: continue
        continue
    # datasets[name].calculate_probabilities()

    except Exception as e:
        print(e)

In [83]:
dsets = list(datasets.keys())
configs = [(1, 2), (0.5, 1.5), (0.25, 1.1)]

In [87]:
d2 = dsets.copy()
if dsets[0].split("_")[0] == "synthetic":
    for i in range(len(d2)):
        d2[i] = "synthetic" + d2[i][12:]
d2

['income_bias_0.1',
 'income_bias_0.3',
 'income_flip_0.1',
 'income_flip_0.3',
 'income_balanced_0.1',
 'income_balanced_0.3']

In [88]:
for c in configs:

    train_ids = []
    test_ids = []
    valid_ids = []
    
    for i in range(1, 11):
        fold = 'x' + str(i)
        exclude = ['prob', 'emp_prob', EXP[dsets[0]]['sensitive_attribute'], EXP[dsets[0]]['label']]
        
        len_train = len(datasets[dsets[0]].foldwise_data[fold]['train'])
        len_test = len(datasets[dsets[0]].foldwise_data[fold]['test'])
        len_valid = len(datasets[dsets[0]].foldwise_data[fold]['valid'])
        
        dataX = datasets[dsets[0]].data
        dataX = dataX[list(set(dataX.columns) - set(exclude))]
    
        tr_idxs, test_idxs = create_shift(dataX, (len_train + len_valid) / (len_train + len_test + len_valid), holdout=0.4, alpha=c[0], beta=c[1])
    
        omitted = []
        for i in range(max(max(tr_idxs), max(test_idxs))):
            if i not in tr_idxs and i not in test_idxs:
                omitted.append(i)
        
        tr_data = dataX.iloc[tr_idxs]
        test_data = dataX.iloc[test_idxs]
        unused_data = dataX.iloc[omitted]
        
        train_ids.append(tr_idxs[:int(len(tr_idxs) * len_train / (len_train + len_valid))])
        test_ids.append(test_idxs)
        valid_ids.append(tr_idxs[int(len(tr_idxs) * len_train / (len_train + len_valid)):])

    
    new_train_ids = pd.DataFrame(train_ids).transpose()
    new_train_ids.columns = ['x' + str(i) for i in range(1, 11)]
    new_train_ids = new_train_ids.iloc[:-1, :] + 1
    
    new_test_ids = pd.DataFrame(test_ids).transpose()
    new_test_ids.columns = ['x' + str(i) for i in range(1, 11)]
    new_test_ids = new_test_ids.iloc[:-1, :] + 1
        
    new_valid_ids = pd.DataFrame(valid_ids).transpose()
    new_valid_ids.columns = ['x' + str(i) for i in range(1, 11)]
    new_valid_ids = new_valid_ids.iloc[:-1, :] + 1

    # Check if there is data leakage between train and test
    print("Checking intersections for data leakage...")
    for column in new_test_ids.columns:
        if len(list(filter(lambda x: x in new_test_ids[column].tolist(), new_train_ids[column].tolist())) + list(filter(lambda x: x in new_test_ids[column].tolist(), new_valid_ids[column].tolist())) + list(filter(lambda x: x in new_valid_ids[column].tolist(), new_train_ids[column].tolist()))) > 0:
            raise Exception("ERROR! Intersection between train, test, and valid is not zero!")
    
    for d in d2:
        print(d)
        dset_name = d
        splits_path = 'data/covariate_shifted_data/semi_synthetic/' + str(c[0]) + '_' + str(c[1]) + '/' + d.split('_')[0] + '/' + d.split('_')[1] + '/' + d.split('_')[2] + '/'
        print(splits_path)
        
        convert_to_int(new_train_ids).to_csv(splits_path + 'train_ids.csv', index=False)
        convert_to_int(new_test_ids).to_csv(splits_path + 'test_ids.csv', index=False)
        convert_to_int(new_valid_ids).to_csv(splits_path + 'valid_ids.csv', index=False)
        
        print("Done!")

Checking intersections for data leakage...
income_bias_0.1
data/covariate_shifted_data/semi_synthetic/1_2/income/bias/0.1/
Done!
income_bias_0.3
data/covariate_shifted_data/semi_synthetic/1_2/income/bias/0.3/
Done!
income_flip_0.1
data/covariate_shifted_data/semi_synthetic/1_2/income/flip/0.1/
Done!
income_flip_0.3
data/covariate_shifted_data/semi_synthetic/1_2/income/flip/0.3/
Done!
income_balanced_0.1
data/covariate_shifted_data/semi_synthetic/1_2/income/balanced/0.1/
Done!
income_balanced_0.3
data/covariate_shifted_data/semi_synthetic/1_2/income/balanced/0.3/
Done!
Checking intersections for data leakage...
income_bias_0.1
data/covariate_shifted_data/semi_synthetic/0.5_1.5/income/bias/0.1/
Done!
income_bias_0.3
data/covariate_shifted_data/semi_synthetic/0.5_1.5/income/bias/0.3/
Done!
income_flip_0.1
data/covariate_shifted_data/semi_synthetic/0.5_1.5/income/flip/0.1/
Done!
income_flip_0.3
data/covariate_shifted_data/semi_synthetic/0.5_1.5/income/flip/0.3/
Done!
income_balanced_0.1
da

In [None]:
["a"] + ["b"]

In [36]:
new_test_ids.min()

x1     1
x2     0
x3     1
x4     0
x5     1
x6     1
x7     0
x8     1
x9     0
x10    1
dtype: int64

In [37]:
new_test_ids

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,14490.0,34636.0,46133.0,7892.0,3020.0,40450.0,27403.0,2111.0,18180.0,40727.0
1,36751.0,11648.0,25231.0,45818.0,1625.0,28913.0,16572.0,43765.0,11159.0,47609.0
2,42807.0,5272.0,5502.0,8485.0,44802.0,25460.0,1551.0,7953.0,4178.0,38793.0
3,11302.0,5292.0,4721.0,34500.0,42161.0,42448.0,1976.0,19914.0,25368.0,365.0
4,6879.0,9458.0,33680.0,14285.0,36016.0,9945.0,1351.0,28680.0,15119.0,32634.0
...,...,...,...,...,...,...,...,...,...,...
2683,22719.0,34401.0,14548.0,15828.0,47356.0,30002.0,13501.0,18145.0,17365.0,10114.0
2684,19118.0,14974.0,43653.0,20265.0,29725.0,7819.0,24080.0,39502.0,10687.0,35245.0
2685,9943.0,20145.0,5483.0,29681.0,29641.0,37720.0,13451.0,9937.0,35328.0,46772.0
2686,42561.0,16755.0,40716.0,16525.0,25566.0,6520.0,7142.0,38984.0,16401.0,44795.0


In [50]:
convert_to_int(new_test_ids.iloc[:-1, :] + 1).min()

x1      6
x2      2
x3      4
x4     30
x5     14
x6      4
x7      6
x8      1
x9      3
x10    22
dtype: int32

In [46]:
def convert_to_int(df):
    """Converts all float columns in a dataframe to int, handling casting errors.
    
    Args:
      df: The pandas dataframe to convert.
    
    Returns:
      A new pandas dataframe with all float columns converted to int, 
      preserving non-numeric columns.
    """
    df_new = df.copy()
    for col in df_new.select_dtypes(include=['float']):
        try:
            # Attempt conversion to int, fill with NaN on errors
            df_new[col] = df_new[col].astype(int)
        except ValueError:
            pass
    return df_new

def dataset_distance(data1, data2):
    data = pd.concat([data1, data2], axis=0)
    
    pca = PCA(n_components=2)
    pc2 = pca.fit_transform(data)
    pc = pc2[:, 0]
    pc = pc.reshape(-1, 1)

    mean1 = np.mean(pc[:len(data1)])
    std1 = np.std(pc[:len(data1)])
    mean2 = np.mean(pc[len(data1):len(data1)+len(data2)])
    std2 = np.std(pc[len(data1):len(data1)+len(data2)])

    return mean1, std1, mean2, std2

def create_shift(
    data,
    src_split=0.8,
    holdout=0.2,
    alpha=1,
    beta=2,
    kdebw=0.3,
    eps=0.001,
):
    """
    Creates covariate shift sampling of data into disjoint source and target set.

    Let \mu and \sigma be the mean and the standard deviation of the first principal component retrieved by PCA on the whole data.
    The target is randomly sampled based on a Gaussian with mean = \mu and standard deviation = \sigma.
    The source is randomly sampled based on a Gaussian with mean = \mu + alpha and standard devaition = \sigma / beta

    data: [m, n]
    alpha, beta: the parameter that distorts the gaussian used in sampling
                   according to the first principle component
    output: source indices, target indices, ratios based on kernel density estimation with bandwidth = kdebw and smoothed by eps
    """
    m = np.shape(data)[0]
    source_size = int(m * src_split * (1 - holdout))
    target_size = int(m * (1 - src_split) * (1 - holdout))

    pca = PCA(n_components=2)
    pc2 = pca.fit_transform(data)
    pc = pc2[:, 0]
    pc = pc.reshape(-1, 1)

    pc_mean = np.mean(pc)
    pc_std = np.std(pc)

    sample_mean = pc_mean + alpha
    sample_std = pc_std / beta

    # sample according to the probs
    prob_s = norm.pdf(pc, loc=sample_mean, scale=sample_std)
    sum_s = np.sum(prob_s)
    prob_s = prob_s / sum_s
    prob_t = norm.pdf(pc, loc=pc_mean, scale=pc_std)
    sum_t = np.sum(prob_t)
    prob_t = prob_t / sum_t

    source_ind = np.random.choice(
        range(m), size=source_size, replace=False, p=np.reshape(prob_s, (m))
    )

    pt_proxy = np.copy(prob_t)
    pt_proxy[source_ind] = 0
    pt_proxy = pt_proxy / np.sum(pt_proxy)
    target_ind = np.random.choice(
        range(m), size=target_size, replace=False, p=np.reshape(pt_proxy, (m))
    )

    return source_ind, target_ind