In [36]:
import json
import os

import numpy as np
import pandas as pd 

from bin.dataset import Dataset
from bin.experiment import Experiment
from bin.metrics import Metrics

from collections import defaultdict

from models.LR import Lr
from models.reduction import Reduction
from models.reweight import Reweight
from models.fair_reduction import FairReduction

from scipy.special import xlog1py
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score, recall_score
from fairlearn.metrics import (
    MetricFrame, plot_model_comparison,
    selection_rate, demographic_parity_difference, demographic_parity_ratio,
    false_positive_rate, false_negative_rate,
    false_positive_rate_difference, false_negative_rate_difference,true_positive_rate, 
    equalized_odds_difference)

import warnings
warnings.filterwarnings('ignore')


In [37]:
model = {
    "LR": Lr, 
    "Reduction": Reduction,
    "Reweight": Reweight
}

In [38]:
def read_config(path):
    """Reads the config file and returns a dictionary."""
    try:
        with open(path) as f:
            config = json.load(f)
    except FileNotFoundError:
        print("Config file not found.")
        config = None
    return config

def load_csv(path):
    """Loads the csv file and returns a dataframe."""
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print("CSV file not found.")
        df = None
    return df

## Main

In [39]:
# exp_conf = "configs/adult_noisy.json"
# exp_conf = "configs/compas_noisy.json"
exp_conf = "configs/synthetic_20_noisy.json"
# exp_conf = "configs/income_noisy.json"
# exp_conf = "configs/baseline_config.json"

EXP = read_config(exp_conf)

In [40]:
datasets = {}
try:
    for name, value in EXP.items():
        # if not name in ['adult_bias_0.1','adult_bias_0.3']: continue
        # if not name in ['COMPAS_bias_0.1']: continue
        # if name in ['income_balanced_0.1', 'income_balanced_0.3']: continue
        if not  name in ['synthetic_20_balanced_0.1']: continue
        # print(f"{name=} and {value=}")
        datasets[name] = Dataset(value)
        datasets[name].calculate_probabilities("fair")
        datasets[name].calculate_probabilities("emp")

except Exception as e:
    print(e)

In [41]:
datasets

{'synthetic_20_balanced_0.1': <bin.dataset.Dataset at 0x1674b8b90>}

In [42]:
# Run Clean up

base_path = "results"
# Run Fair Reduction on every set 
for exp_name, data_obj in datasets.items():
    # if exp_name not in ['synthetic_20_balanced_0.1', 'synthetic_20_balanced_0.3']: continue
    # if not name  in ['income_balanced_0.1', 'income_balanced_0.3']: continue 
    print(exp_name)
    eval_labels =  data_obj.eval_labels()
    meta = {"name": data_obj.name, "noise": data_obj.noise_type, "level": data_obj.noise_level}
    res = []
    pred_dict = defaultdict(dict)
    for fold, _data in data_obj.foldwise_data.items():
        print(fold)
        train_data, test_data = _data['train'], _data['test']
        x_train, x_test = train_data.drop(data_obj.cols_to_drop, axis = 1, errors='ignore'), test_data.drop(data_obj.cols_to_drop, axis = 1, errors='ignore')
        y_train, y_test = train_data[data_obj.label], test_data[data_obj.label]
        y_map = {
            "fair":  data_obj.process_preds(train_data['prob'], 0.5),
            "emp":  data_obj.process_preds(train_data['emp_prob'], 0.5)
        }
        sv_train, sv_test = train_data[data_obj.sensitive_attribute], test_data[data_obj.sensitive_attribute]
        
        for label_type in y_map.keys():
            for m_name, clf in model.items():
                _model = clf().fit(x_train, y_map[label_type], sv_train)
                y_pred = _model.predict(x_test)
                pred_path = os.path.join(base_path, "cleanup", data_obj.name, data_obj.exp_name, m_name, fold, label_type)
                os.makedirs(pred_path, exist_ok=True)
                pd.DataFrame(y_pred).to_csv(os.path.join(pred_path, "preds.csv"))
                meta['fold'] = fold
                meta['model'] = m_name
                meta['label_type'] = label_type
                for eval_type, _label in eval_labels.items():
                    # if eval_type not in ['ground']: continue
                    meta['eval_type'] = eval_type
                    perf_dict = Metrics().performance_metrics(_label[fold], y_pred, 0.5, meta)
                    fair_dict = Metrics().fairness_metrics(_label[fold], y_pred, sv_test, threshold =  0.5, meta = perf_dict)
                    print(fair_dict)
                    res.append(fair_dict)
            save_path = os.path.join(base_path, data_obj.name,data_obj.exp_name)
            os.makedirs(save_path, exist_ok=True)
            pd.DataFrame(res).to_csv(os.path.join(base_path, data_obj.name,data_obj.exp_name, f"{label_type}_cleanup.csv"))

synthetic_20_balanced_0.1
x1
{'FNR_diff': 0.14527133216233706, 'FPR_diff': 0.025159489633173826, 'equalized_odds': 0.14527133216233712, 'demographic_parity': 0.08613877034929673, 'demographic_parity_ratio': 0.8376526660708965, 'Equal_Opportunity': 0.14527133216233712, 'accuracy': 0.807, 'f1_score': 0.8113391984359726, 'precision': 0.8217821782178217, 'recall': 0.8011583011583011, 'FNR': 0.19884169884169883, 'FPR': 0.18672199170124482, 'name': 'synthetic_20', 'noise': 'balanced', 'level': '0.1', 'fold': 'x1', 'model': 'LR', 'label_type': 'fair', 'eval_type': 'fair_clean'}
{'FNR_diff': 0.05529391243676951, 'FPR_diff': 0.11681818181818177, 'equalized_odds': 0.11681818181818177, 'demographic_parity': 0.08613877034929673, 'demographic_parity_ratio': 0.8376526660708965, 'Equal_Opportunity': 0.055293912436769566, 'accuracy': 0.483, 'f1_score': 0.48454636091724823, 'precision': 0.48118811881188117, 'recall': 0.4879518072289157, 'FNR': 0.5120481927710844, 'FPR': 0.5219123505976095, 'name': 'syn