In [1]:
import warnings; warnings.simplefilter('ignore')
import logging, sys
logging.disable(sys.maxsize)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import pickle
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from diffprivlib.mechanisms import Geometric
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing

# Privacy budget from 0.01 to 15

In [2]:
# ==========================================================
# CONFIGURATIONS
# ==========================================================

# number of iterations
nb_iter = 20

# seed
SEED = 42

# folder's name
folder_name = "RF_PreGEOGRR_Eps_0.01_15"

# eps values
eps_values = list(np.round(np.arange(0.1,1,0.1),2)) + list(np.round(np.arange(1,16,1),2))

# target
target = "ind_avg"

# dico result all iters
dico_result_all = {}


In [3]:
# ==========================================================
# ITERATIONS
# ==========================================================

for eps_total in eps_values:
    
    # eps
    eps_div = eps_total/3
    print(eps_total, eps_div)
    
    # dico results
    dico_result = {"acc":[],
                   "c0":[],
                   "c1":[],
                   "f1":[],
                   "acc-more_vol-0":[],
                   "acc-more_vol-0-0":[],
                   "acc-more_vol-0-1":[],
                   "acc-more_prof-1":[],
                   "acc-more_prof-1-0":[],
                   "acc-more_prof-1-1":[],
                   "disparate":[],
                   "smoothed":[],
                   "average_odds":[],
                   "equal_opportunity":[],
                   "mean_difference":[],
                   "consistency":[],
                  }

    for niter in range(nb_iter):

        name_iter = "ITER_"+str(niter)+"_"

        # reading dataset
        # -------------------------------------------------------
        df = pd.read_csv("dataset_classification.csv")
        df["alert"] = pd.to_datetime(df["alert"])
        df.sort_values(by="alert", ascending=True, inplace=True)
        
        # update dico_result
        # ------------------------------------------------------
        dico_result.update({"feat_"+f:[] for f in set(df.columns)-{target}})

        # divide train test
        # -------------------------------------------------------
        learn = df.loc[df["alert"]<datetime(2021,7,1,0,0,0)]
        test = df.loc[df["alert"]>=datetime(2021,7,1,0,0,0)]
        del learn["alert"], test["alert"]

        # applying DP
        # -------------------------------------------------------
        # Geometric
        sensitive_vars = [(eps_div, "nb_spp"), # eps , var
                          (eps_div, "nb_spv"), # eps , var
                         ]
        for eps, var in sensitive_vars:
            GEO_MECH = Geometric(epsilon=eps, sensitivity=1)
            learn[var] = [np.array(GEO_MECH.randomise(input_data)).clip(0) for input_data in learn[var]]
        
        # GRR
        sensitive_vars = [(eps_div, "motif"), # eps , var # motif
                         ]
        for eps, var in sensitive_vars:
            k = len(learn[var].unique())
            learn[var] = [GRR_Client(input_data, k, eps) for input_data in learn[var]]
        
        # updating ind_prof
        learn["ind_prof"] = learn.apply(lambda x: 1 if x.nb_spv > x.nb_spp else 0, axis=1)

        # saving train test xy dataset
        # -------------------------------------------------------
        X_train = learn.drop(target, axis=1)
        y_train = learn[target]
        X_test = test.drop(target, axis=1)
        y_test = test[target]
        dico_dataset = {}
        dico_dataset["X_train"] = X_train
        dico_dataset["y_train"] = y_train
        dico_dataset["X_test"] = X_test
        dico_dataset["y_test"] = y_test
        pickle.dump(dico_dataset, open(folder_name + "/" + name_iter+"eps"+str(eps_total)+"_train_test_xy.dat", "wb"))
        del dico_dataset

        # model statistics
        # -------------------------------------------------------
        # general
        params = {'n_estimators': 200, 'max_depth': 14, 'max_features': 0.8, 'max_samples': 0.95, 'class_weight': {0: 1, 1: 2}}
        best_data = {}
        best_data.update(pickle.load(open(folder_name + "/" + name_iter+"eps"+str(eps_total)+"_train_test_xy.dat", "rb"))) 
        model = RandomForestClassifier(random_state=SEED,
                                       n_jobs=-1)
        model.set_params(**params)
        model.fit(best_data["X_train"], best_data["y_train"])
        y_pred = model.predict(best_data["X_test"])
        
        # feature importance
        importance = pd.Series(model.feature_importances_, index=best_data["X_train"].columns)
        importance = importance.to_dict()
        for f in importance.keys():
            dico_result["feat_"+f].append(importance[f])
        
        # performance metrics of the model in general
        acc = round(accuracy_score(best_data["y_test"],y_pred)*100, 2)
        cm = confusion_matrix(best_data["y_test"], y_pred)
        c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
        c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
        report = classification_report(best_data["y_test"], y_pred, output_dict=True)
        f1 = report["macro avg"]["f1-score"]
        dico_result["acc"].append(acc)
        dico_result["c0"].append(c0)
        dico_result["c1"].append(c1)
        dico_result["f1"].append(f1)

        # performance metrics of the model by group
        dfout = pd.DataFrame()
        dfout["ind_prof"] = best_data["X_test"]["ind_prof"]
        dfout["y_test"] = best_data["y_test"].values
        dfout["y_pred"] = y_pred
        for cat_name, cat_sex in [("more_vol", 0),
                                  ("more_prof", 1)]:
            dfaux = dfout.loc[dfout["ind_prof"]==cat_sex]
            acc = round(accuracy_score(dfaux["y_test"].values, dfaux["y_pred"].values)*100, 2)
            cm = confusion_matrix(dfaux["y_test"].values, dfaux["y_pred"].values)
            c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
            c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
            dico_result["acc-"+cat_name+"-"+str(cat_sex)].append(acc)
            dico_result["acc-"+cat_name+"-"+str(cat_sex)+"-0"].append(c0)
            dico_result["acc-"+cat_name+"-"+str(cat_sex)+"-1"].append(c1)

        # disparate impact
        unpriv_df = dfout[dfout["ind_prof"]==0] 
        unpriv_total = unpriv_df.shape[0]

        priv_df = dfout[dfout["ind_prof"]==1]
        priv_total = priv_df.shape[0]

        unpriv_outcomes = unpriv_df[unpriv_df["y_pred"]==1].shape[0]
        unpriv_ratio = unpriv_outcomes/unpriv_total

        priv_outcomes = priv_df[priv_df["y_pred"]==1].shape[0]
        priv_ratio = priv_outcomes/priv_total

        disparate_impact = unpriv_ratio/priv_ratio
        dico_result["disparate"].append(disparate_impact)
    
        # SEDF
        privileged_groups = [{'ind_prof': 1}] 
        unprivileged_groups = [{'ind_prof': 0}] 
        ds = best_data['X_test'].copy()
        ds["y_pred"] = y_pred
        binaryLabelDataset = BinaryLabelDataset(favorable_label=0,
                                                unfavorable_label=1,
                                                df=ds,
                                                label_names=['y_pred'],
                                                protected_attribute_names=['ind_prof'])
        metric = BinaryLabelDatasetMetric(binaryLabelDataset, 
                                         unprivileged_groups=unprivileged_groups,
                                         privileged_groups=privileged_groups)
        smoothed = metric.smoothed_empirical_differential_fairness()
        dico_result["smoothed"].append(metric.smoothed_empirical_differential_fairness())

    # saving final results per epsilon
    dico_result_all[eps_total] = {k:round(np.mean(dico_result[k]),2) for k in dico_result.keys()}
    
# saving all results
pickle.dump(dico_result_all, open(folder_name + "/" + "dico_result_all.dat", "wb"))

0.1 0.03333333333333333
0.2 0.06666666666666667
0.3 0.09999999999999999
0.4 0.13333333333333333
0.5 0.16666666666666666
0.6 0.19999999999999998
0.7 0.2333333333333333
0.8 0.26666666666666666
0.9 0.3
1 0.3333333333333333
2 0.6666666666666666
3 1.0
4 1.3333333333333333
5 1.6666666666666667
6 2.0
7 2.3333333333333335
8 2.6666666666666665
9 3.0
10 3.3333333333333335
11 3.6666666666666665
12 4.0
13 4.333333333333333
14 4.666666666666667
15 5.0
