In [19]:
import warnings; warnings.simplefilter('ignore')
import logging, sys
logging.disable(sys.maxsize)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import pickle
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing

# Parameters

In [20]:
folder_name = "RF_Base"
target = "ind_avg"
SEED = 42

# Preprocessing data

## Reading data

In [21]:
df = pd.read_csv("dataset_classification.csv")
df["alert"] = pd.to_datetime(df["alert"])
df.sort_values(by="alert", ascending=True, inplace=True)
print("shape:", df.shape)

shape: (106520, 34)


## Dividing into learn and test

In [22]:
learn = df.loc[df["alert"]<datetime(2021,7,1,0,0,0)]
test = df.loc[df["alert"]>=datetime(2021,7,1,0,0,0)]
del learn["alert"], test["alert"]
print(learn.shape, test.shape)

(90759, 33) (15761, 33)


## Applying Reweighing

In [23]:
binaryLabelDataset = BinaryLabelDataset(favorable_label=0,
                                        unfavorable_label=1,
                                        df=learn,
                                        label_names=['ind_avg'],
                                        protected_attribute_names=['ind_prof'])
privileged_groups_params = dict(
    privileged_groups=[{"ind_prof": 1}],
    unprivileged_groups=[{"ind_prof": 0}]
)
rew = Reweighing(**privileged_groups_params)
ds_transf_train = rew.fit_transform(binaryLabelDataset)
pickle.dump(ds_transf_train, open(folder_name + "/" + "ds_transf_train.dat", "wb"))

## Dividing data

In [24]:
X_train = learn.drop(target, axis=1)
y_train = learn[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

In [25]:
dico_dataset = {}
dico_dataset["X_train"] = X_train
dico_dataset["y_train"] = y_train
dico_dataset["X_test"] = X_test
dico_dataset["y_test"] = y_test
pickle.dump(dico_dataset, open(folder_name + "/" + "train_test_xy.dat", "wb"))
del dico_dataset

# Analysis of the model

In [26]:
params = {'n_estimators': 200, 'max_depth': 14, 'max_features': 0.8, 'max_samples': 0.95, 'class_weight': {0: 1, 1: 2}}

best_data = {}
best_data.update(pickle.load(open(folder_name + "/" + "train_test_xy.dat", "rb"))) 
best_data["ds_transf_train"] = pickle.load(open(folder_name + "/" + "ds_transf_train.dat", "rb"))

model = RandomForestClassifier(random_state=SEED,
                               n_jobs=-1)
model.set_params(**params)
model.fit(best_data["X_train"], best_data["y_train"],
          sample_weight=best_data["ds_transf_train"].instance_weights)
y_pred = model.predict(best_data["X_test"])

acc = round(accuracy_score(best_data["y_test"],y_pred)*100, 2)
cm = confusion_matrix(best_data["y_test"], y_pred)
c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
report = classification_report(best_data["y_test"], y_pred, output_dict=True)

print("accuracy:", acc)
print("accuracy class 0:", c0, "accuracy class 1:", c1)
print("f1 score:", report["macro avg"]["f1-score"])

dico_metrics = {}
dico_metrics["acc"] = acc
dico_metrics["f1"] = report["macro avg"]["f1-score"]
dico_metrics["c0"] = c0
dico_metrics["c1"] = c1

accuracy: 75.06
accuracy class 0: 70.68 accuracy class 1: 81.02
f1 score: 0.7495461388436626


In [27]:
dfout = pd.DataFrame()
dfout["ind_prof"] = best_data["X_test"]["ind_prof"]
dfout["y_test"] = best_data["y_test"].values
dfout["y_pred"] = y_pred

print("\n# ================================================")
diff = 0
for cat_name, cat_sex in [("More vol", 0),
                          ("More prof", 1)]:
    print("\nProtected:", cat_name)
    print("------------------------------------")
    dfaux = dfout.loc[dfout["ind_prof"]==cat_sex]
    acc = round(accuracy_score(dfaux["y_test"].values, dfaux["y_pred"].values)*100, 2)
    cm = confusion_matrix(dfaux["y_test"].values, dfaux["y_pred"].values)
    c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
    c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
    print("accuracy : ", acc)
    print("accuracy class 0: ", c0)
    print("accuracy class 1: ", c1)
    dico_metrics[cat_name] = {"acc": acc,
                              "c0": c0,
                              "c1": c1,
                             }
    
print("\n# ================================================")
unpriv_df = dfout[dfout["ind_prof"]==0] 
unpriv_total = unpriv_df.shape[0]

priv_df = dfout[dfout["ind_prof"]==1]
priv_total = priv_df.shape[0]

unpriv_outcomes = unpriv_df[unpriv_df["y_pred"]==1].shape[0]
unpriv_ratio = unpriv_outcomes/unpriv_total

priv_outcomes = priv_df[priv_df["y_pred"]==1].shape[0]
priv_ratio = priv_outcomes/priv_total

disparate_impact = unpriv_ratio/priv_ratio
dico_metrics["disparate"] = disparate_impact

print("\nDisparate_impact", disparate_impact)

print("\n# ================================================")
privileged_groups = [{'ind_prof': 1}] 
unprivileged_groups = [{'ind_prof': 0}] 
ds = best_data['X_test'].copy()
ds["y_pred"] = y_pred
binaryLabelDataset = BinaryLabelDataset(favorable_label=0,
                                        unfavorable_label=1,
                                        df=ds,
                                        label_names=['y_pred'],
                                        protected_attribute_names=['ind_prof'])
metric = BinaryLabelDatasetMetric(binaryLabelDataset, 
                                 unprivileged_groups=unprivileged_groups,
                                 privileged_groups=privileged_groups)
print("\nSEDF", metric.smoothed_empirical_differential_fairness())
dico_metrics["smoothed"] = metric.smoothed_empirical_differential_fairness()
pickle.dump(dico_metrics, open(folder_name + "/" + "dico_metrics.dat", "wb"))



Protected: More vol
------------------------------------
accuracy :  78.16
accuracy class 0:  69.55
accuracy class 1:  84.98

Protected: More prof
------------------------------------
accuracy :  70.6
accuracy class 0:  71.62
accuracy class 1:  67.18


Disparate_impact 1.6317883751240885


SEDF 0.48960472073187145
