In [5]:
import warnings; warnings.simplefilter('ignore')
import logging, sys
logging.disable(sys.maxsize)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import pickle
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

# Parameters

In [6]:
folder_name = "RF_Base"
target = "ind_avg"
SEED = 42

# Writing function

In [5]:
def write(folder_name, values):
    with open(folder_name + "/" + folder_name + '_results.csv', mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()

# Objective function

In [5]:
def objective_function(space):
    
    global SEED, ITER, X_train, y_train, X_test, y_test
    
    ITER += 1
    
    params = {'n_estimators': int(space["n_estimators"]), 
              'max_depth': int(space["max_depth"]),
              'max_features': round(space["max_features"],2),
              'max_samples': round(space["max_samples"],2),
              'class_weight': {0:1, 1:2}
              }
    
    print("\n------------------------------------------")
    print("\n", ITER, ":: ", params)
    
    model = RandomForestClassifier(random_state=SEED,
                                   n_jobs=-1
                                )
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = round(accuracy_score(y_test,y_pred)*100, 2)
    cm = confusion_matrix(y_test, y_pred)
    c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
    c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    loss = 0
    if c0>70 and c1>70: 
        loss = c0*c1
    else:
        loss = c0+c1
        
    # write 
    write(folder_name, [str(ITER),
           loss,
           acc,
           report["macro avg"]["f1-score"],
           report["macro avg"]["precision"],
           report["macro avg"]["recall"],
           c0,
           c1,
           cm,
           classification_report(y_test, y_pred),
           params])
    
    # save files
    dico_files = {}
    dico_files["model"] = model
    dico_files["params"] = params
    dico_files["y_pred"] = y_pred
    pickle.dump(dico_files, open(folder_name + "/" + "/ITER_"+str(ITER)+".dat", "wb"))
    del dico_files

    print("loss", loss)
    print("acc", acc)
    print("f1", report["macro avg"]["f1-score"])
    print("cm", cm)
    print("c0", c0, "c1", c1)
    
    return {'loss':-loss, 'status': STATUS_OK}

# Preprocessing data

## Reading data

In [4]:
df = pd.read_csv("dataset_classification.csv")
df["alert"] = pd.to_datetime(df["alert"])
df.sort_values(by="alert", ascending=True, inplace=True)
print("shape:", df.shape)

shape: (106520, 34)


## Dividing into learn and test

In [12]:
learn = df.loc[df["alert"]<datetime(2021,7,1,0,0,0)]
test = df.loc[df["alert"]>=datetime(2021,7,1,0,0,0)]
del learn["alert"], test["alert"]
print(learn.shape, test.shape)

(90759, 33) (15761, 33)


In [14]:
X_train = learn.drop(target, axis=1)
y_train = learn[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

In [16]:
dico_dataset = {}
dico_dataset["X_train"] = X_train
dico_dataset["y_train"] = y_train
dico_dataset["X_test"] = X_test
dico_dataset["y_test"] = y_test
pickle.dump(dico_dataset, open(folder_name + "/" + "train_test_xy.dat", "wb"))
del dico_dataset

# Searching best model

In [11]:
space = {'n_estimators': hp.quniform('n_estimators', 200, 600, 20),
         'max_depth': hp.quniform('max_depth', 10, 15, 1),
         'max_features': hp.quniform('max_features', 0.5, 1, 0.05),
         'max_samples': hp.quniform('max_samples', 0.5, 1, 0.05),
         #'w0': hp.uniform('c0', 1, 5), 
         #'w1': hp.uniform("c1", 1, 5)
        }

header = ["iter", "loss", "acc", "f1-score", "precision", "recall", "c0", "c1", "cm", "report", "params"]
write(folder_name, header)

ITER = 0
trials = Trials()
best = fmin(fn=objective_function,
            space=space,
            algo=tpe.suggest,
            rstate= np.random.default_rng(SEED),
            max_evals=50,
            verbose=False,
            trials=trials)


------------------------------------------

 1 ::  {'n_estimators': 520, 'max_depth': 13, 'max_features': 0.95, 'max_samples': 0.8, 'class_weight': {0: 1, 1: 2}}
loss 5986.9130000000005
acc 77.2
f1 0.7693701815110343
cm [[6920 2167]
 [1427 5247]]
c0 76.15 c1 78.62

------------------------------------------

 2 ::  {'n_estimators': 320, 'max_depth': 10, 'max_features': 0.95, 'max_samples': 0.8, 'class_weight': {0: 1, 1: 2}}
loss 2943.6000000000004
acc 76.3
f1 0.7612763960643543
cm [[6687 2400]
 [1335 5339]]
c0 73.59 c1 80.0

------------------------------------------

 3 ::  {'n_estimators': 260, 'max_depth': 11, 'max_features': 0.8, 'max_samples': 0.55, 'class_weight': {0: 1, 1: 2}}
loss 2980.2324000000003
acc 76.86
f1 0.7665810733699183
cm [[6791 2296]
 [1351 5323]]
c0 74.73 c1 79.76

------------------------------------------

 4 ::  {'n_estimators': 360, 'max_depth': 14, 'max_features': 0.8, 'max_samples': 0.6, 'class_weight': {0: 1, 1: 2}}
loss 3004.7702000000004
acc 77.64
f1 0.7

loss 2984.4408000000003
acc 77.52
f1 0.7710784165721166
cm [[7167 1920]
 [1623 5051]]
c0 78.87 c1 75.68

------------------------------------------

 33 ::  {'n_estimators': 520, 'max_depth': 14, 'max_features': 0.8, 'max_samples': 0.65, 'class_weight': {0: 1, 1: 2}}
loss 3003.7376
acc 77.6
f1 0.7724469268345734
cm [[7095 1992]
 [1539 5135]]
c0 78.08 c1 76.94

------------------------------------------

 34 ::  {'n_estimators': 400, 'max_depth': 10, 'max_features': 0.65, 'max_samples': 0.8, 'class_weight': {0: 1, 1: 2}}
loss 2950.4230000000002
acc 76.31
f1 0.7616367410307743
cm [[6628 2459]
 [1275 5399]]
c0 72.94 c1 80.9

------------------------------------------

 35 ::  {'n_estimators': 260, 'max_depth': 13, 'max_features': 0.8, 'max_samples': 0.5, 'class_weight': {0: 1, 1: 2}}
loss 6000.0435
acc 77.45
f1 0.7713392277825436
cm [[7031 2056]
 [1498 5176]]
c0 77.37 c1 77.55

------------------------------------------

 36 ::  {'n_estimators': 360, 'max_depth': 12, 'max_features': 0.8, 

# Analysis of best model

In [15]:
best_iter = "43"
best_data = pickle.load(open(folder_name + "/" + "ITER_"+str(best_iter)+".dat", "rb"))
best_data.update(pickle.load(open(folder_name + "/" + "train_test_xy.dat", "rb")))
dfout = best_data["X_test"][["ind_prof"]]
dfout["y_test"] = best_data["y_test"].values
dfout["y_pred"] = best_data["y_pred"]

## Accuracy

In [16]:
acc = round(accuracy_score(dfout["y_test"],dfout["y_pred"])*100, 2)
cm = confusion_matrix(dfout["y_test"], dfout["y_pred"])
c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
report = classification_report(dfout["y_test"], dfout["y_pred"], output_dict=True)
dico_metrics = {}
dico_metrics["acc"] = acc
dico_metrics["f1"] = report["macro avg"]["f1-score"]
dico_metrics["c0"] = c0
dico_metrics["c1"] = c1
print("accuracy:", acc)
print("accuracy class 0:", c0, "accuracy class 1:", c1)
print("f1 score:", report["macro avg"]["f1-score"])

accuracy: 77.52
accuracy class 0: 77.39 accuracy class 1: 77.7
f1 score: 0.7720779478795234


In [22]:
for cat_name, cat_sex in [("More vol", 0),
                          ("More prof", 1)]:
    print("Protected:", cat_name)
    print("------------------------------------")
    dfaux = dfout.loc[dfout["ind_prof"]==cat_sex]
    acc = round(accuracy_score(dfaux["y_test"].values, dfaux["y_pred"].values)*100, 2)
    cm = confusion_matrix(dfaux["y_test"].values, dfaux["y_pred"].values)
    c0 = round(cm[0,0]/np.sum(cm[0])*100, 2)
    c1 = round(cm[1,1]/np.sum(cm[1])*100, 2)
    print("accuracy : ", acc)
    print("accuracy class 0: ", c0)
    print("accuracy class 1: ", c1)
    print("\n")
    dico_metrics[cat_name] = {"acc": acc,
                              "c0": c0,
                              "c1": c1,
                             }

Protected: More vol
------------------------------------
accuracy :  77.48
accuracy class 0:  64.63
accuracy class 1:  87.66


Protected: More prof
------------------------------------
accuracy :  77.58
accuracy class 0:  87.92
accuracy class 1:  42.97




## Disparate impact

In [23]:
unpriv_df = dfout[dfout["ind_prof"]==0] 
unpriv_total = unpriv_df.shape[0]

priv_df = dfout[dfout["ind_prof"]==1]
priv_total = priv_df.shape[0]

unpriv_outcomes = unpriv_df[unpriv_df["y_pred"]==1].shape[0]
unpriv_ratio = unpriv_outcomes/unpriv_total

priv_outcomes = priv_df[priv_df["y_pred"]==1].shape[0]
priv_ratio = priv_outcomes/priv_total

disparate_impact = unpriv_ratio/priv_ratio
dico_metrics["disparate"] = disparate_impact

print("Disparate_impact", disparate_impact)

Disparate_impact 3.363926945787856


## Smoothed_empirical_differential_fairness

In [25]:
privileged_groups = [{'ind_prof': 1}] 
unprivileged_groups = [{'ind_prof': 0}] 

ds = best_data['X_test'].copy()
ds["y_pred"] = best_data["y_pred"]
binaryLabelDataset = BinaryLabelDataset(favorable_label=0,
                                        unfavorable_label=1,
                                        df=ds,
                                        label_names=['y_pred'],
                                        protected_attribute_names=['ind_prof'])
metric = BinaryLabelDatasetMetric(binaryLabelDataset, 
                                 unprivileged_groups=unprivileged_groups,
                                 privileged_groups=privileged_groups)
print("SEDF", metric.smoothed_empirical_differential_fairness())
dico_metrics["smoothed"] = metric.smoothed_empirical_differential_fairness()
pickle.dump(dico_metrics, open(folder_name + "/" + "dico_metrics.dat", "wb"))

SEDF 1.2128363675562777
