<a href="https://colab.research.google.com/github/qahtanaa/OnSubGroupFairness/blob/main/Comparison_IR_EqualSize_OS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%reset -f

In [None]:
!pip install aif360



In [None]:
import numpy as np
import pandas as pd
from aif360.metrics import utils
from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
from sklearn.neighbors import NearestNeighbors
from sympy import Symbol
from sympy.solvers import solve
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from aif360.algorithms.preprocessing import *
from aif360.algorithms.preprocessing.optim_preproc_helpers import distortion_functions, opt_tools
from aif360.algorithms.inprocessing import *
from aif360.algorithms.postprocessing import *
import math
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier



---



---



---

DATASET

In [None]:
def preprocess_dataset(dataset_path, dataset_type, model):
    if dataset_type == 'German':
        df = pd.read_csv(dataset_path)
        df['age'] = df['age'].apply(lambda age: 1 if age >= 25 else 0)
        df['personal_status'] = df['personal_status'].apply(lambda sex: 1 if sex == 'male' else 0)
        print("German dataset:")
        print(df.head())
        sensitive_attributes = ['personal_status','age']
        label = 'credit'
        privileged = [1, 1]
        unprivileged = [0, 0]
        favorable_label = 1
        unfavorable_label = 2
        groups = [
                  {'name': 'Male Adult', 'attributes': {'personal_status': 1, 'age': 1}},
                  {'name': 'Female Adult', 'attributes': {'personal_status': 0, 'age': 1}},
                  {'name': 'Male Young', 'attributes': {'personal_status': 1, 'age': 0}},
                  {'name': 'Female Young', 'attributes': {'personal_status': 0, 'age': 0}}
              ]
        model = model
        #num_clust_km = 4

    elif dataset_type == 'COMPAS':
        df = pd.read_csv(dataset_path)
        selected_columns = ['sex', 'age_cat', 'race', 'juv_fel_count', 'juv_misd_count',
                            'juv_other_count', 'priors_count', 'c_charge_degree',
                            'c_charge_desc', 'two_year_recid']
        df = df[selected_columns]
        df = df[(df['race'] == 'Caucasian') | (df['race'] == 'African-American')].reset_index(drop=True)
        print("COMPAS dataset:")
        print(df.head())
        sensitive_attributes = ['race','sex']
        label = 'two_year_recid'
        privileged = ['Caucasian', 'Female']
        unprivileged = ['African-American', 'Male']
        favorable_label = 0
        unfavorable_label = 1
        groups = [
                  {'name': 'Caucasian Female', 'attributes': {'race': 1, 'sex': 1}},
                  {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 1}},
                  {'name': 'Causasian Male', 'attributes': {'race': 1, 'sex': 0}},
                  {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 0}}
              ]
        model = model

    elif dataset_type == 'Adult':
        df = pd.read_csv('/content/raw_adult_dataset.csv', delimiter=';')
        df['income'] = df['income'].str.strip().replace({'>50K.': '>50K', '<=50K.': '<=50K'})
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
        df.replace('?', np.nan, inplace=True)
        df = df.drop(columns=['fnlwgt', 'education-num'])
        df = df[(df['race'] == 'White') | (df['race'] == 'Black')].reset_index(drop=True)
        print("Adult dataset:")
        print(df.head())
        sensitive_attributes = ['race','sex']
        label = 'income'
        privileged = ['White', 'Male']
        unprivileged = ['Black', 'Female']
        favorable_label = '>50K'
        unfavorable_label = '<=50K'
        groups = [
                  {'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}},
                  {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}},
                  {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}},
                  {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}}
              ]
        model = model

    return df, sensitive_attributes, label, privileged, unprivileged, favorable_label, unfavorable_label, groups, model#, num_clust_km

In [None]:
##################################################################################
df, sensitive_attributes, label, privileged, unprivileged, favorable_label, unfavorable_label, groups, model = preprocess_dataset('/content/raw_adult_dataset.csv', 'Adult', 'Logistic Regression')
# = '/content/raw_german_dataset.csv', 'German'
# = preprocess_dataset('/content/raw_compas_dataset.csv', 'COMPAS')
# = preprocess_dataset('/content/raw_adult_dataset.csv', 'Adult')

#model == 'Logistic Regression':
#model == 'Random Forest':
#model == 'Gradient Boosting':

Adult dataset:
   age         workclass  education      marital_status         occupation  \
0   39         State-gov  Bachelors       Never-married       Adm-clerical   
1   50  Self-emp-not-inc  Bachelors  Married-civ-spouse    Exec-managerial   
2   38           Private    HS-grad            Divorced  Handlers-cleaners   
3   53           Private       11th  Married-civ-spouse  Handlers-cleaners   
4   28           Private  Bachelors  Married-civ-spouse     Prof-specialty   

    relationship   race     sex  capital_gain  capital_loss  hours_per_week  \
0  Not-in-family  White    Male          2174             0              40   
1        Husband  White    Male             0             0              13   
2  Not-in-family  White    Male             0             0              40   
3        Husband  Black    Male             0             0              40   
4           Wife  Black  Female             0             0              40   

  native_country income  
0  United-State



---



---



---

DATA PREPARATION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

class DataPreparation():
    """
    ........
    """
    def __init__(self, df, sensitive, label, priv, unpriv, fav, unfav, categorical=[]):
        """
        Construct all necessary attributes for the data preparation.

        df : (pandas DataFrame) containing the data
        sensitive : (list(str)) specifying the column names of all sensitive features
        label : (str) specifying the label column
        priv : (list(dicts)) representation of the privileged groups
        unpriv : (list(dicts)) representation of the unprivileged groups
        fav : (str/int/..) value representing the favorable label
        unfav : (str/int/..) value representing the unfavorable label
        categorical : (list(str)) (optional) specifying column names of categorical features
        """
        self.df = df
        self.sensitive = sensitive
        self.label = label
        self.priv = priv
        self.unpriv = unpriv
        self.fav = fav
        self.unfav = unfav
        self.categorical = categorical

    def detect_missing_values(self):
        """
        Detect rows with missing values and remove them from the DataFrame.
        """
        initial_rows = len(self.df)
        self.df = self.df.dropna()
        removed_rows = initial_rows - len(self.df)

        if removed_rows > 0:
            print(f"Detected {removed_rows} rows with missing values. Removed them.")
        else:
            print("No missing values detected.")  # pass

    def binary_label(self):
        """
        Ensure the decision label and sensitive attributes are encoded as binary, where:
        - Favorable label and privileged groups are encoded as 1.
        - Unfavorable label and unprivileged groups are encoded as 0.
        """
        if len(self.priv) != 2 or len(self.unpriv) != 2:
            raise ValueError("Both 'priv' and 'unpriv' must contain exactly two values.")

        number_label_values = self.df[self.label].nunique()
        if number_label_values == 2:
            print(f"The '{self.label}' column has only two unique values.")
            self.df.loc[:, self.label] = self.df[self.label].replace([self.unfav, self.fav], [0, 1])
        else:
            print(f"The '{self.label}' column does not have exactly two unique values, as it should.")

        # Create mappings for each sensitive attribute
        race_mapping = {self.priv[0]: 1, self.unpriv[0]: 0}
        sex_mapping = {self.priv[1]: 1, self.unpriv[1]: 0}

        # Apply the mappings to the respective columns
        self.df.loc[:, self.sensitive[0]] = self.df[self.sensitive[0]].replace(race_mapping)
        self.df.loc[:, self.sensitive[1]] = self.df[self.sensitive[1]].replace(sex_mapping)

    def find_categorical_attributes(self):
        """
        Identify categorical attributes and encode.
        """
        self.attribute_types = {}

        for column in self.df.columns:
            if column == 'Group':
                continue  # Skip the 'Group' column
            elif column in self.categorical:
                self.attribute_types[column] = 'Categorical'
            elif self.df[column].nunique() == 2:
                self.attribute_types[column] = 'Categorical'
            else:
                num_float = 0
                num_text = 0
                thresh = 0.99
                num_att_in_column = len(self.df[column])

                for value in self.df[column]:
                    try:
                        float(value)
                        num_float += 1
                    except ValueError:
                        num_text += 1

                if num_float / num_att_in_column > thresh:
                    self.attribute_types[column] = 'Numerical'
                else:
                    self.attribute_types[column] = 'Categorical'
        # Boolean
        self.cat_features = []
        for attr in self.attribute_types:
            self.cat_features.append(self.attribute_types[attr] == 'Categorical')

        encoder_dict = dict()
        self.columns_categorical = self.df.columns[self.cat_features]

        for column in self.columns_categorical:
            le = LabelEncoder()
            self.df.loc[:, column] = le.fit_transform(self.df[column].values)
            mapping = dict(zip(le.classes_, range(len(le.classes_))))
            encoder_dict[column] = mapping
        print(encoder_dict, 'encoder dict')
        self.numerical_features = [not feature for feature in self.cat_features]
        self.columns_numerical = self.df.columns[self.numerical_features]

        for column in self.columns_numerical:
            self.df.loc[:, column] = self.df[column].astype(float)

        return self.attribute_types, self.cat_features, self.numerical_features

    def create_group_column(self):
        """
        Create a 'Group' column in the DataFrame based on protected attributes, privileged/unprivileged conditions, and label.
        """
        group_combinations = pd.MultiIndex.from_product([self.df[sensitive].unique() for sensitive in self.sensitive] + [self.df[self.label].unique()], names=self.sensitive + [self.label])
        print(list(enumerate(group_combinations)))
        # Create a mapping between group combinations and their corresponding numbers
        group_mapping = {group: idx for idx, group in enumerate(group_combinations)}
        reverse_group_mapping = {idx: group for group, idx in group_mapping.items()}
        self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)

        return reverse_group_mapping

    def train_test_split(self):
        X = self.df.loc[:, self.df.columns != self.label]
        y = self.df[self.label]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=self.df['Group'])  # , random_state=42

        return self.X_train, self.y_train, self.X_test, self.y_test

    def standardization_numerical(self):
        train_dataset_numerical = self.X_train[self.columns_numerical]
        test_dataset_numerical = self.X_test[self.columns_numerical]

        scaler = StandardScaler().fit(train_dataset_numerical)
        train_dataset_scaled_numerical = scaler.transform(train_dataset_numerical)
        test_dataset_scaled_numerical = scaler.transform(test_dataset_numerical)

        self.X_train.loc[:, self.columns_numerical] = train_dataset_scaled_numerical
        self.X_test.loc[:, self.columns_numerical] = test_dataset_scaled_numerical

        self.X_train = pd.concat([self.X_train, self.y_train], axis=1)
        self.X_test = pd.concat([self.X_test, self.y_test], axis=1)
        return self.X_train, self.X_test

    def prepare(self):
        """
        Perform all preprocessing steps.
        """
        self.detect_missing_values()
        self.binary_label()
        self.find_categorical_attributes()
        self.create_group_column()
        self.train_test_split()
        self.standardization_numerical()
        return self


In [None]:
##################################################################################
#use the DataPreparation class to preprocess the dataframe
data_prep = DataPreparation(df, sensitive_attributes, label, privileged, unprivileged, favorable_label, unfavorable_label)
data_prep.prepare()
data_prep.df = data_prep.df.reset_index(drop=True)
X_train, X_test = data_prep.X_train, data_prep.X_test
attribute_types = data_prep.attribute_types
cat_features = data_prep.cat_features
numerical_features = data_prep.numerical_features
reverse_group_mapping = data_prep.create_group_column()
#theoretical_num_groups = len(reverse_group_mapping)
X_train = X_train.reset_index(drop=True)
print(X_train.head())

Detected 3316 rows with missing values. Removed them.
The 'income' column has only two unique values.
{'workclass': {'Federal-gov': 0, 'Local-gov': 1, 'Private': 2, 'Self-emp-inc': 3, 'Self-emp-not-inc': 4, 'State-gov': 5, 'Without-pay': 6}, 'education': {'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}, 'marital_status': {'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}, 'occupation': {'Adm-clerical': 0, 'Armed-Forces': 1, 'Craft-repair': 2, 'Exec-managerial': 3, 'Farming-fishing': 4, 'Handlers-cleaners': 5, 'Machine-op-inspct': 6, 'Other-service': 7, 'Priv-house-serv': 8, 'Prof-specialty': 9, 'Protective-serv': 10, 'Sales': 11, 'Tech-support': 12, 'Transport-moving': 13}, 'relationship': {'Husband': 0, 'Not-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)


In [None]:
grouped_counts = X_train.groupby(sensitive_attributes + [label]).size().reset_index(name='count')

# Print the counts
print(grouped_counts)

   race  sex  income  count
0     0    0       0   1371
1     0    0       1     88
2     0    1       0   1215
3     0    1       1    286
4     1    0       0   7299
5     1    0       1   1019
6     1    1       0  12787
7     1    1       1   6126


In [None]:
# Define sensitive attribute and label columns
sensitive_attr_0 = sensitive_attributes[0]
sensitive_attr_1 = sensitive_attributes[1]

# Calculate the number of ones and zeros for each group
num_group_11_ones = X_train[(X_train[sensitive_attr_0] == 1) &
                            (X_train[sensitive_attr_1] == 1) &
                            (X_train[label] == 1)].shape[0]

num_group_11_zeros = X_train[(X_train[sensitive_attr_0] == 1) &
                             (X_train[sensitive_attr_1] == 1) &
                             (X_train[label] == 0)].shape[0]

num_group_10_ones = X_train[(X_train[sensitive_attr_0] == 1) &
                            (X_train[sensitive_attr_1] == 0) &
                            (X_train[label] == 1)].shape[0]

num_group_10_zeros = X_train[(X_train[sensitive_attr_0] == 1) &
                             (X_train[sensitive_attr_1] == 0) &
                             (X_train[label] == 0)].shape[0]

num_group_01_ones = X_train[(X_train[sensitive_attr_0] == 0) &
                            (X_train[sensitive_attr_1] == 1) &
                            (X_train[label] == 1)].shape[0]

num_group_01_zeros = X_train[(X_train[sensitive_attr_0] == 0) &
                             (X_train[sensitive_attr_1] == 1) &
                             (X_train[label] == 0)].shape[0]

num_group_00_ones = X_train[(X_train[sensitive_attr_0] == 0) &
                            (X_train[sensitive_attr_1] == 0) &
                            (X_train[label] == 1)].shape[0]

num_group_00_zeros = X_train[(X_train[sensitive_attr_0] == 0) &
                             (X_train[sensitive_attr_1] == 0) &
                             (X_train[label] == 0)].shape[0]

# Calculate imbalance ratios
imbalance_ratio_11 = num_group_11_ones / num_group_11_zeros if num_group_11_zeros != 0 else float('inf')
imbalance_ratio_10 = num_group_10_ones / num_group_10_zeros if num_group_10_zeros != 0 else float('inf')
imbalance_ratio_01 = num_group_01_ones / num_group_01_zeros if num_group_01_zeros != 0 else float('inf')
imbalance_ratio_00 = num_group_00_ones / num_group_00_zeros if num_group_00_zeros != 0 else float('inf')

# Print imbalance ratios
print(f"Imbalance ratio for group (1, 1): {imbalance_ratio_11}")
print(f"Imbalance ratio for group (1, 0): {imbalance_ratio_10}")
print(f"Imbalance ratio for group (0, 1): {imbalance_ratio_01}")
print(f"Imbalance ratio for group (0, 0): {imbalance_ratio_00}")


Imbalance ratio for group (1, 1): 0.4790803159458825
Imbalance ratio for group (1, 0): 0.13960816550212357
Imbalance ratio for group (0, 1): 0.2353909465020576
Imbalance ratio for group (0, 0): 0.06418672501823487


In [None]:
# #################################################################################
# num_privileged_ones = X_train[(X_train[sensitive_attributes[0]] == 1) &
#                               (X_train[sensitive_attributes[1]] == 1) &
#                               (X_train[label] == 1)].shape[0]

# num_privileged_zeros = X_train[(X_train[sensitive_attributes[0]]) &
#                                (X_train[sensitive_attributes[1]] == 1) &
#                                (X_train[label] == 0)].shape[0]

# # Calculating the ratio of the most privileged class
# total_ratio = num_privileged_ones / num_privileged_zeros if num_privileged_zeros != 0 else float('inf')  # Avoid division by zero

# print(f"Ratio of most privileged class: {total_ratio}")

In [None]:
#################################################################################
## Save the 'Group' column from X_train
subgroup_column_train = X_train['Group']
subgroup_column_test = X_test['Group']

# Drop the 'Group' column from X_train
X_train = X_train.drop(columns=['Group'])
X_test = X_test.drop(columns=['Group'])

In [None]:
#distances = gower.gower_matrix(??, os_df, cat_features=cat_features)

In [None]:
!pip install gower





---

---



---

METHOD 3 - DBSCAN \\
for each group in the dataset: \\
-cluster the group in order to find the points that are considered Core, Borderline, Noise by the DBSCAN algorithm \\
-oversample each group using the Borderline points, to reach the imbalance ratio of the most priviledged group



In [None]:
# find the eps
from gower import gower_matrix
from sklearn.cluster import DBSCAN
import math

# # Filter the dataset for one specific combination of sensitive attributes and labels
# filtered_data = X_train[(X_train[sensitive_attributes[0]] == 1) & (X_train[sensitive_attributes[1]] == 0) & (X_train[label] == 1)]

# # Calculate Gower distance matrix
# distance_matrix = gower_matrix(filtered_data, cat_features=cat_features)

# eps = 0.16
# # German: group (1,1,1), eps = 0.15 3 clusters
# #round(math.log(len(filtered_data)))
# # DBSCAN clustering
# dbscan = DBSCAN(eps=eps, min_samples=round(math.log(len(filtered_data))), metric='precomputed')  # Set appropriate values for eps and min_samples
# clusters = dbscan.fit_predict(distance_matrix)

# # Assign cluster labels to dataframe
# filtered_data['cluster'] = clusters

# print(round(math.log(len(filtered_data))))
# # Display the number of samples in each cluster
# cluster_counts = filtered_data.groupby(['cluster']).size()
# print("Number of samples in each cluster:")
# print(cluster_counts)

# # Get cluster labels
# labels = dbscan.labels_

# # Get core samples
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[dbscan.core_sample_indices_] = True

# # Identify core, border, and noise points
# core_points = filtered_data[core_samples_mask]
# border_points = filtered_data[~core_samples_mask & (labels != -1)]
# noise_points = filtered_data[labels == -1]

# # For example, you can print the number of points in each category
# print("Number of core points:", len(core_points))
# print("Number of border points:", len(border_points))
# print("Number of noise points:", len(noise_points))


In [None]:
# # Function to find the optimal epsilon
# def find_optimal_epsilonn(filtered_data, cat_features, min_samples, eps_step=0.001, eps_min=0.01, eps_max=1.1):
#     distance_matrix = gower_matrix(filtered_data, cat_features=cat_features)

#     def cluster_count(eps):
#         dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
#         labels = dbscan.fit_predict(distance_matrix)
#         unique_labels = np.unique(labels)
#         n_clusters = len(unique_labels) #- (1 if -1 in unique_labels else 0)
#         return n_clusters, unique_labels

#     # Binary search for optimal epsilon
#     while eps_max - eps_min > eps_step:
#         eps_mid = (eps_min + eps_max) / 2
#         n_clusters_mid, labels_mid = cluster_count(eps_mid)
#         print(eps_mid, n_clusters_mid, labels_mid, 'mids')

#         if n_clusters_mid == 1:
#             if -1 in labels_mid:
#                 eps_min = eps_mid  # Only noise points, increase epsilon
#             else:
#                 eps_max = eps_mid  # Only core points, decrease epsilon
#         elif n_clusters_mid > 2:
#             eps_min = eps_mid  # More than two clusters, increase epsilon
#         else:
#             eps_max = eps_mid  # Exactly two clusters, continue search to fine-tune
#         print(eps_min, eps_max, 'min, max')
#     return eps_max

# # Filter the dataset for one specific combination of sensitive attributes and labels
# filtered_data = X_train[(X_train[sensitive_attributes[0]] == 0) &
#                         (X_train[sensitive_attributes[1]] == 1) &
#                         (X_train[label] == 1)]

# # Calculate min_samples
# min_samples = round(math.log(len(filtered_data)))

# # Find the optimal epsilon
# optimal_eps = find_optimal_epsilonn(filtered_data, cat_features, min_samples)

# # DBSCAN clustering with the optimal epsilon
# distance_matrix = gower_matrix(filtered_data, cat_features=cat_features)
# dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples, metric='precomputed')
# clusters = dbscan.fit_predict(distance_matrix)

# # Assign cluster labels to dataframe
# filtered_data['cluster'] = clusters

# # Display the number of samples in each cluster
# cluster_counts = filtered_data.groupby(['cluster']).size()
# print("Number of samples in each cluster:")
# print(cluster_counts)

# # Get cluster labels
# labels = dbscan.labels_

# # Get core samples
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[dbscan.core_sample_indices_] = True

# # Identify core, border, and noise points
# core_points = filtered_data[core_samples_mask]
# border_points = filtered_data[~core_samples_mask & (labels != -1)]
# noise_points = filtered_data[labels == -1]

# # Print the number of points in each category
# print("Optimal epsilon:", optimal_eps)
# print("Number of core points:", len(core_points))
# print("Number of border points:", len(border_points))
# print("Number of noise points:", len(noise_points))


In [None]:
# from gower import gower_matrix
# from sklearn.cluster import DBSCAN
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.neighbors import NearestNeighbors

# # Compute distances to nearest neighbors
# k = round(math.log(len(filtered_data)))
# nbrs = NearestNeighbors(n_neighbors=k, metric='precomputed').fit(distance_matrix)
# distances, _ = nbrs.kneighbors(distance_matrix)

# # Compute reachability distances
# reachability_distances = np.mean(distances[:, 1:], axis=1)

# # Sort reachability distances in ascending order
# sorted_distances = np.sort(reachability_distances)

# # Plot reachability distances
# plt.figure(figsize=(6, 4))
# plt.plot(sorted_distances)
# plt.title('Reachability Plot')
# plt.xlabel('Data Points (Sorted)')
# plt.ylabel('Reachability Distance')
# plt.grid(True)
# plt.show()


In [None]:
# # Compute distances to K-nearest neighbors
# k = round(math.log(len(filtered_data))) # Choose the value of K
# nbrs = NearestNeighbors(n_neighbors=k, metric='precomputed').fit(distance_matrix)
# distances, _ = nbrs.kneighbors(distance_matrix)

# # Sort distances
# sorted_distances = np.sort(distances[:, -1])

# # Plot K-distance graph
# plt.plot(range(len(filtered_data)), sorted_distances)
# plt.xlabel('Data Points')
# plt.ylabel('Distance to Kth Nearest Neighbor')
# plt.title('K-distance Graph')
# plt.show()




---



---



---



In [None]:
def oversample_groups(X_train, cat_features, custom_smote, group_column_train,sensitive_attributes,label, reverse_group_mapping):
    """
    Function to oversample multiple groups automatically based on group labels.

    Parameters:
    - X_train: Preprocessed training dataset.
    - cat_features: List indicating categorical features.
    - custom_smote: Custom SMOTE function to be used.
    - group_column_train: Column containing the group label for each instance.
    - total_ratio: Desired ratio of positive to negative labels.
    - reverse_group_mapping: Mapping of groups to sensitive attributes and labels.

    Returns:
    - synthetic_samples_matrix: Matrix containing all generated synthetic samples.
    - synthetic_samples_group: Array of group labels for the synthetic samples.
    """

    synthetic_samples = []
    synthetic_samples_group = []

    largest_group_size_label_1 = max(
    X_train[(X_train[sensitive_attributes[0]] == val0) &
            (X_train[sensitive_attributes[1]] == val1) &
            (X_train[label] == 1)].shape[0]
    for val0 in [0, 1]
    for val1 in [0, 1]
    )

    # Determine the largest group size with X_train[label] == 0
    largest_group_size_label_0 = max(
        X_train[(X_train[sensitive_attributes[0]] == val0) &
                (X_train[sensitive_attributes[1]] == val1) &
                (X_train[label] == 0)].shape[0]
        for val0 in [0, 1]
        for val1 in [0, 1]
    )

    groups = sorted(group_column_train.unique())
    #paired_groups = [(groups[i], groups[i+1]) for i in range(0, len(groups), 2)]


    for group in groups:
        ########## Determine pu_ix and nu_ix using reverse_group_mapping ##########

        # Update the total_ratio based on the reverse_group_mapping condition
        if reverse_group_mapping[group][2] == 1:
            tot_ratio = largest_group_size_label_1
        else:
            tot_ratio = largest_group_size_label_0
        ##########################################################################

        group_df = X_train[group_column_train == group]
        # group_df_nu = X_train[group_column_train == nu_ix]
        # positive_count = group_df_pu[group_df_pu[label] == 1].shape[0]
        # negative_count = group_df_nu[group_df_nu[label] == 0].shape[0]

        #if positive_count == 0 or negative_count == 0:
        #    continue

        #current_ratio = positive_count / negative_count
        if group_df.shape[0] == tot_ratio:
            continue  # Skip the most privileged group

        synthetic_points, synthetic_count = custom_smote(group_df, cat_features, group_column_train, total_ratio=tot_ratio)
        #pu_column = np.full((len(synthetic_points), 1), os_ix)
        synthetic_samples.append(synthetic_points)
        #synthetic_samples_group.append(pu_column)
        print(f"Oversampling for group  ({group}): Added {synthetic_count} synthetic samples in {group}.")

    synthetic_samples_matrix = pd.concat(synthetic_samples, ignore_index=True)
    #synthetic_samples_group = np.concatenate(synthetic_samples_group)

    return synthetic_samples_matrix#, synthetic_samples_group




---



---



---



In [None]:
def find_optimal_epsilon(filtered_data, cat_features, min_samples, distance_matrix, eps_step=0.001, eps_min=0.01, eps_max=1.1):

    def cluster_count(eps):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        labels = dbscan.fit_predict(distance_matrix)
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels)  # - (1 if -1 in unique_labels else 0)
        return n_clusters, unique_labels

    # Binary search for optimal epsilon
    while eps_max - eps_min > eps_step:
        eps_mid = (eps_min + eps_max) / 2
        n_clusters_mid, labels_mid = cluster_count(eps_mid)
        print(eps_mid, n_clusters_mid, labels_mid, 'mids')

        if n_clusters_mid == 1:
            if -1 in labels_mid:
                eps_min = eps_mid  # Only noise points, increase epsilon
            else:
                eps_max = eps_mid  # Only core points, decrease epsilon
        elif n_clusters_mid > 2:
            eps_min = eps_mid  # More than two clusters, increase epsilon
        else:
            eps_max = eps_mid  # Exactly two clusters, continue search to fine-tune
        print(eps_min, eps_max, 'min, max')
    return eps_max

# Custom SMOTE-DBSCAN function
def custom_smote_dbscan(group_df, cat_features, group_column_train, total_ratio):
    """
    X_train is the training dataset preprocessed, group_column_train is a column containing the group of each
    instance in X_train
    """
    #cat_attr_ix = [i for i, value in enumerate(cat_features) if value]

    # X2_df = X_train[group_column_train == pu_ix]
    # X2 = X2_df.values
    # X3_df = X_train[group_column_train == nu_ix]
    # X3 = X3_df.values

    # PU = len(X2)
    # NU = len(X3)
    group_val = group_df.values
    PU = len(group_val)

    # Determine the oversampling target based on a given total_ratio
    if PU > total_ratio:
        print("it shouldn't be :(")
    elif PU == total_ratio:
        print("The ratio of PU to NU is within the acceptable range of total_ratio.")
        return [], 0, #pu_ix
    else:
        oversampling_target = total_ratio - PU
        os_df = group_df
        #os_ix = pu_ix
    os_df = os_df.reset_index(drop=True)

    # Calculate min_samples
    min_samples = round(math.log(len(os_df)))
    distance_matrix = gower_matrix(os_df, cat_features=cat_features)

    # Find the optimal epsilon for os_df
    optimal_eps = find_optimal_epsilon(os_df, cat_features, min_samples, distance_matrix)

    # DBSCAN clustering with the optimal epsilon
    dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)

    # Get cluster labels
    labels = dbscan.labels_

    # Get core samples
    core_samples_mask = np.zeros_like(labels, dtype=bool)
    core_samples_mask[dbscan.core_sample_indices_] = True

    # Identify core, border, and noise points
    core_points = os_df[core_samples_mask]
    border_points = os_df[~core_samples_mask & (labels != -1)]
    noise_points = os_df[labels == -1]

    if len(border_points) == 0:
        border_points = core_points

    # Initialize synthetic samples list
    synthetic_samples = []

    border_indices = border_points.index.tolist()
    random.shuffle(border_indices)
    current_index = 0

    while len(synthetic_samples) < oversampling_target:
        idx_A = border_indices[current_index % len(border_indices)]
        current_index += 1
        point_A = os_df.loc[idx_A]

        # Ensure point B is not a noise point
        distances_to_A = distance_matrix[idx_A]
        neighbors = np.argsort(distances_to_A)[1:min_samples+1]  # Exclude the point itself
        valid_neighbors = [idx for idx in neighbors if labels[idx] != -1]  # Exclude noise points

        if not valid_neighbors:
            continue  # Skip if no valid neighbors are found

        idx_B = np.random.choice(valid_neighbors)
        point_B = os_df.loc[idx_B]

        synthetic_point = {}
        for i, col in enumerate(os_df.columns):
            if cat_features[i]:
                neighbor_values = os_df.iloc[valid_neighbors][col].tolist()
                synthetic_point[col] = max(set(neighbor_values), key=neighbor_values.count)
            else:
                alpha = np.random.rand()
                synthetic_point[col] = point_A[col] + alpha * (point_B[col] - point_A[col])

        synthetic_samples.append(synthetic_point)

    return pd.DataFrame(synthetic_samples), len(synthetic_samples)

# Example usage (assuming you have defined X_train, cat_features, etc.):
# synthetic_samples, num_samples, oversampled_index = custom_smote_dbscan(X_train, cat_features, pu_ix, nu_ix, group_column_train, total_ratio)


In [None]:
synthetic_samples_matrix_dbscan = oversample_groups(X_train, cat_features, custom_smote_dbscan, subgroup_column_train, sensitive_attributes, label, reverse_group_mapping)

# Concatenate the original dataset with the synthetic samples
X_train_resampled_dbscan = pd.concat([X_train, pd.DataFrame(synthetic_samples_matrix_dbscan, columns=X_train.columns)], ignore_index=True)
#subgroup_column_resampled_tax = pd.concat([X_train[group_column_train], pd.Series(synthetic_samples_group_tax.flatten())], ignore_index=True)


0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 min, max
0.14625000000000002 3 [-1  0  1] mids
0.14625000000000002 0.28250000000000003 min, max
0.21437500000000004 2 [-1  0] mids
0.14625000000000002 0.21437500000000004 min, max
0.18031250000000004 2 [-1  0] mids
0.14625000000000002 0.18031250000000004 min, max
0.16328125000000004 2 [-1  0] mids
0.14625000000000002 0.16328125000000004 min, max
0.15476562500000002 2 [-1  0] mids
0.14625000000000002 0.15476562500000002 min, max
0.1505078125 3 [-1  0  1] mids
0.1505078125 0.15476562500000002 min, max
0.15263671875 3 [-1  0  1] mids
0.15263671875 0.15476562500000002 min, max
0.15370117187500001 3 [-1  0  1] mids
0.15370117187500001 0.15476562500000002 min, max
0.1542333984375 2 [-1  0] mids
0.15370117187500001 0.1542333984375 min, max
Oversampling for group  (2): Added 5488 synthetic samples in 2.
0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 min, ma



---



---



---



In [None]:
#################################################################################
num_privileged_ones = X_train[(X_train[sensitive_attributes[0]] == 1) &
                              (X_train[sensitive_attributes[1]] == 1) &
                              (X_train[label] == 1)].shape[0]

num_privileged_zeros = X_train[(X_train[sensitive_attributes[0]]) &
                               (X_train[sensitive_attributes[1]] == 1) &
                               (X_train[label] == 0)].shape[0]

# Calculating the ratio of the most privileged class
total_ratio = num_privileged_ones / num_privileged_zeros if num_privileged_zeros != 0 else float('inf')  # Avoid division by zero

print(f"Ratio of most privileged class: {total_ratio}")

Ratio of most privileged class: 0.4790803159458825


In [None]:
def find_optimal_epsilon_2(filtered_data, cat_features, min_samples, distance_matrix, eps_step=0.001, eps_min=0.01, eps_max=1.1):

    def cluster_count(eps):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        labels = dbscan.fit_predict(distance_matrix)
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels)  # - (1 if -1 in unique_labels else 0)
        return n_clusters, unique_labels

    # Binary search for optimal epsilon
    while eps_max - eps_min > eps_step:
        eps_mid = (eps_min + eps_max) / 2
        n_clusters_mid, labels_mid = cluster_count(eps_mid)
        print(eps_mid, n_clusters_mid, labels_mid, 'mids')

        if n_clusters_mid == 1:
            if -1 in labels_mid:
                eps_min = eps_mid  # Only noise points, increase epsilon
            else:
                eps_max = eps_mid  # Only core points, decrease epsilon
        elif n_clusters_mid > 2:
            eps_min = eps_mid  # More than two clusters, increase epsilon
        else:
            eps_max = eps_mid  # Exactly two clusters, continue search to fine-tune
        print(eps_min, eps_max, 'min, max')
    return eps_max

# Custom SMOTE-DBSCAN function
def custom_smote_dbscan_ratio(X_train, cat_features, pu_ix, nu_ix, group_column_train, total_ratio):
    """
    X_train is the training dataset preprocessed, group_column_train is a column containing the group of each
    instance in X_train
    """
    cat_attr_ix = [i for i, value in enumerate(cat_features) if value]

    X2_df = X_train[group_column_train == pu_ix]
    X2 = X2_df.values
    X3_df = X_train[group_column_train == nu_ix]
    X3 = X3_df.values

    PU = len(X2)
    NU = len(X3)

    # Determine the oversampling target based on a given total_ratio
    if (PU / NU) > total_ratio:
        oversampling_target = (PU / total_ratio) - NU
        os_df = X3_df
        os_ix = nu_ix
    elif (PU / NU) == total_ratio:
        print("The ratio of PU to NU is within the acceptable range of total_ratio.")
        return [], 0, pu_ix
    else:
        oversampling_target = (total_ratio * NU) - PU
        os_df = X2_df
        os_ix = pu_ix
    os_df = os_df.reset_index(drop=True)

    # Calculate min_samples
    min_samples = round(math.log(len(os_df)))
    distance_matrix = gower_matrix(os_df, cat_features=cat_features)

    # Find the optimal epsilon for os_df
    optimal_eps = find_optimal_epsilon_2(os_df, cat_features, min_samples, distance_matrix)

    # DBSCAN clustering with the optimal epsilon
    dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)

    # Get cluster labels
    labels = dbscan.labels_

    # Get core samples
    core_samples_mask = np.zeros_like(labels, dtype=bool)
    core_samples_mask[dbscan.core_sample_indices_] = True

    # Identify core, border, and noise points
    core_points = os_df[core_samples_mask]
    border_points = os_df[~core_samples_mask & (labels != -1)]
    noise_points = os_df[labels == -1]

    if len(border_points) == 0:
        border_points = core_points

    # Initialize synthetic samples list
    synthetic_samples = []

    border_indices = border_points.index.tolist()
    random.shuffle(border_indices)
    current_index = 0

    while len(synthetic_samples) < oversampling_target:
        idx_A = border_indices[current_index % len(border_indices)]
        current_index += 1
        point_A = os_df.loc[idx_A]

        # Ensure point B is not a noise point
        distances_to_A = distance_matrix[idx_A]
        neighbors = np.argsort(distances_to_A)[1:min_samples+1]  # Exclude the point itself
        valid_neighbors = [idx for idx in neighbors if labels[idx] != -1]  # Exclude noise points

        if not valid_neighbors:
            continue  # Skip if no valid neighbors are found

        idx_B = np.random.choice(valid_neighbors)
        point_B = os_df.loc[idx_B]

        synthetic_point = {}
        for i, col in enumerate(os_df.columns):
            if cat_features[i]:
                neighbor_values = os_df.iloc[valid_neighbors][col].tolist()
                synthetic_point[col] = max(set(neighbor_values), key=neighbor_values.count)
            else:
                alpha = np.random.rand()
                synthetic_point[col] = point_A[col] + alpha * (point_B[col] - point_A[col])

        synthetic_samples.append(synthetic_point)

    return pd.DataFrame(synthetic_samples), len(synthetic_samples), os_ix

# Example usage (assuming you have defined X_train, cat_features, etc.):
# synthetic_samples, num_samples, oversampled_index = custom_smote_dbscan(X_train, cat_features, pu_ix, nu_ix, group_column_train, total_ratio)


In [None]:
def oversample_groups_ratio(X_train, cat_features, custom_smote, group_column_train, total_ratio, reverse_group_mapping):
    """
    Function to oversample multiple groups automatically based on group labels.

    Parameters:
    - X_train: Preprocessed training dataset.
    - cat_features: List indicating categorical features.
    - custom_smote: Custom SMOTE function to be used.
    - group_column_train: Column containing the group label for each instance.
    - total_ratio: Desired ratio of positive to negative labels.
    - reverse_group_mapping: Mapping of groups to sensitive attributes and labels.

    Returns:
    - synthetic_samples_matrix: Matrix containing all generated synthetic samples.
    - synthetic_samples_group: Array of group labels for the synthetic samples.
    """

    synthetic_samples = []
    synthetic_samples_group = []

    groups = sorted(group_column_train.unique())
    paired_groups = [(groups[i], groups[i+1]) for i in range(0, len(groups), 2)]

    for group1, group2 in paired_groups:
        ########## Determine pu_ix and nu_ix using reverse_group_mapping ##########
        if reverse_group_mapping[group1][2] == 1:
            pu_ix = group1
            nu_ix = group2
        else:
            pu_ix = group2
            nu_ix = group1
        ##########################################################################

        group_df_pu = X_train[group_column_train == pu_ix]
        group_df_nu = X_train[group_column_train == nu_ix]
        positive_count = group_df_pu[group_df_pu[label] == 1].shape[0]
        negative_count = group_df_nu[group_df_nu[label] == 0].shape[0]

        if positive_count == 0 or negative_count == 0:
            continue

        current_ratio = positive_count / negative_count

        if current_ratio == total_ratio:
            continue  # Skip the most privileged group

        synthetic_points, synthetic_count, os_ix = custom_smote(X_train, cat_features, pu_ix, nu_ix, group_column_train, total_ratio=total_ratio)
        pu_column = np.full((len(synthetic_points), 1), os_ix)
        synthetic_samples.append(synthetic_points)
        synthetic_samples_group.append(pu_column)
        print(f"Oversampling for group pair ({pu_ix}, {nu_ix}): Added {synthetic_count} synthetic samples in {os_ix}.")

    synthetic_samples_matrix = pd.concat(synthetic_samples, ignore_index=True)
    synthetic_samples_group = np.concatenate(synthetic_samples_group)

    return synthetic_samples_matrix, synthetic_samples_group


In [None]:
synthetic_samples_matrix_dbscan_ratio, synthetic_samples_group_dbscan_ratio = oversample_groups_ratio(X_train, cat_features, custom_smote_dbscan_ratio, subgroup_column_train, total_ratio, reverse_group_mapping)

# Concatenate the original dataset with the synthetic samples
X_train_resampled_dbscan_ratio = pd.concat([X_train, pd.DataFrame(synthetic_samples_matrix_dbscan_ratio, columns=X_train.columns)], ignore_index=True)
#subgroup_column_resampled_tax = pd.concat([X_train[group_column_train], pd.Series(synthetic_samples_group_tax.flatten())], ignore_index=True)


0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 min, max
0.14625000000000002 3 [-1  0  1] mids
0.14625000000000002 0.28250000000000003 min, max
0.21437500000000004 2 [-1  0] mids
0.14625000000000002 0.21437500000000004 min, max
0.18031250000000004 2 [-1  0] mids
0.14625000000000002 0.18031250000000004 min, max
0.16328125000000004 2 [-1  0] mids
0.14625000000000002 0.16328125000000004 min, max
0.15476562500000002 2 [-1  0] mids
0.14625000000000002 0.15476562500000002 min, max
0.1505078125 3 [-1  0  1] mids
0.1505078125 0.15476562500000002 min, max
0.15263671875 3 [-1  0  1] mids
0.15263671875 0.15476562500000002 min, max
0.15370117187500001 3 [-1  0  1] mids
0.15370117187500001 0.15476562500000002 min, max
0.1542333984375 2 [-1  0] mids
0.15370117187500001 0.1542333984375 min, max
Oversampling for group pair (3, 2): Added 2478 synthetic samples in 3.
0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 



---



---



---
CLASSIFICATION


In [None]:
def evaluate_model_performance(X_train, X_test, protected_attributes, label_name, groups, model, weights=None):
    favorable_label = 1.0
    unfavorable_label = 0.0
    X_train[label_name] = X_train[label_name].astype(float)
    X_test[label_name] = X_test[label_name].astype(float)
    # If weights is not provided, create an array of ones with the same length as X_train
    if weights is None:
        weights = np.ones(len(X_train))

    # Create BinaryLabelDatasets
    binary_ds_train = BinaryLabelDataset(df=X_train, label_names=[label_name],
                                         protected_attribute_names=protected_attributes,
                                         favorable_label=favorable_label, unfavorable_label=unfavorable_label)
    binary_ds_test = BinaryLabelDataset(df=X_test, label_names=[label_name],
                                        protected_attribute_names=protected_attributes,
                                        favorable_label=favorable_label, unfavorable_label=unfavorable_label)
    if model == 'Logistic Regression':
        classifier = LogisticRegression(max_iter = 300)
    elif model == 'Random Forest':
        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    elif model == 'Gradient Boosting':
        classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    else:
        raise ValueError('Choose one classification algorithm between Logistic Regression, Random Forest, Gradient Boosting')

    classifier.fit(X_train.drop(columns=[label_name]), X_train[label_name], sample_weight=weights)
    predicted_labels = classifier.predict(X_test.drop(columns=[label_name]))

    X_test_with_predictions = pd.concat([X_test.drop(columns=[label_name]), pd.Series(predicted_labels, name=label_name, index=X_test.index)], axis=1)

    binary_ds_test_pred = BinaryLabelDataset(df=X_test_with_predictions, label_names=[label_name],
                                             protected_attribute_names=protected_attributes,
                                             favorable_label=favorable_label, unfavorable_label=unfavorable_label)

    all_results = {}
    for (group1, group2) in itertools.combinations(groups, 2):
        print(group1, group2, 'gruppi')
        pair_key = f"{group1['name']} vs {group2['name']}"
        all_results[pair_key] = evaluate(
            binary_ds_test, binary_ds_test_pred,
            [group1['attributes']], [group2['attributes']])
        #print([group1['attributes']], [group2['attributes']])


    return all_results, predicted_labels

In [None]:
def evaluate(test_data, pred, priv_group, unpriv_group):
    cm = ClassificationMetric(test_data, pred,
                              unprivileged_groups=unpriv_group,
                              privileged_groups=priv_group)
    dm = BinaryLabelDatasetMetric(pred,
                                  unprivileged_groups=unpriv_group,
                                  privileged_groups=priv_group)

    measure_scores = {
        'Balanced Accuracy': balanced_accuracy_score(test_data.labels, pred.labels),
        'Accuracy': cm.accuracy(),
        'F1 Score': f1_score(test_data.labels.ravel(), pred.labels.ravel()),  # Ensure labels are flat
        'Disparate Impact Ratio': dm.disparate_impact(),
        #'Demographic Parity Difference': cm.statistical_parity_difference(),
        #'Predictive Parity Difference': cm.positive_predictive_value(privileged=True) - cm.positive_predictive_value(privileged=False),
        'Average Odds Difference': cm.average_odds_difference(),
        'Equal Opportunity Difference': cm.equal_opportunity_difference(),
        #'Equalized Odds Difference': cm.average_abs_odds_difference(),
        'Consistency': dm.consistency(),
        #'TPR Difference': cm.true_positive_rate_difference(),
        #'FPR Difference': cm.false_positive_rate_difference(),
        #'TNR Difference': cm.true_negative_rate(privileged=True) - cm.true_negative_rate(privileged=False),
        #'FNR Difference': cm.false_negative_rate_difference(),
    }

    return measure_scores

In [None]:
def compute_metrics(df, actual_labels, predicted_labels):
    """Compute fairness and performance metrics."""
    cm = confusion_matrix(actual_labels, predicted_labels)
    TN, FP, FN, TP = cm.ravel()
    metrics = {
        'Accuracy': accuracy_score(actual_labels, predicted_labels),
        'Precision': precision_score(actual_labels, predicted_labels),
        'Recall': recall_score(actual_labels, predicted_labels),
        'F1 Score': f1_score(actual_labels, predicted_labels),
        'TPR': TP / (TP + FN),
        'FPR': FP / (FP + TN),
        'TNR': TN / (TN + FP),
        'FNR': FN / (FN + TP),
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN
    }

    return metrics

In [None]:
##################################################################################
results_orig, pred_labels_orig = evaluate_model_performance(X_train, X_test, sensitive_attributes, label,
                                                            groups, model=model)
#model = 'Random Forest'
#model = 'Gradient Boosting'

# Initialize a list to hold DataFrames
data_frames = []

# Populate the list with DataFrames, each having a unique row index
for key, values in results_orig.items():
    df_part = pd.DataFrame([values], index=[key])
    data_frames.append(df_part)

# Concatenate all DataFrames into a single DataFrame
results_orig_df = pd.concat(data_frames)
results_orig_df.index.name = 'Comparison'

# Print the results DataFrame
print(results_orig_df)

{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
                              Balanced Accuracy  Accuracy  F1 Score  \
Comparison                                                            
White Male vs Black Male               0.637464   0.79575  0.439924   
White Male vs

In [None]:
##################################################################################
results_dbscan, pred_labels_dbscan = evaluate_model_performance(X_train_resampled_dbscan, X_test, sensitive_attributes, label,
                                                            groups, model=model)

# Initialize a list to hold DataFrames
data_frames = []

# Populate the list with DataFrames, each having a unique row index
for key, values in results_dbscan.items():
    df_part = pd.DataFrame([values], index=[key])
    data_frames.append(df_part)

# Concatenate all DataFrames into a single DataFrame
results_dbscan_df = pd.concat(data_frames)
results_dbscan_df.index.name = 'Comparison'

# Print the results DataFrame
print(results_dbscan_df)

{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
                              Balanced Accuracy  Accuracy  F1 Score  \
Comparison                                                            
White Male vs Black Male               0.646666  0.772488  0.464338   
White Male vs

In [None]:
##################################################################################
results_dbscan_ratio, pred_labels_dbscan_ratio = evaluate_model_performance(X_train_resampled_dbscan_ratio, X_test, sensitive_attributes, label,
                                                            groups, model=model)

# Initialize a list to hold DataFrames
data_frames = []

# Populate the list with DataFrames, each having a unique row index
for key, values in results_dbscan_ratio.items():
    df_part = pd.DataFrame([values], index=[key])
    data_frames.append(df_part)

# Concatenate all DataFrames into a single DataFrame
results_dbscan_ratio_df = pd.concat(data_frames)
results_dbscan_ratio_df.index.name = 'Comparison'

# Print the results DataFrame
print(results_dbscan_ratio_df)

{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
                              Balanced Accuracy  Accuracy  F1 Score  \
Comparison                                                            
White Male vs Black Male               0.640086  0.780835  0.449748   
White Male vs

In [None]:
class ModelEvaluator:
    def __init__(self, df, protected_attributes, label_name, privileged, unprivileged, fav, unfav, groups, num_iterations=10, oversampling_methods=None):
        self.df = df
        self.protected_attributes = protected_attributes
        self.label_name = label_name
        self.privileged = privileged
        self.unprivileged = unprivileged
        self.fav = fav
        self.unfav = unfav
        self.groups = groups
        self.num_iterations = num_iterations
        self.oversampling_methods = oversampling_methods if oversampling_methods is not None else ['none']

    def evaluate_model_performance_mean(self):
        results_dict = {method: [] for method in self.oversampling_methods}

        for _ in range(self.num_iterations):
            data_prep = DataPreparation(self.df, self.protected_attributes, self.label_name,
                                        self.privileged, self.unprivileged, self.fav, self.unfav)
            data_prep.prepare()
            data_prep.df = data_prep.df.reset_index(drop=True)
            X_train, X_test = data_prep.X_train, data_prep.X_test
            X_train = X_train.reset_index(drop=True)
            cat_features = data_prep.cat_features
            numerical_features = data_prep.numerical_features
            reverse_group_mapping = data_prep.create_group_column()
            group_counts_train = X_train['Group'].value_counts().sort_index()
            subgroup_column_train = X_train['Group']
            subgroup_column_test = X_test['Group']
            X_train = X_train.drop(columns=['Group'])
            X_test = X_test.drop(columns=['Group'])

            num_privileged_ones = X_train[(X_train[self.protected_attributes[0]] == 1) &
                                          (X_train[self.protected_attributes[1]] == 1) &
                                          (X_train[self.label_name] == 1)].shape[0]
            num_privileged_zeros = X_train[(X_train[self.protected_attributes[0]] == 1) &
                                          (X_train[self.protected_attributes[1]] == 1) &
                                          (X_train[self.label_name] == 0)].shape[0]
            total_ratio = num_privileged_ones / num_privileged_zeros if num_privileged_zeros != 0 else float('inf')

            for method in self.oversampling_methods:
                if method == 'none':
                    results, pred_labels = evaluate_model_performance(X_train, X_test, self.protected_attributes, self.label_name,
                                                                      self.groups, model=model)
                elif method == 'custom_smote_dbscan':
                    synthetic_samples_matrix_dbscan = oversample_groups(X_train, cat_features, custom_smote_dbscan, subgroup_column_train, self.protected_attributes,
                                                                      self.label_name, reverse_group_mapping)
                    X_train_resampled_dbscan = pd.concat([X_train, pd.DataFrame(synthetic_samples_matrix_dbscan, columns=X_train.columns)], ignore_index=True)
                    results, pred_labels = evaluate_model_performance(X_train_resampled_dbscan, X_test, self.protected_attributes,
                                                                      self.label_name, self.groups, model=model)
                elif method == 'custom_smote_dbscan_ratio':
                    synthetic_samples_matrix_dbscan_ratio, synthetic_samples_group_dbscan_ratio = oversample_groups_ratio(X_train, cat_features, custom_smote_dbscan_ratio, subgroup_column_train, total_ratio, reverse_group_mapping)
                    X_train_resampled_dbscan_ratio = pd.concat([X_train, pd.DataFrame(synthetic_samples_matrix_dbscan_ratio, columns=X_train.columns)], ignore_index=True)
                    results, pred_labels = evaluate_model_performance(X_train_resampled_dbscan_ratio, X_test, self.protected_attributes,
                                                                      self.label_name, self.groups, model=model)

                data_frames = []
                for key, values in results.items():
                    df_part = pd.DataFrame([values], index=[key])
                    data_frames.append(df_part)

                results_df = pd.concat(data_frames)
                results_df.index.name = 'Comparison'
                results_dict[method].append(results_df)

        combined_results = {method: pd.concat(results_dict[method]).groupby(level=0).mean() for method in self.oversampling_methods}
        return combined_results


In [None]:
evaluators = {
    'ModelEvaluator': ModelEvaluator(df, sensitive_attributes, label, privileged, unprivileged, favorable_label, unfavorable_label, groups, num_iterations=2, oversampling_methods=['none', 'custom_smote_dbscan', 'custom_smote_dbscan_ratio'])
}

results = {}

# Evaluate all models and store results
for key, evaluator in evaluators.items():
    results[key] = evaluator.evaluate_model_performance_mean()

# Display the results
for method, result in results['ModelEvaluator'].items():
    print(f"Results for {method} method:")
    print(result)

Detected 3316 rows with missing values. Removed them.
The 'income' column has only two unique values.
{'workclass': {'Federal-gov': 0, 'Local-gov': 1, 'Private': 2, 'Self-emp-inc': 3, 'Self-emp-not-inc': 4, 'State-gov': 5, 'Without-pay': 6}, 'education': {'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}, 'marital_status': {'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}, 'occupation': {'Adm-clerical': 0, 'Armed-Forces': 1, 'Craft-repair': 2, 'Exec-managerial': 3, 'Farming-fishing': 4, 'Handlers-cleaners': 5, 'Machine-op-inspct': 6, 'Other-service': 7, 'Priv-house-serv': 8, 'Prof-specialty': 9, 'Protective-serv': 10, 'Sales': 11, 'Tech-support': 12, 'Transport-moving': 13}, 'relationship': {'Husband': 0, 'Not-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)


[(0, (1, 1, 0)), (1, (1, 1, 1)), (2, (1, 0, 0)), (3, (1, 0, 1)), (4, (0, 1, 0)), (5, (0, 1, 1)), (6, (0, 0, 0)), (7, (0, 0, 1))]
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 min,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)


[(0, (1, 1, 0)), (1, (1, 1, 1)), (2, (1, 0, 0)), (3, (1, 0, 1)), (4, (0, 1, 0)), (5, (0, 1, 1)), (6, (0, 0, 0)), (7, (0, 0, 1))]
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'White Male', 'attributes': {'race': 1, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} gruppi
{'name': 'Black Male', 'attributes': {'race': 0, 'sex': 1}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
{'name': 'White Female', 'attributes': {'race': 1, 'sex': 0}} {'name': 'Black Female', 'attributes': {'race': 0, 'sex': 0}} gruppi
0.555 1 [0] mids
0.01 0.555 min, max
0.28250000000000003 1 [0] mids
0.01 0.28250000000000003 min,

In [None]:
#comparison_key = 'Male Adult vs Female Young'
#comparison_key = 'Caucasian Female vs Black Male'
comparison_key = 'White Male vs Black Female'

# Create an empty list to store the extracted data
data = []

# Iterate over the results dictionary and extract the relevant data
for method, result_dict in results['ModelEvaluator'].items():
    if comparison_key in result_dict.index:
        metrics = result_dict.loc[comparison_key]
        row = {
            'Classifier': model,
            'Technique': method,
            'DI Ratio': metrics['Disparate Impact Ratio'],
            'AEO Diff.': metrics['Average Odds Difference'],
            'Equal Opportunity Difference': metrics['Equal Opportunity Difference'],
            'Consis.': metrics['Consistency'],
            'Acc.': metrics['Accuracy'],
            'Bal. Acc.': metrics['Balanced Accuracy'],
            'F1 Score': metrics['F1 Score']
        }
        data.append(row)

# Convert the extracted data into a DataFrame
df_results = pd.DataFrame(data)

# Print the DataFrame as a formatted table
print(df_results.to_string(index=False))


         Classifier                 Technique  DI Ratio  AEO Diff.  Equal Opportunity Difference              Consis.     Acc.  Bal. Acc.  F1 Score
Logistic Regression                      none  0.080706  -0.121975                     -0.174530 [0.9529134466769706] 0.796368   0.636320  0.436990
Logistic Regression       custom_smote_dbscan  0.435917  -0.041930                     -0.047461 [0.9270865533230285] 0.775502   0.644419  0.459488
Logistic Regression custom_smote_dbscan_ratio  0.469799  -0.008673                      0.004369 [0.9335780525502313] 0.781646   0.640470  0.450290




---



---



In [None]:
def compute_group_ratios(X_train_resampled_tax, sensitive_attributes, label):
    results = []
    groups = X_train_resampled_tax.groupby(sensitive_attributes)

    for group_values, group_df in groups:
        positive_count = group_df[group_df[label] == 1].shape[0]
        negative_count = group_df[group_df[label] == 0].shape[0]

        if negative_count == 0:
            current_ratio = float('inf')
        else:
            current_ratio = positive_count / negative_count

        # Calculate the number of samples to add to match the total_ratio
        # if current_ratio < total_ratio:
        #     required_positives = int((total_ratio * negative_count) - positive_count)
        #     samples_to_add = max(0, required_positives)  # Ensure no negative numbers
        # else:
        #     samples_to_add = 0

        results.append({
            "group": group_values,
            "current_ratio": current_ratio,
            #"samples_to_add": samples_to_add
        })

    return results


group_ratios = compute_group_ratios(X_train_resampled_dbscan, sensitive_attributes, label)

# Display the results
for result in group_ratios:
    print(f"Group: {result['group']}, Current Ratio: {result['current_ratio']}")


Group: (0, 0), Current Ratio: 0.4790803159458825
Group: (0, 1), Current Ratio: 0.4790803159458825
Group: (1, 0), Current Ratio: 0.4790803159458825
Group: (1, 1), Current Ratio: 0.4790803159458825


In [None]:
group_ratios_ratio = compute_group_ratios(X_train_resampled_dbscan_ratio, sensitive_attributes, label)

# Display the results
for result in group_ratios_ratio:
    print(f"Group: {result['group']}, Current Ratio: {result['current_ratio']}")

Group: (0, 0), Current Ratio: 0.47921225382932164
Group: (0, 1), Current Ratio: 0.47983539094650207
Group: (1, 0), Current Ratio: 0.4791067269488971
Group: (1, 1), Current Ratio: 0.4790803159458825


In [None]:
grouped_counts = X_train_resampled_dbscan.groupby(sensitive_attributes + [label]).size().reset_index(name='count')

# Print the counts
print(grouped_counts)

   race  sex  income  count
0     0    0     0.0  12787
1     0    0     1.0   6126
2     0    1     0.0  12787
3     0    1     1.0   6126
4     1    0     0.0  12787
5     1    0     1.0   6126
6     1    1     0.0  12787
7     1    1     1.0   6126


In [None]:
grouped_counts = X_train_resampled_dbscan_ratio.groupby(sensitive_attributes + [label]).size().reset_index(name='count')

# Print the counts
print(grouped_counts)

   race  sex  income  count
0     0    0     0.0   1371
1     0    0     1.0    657
2     0    1     0.0   1215
3     0    1     1.0    583
4     1    0     0.0   7299
5     1    0     1.0   3497
6     1    1     0.0  12787
7     1    1     1.0   6126
