<a href="https://colab.research.google.com/github/qahtanaa/OnSubGroupFairness/blob/main/problems_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install aif360

Collecting aif360
  Downloading aif360-0.5.0-py3-none-any.whl (214 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/214.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m204.8/214.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.1/214.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: aif360
Successfully installed aif360-0.5.0


In [2]:
import numpy as np
import pandas as pd
from aif360.metrics import utils
from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


In [3]:
class DataPreparation():
  """
  ........
  """
  def __init__(self, df, sensitive, label, priv, unpriv, fav, unfav, categorical):
    """
    Construct all necessary attributes for the data preparation.

    df : (pandas DataFrame) containing the data
    sensitive : (list(str)) specifying the column names of all sensitive features
    label : (str) specifying the label column
    priv : (list(dicts)) representation of the privileged groups
    unpriv : (list(dicts)) representation of the unprivileged groups
    fav : (str/int/..) value representing the favorable label
    unfav : (str/int/..) value representing the unfavorable label
    categorical : (list(str)) (optional) specifying column names of categorical features
    """
    self.df = df
    self.sensitive = sensitive
    self.label = label
    self.priv = priv
    self.unpriv = unpriv
    self.fav = fav
    self.unfav = unfav
    self.categorical = categorical


  def detect_missing_values(self):
      """
      Detect rows with missing values and remove them from the DataFrame.
      """
      initial_rows = len(self.df)
      self.df = self.df.dropna()
      removed_rows = initial_rows - len(self.df)

      if removed_rows > 0:
          print(f"Detected {removed_rows} rows with missing values. Removed them.")
      else:
          print("No missing values detected.") #pass

  def binary_label(self):
    """
    Check that the decision label is made of two values, change it as a binary representation
    where favorable label = 1, unfavorable label = 0.
    """
    number_label_values = self.df[self.label].nunique()
    if number_label_values == 2:
      print(f"The '{self.label}' column has only two unique values.")
      self.df[self.label].replace([self.unfav, self.fav], [0, 1], inplace=True)
    else:
      print(f"The '{self.label}' column does not have exactly two unique values, as it should.")

  def find_categorical_attributes(self):
    """
    find the categorical attributes
    """
    attribute_types = {}

    for column in self.df.columns:
        # Skip the 'Group' column
        if column == 'Group':
            continue
        # if the column has already been classified as categorical by the user, leave it cat
        elif column in self.categorical:
            attribute_types[column] = 'Categorical'
            continue
        # if the column has only two distinguished values, consider it categorical
        elif self.df[column].nunique() == 2:
          attribute_types[column] = 'Categorical'
          continue
        else: #if it's not the group column, or if it's not been classified as categorical by the user, check every value of the column
          num_float = 0
          num_text = 0
          thresh = 0.99
          num_att_in_column = len(self.df[column])

          for value in self.df[column]:  # Accessing all values in the column
            # Attempt to convert the value to a float
            try:
              float(value)
              num_float += 1
              continue  # Move to the next value
            except ValueError:
              pass  # If it's not a float, continue to the next check
            # If it's not an integer or a float, consider it as text
            num_text += 1

          # now see if it's categorical or numerical
          if num_float / num_att_in_column > thresh:
            attribute_types[column] = 'Numerical'
            continue
          else:
            attribute_types[column] = 'Categorical'
    return attribute_types


  def create_group_column(self):
    """
    Create a 'Group' column in the DataFrame based on protected attributes, privileged/unprivileged conditions, and label. Then drop
    the protected attributes and label columns because those informations are already present in the Group column
    """
    group_combinations = pd.MultiIndex.from_product([self.df[sensitive].unique() for sensitive in self.sensitive] + [self.df[self.label].unique()], names=self.sensitive + [self.label])
    #print(group_combinations)
    print(list(enumerate(group_combinations)))
    # Create a mapping between group combinations and their corresponding numbers
    group_mapping = {group: idx for idx, group in enumerate(group_combinations)}
    reverse_group_mapping = {idx: group for group, idx in group_mapping.items()}  # Create reverse mapping
    # Apply the mapping to create a new column in the DataFrame
    self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)

    return reverse_group_mapping #enumerate(group_combinations)

  def prepare(self):
    """
    Perform all preprocessing steps.
    """
    self.detect_missing_values()
    self.binary_label()
    self.create_group_column()
    self.find_categorical_attributes()
    #self.normalization()
    #self.make_numerical()
    return self

In [4]:
########### CODE TO TRY DATAPREPARATION CLASS WITH A SMALL DATAFAME

# Create a small DataFrame
np.random.seed(42)
n = 200
data = {
    'Gender': np.random.choice(['F', 'M'], size=n),
    'Race': np.random.choice(['B', 'W'], size=n),
    'NumKids': np.random.choice(range(5), size=n),
    'Height': np.random.uniform(150, 190, size=n),
    'ShirtColor': np.random.choice(['Red', 'Blue', 'Green'], size=n),
    'Age': np.random.randint(18, 65, size=n),
    'Accepted': np.random.choice([1, 2], size=n)
}

df = pd.DataFrame(data)

# Introduce missing values randomly
missing_fraction = 0.05  # Adjust this fraction based on the desired percentage of missing values

# Randomly select cells and set their values to NaN
mask = np.random.rand(*df.shape) < missing_fraction
df[mask] = np.nan
print(df)

del mask
del missing_fraction

#count the number of columns in the dataframe
num_of_columns = len(df.columns)
print("Number of columns:", num_of_columns)

#use the DataPreparation class to preprocess the dataframe
data_prep = DataPreparation(df, ['Gender','Race'], 'Accepted', ['M','W'], ['F', 'B'], 2, 1, ['NumKids'])
data_prep.prepare()

# use this reset index to have indices from 0 to len(df)
data_prep.df = data_prep.df.reset_index(drop=True)
print(data_prep.df)



    Gender Race  NumKids      Height ShirtColor   Age  Accepted
0        F    B      3.0  188.447623       Blue  43.0       2.0
1        M    W      2.0         NaN      Green  31.0       1.0
2        F    B      0.0  157.831645      Green   NaN       2.0
3        F    B      3.0  152.774452      Green  54.0       1.0
4        F    W      3.0         NaN      Green  28.0       2.0
..     ...  ...      ...         ...        ...   ...       ...
195      M    W      2.0         NaN        NaN  52.0       NaN
196      M    W      4.0  155.454859        Red  42.0       1.0
197      M    W      3.0  150.581787        Red  43.0       2.0
198      F    W      2.0  164.023502      Green  28.0       1.0
199      F    B      0.0  173.596707        Red  55.0       1.0

[200 rows x 7 columns]
Number of columns: 7
Detected 60 rows with missing values. Removed them.
The 'Accepted' column has only two unique values.
[(0, ('F', 'B', 1.0)), (1, ('F', 'B', 0.0)), (2, ('F', 'W', 1.0)), (3, ('F', 'W', 0.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[self.label].replace([self.unfav, self.fav], [0, 1], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)


In [5]:
# Call the method to find categorical indices
attribute_types = data_prep.find_categorical_attributes()

# Print the result
print("dictionary 'Attribute' : 'Numerical/Categorical' \n ", attribute_types)

# Initialize cat_features list
cat_features = []

# Iterate over attribute names and determine whether they are categorical or numerical
for attr in attribute_types:
    cat_features.append(attribute_types[attr] == 'Categorical')

del attr

print('boolean categorical features vector:\n', cat_features)


dictionary 'Attribute' : 'Numerical/Categorical' 
  {'Gender': 'Categorical', 'Race': 'Categorical', 'NumKids': 'Categorical', 'Height': 'Numerical', 'ShirtColor': 'Categorical', 'Age': 'Numerical', 'Accepted': 'Categorical'}
boolean categorical features vector:
 [True, True, True, False, True, False, True]


heom di distyton, using Nearest Neighbors to find the kNN


In [6]:
!pip install distython
from distython import HEOM
from distython import HVDM
from sklearn.neighbors import NearestNeighbors

Collecting distython
  Downloading distython-0.0.3-py3-none-any.whl (8.0 kB)
Installing collected packages: distython
Successfully installed distython-0.0.3


In [17]:
def heom_distance1(ndary, cat_features_boolean, n=5):
  """
  later
  """
  X = ndary
  n = n+1

  # array of categorical indices
  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]

  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HEOM = HEOM(X, cat_attr_ix, nan_equivalents = [nan_eqv])

  # Declare NearestNeighbor and link the metric
  neighbor = NearestNeighbors(metric = matrix_dist_HEOM.heom)

  # Fit the model which uses the custom distance metric
  neighbor.fit(X)

  # Initialize empty matrices to store indices and distances
  nearest_indices_heom = np.zeros((len(X), 6), dtype=int)
  nearest_distances_heom = np.zeros((len(X), 6))

  # Loop through each instance in the data
  for i in range(len(X)):
      # Find the 5 nearest neighbors to the current instance
      result = neighbor.kneighbors(X[i].reshape(1, -1), n_neighbors=6)

      # Store the indices and distances of the nearest neighbors in the matrices
      nearest_indices_heom[i] = result[1][0]
      nearest_distances_heom[i] = result[0][0]

  return nearest_indices_heom, nearest_distances_heom

In [21]:
# FIRST OF ALL, Encode the dataframe so that the categorical attributes are converted to numbers, but still treated like categorical.
X = data_prep.df.drop(columns=['Group'], axis=1).copy()
#print(X)
encoder_dict = dict()
columns_categorical = X.columns[cat_features]

# for each categorical column, encode it
for column in columns_categorical:
  le = LabelEncoder()
  X[column] = le.fit_transform(X[column].values)
  mapping = dict(zip(le.classes_, range(len(le.classes_))))
  encoder_dict[column] = mapping
#print(X)

# convert it to numpy so that distython doesn't cry
X = X.to_numpy()
#print(X)

#call the method defined above
closest_index_heom1, closest_values_heom1 = heom_distance1(X, cat_features,5)

In [22]:
print(closest_index_heom1[0:15,:])
print('...')

[[  0  83 130  78 135  91]
 [  1  27 111  75 114  78]
 [  2  36  29  97 109  12]
 [  3 135 126  26  83  91]
 [  4  38 133  32 105 102]
 [  5  67 106  68  18  75]
 [  6 130  42 103 125 122]
 [  7 117  55  48  27  53]
 [  8 139  34 133 104  38]
 [  9  84  15  32  14 102]
 [ 10 137 115  52  37  77]
 [ 11  34 125  92  29 133]
 [ 12  72  62  43  61  17]
 [ 13  43 113  72  14  24]
 [ 14  13  84   9  24 115]]
...


In [16]:
def heom_distance2(ndary, cat_features_boolean, n=5):
  """
  later
  """
  X = ndary
  n = n+1

  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]

  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HEOM = HEOM(X, cat_attr_ix, nan_equivalents = [nan_eqv])

  matrix_distances = np.zeros((len(X), len(X)))
  distances = []
  k = 0

  #compute the distances using HEOM metric
  for i in X:
      for j in X:
          i = np.asarray(i)
          i = i.astype(np.float64)
          j = np.asarray(j)
          j = j.astype(np.float64)
          dist = matrix_dist_HEOM.heom(i, j)
          distances.append(dist)

      matrix_distances[k, :] = distances
      k += 1
      distances = []

  #compute the matrices with n-NN indices and values
  matrix_ind_val = []
  for i in range(len(X)):
    dm = matrix_distances[i,:]
    top_n = smallest_indices(np.nan_to_num(dm, nan=1),6)
    matrix_ind_val.append(top_n)

  matrix_neighbors_heom_index = np.array([item['index'] for item in matrix_ind_val])
  matrix_neighbors_heom_values = np.array([item['values'] for item in matrix_ind_val])

  return matrix_neighbors_heom_index, matrix_neighbors_heom_values

def smallest_indices(ary, n):
    """Returns the n largest indices from a numpy array."""
    #n += 1
    flat = np.nan_to_num(ary.flatten(), nan=999)
    indices = np.argpartition(-flat, -n)[-n:]
    indices = indices[np.argsort(flat[indices])]
    #indices = np.delete(indices,0,0)
    values = flat[indices]
    return {'index': indices, 'values': values}

In [23]:
closest_index_heom2, closest_values_heom2 = heom_distance2(X, cat_features,5)

In [24]:
print(closest_index_heom2[0:15,:])
print('...')

[[  0  83 130  78 135  91]
 [  1  27 111  75 114  78]
 [  2  36  29  97 109  12]
 [  3 135 126  26  83  91]
 [  4  38 133  32 105 102]
 [  5  67 106  68  18  75]
 [  6 130  42 103 125 122]
 [  7 117  55  48  27  53]
 [  8 139  34 133 104  38]
 [  9  84  15  32  14 102]
 [ 10 137 115  52  37  77]
 [ 11  34 125  92  29 133]
 [ 12  72  62  43  61  17]
 [ 13  43 113  72  14  24]
 [ 14  13  84   9  24 115]]
...


hvdm di distyton, using Nearest Neighbors to find the kNN


In [26]:
def hvdm_distance1(ndary, y_ix, cat_features_boolean, n=5):
  """
  y_ix = [n] is a 1-element array what represent the index of the target variable/laber
  """
  X = ndary
  n = n+1

  # convert the values of the cat_features from boolean to binary: 1 if categorical, 0 if numerical
  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]
  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HVDM = HVDM(X,y_ix, cat_attr_ix, nan_equivalents = [nan_eqv], normalised="std")

  # Declare NearestNeighbor and link the metric
  neighbor = NearestNeighbors(metric = matrix_dist_HVDM.hvdm)

  # Fit the model which uses the custom distance metric
  neighbor.fit(X)

  # Initialize empty matrices to store indices and distances
  nearest_indices_hvdm = np.zeros((len(X), 6), dtype=int)
  nearest_distances_hvdm = np.zeros((len(X), 6))

  # Loop through each instance in the data
  for i in range(len(X)):
      # Find the 5 nearest neighbors to the current instance
      result = neighbor.kneighbors(X[i].reshape(1, -1), n_neighbors=6)

      # Store the indices and distances of the nearest neighbors in the matrices
      nearest_indices_hvdm[i] = result[1][0]
      nearest_distances_hvdm[i] = result[0][0]

  return nearest_indices_hvdm, nearest_distances_hvdm

In [27]:
y_ix = [6]
closest_index_hvdm1, closest_values_hvdm1 = hvdm_distance1(X, y_ix, cat_features,5, )

Division by zero is not allowed!


UnboundLocalError: local variable 'temp_result' referenced before assignment

/usr/local/lib/python3.10/dist-packages/distython/VDM.py let's see this error, it doesn't enter in the if in line 96, as if N_ax or N_ay are equal zero, but they're not!

The problem is in the fit() method of Nearest Neighbors, I don't know why. The code is essentially the same as the one for HEOM distance.

In [28]:
def hvdm_distance2(ndary, y_ix, cat_features_boolean, n=5):
  """
  y_ix = [n] is a 1-element array what represent the index of the target variable/laber
  """
  X = ndary
  n = n+1

  # convert the values of the cat_features from boolean to binary: 1 if categorical, 0 if numerical
  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]
  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HVDM = HVDM(X,y_ix, cat_attr_ix, nan_equivalents = [nan_eqv], normalised="std")

  matrix_distances = np.zeros((len(X), len(X)))
  distances = []
  k = 0

  #compute the distances using HVDM metric
  for i in X:
      for j in X:
          i = np.asarray(i)
          i = i.astype(np.float64)
          j = np.asarray(j)
          j = j.astype(np.float64)
          dist = matrix_dist_HVDM.hvdm(i, j)
          distances.append(dist)

      matrix_distances[k, :] = distances
      k += 1
      distances = []

  #compute the matrices with n-NN indices and values
  matrix_ind_val = []
  for i in range(len(X)):
    dm = matrix_distances[i,:]
    top_n = smallest_indices(np.nan_to_num(dm, nan=1),6)
    matrix_ind_val.append(top_n)

  matrix_neighbors_hvdm_index = np.array([item['index'] for item in matrix_ind_val])
  matrix_neighbors_hvdm_values = np.array([item['values'] for item in matrix_ind_val])

  return matrix_neighbors_hvdm_index, matrix_neighbors_hvdm_values


In [29]:
y_ix = [6]
closest_index_hvdm2, closest_values_hvdm2 = hvdm_distance2(X, y_ix, cat_features,5, )

In [30]:
print(closest_index_hvdm2[0:15,:])
print('...')

[[  0 122 130  90  83 103]
 [  1  27  68  45   5  69]
 [  2  27   1  68 111 121]
 [  3 135  10  50 107  26]
 [  4 133  92 105  32  15]
 [  5  68  75 111 106   1]
 [  6  35 103  96  52 134]
 [  7 133  92   4 105  54]
 [  8 139  34  94  79  43]
 [  9  93 125 113  13  15]
 [ 10 107 126  83 115   3]
 [ 11 125  30  13 113  55]
 [ 12  20  62  95 104  89]
 [ 13 113  93  74  14   9]
 [ 14  13  74  23  30 113]]
...
