<a href="https://colab.research.google.com/github/qahtanaa/OnSubGroupFairness/blob/main/Problems2_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [29]:
!pip install aif360



In [30]:
import numpy as np
import pandas as pd
from aif360.metrics import utils
from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [31]:
class DataPreparation():
  """
  ........
  """
  def __init__(self, df, sensitive, label, priv, unpriv, fav, unfav, categorical):
    """
    Construct all necessary attributes for the data preparation.

    df : (pandas DataFrame) containing the data
    sensitive : (list(str)) specifying the column names of all sensitive features
    label : (str) specifying the label column
    priv : (list(dicts)) representation of the privileged groups
    unpriv : (list(dicts)) representation of the unprivileged groups
    fav : (str/int/..) value representing the favorable label
    unfav : (str/int/..) value representing the unfavorable label
    categorical : (list(str)) (optional) specifying column names of categorical features
    """
    self.df = df
    self.sensitive = sensitive
    self.label = label
    self.priv = priv
    self.unpriv = unpriv
    self.fav = fav
    self.unfav = unfav
    self.categorical = categorical


  def detect_missing_values(self):
      """
      Detect rows with missing values and remove them from the DataFrame.
      """
      initial_rows = len(self.df)
      self.df = self.df.dropna()
      removed_rows = initial_rows - len(self.df)

      if removed_rows > 0:
          print(f"Detected {removed_rows} rows with missing values. Removed them.")
      else:
          print("No missing values detected.") #pass

  def binary_label(self):
    """
    Check that the decision label is made of two values, change it as a binary representation
    where favorable label = 1, unfavorable label = 0.
    """
    number_label_values = self.df[self.label].nunique()
    if number_label_values == 2:
      print(f"The '{self.label}' column has only two unique values.")
      self.df[self.label].replace([self.unfav, self.fav], [0, 1], inplace=True)
    else:
      print(f"The '{self.label}' column does not have exactly two unique values, as it should.")

  def find_categorical_attributes(self):
    """
    find the categorical attributes
    """
    attribute_types = {}

    for column in self.df.columns:
        # Skip the 'Group' column
        if column == 'Group':
            continue
        # if the column has already been classified as categorical by the user, leave it cat
        elif column in self.categorical:
            attribute_types[column] = 'Categorical'
            continue
        # if the column has only two distinguished values, consider it categorical
        elif self.df[column].nunique() == 2:
          attribute_types[column] = 'Categorical'
          continue
        else: #if it's not the group column, or if it's not been classified as categorical by the user, check every value of the column
          num_float = 0
          num_text = 0
          thresh = 0.99
          num_att_in_column = len(self.df[column])

          for value in self.df[column]:  # Accessing all values in the column
            # Attempt to convert the value to a float
            try:
              float(value)
              num_float += 1
              continue  # Move to the next value
            except ValueError:
              pass  # If it's not a float, continue to the next check
            # If it's not an integer or a float, consider it as text
            num_text += 1

          # now see if it's categorical or numerical
          if num_float / num_att_in_column > thresh:
            attribute_types[column] = 'Numerical'
            continue
          else:
            attribute_types[column] = 'Categorical'
    return attribute_types


  def create_group_column(self):
    """
    Create a 'Group' column in the DataFrame based on protected attributes, privileged/unprivileged conditions, and label. Then drop
    the protected attributes and label columns because those informations are already present in the Group column
    """
    group_combinations = pd.MultiIndex.from_product([self.df[sensitive].unique() for sensitive in self.sensitive] + [self.df[self.label].unique()], names=self.sensitive + [self.label])
    #print(group_combinations)
    print(list(enumerate(group_combinations)))
    # Create a mapping between group combinations and their corresponding numbers
    group_mapping = {group: idx for idx, group in enumerate(group_combinations)}
    reverse_group_mapping = {idx: group for group, idx in group_mapping.items()}  # Create reverse mapping
    # Apply the mapping to create a new column in the DataFrame
    self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)

    return reverse_group_mapping #enumerate(group_combinations)

  def prepare(self):
    """
    Perform all preprocessing steps.
    """
    self.detect_missing_values()
    self.binary_label()
    self.create_group_column()
    self.find_categorical_attributes()
    #self.normalization()
    #self.make_numerical()
    return self

In [32]:
########### CODE TO TRY DATAPREPARATION CLASS WITH A SMALL DATAFAME

# Create a small DataFrame
np.random.seed(42)
n = 20
data = {
    'Gender': np.random.choice(['F', 'M'], size=n),
    'Race': np.random.choice(['B', 'W'], size=n),
    'NumKids': np.random.choice(range(5), size=n),
    'Height': np.random.uniform(150, 190, size=n),
    'ShirtColor': np.random.choice(['Red', 'Blue', 'Green'], size=n),
    'Age': np.random.randint(18, 65, size=n),
    'Accepted': np.random.choice([1, 2], size=n)
}

df = pd.DataFrame(data)

# Introduce missing values randomly
missing_fraction = 0.05  # Adjust this fraction based on the desired percentage of missing values

# Randomly select cells and set their values to NaN
mask = np.random.rand(*df.shape) < missing_fraction
df[mask] = np.nan
print(df)

del mask
del missing_fraction

#count the number of columns in the dataframe
num_of_columns = len(df.columns)
print("Number of columns:", num_of_columns)

#use the DataPreparation class to preprocess the dataframe
data_prep = DataPreparation(df, ['Gender','Race'], 'Accepted', ['M','W'], ['F', 'B'], 2, 1, ['NumKids'])
data_prep.prepare()

# use this reset index to have indices from 0 to len(df)
data_prep.df = data_prep.df.reset_index(drop=True)
print(data_prep.df)



   Gender Race  NumKids      Height ShirtColor  Age  Accepted
0       F    W      2.0  150.638650       Blue   41       NaN
1       M  NaN      1.0  159.235753      Green   18       1.0
2       F    W      3.0  159.641019        Red   61       2.0
3       F    W      3.0  177.330541       Blue   25       1.0
4       F    W      2.0  174.399866        Red   41       1.0
5       M  NaN      3.0  183.327796        Red   28       2.0
6       F    W      3.0  156.934586       Blue   34       2.0
7       F    W      0.0  165.642424        NaN   25       1.0
8       F    W      2.0  157.289444        Red   52       1.0
9       M    W      4.0  180.214456       Blue   52       2.0
10      F    B      2.0  167.006235        Red   50       NaN
11      F    B      NaN  158.317667        Red   22       2.0
12      F    W      0.0  172.708013        Red   59       1.0
13      F    W      1.0  151.252532        Red   56       1.0
14      M    W      3.0  183.691391      Green   58       1.0
15      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[self.label].replace([self.unfav, self.fav], [0, 1], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)


In [33]:
# Call the method to find categorical indices
attribute_types = data_prep.find_categorical_attributes()

# Print the result
print("dictionary 'Attribute' : 'Numerical/Categorical' \n ", attribute_types)

# Initialize cat_features list
cat_features = []

# Iterate over attribute names and determine whether they are categorical or numerical
for attr in attribute_types:
    cat_features.append(attribute_types[attr] == 'Categorical')

del attr

print('boolean categorical features vector:\n', cat_features)


dictionary 'Attribute' : 'Numerical/Categorical' 
  {'Gender': 'Categorical', 'Race': 'Categorical', 'NumKids': 'Categorical', 'Height': 'Numerical', 'ShirtColor': 'Categorical', 'Age': 'Numerical', 'Accepted': 'Categorical'}
boolean categorical features vector:
 [True, True, True, False, True, False, True]


heom di distyton, using Nearest Neighbors to find the kNN


In [34]:
!pip install distython
from distython import HEOM
#from distython import HVDM
from sklearn.neighbors import NearestNeighbors



In [35]:
# FIRST OF ALL, Encode the dataframe so that the categorical attributes are converted to numbers, but still treated like categorical.
X = data_prep.df.drop(columns=['Group'], axis=1).copy()
#print(X)
encoder_dict = dict()
columns_categorical = X.columns[cat_features]

# for each categorical column, encode it
for column in columns_categorical:
  le = LabelEncoder()
  X[column] = le.fit_transform(X[column].values)
  mapping = dict(zip(le.classes_, range(len(le.classes_))))
  encoder_dict[column] = mapping
#print(X)

# convert it to numpy so that distython doesn't cry
X = X.to_numpy()


In [39]:
class HVDM():
    def __init__(self, X , label_ix, cat_ix, normalised="std"):
        self.cat_ix = cat_ix
        self.col_ix = [i for i in range(X.shape[1])]
        self.classes = np.unique(X[:, label_ix])
        #print( self.cat_ix, self.col_ix, self.classes, 'cose iniziali di HVDM')
        # Get the normalization scheme for numerical variables
        if normalised == "std":
            self.range = 4* np.nanstd(X, axis = 0)
        else:
            self.range = np.nanmax(X, axis = 0) - np.nanmin(X, axis = 0)

        #print(self.range, 'range')
        #it computes 4*std for every column, even for the categorical

        array_len = 0
        # Get the max no. of unique classes within columns to initialize the array
        for ix in self.cat_ix:
            max_val = len(np.unique(X[:, ix]))
            if max_val > array_len:
                array_len = max_val


        # Store the list of unique classes elements for each categorical column
        # self.col_ix is used here for clearer indices assignment
        self.unique_attributes = np.full((array_len, len(self.col_ix)), fill_value=-1)
        for ix in self.cat_ix:
            unique_vals = np.unique(X[:, ix])
            self.unique_attributes[0:len(unique_vals), ix] = unique_vals
        #print(self.unique_attributes, 'unique attr')

        # Declare the 3D numpy array which holds specifc count for each attribute
        # for each column for each output class
        # +1 in len(self.classes) + 1 is to store the sum (N_a,x) in the last element
        self.final_count = np.zeros((len(self.col_ix), self.unique_attributes.shape[0], len(self.classes) + 1))
        # For each column
        for i, col in enumerate(self.cat_ix):
            # For each attribute value in the column
            for j, attr in enumerate(self.unique_attributes[:, col]):
                # If attribute exists
                if attr != -1:
                    # For each output class value
                    for k, val in enumerate(self.classes):
                        # Get an attribute count for each output class
                        row_ixs = np.argwhere(X[:, col] == attr)
                        cnt = np.sum(X[row_ixs, label_ix] == val)
                        self.final_count[col, j, k] = cnt
                    # Get a sum of all occurences
                    self.final_count[col, j, -1] = np.sum(self.final_count[col, j, :])

    def hvdm(self, x, y):
        # Initialise results array
        results_array = np.zeros(x.shape)
        #print(results_array,'results_array')
        print('x inside hvdm', x)
        cat_ix = self.cat_ix
        print('y inside hvdm', y)

        for i in cat_ix:
            # Get indices to access the final_count array
            x_ix = np.argwhere(self.unique_attributes[:, i] == x[i]).flatten()
            print(x_ix, 'x_ix')
            y_ix = np.argwhere(self.unique_attributes[:, i] == y[i]).flatten()
            print(y_ix, 'y_ix')
            # Get the count to calculate the conditional probability
            N_ax = self.final_count[i, x_ix, -1].flatten()
            N_ay = self.final_count[i, y_ix, -1].flatten()
            N_axc = self.final_count[i, x_ix].flatten()
            N_ayc = self.final_count[i, y_ix].flatten()
            ##################################################
            temp_result = 1 ###################################
            ##################################################
            #the temp_result = 1 was not present in the original code, so when it arrived
            #in the line results_array[i] said that temp_result was not defined, and so it stopped the run
            if N_ax != 0 and N_ay != 0:
                temp_result = abs(N_axc/N_ax - N_ayc/N_ay)
                temp_result = np.sum(temp_result)
            else:
                print("Division by zero is not allowed!")
            results_array[i] = temp_result
            print(results_array[i], 'results array i')

        num_ix = np.setdiff1d(self.col_ix, self.cat_ix)
        # Calculate the distance for numerical elements
        results_array[num_ix] = np.abs(x[num_ix] - y[num_ix]) / self.range[num_ix]
        print(results_array, 'array with the distances between every attribute for instances x and y')

        # Return the final result
        # Square root is not computed in practice
        # As it doesn't change similarity between instances
        return np.sum(np.square(results_array))


In [40]:
categorical_ix = [0,1,2,4,6]
label_ix = [6]

# Instantiate the custom distance metric
metric_hvdm = HVDM(X, label_ix=label_ix, cat_ix=categorical_ix)

# Initialize the Nearest Neighbors model with the custom distance metric
neighbor = NearestNeighbors(metric=metric_hvdm.hvdm)

# Fit the Nearest Neighbors model to your dataset
neighbor.fit(X)
# Return 5-Nearest Neighbors to the 1st instance (row 1)
result = neighbor.kneighbors(X[0].reshape(1, -1), n_neighbors = 5)
print(result)

x inside hvdm [  0.30769231   0.69230769   1.76923077 170.05160601   1.38461538
  43.30769231   0.30769231]
y inside hvdm [  0.           1.           3.         159.64101864   2.
  61.           1.        ]
[] x_ix
[0] y_ix
Division by zero is not allowed!
1.0 results array i
[] x_ix
[1] y_ix
Division by zero is not allowed!
1.0 results array i
[] x_ix
[3] y_ix
Division by zero is not allowed!
1.0 results array i
[] x_ix
[2] y_ix
Division by zero is not allowed!
1.0 results array i
[] x_ix
[1] y_ix
Division by zero is not allowed!
1.0 results array i
[1.         1.         1.         0.23538722 1.         0.32850171
 1.        ] array with the distances between every attribute for instances x and y
x inside hvdm [  0.30769231   0.69230769   1.76923077 170.05160601   1.38461538
  43.30769231   0.30769231]
y inside hvdm [  0.           1.           3.         177.33054075   0.
  25.           0.        ]
[] x_ix
[0] y_ix
Division by zero is not allowed!
1.0 results array i
[] x_ix
[1] y

  if N_ax != 0 and N_ay != 0:
  if N_ax != 0 and N_ay != 0:
