<a href="https://colab.research.google.com/github/qahtanaa/OnSubGroupFairness/blob/main/German_distances_taxonomies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

___________________________________
gower is faster than HEOM and HVDM
I obtain closer results (to the paper FAWOS) with HVDM, probably because of how they select numerical/categorical attributes
___________________________________

In [None]:
%reset -f

In [None]:
!pip install aif360



In [None]:
import numpy as np
import pandas as pd
from aif360.metrics import utils
from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
class DataPreparation():
  """
  ........
  """
  def __init__(self, df, sensitive, label, priv, unpriv, fav, unfav, categorical):
    """
    Construct all necessary attributes for the data preparation.

    df : (pandas DataFrame) containing the data
    sensitive : (list(str)) specifying the column names of all sensitive features
    label : (str) specifying the label column
    priv : (list(dicts)) representation of the privileged groups
    unpriv : (list(dicts)) representation of the unprivileged groups
    fav : (str/int/..) value representing the favorable label
    unfav : (str/int/..) value representing the unfavorable label
    categorical : (list(str)) (optional) specifying column names of categorical features
    """
    self.df = df
    self.sensitive = sensitive
    self.label = label
    self.priv = priv
    self.unpriv = unpriv
    self.fav = fav
    self.unfav = unfav
    self.categorical = categorical


  def detect_missing_values(self):
      """
      Detect rows with missing values and remove them from the DataFrame.
      """
      initial_rows = len(self.df)
      self.df = self.df.dropna()
      removed_rows = initial_rows - len(self.df)

      if removed_rows > 0:
          print(f"Detected {removed_rows} rows with missing values. Removed them.")
      else:
          print("No missing values detected.") #pass

  def binary_label(self):
    """
    Check that the decision label is made of two values, change it as a binary representation
    where favorable label = 1, unfavorable label = 0.
    """
    number_label_values = self.df[self.label].nunique()
    if number_label_values == 2:
      print(f"The '{self.label}' column has only two unique values.")
      self.df[self.label].replace([self.unfav, self.fav], [0, 1], inplace=True)
    else:
      print(f"The '{self.label}' column does not have exactly two unique values, as it should.")

  def find_categorical_attributes(self):
    """
    find the categorical attributes
    """
    attribute_types = {}

    for column in self.df.columns:
        # Skip the 'Group' column
        if column == 'Group':
            continue
        # if the column has already been classified as categorical by the user, leave it cat
        elif column in self.categorical:
            attribute_types[column] = 'Categorical'
            continue
        # if the column has only two distinguished values, consider it categorical
        elif self.df[column].nunique() == 2:
          attribute_types[column] = 'Categorical'
          continue
        else: #if it's not the group column, or if it's not been classified as categorical by the user, check every value of the column
          num_float = 0
          num_text = 0
          thresh = 0.99
          num_att_in_column = len(self.df[column])

          for value in self.df[column]:  # Accessing all values in the column
            # Attempt to convert the value to a float
            try:
              float(value)
              num_float += 1
              continue  # Move to the next value
            except ValueError:
              pass  # If it's not a float, continue to the next check
            # If it's not an integer or a float, consider it as text
            num_text += 1

          # now see if it's categorical or numerical
          if num_float / num_att_in_column > thresh:
            attribute_types[column] = 'Numerical'
            continue
          else:
            attribute_types[column] = 'Categorical'
    return attribute_types


  def create_group_column(self):
    """
    Create a 'Group' column in the DataFrame based on protected attributes, privileged/unprivileged conditions, and label. Then drop
    the protected attributes and label columns because those informations are already present in the Group column
    """
    group_combinations = pd.MultiIndex.from_product([self.df[sensitive].unique() for sensitive in self.sensitive] + [self.df[self.label].unique()], names=self.sensitive + [self.label])
    #print(group_combinations)
    print(list(enumerate(group_combinations)))
    # Create a mapping between group combinations and their corresponding numbers
    group_mapping = {group: idx for idx, group in enumerate(group_combinations)}
    reverse_group_mapping = {idx: group for group, idx in group_mapping.items()}  # Create reverse mapping
    # Apply the mapping to create a new column in the DataFrame
    self.df['Group'] = pd.MultiIndex.from_frame(self.df[self.sensitive + [self.label]]).map(group_mapping)

    return reverse_group_mapping #enumerate(group_combinations)

  def prepare(self):
    """
    Perform all preprocessing steps.
    """
    self.detect_missing_values()
    self.binary_label()
    self.create_group_column()
    self.find_categorical_attributes()
    #self.normalization()
    #self.make_numerical()
    return self

In [None]:
# ########### CODE TO TRY DATAPREPARATION CLASS WITH A SMALL DATAFAME

# # Create a small DataFrame
# np.random.seed(42)
# n = 5000
# data = {
#     'Gender': np.random.choice(['F', 'M'], size=n),
#     'Race': np.random.choice(['B', 'W'], size=n),
#     'NumKids': np.random.choice(range(5), size=n),
#     'Height': np.random.uniform(150, 190, size=n),
#     'ShirtColor': np.random.choice(['Red', 'Blue', 'Green'], size=n),
#     'Age': np.random.randint(18, 65, size=n),
#     'Accepted': np.random.choice([1, 2], size=n)
# }

# df = pd.DataFrame(data)

# # Introduce missing values randomly
# missing_fraction = 0.05  # Adjust this fraction based on the desired percentage of missing values

# # Randomly select cells and set their values to NaN
# mask = np.random.rand(*df.shape) < missing_fraction
# df[mask] = np.nan
# print(df)

# del mask
# del missing_fraction

# #count the number of columns in the dataframe
# num_of_columns = len(df.columns)
# print("Number of columns:", num_of_columns)

# #use the DataPreparation class to preprocess the dataframe
# data_prep = DataPreparation(df, ['Gender','Race'], 'Accepted', ['M','W'], ['F', 'B'], 2, 1, ['NumKids'])
# data_prep.prepare()

# # use this reset index to have indices from 0 to len(df)
# data_prep.df = data_prep.df.reset_index(drop=True)
# print(data_prep.df)



In [None]:
!pip install ucimlrepo



In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
print(statlog_german_credit_data.metadata)

# variable information
print(statlog_german_credit_data.variables)

{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

In [None]:
# Define a function to convert age values to categories
def convert_age_to_category(age):
    if age >= 25:
        return 1 #'Adult'
    else:
        return 2 #'Youth'

def convert_sex(sex):
  if sex == 'A92':
    return 'female'
  else:
    return 'male'

# Apply the function to the 'Attribute13' column
X_new = X.copy()
X_new['Attribute9'] = X['Attribute9'].apply(convert_sex) #Attribute9 == sex
X_new['Attribute13'] = X['Attribute13'].apply(convert_age_to_category) #Attribute13 == age

df = pd.concat([X_new, y], axis=1)
#print(df.iloc[0:50, 8])


#count the number of columns in the dataframe
num_of_columns = len(df.columns)
print("Number of columns:", num_of_columns)

Number of columns: 21


In [None]:
#use the DataPreparation class to preprocess the dataframe
data_prep = DataPreparation(df, ['Attribute9','Attribute13'], 'class', ['male', 1], ['female', '2'], 1, 2, ['Attribute1'])
data_prep.prepare()

# use this reset index to have indices from 0 to len(df)
data_prep.df = data_prep.df.reset_index(drop=True)
print(data_prep.df)

No missing values detected.
The 'class' column has only two unique values.
[(0, ('male', 1, 1)), (1, ('male', 1, 0)), (2, ('male', 2, 1)), (3, ('male', 2, 0)), (4, ('female', 1, 1)), (5, ('female', 1, 0)), (6, ('female', 2, 1)), (7, ('female', 2, 0))]
    Attribute1  Attribute2 Attribute3 Attribute4  Attribute5 Attribute6  \
0          A11           6        A34        A43        1169        A65   
1          A12          48        A32        A43        5951        A61   
2          A14          12        A34        A46        2096        A61   
3          A11          42        A32        A42        7882        A61   
4          A11          24        A33        A40        4870        A61   
..         ...         ...        ...        ...         ...        ...   
995        A14          12        A32        A42        1736        A61   
996        A11          30        A32        A41        3857        A61   
997        A14          12        A32        A43         804        A61  

In [None]:
# Call the method to find categorical indices
attribute_types = data_prep.find_categorical_attributes()
attribute_types['Attribute18'] = 'Numerical'

print("dictionary 'Attribute' : 'Numerical/Categorical' \n ", attribute_types)

# Initialize cat_features list
cat_features = []

# Iterate over attribute names and determine whether they are categorical or numerical
for attr in attribute_types:
    cat_features.append(attribute_types[attr] == 'Categorical')

del attr

print('boolean categorical features vector:\n', cat_features)


dictionary 'Attribute' : 'Numerical/Categorical' 
  {'Attribute1': 'Categorical', 'Attribute2': 'Numerical', 'Attribute3': 'Categorical', 'Attribute4': 'Categorical', 'Attribute5': 'Numerical', 'Attribute6': 'Categorical', 'Attribute7': 'Categorical', 'Attribute8': 'Numerical', 'Attribute9': 'Categorical', 'Attribute10': 'Categorical', 'Attribute11': 'Numerical', 'Attribute12': 'Categorical', 'Attribute13': 'Categorical', 'Attribute14': 'Categorical', 'Attribute15': 'Categorical', 'Attribute16': 'Numerical', 'Attribute17': 'Categorical', 'Attribute18': 'Numerical', 'Attribute19': 'Categorical', 'Attribute20': 'Categorical', 'class': 'Categorical'}
boolean categorical features vector:
 [True, False, True, True, False, True, True, False, True, True, False, True, True, True, True, False, True, False, True, True, True]


In [None]:
reverse_group_mapping = data_prep.create_group_column()

[(0, ('male', 1, 1)), (1, ('male', 1, 0)), (2, ('male', 2, 1)), (3, ('male', 2, 0)), (4, ('female', 1, 1)), (5, ('female', 1, 0)), (6, ('female', 2, 1)), (7, ('female', 2, 0))]


In [None]:
theoretical_num_groups = len(reverse_group_mapping)

In [None]:
!pip install gower
!pip install distython
from distython import HEOM
import gower
from sklearn.neighbors import NearestNeighbors



In [None]:
import sys
sys.path.append('/content/gdrive/MyDrive/code from Github - Annalisa/SolutionForTheProblem')

In [None]:
import HVDM

In [None]:
# FIRST OF ALL, Encode the dataframe so that the categorical attributes are converted to numbers, but still treated like categorical.
X = data_prep.df.drop(columns=['Group'], axis=1).copy()
#print(X)
encoder_dict = dict()
columns_categorical = X.columns[cat_features]

# for each categorical column, encode it
for column in columns_categorical:
  le = LabelEncoder()
  X[column] = le.fit_transform(X[column].values)
  mapping = dict(zip(le.classes_, range(len(le.classes_))))
  encoder_dict[column] = mapping
#print(X)

# Invert the boolean values in the list using list comprehension
numerical_features = [not feature for feature in cat_features]
# Use the inverted list to select numerical columns
columns_numerical = X.columns[numerical_features]

# int columns to float, otherwise gower has problems
for column in columns_numerical:
  X[column] = X[column].astype(float)

# convert it to numpy so that distython doesn't cry
X = X.to_numpy()
nan_eqv = 12345
cat_attr_ix = [i for i, value in enumerate(cat_features) if value]

GOWER DISTANCE

In [None]:
def gower_distance(ndary, cat_features_boolean, n=5):
  """
  ndary: ndarray, that if the dataframe X precedentemente converted to numpy, it has all columns beside the Group one (yes, it also has
        the label column)
  cat_features: array of T/F where True = the attribute is categorical, False = the attribute is numerical
  n = number or closest neighbors
  """
  X = ndary
  cat_features = cat_features_boolean
  #n+1 because in the matrix with indices, at the first position there's the point itself
  n = n+1

  #compute the matrix of the distances using Gower
  matrix_dist_gower = gower.gower_matrix(X, cat_features = cat_features)

  #compute the matrix with top-n neighbors and the distance values
  num_rows_gower = X.shape[0]

  # Initialize a list to store the results
  matrix_neighbors_gower = []

  # Loop through each row
  for i in range(num_rows_gower):
    top_n_neighbors_gower = gower.gower_topn(X[i:i+1, :], X[:, :], cat_features = cat_features, n=6)
    matrix_neighbors_gower.append(top_n_neighbors_gower)

  # Convert the list of dictionaries to a matrix
  matrix_neighbors_gower_index = np.array([item['index'] for item in matrix_neighbors_gower])
  matrix_neighbors_gower_values = np.array([item['values'] for item in matrix_neighbors_gower])

  return matrix_neighbors_gower_index, matrix_neighbors_gower_values

In [None]:
cat_attr_ix = [i for i, value in enumerate(cat_features) if value]
print(cat_attr_ix)

[0, 2, 3, 5, 6, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20]


HEOM DISTANCE

In [None]:
def heom_distance(ndary, cat_features_boolean, n=5):
  """
  later
  """
  X = ndary
  n = n+1

  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]
  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HEOM = HEOM(X, cat_attr_ix, nan_equivalents = [nan_eqv])

  # Declare NearestNeighbor and link the metric
  neighbor = NearestNeighbors(metric = matrix_dist_HEOM.heom)

  # Fit the model which uses the custom distance metric
  neighbor.fit(X)

  # Initialize empty matrices to store indices and distances
  nearest_indices_heom = np.zeros((len(X), 6), dtype=int)
  nearest_distances_heom = np.zeros((len(X), 6))

  # Loop through each instance in the data
  for i in range(len(X)):
      # Find the 5 nearest neighbors to the current instance
      result = neighbor.kneighbors(X[i].reshape(1, -1), n_neighbors=6)

      # Store the indices and distances of the nearest neighbors in the matrices
      nearest_indices_heom[i] = result[1][0]
      nearest_distances_heom[i] = result[0][0]

  return nearest_indices_heom, nearest_distances_heom

HVDM DISTANCE


In [None]:
def hvdm_distance(ndary, y_ix, cat_features_boolean, n=5):
  """
  y_ix = [n] is a 1-element array what represent the index of the target variable/laber
  """
  X = ndary
  n = n+1

  # convert the values of the cat_features from boolean to binary: 1 if categorical, 0 if numerical
  cat_attr_ix = [i for i, value in enumerate(cat_features_boolean) if value]
  nan_eqv = 12345

  #compute the matrix of the distances using HEOM
  matrix_dist_HVDM = HVDM(X,y_ix, cat_attr_ix, nan_equivalents = [nan_eqv], normalised="std")

  # Declare NearestNeighbor and link the metric
  neighbor = NearestNeighbors(metric = matrix_dist_HVDM.hvdm)

  # Fit the model which uses the custom distance metric
  neighbor.fit(X)

  # Initialize empty matrices to store indices and distances
  nearest_indices_hvdm = np.zeros((len(X), 6), dtype=int)
  nearest_distances_hvdm = np.zeros((len(X), 6))

  # Loop through each instance in the data
  for i in range(len(X)):
      # Find the 5 nearest neighbors to the current instance
      result = neighbor.kneighbors(X[i].reshape(1, -1), n_neighbors=6)

      # Store the indices and distances of the nearest neighbors in the matrices
      nearest_indices_hvdm[i] = result[1][0]
      nearest_distances_hvdm[i] = result[0][0]

  return nearest_indices_hvdm, nearest_distances_hvdm

In [None]:
from HVDM import HVDM

In [None]:
# to select one of the 3 distances
y_ix = [6]
distance = 'GOWER' #'HEOM', 'GOWER, 'HVDM'
print(distance)
if distance == 'HVDM':
  closest_index_hvdm, closest_values_hvdm = hvdm_distance(X, y_ix, cat_features,5)
  print(closest_index_hvdm, '\n',closest_values_hvdm)
if distance == 'HEOM':
  closest_index_heom, closest_values_heom = heom_distance(X, cat_features,5)
  print(closest_index_heom, '\n',closest_values_heom)
if distance == 'GOWER':
  closest_index_gower, closest_values_gower = gower_distance(X, cat_features,5)
  print(closest_index_gower, '\n',closest_values_gower)

GOWER
[[  0  61 135 807 177 115]
 [  1 521 308 677  39 576]
 [  2 527 422 234 943 204]
 ...
 [997 877  46 786 625  90]
 [998 624 457  68 528 938]
 [999  23 893  97 553 145]] 
 [[0.         0.10250484 0.13137184 0.13181204 0.1450345  0.14683488]
 [0.         0.07586175 0.1356008  0.16059665 0.18456069 0.19185579]
 [0.         0.1023941  0.11409287 0.11815969 0.11839288 0.1761779 ]
 ...
 [0.         0.05586566 0.06834293 0.07539721 0.08454462 0.09572545]
 [0.         0.1319969  0.18587708 0.19684684 0.19797613 0.21262182]
 [0.         0.12561046 0.16823977 0.18275754 0.18860207 0.1952234 ]]


In [None]:
# to select one of the 3 distances
y_ix = [6]
distance = 'HEOM' #'HEOM', 'GOWER, 'HVDM'
print(distance)
if distance == 'HVDM':
  closest_index_hvdm, closest_values_hvdm = hvdm_distance(X, y_ix, cat_features,5)
  print(closest_index_hvdm, '\n',closest_values_hvdm)
if distance == 'HEOM':
  closest_index_heom, closest_values_heom = heom_distance(X, cat_features,5)
  print(closest_index_heom, '\n',closest_values_heom)
if distance == 'GOWER':
  closest_index_gower, closest_values_gower = gower_distance(X, cat_features,5)
  print(closest_index_gower, '\n',closest_values_gower)

HEOM
[[  0  61 135 807 115 214]
 [  1 521 308  39 677 724]
 [  2 527 234 943 422 204]
 ...
 [997 877  46 786 625  52]
 [998 624 457  68 528 938]
 [999  23 893  97 553 525]] 
 [[0.         2.01792731 2.45224517 2.45240285 2.51016903 2.76448836]
 [0.         1.21771646 1.9688551  2.97584356 3.12495118 3.35220321]
 [0.         1.57046104 1.90365224 1.9039737  2.11503199 3.00109357]
 ...
 [0.         0.59259598 1.13133424 1.14333596 1.4505708  1.55619064]
 [0.         2.27049263 3.35381108 4.01751935 4.01814961 4.10842481]
 [0.         2.25877444 3.13316429 3.28032656 3.36679005 3.44619779]]


In [None]:
# to select one of the 3 distances
y_ix = [20]
distance = 'HVDM' #'HEOM', 'GOWER, 'HVDM'
print(distance)
if distance == 'HVDM':
  closest_index_hvdm, closest_values_hvdm = hvdm_distance(X, y_ix, cat_features,5)
  print(closest_index_hvdm, '\n',closest_values_hvdm)
if distance == 'HEOM':
  closest_index_heom, closest_values_heom = heom_distance(X, cat_features,5)
  print(closest_index_heom, '\n',closest_values_heom)
if distance == 'GOWER':
  closest_index_gower, closest_values_gower = gower_distance(X, cat_features,5)
  print(closest_index_gower, '\n',closest_values_gower)

HVDM
[[  0 177  61 713 250 463]
 [  1 569 706 444 677 295]
 [  2 734 499 160 941 275]
 ...
 [997 453  90 440 533 532]
 [998 528 194 607  11 813]
 [999  98 893 996 669 293]] 
 [[0.         0.04575456 0.12412325 0.15793985 0.18956856 0.23474701]
 [0.         0.20345042 0.24195905 0.30036409 0.31042924 0.36198608]
 [0.         0.33344272 0.36588976 0.3715587  0.39370213 0.41527487]
 ...
 [0.         0.11639186 0.12384068 0.12455554 0.12684267 0.13163233]
 [0.         0.17366521 0.17510369 0.25622068 0.30298542 0.33737299]
 [0.         0.23185263 0.25876459 0.3150544  0.35088757 0.3777941 ]]


In [None]:
# matrices index close to each other
#COMPARISON
#print('HVDM \t\t\t HEOM \t\t\t GOWER')
#for i in range(len(closest_index_hvdm)):
#    print(f"{closest_index_hvdm[i]}\t{closest_index_heom[i]}\t{closest_index_gower[i]}")


FIND THE TAXONOMY - S,B,R,O

In [None]:
complete_df = data_prep.df
print(complete_df)

    Attribute1  Attribute2 Attribute3 Attribute4  Attribute5 Attribute6  \
0          A11           6        A34        A43        1169        A65   
1          A12          48        A32        A43        5951        A61   
2          A14          12        A34        A46        2096        A61   
3          A11          42        A32        A42        7882        A61   
4          A11          24        A33        A40        4870        A61   
..         ...         ...        ...        ...         ...        ...   
995        A14          12        A32        A42        1736        A61   
996        A11          30        A32        A41        3857        A61   
997        A14          12        A32        A43         804        A61   
998        A11          45        A32        A43        1845        A61   
999        A12          45        A34        A41        4576        A62   

    Attribute7  Attribute8 Attribute9 Attribute10  ...  Attribute13  \
0          A75           4  

In [None]:
# matrices index close to each other
print('HVDM \t\t\t HEOM \t\t\t GOWER')
for i in range(len(closest_index_hvdm)-900):
    print(f"{closest_index_hvdm[i]}\t{closest_index_heom[i]}\t{closest_index_gower[i]}")


HVDM 			 HEOM 			 GOWER
[  0 177  61 713 250 463]	[  0  61 135 807 115 214]	[  0  61 135 807 177 115]
[  1 569 706 444 677 295]	[  1 521 308  39 677 724]	[  1 521 308 677  39 576]
[  2 734 499 160 941 275]	[  2 527 234 943 422 204]	[  2 527 422 234 943 204]
[  3 501 392 239  42 684]	[  3 460 239 316 286 501]	[  3 460 239 316 501 286]
[  4 853 429 649 814 334]	[  4 522 814 545 624 853]	[  4 522 814 545 624 853]
[  5 971 685 829 791 854]	[  5 685 881 681  87 829]	[  5 685 881 681  87 511]
[  6 786 928 742  46 164]	[  6 164 786 283  46 452]	[  6 164 786 283  46 452]
[  7 815 730 142  51 322]	[  7 425 291 937 511 224]	[  7 511 291 425 937 224]
[  8 786 361 816 695   6]	[  8 430  84 629 995 848]	[  8 430  84 995 629 478]
[  9 320  35 578 771 237]	[  9 320 578 128 474  13]	[  9 320 578 128 474  13]
[ 10 747 601 878 172 364]	[ 10 952 500 809 398 831]	[ 10 952 500 809 747 398]
[ 11 813 194 446 998 528]	[ 11 504 922 471 359 926]	[ 11 922 504 471 359 926]
[ 12 480 576 600 176 692]	[ 12 410 931 1

In [None]:
#drop the first column of the matrix, of course the closest point to i is i

closest_index_heom_NN = closest_index_heom[:,1:]
print(closest_index_heom_NN)
closest_index_hvdm_NN = closest_index_hvdm[:,1:]
closest_index_gower_NN = closest_index_gower[:,1:]

[[ 61 135 807 115 214]
 [521 308  39 677 724]
 [527 234 943 422 204]
 ...
 [877  46 786 625  52]
 [624 457  68 528 938]
 [ 23 893  97 553 525]]


In [None]:
# gower
# MATRIX [#GROUPS, #TAXONOMY], where taxonomy is S,B,R,O
# Define constants for the taxonomy categories
SAFE = 0
BORDERLINE = 1
RARE = 2
OUTLIER = 3

# Initialize matrix_taxonomy with dtype=int for clarity
matrix_taxonomy = np.zeros((theoretical_num_groups, 4), dtype=int)

# Loop through each sample
for sample, neighbors_indices in enumerate(closest_index_gower_NN):
    sample_group = complete_df.iloc[sample]['Group']
    same_group_neighbors = 0
    same_group_neighbor_indices = []

    # Count neighbors from the same group and collect their indices
    for neighbor_index in neighbors_indices:
        if sample_group == complete_df.iloc[neighbor_index]['Group']:
            same_group_neighbors += 1
            same_group_neighbor_indices.append(neighbor_index)

    # Classify based on neighbor counts
    if same_group_neighbors == 5 or same_group_neighbors == 4:
        classification = SAFE
    elif same_group_neighbors == 3 or same_group_neighbors == 2:
        classification = BORDERLINE
    elif same_group_neighbors == 1:
        # Check if the neighbor has either 0:5 or 1:4 neighbors from the same class
        solo_neighbor = same_group_neighbor_indices[0]
        neighbors_of_solo_neighbor = closest_index_gower_NN[solo_neighbor]

        same_class_neighbors = sum(
            1 for neighbor_index in neighbors_of_solo_neighbor if complete_df.iloc[neighbor_index]['Group'] == sample_group
        )

        if same_class_neighbors == 0 or same_class_neighbors == 1:
            classification = RARE
        else:
            classification = BORDERLINE
    elif same_group_neighbors == 0:
        classification = OUTLIER
    else:
        classification = BORDERLINE  # If not falling into any specific category, consider as Borderline

    # Update the taxonomy matrix
    matrix_taxonomy[sample_group, classification] += 1
    # print('Index of the sample:', sample, 'Group of the sample:', sample_group, 'Number of neighbors from the same group:', same_group_neighbors)

#print(matrix_taxonomy)

# Define the taxonomy labels
taxonomy_labels = ['S', 'B', 'R', 'O']

# Create a DataFrame from matrix_taxonomy
taxonomy_df_gower = pd.DataFrame(matrix_taxonomy, columns=taxonomy_labels)

# Add a column for group numbers
taxonomy_df_gower['Group'] = range(theoretical_num_groups)

# Set 'Group' column as the index
taxonomy_df_gower.set_index('Group', inplace=True)

# Print the DataFrame
taxonomy_df_gower


Unnamed: 0_level_0,S,B,R,O
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,428,31,0,0
1,73,87,4,2
2,5,28,1,6
3,3,16,2,4
4,62,86,1,4
5,23,43,1,6
6,15,28,5,0
7,16,19,0,1


In [None]:
# heom
# MATRIX [#GROUPS, #TAXONOMY], where taxonomy is S,B,R,O
# Define constants for the taxonomy categories

SAFE = 0
BORDERLINE = 1
RARE = 2
OUTLIER = 3

# Initialize matrix_taxonomy with dtype=int for clarity
matrix_taxonomy = np.zeros((theoretical_num_groups, 4), dtype=int)

# Loop through each sample
for sample, neighbors_indices in enumerate(closest_index_heom_NN):
    sample_group = complete_df.iloc[sample]['Group']
    same_group_neighbors = 0
    same_group_neighbor_indices = []

    # Count neighbors from the same group and collect their indices
    for neighbor_index in neighbors_indices:
        if sample_group == complete_df.iloc[neighbor_index]['Group']:
            same_group_neighbors += 1
            same_group_neighbor_indices.append(neighbor_index)

    # Classify based on neighbor counts
    if same_group_neighbors == 5 or same_group_neighbors == 4:
        classification = SAFE
    elif same_group_neighbors == 3 or same_group_neighbors == 2:
        classification = BORDERLINE
    elif same_group_neighbors == 1:
        # Check if the neighbor has either 0:5 or 1:4 neighbors from the same class
        solo_neighbor = same_group_neighbor_indices[0]
        neighbors_of_solo_neighbor = closest_index_heom_NN[solo_neighbor]

        same_class_neighbors = sum(
            1 for neighbor_index in neighbors_of_solo_neighbor if complete_df.iloc[neighbor_index]['Group'] == sample_group
        )

        if same_class_neighbors == 0 or same_class_neighbors == 1:
            classification = RARE
        else:
            classification = BORDERLINE
    elif same_group_neighbors == 0:
        classification = OUTLIER
    else:
        classification = BORDERLINE  # If not falling into any specific category, consider as Borderline

    # Update the taxonomy matrix
    matrix_taxonomy[sample_group, classification] += 1
    # print('Index of the sample:', sample, 'Group of the sample:', sample_group, 'Number of neighbors from the same group:', same_group_neighbors)

#print(matrix_taxonomy)

# Define the taxonomy labels
taxonomy_labels = ['S', 'B', 'R', 'O']

# Create a DataFrame from matrix_taxonomy
taxonomy_df_heom = pd.DataFrame(matrix_taxonomy, columns=taxonomy_labels)

# Add a column for group numbers
taxonomy_df_heom['Group'] = range(theoretical_num_groups)

# Set 'Group' column as the index
taxonomy_df_heom.set_index('Group', inplace=True)

# Print the DataFrame
taxonomy_df_heom


Unnamed: 0_level_0,S,B,R,O
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,429,30,0,0
1,79,82,2,3
2,7,26,1,6
3,1,17,2,5
4,61,86,1,5
5,24,42,1,6
6,15,28,5,0
7,16,19,0,1


In [None]:
# hvdm
# MATRIX [#GROUPS, #TAXONOMY], where taxonomy is S,B,R,O
# Define constants for the taxonomy categories

SAFE = 0
BORDERLINE = 1
RARE = 2
OUTLIER = 3

# Initialize matrix_taxonomy with dtype=int for clarity
matrix_taxonomy = np.zeros((theoretical_num_groups, 4), dtype=int)

# Loop through each sample
for sample, neighbors_indices in enumerate(closest_index_hvdm_NN):
    sample_group = complete_df.iloc[sample]['Group']
    same_group_neighbors = 0
    same_group_neighbor_indices = []

    # Count neighbors from the same group and collect their indices
    for neighbor_index in neighbors_indices:
        if sample_group == complete_df.iloc[neighbor_index]['Group']:
            same_group_neighbors += 1
            same_group_neighbor_indices.append(neighbor_index)

    # Classify based on neighbor counts
    if same_group_neighbors == 5 or same_group_neighbors == 4:
        classification = SAFE
    elif same_group_neighbors == 3 or same_group_neighbors == 2:
        classification = BORDERLINE
    elif same_group_neighbors == 1:
        # Check if the neighbor has either 0:5 or 1:4 neighbors from the same class
        solo_neighbor = same_group_neighbor_indices[0]
        neighbors_of_solo_neighbor = closest_index_hvdm_NN[solo_neighbor]

        same_class_neighbors = sum(
            1 for neighbor_index in neighbors_of_solo_neighbor if complete_df.iloc[neighbor_index]['Group'] == sample_group
        )

        if same_class_neighbors == 0 or same_class_neighbors == 1:
            classification = RARE
        else:
            classification = BORDERLINE
    elif same_group_neighbors == 0:
        classification = OUTLIER
    else:
        classification = BORDERLINE  # If not falling into any specific category, consider as Borderline

    # Update the taxonomy matrix
    matrix_taxonomy[sample_group, classification] += 1
    # print('Index of the sample:', sample, 'Group of the sample:', sample_group, 'Number of neighbors from the same group:', same_group_neighbors)

#print(matrix_taxonomy)

# Define the taxonomy labels
taxonomy_labels = ['S', 'B', 'R', 'O']

# Create a DataFrame from matrix_taxonomy
taxonomy_df_hvdm = pd.DataFrame(matrix_taxonomy, columns=taxonomy_labels)

# Add a column for group numbers
taxonomy_df_hvdm['Group'] = range(theoretical_num_groups)

# Set 'Group' column as the index
taxonomy_df_hvdm.set_index('Group', inplace=True)

# Print the DataFrame
taxonomy_df_hvdm


Unnamed: 0_level_0,S,B,R,O
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,326,130,2,1
1,84,81,1,0
2,0,13,10,17
3,1,2,10,12
4,11,73,37,32
5,2,49,14,8
6,0,27,11,10
7,8,20,3,5


In [None]:
# Concatenate the DataFrames horizontally
concatenated_df = pd.concat([taxonomy_df_hvdm, taxonomy_df_heom, taxonomy_df_gower], axis=1)
print('\t HVDM, \t HEOM, \t GOWER')
# Print the concatenated DataFrame
concatenated_df


	 HVDM, 	 HEOM, 	 GOWER


Unnamed: 0_level_0,S,B,R,O,S,B,R,O,S,B,R,O
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,326,130,2,1,429,30,0,0,428,31,0,0
1,84,81,1,0,79,82,2,3,73,87,4,2
2,0,13,10,17,7,26,1,6,5,28,1,6
3,1,2,10,12,1,17,2,5,3,16,2,4
4,11,73,37,32,61,86,1,5,62,86,1,4
5,2,49,14,8,24,42,1,6,23,43,1,6
6,0,27,11,10,15,28,5,0,15,28,5,0
7,8,20,3,5,16,19,0,1,16,19,0,1
