In [9]:
from FLAI import data
from FLAI import causal_graph
import pandas as pd
import numpy as np
import math

In [2]:
import time



In [40]:
def fairness_eqa_eqi(flai_dataset, features=None, target_column=None, column_filter=None, plot=True):
    """
    Calculate fairness metrics for the data.

    Args:
    target_column (str, optional): The target column. If None, an exception is raised. Default is None.
    features (list, optional): List of column names to be used as features. Default is None.
    column_filter (list, optional): List of column names to be used as sensitive attributes. Default is None.
    plot (bool, optional): Whether to plot the results. Default is True.
    """

    if not target_column:
        raise ValueError("target_column is not provided")
    if not features:
        raise ValueError("features are not provided")
    if not column_filter:
        raise ValueError("column_filter is not provided")

    # Group data and calculate metrics in one step
    df_aux = (
        flai_dataset.data
        .groupby(column_filter + features)
        .agg(count=(target_column, 'count'),
             sum=(target_column, 'sum'),
             px=(target_column, lambda x: x.sum() / x.count()))
        .reset_index()
    )

    df_aux_ideal = (
        flai_dataset.data
        .groupby(features)
        .agg(count=(target_column, 'count'),
             sum=(target_column, 'sum'),
             px=(target_column, lambda x: x.sum() / x.count()))
        .reset_index()
    )
    df_aux_ideal['dx'] = [0] + (df_aux_ideal['count'].cumsum() / df_aux_ideal['count'].sum()).tolist()[:-1]

    # Prepare combinations
    combinations_s = flai_dataset.data[column_filter].drop_duplicates().values.tolist()
    combinations_f = flai_dataset.data[features].drop_duplicates().values.tolist()
    n_group = len(combinations_s)

    # Add px and dx values for each group
    for n, combo_s in enumerate(combinations_s):
        df_aux_ideal[f'px_{n}'] = 0
        df_aux_ideal[f'count_{n}'] = 0
        df_aux_ideal[f'dx_{n}'] = 0

        for _, row in df_aux.iterrows():
            if all(row[col] == val for col, val in zip(column_filter, combo_s)):
                matching_idx = (df_aux_ideal[features] == row[features].tolist()).all(axis=1)
                df_aux_ideal.loc[matching_idx, f'px_{n}'] = row['px']
                df_aux_ideal.loc[matching_idx, f'count_{n}'] = row['count']

        df_aux_ideal[f'dx_{n}'] = [0] + (df_aux_ideal[f'count_{n}'].cumsum() / df_aux_ideal[f'count_{n}'].sum()).tolist()[:-1]

    # Identify the group with the maximum px and minimum dx
    p_max, d_max, n_p = -1, float('inf'), -1
    for n in range(n_group):
        p_aux = df_aux_ideal[f'px_{n}'].max()
        d_aux = df_aux_ideal[f'dx_{n}'].max()
        if p_aux > p_max or (p_aux == p_max and d_aux < d_max):
            p_max, d_max, n_p = p_aux, d_aux, n

    # Compute fairness metrics
    results = []
    for n in range(n_group):
        if n != n_p:
            eqi = (df_aux_ideal[f'dx_{n_p}'] - df_aux_ideal[f'dx_{n}']).mean()
            eqa = (df_aux_ideal[f'px_{n_p}'] - df_aux_ideal[f'px_{n}']).mean()

            EQI = round(eqi, 2)
            EQA = round(eqa, 2)
            F = round(math.sqrt(EQA**2 + EQI**2), 2)

            results.append({
                'group': f'{column_filter}_{combinations_s[n]}',
                'reference': f'{column_filter}_{combinations_s[n_p]}',
                'EQI': EQI,
                'EQA': EQA,
                'F': F
            })

    df_f = pd.DataFrame(results)

    # Plot if required
    if plot:
        flai_dataset.plot_fairness_eqa_eqi(df_aux_ideal, n_group, [f'{column_filter}_{s}' for s in combinations_s])

    return df_f, df_aux_ideal


In [69]:
def fairness_eqa_eqi(flai_dataset, features = None, target_column = None, column_filter = None,plot = True):
        """
        Calculate fairness metrics for the data.

        Args:
        target_column (str, optional): The target column. If None, an exception is raised. Default is None.
        features (dict, optional): Dictionary with keys as column names as feature. Default is None.
        column_filter (dict, optional): Dictionary with keys as column names as sensible. Default is None.
        """

        if target_column is None:
            raise Exception("target_column is not provided")
        if features is None:
            raise Exception("features is not provided")
        if column_filter is None:
            raise Exception("predicted_column is not column_filter")
  
    
        df_aux = flai_dataset.data.groupby(by=column_filter + features).agg({target_column: ['count', 'sum']})
        df_aux_ideal = flai_dataset.data.groupby(by=features).agg({target_column: ['count', 'sum']})
        df_aux.columns = df_aux.columns.droplevel(0)
        df_aux = df_aux.reset_index()
        combinations_s = df_aux[column_filter].value_counts().index.values
        df_aux = df_aux.set_index(column_filter + features)

        df_aux_ideal.columns = df_aux_ideal.columns.droplevel(0)
        df_aux_ideal = df_aux_ideal.reset_index()
        combinations_f = df_aux_ideal[features].value_counts().index.values
        df_aux_ideal['px'] = df_aux_ideal['sum'] / df_aux_ideal['count']
        df_aux_ideal = df_aux_ideal.sort_values(by=['px']+features)
        df_aux_ideal = df_aux_ideal.set_index(features)
        df_aux_ideal['dx'] = [0] + (df_aux_ideal['count'].cumsum() / df_aux_ideal['count'].sum()).tolist()[:-1]
        
        df_aux['px'] = df_aux['sum'] / df_aux['count']

        n_group = combinations_s.shape[0]
        groups = [str(column_filter) + str(s) for s in combinations_s]
        combinations = [[s + f for s in combinations_s] for f in combinations_f]
        start_time = time.time()
        df_aux = df_aux.reset_index().set_index(features)
        
        for cs,n in zip(combinations_s,range(n_group)):
            condition = True
            for feature, value in zip(column_filter, cs):
                condition &= (df_aux[feature] == value)
            filtered_aux = df_aux[condition]
            df_aux_ideal = df_aux_ideal.merge(filtered_aux,how='outer',left_index=True,right_index=True,suffixes=["","_"+str(n)]).fillna(0)
            df_aux_ideal = df_aux_ideal.reset_index().sort_values(by=['px']+features)
            df_aux_ideal = df_aux_ideal.set_index(features)
            df_aux_ideal['dx_'+str(n)] = [0] + (df_aux_ideal['count_'+str(n)].cumsum() / df_aux_ideal['count_'+str(n)].sum()).tolist()[:-1]
                
        stop_time = time.time()
        print(stop_time - start_time)
        if plot:
            flai_dataset.plot_fairness_eqa_eqi(df_aux_ideal,n_group,groups)
        n_p = -1
        p_max = 0
        d_max = 0
        for n in range(n_group):
            p_aux = df_aux_ideal['px_'+str(n)].max()
            d_aux = df_aux_ideal['dx_'+str(n)].max()
            if p_aux > p_max:
                p_max = p_aux
                d_max = d_aux
                n_p = n
            elif p_aux == p_max:
                if d_aux < d_max:
                    p_max = p_aux
                    d_max = d_aux
                    n_p = n

        df_f = pd.DataFrame(columns = ['group','reference','EQI','EQA','F'])
        for n in range(n_group):
            if n != n_p:
                eqi = (df_aux_ideal['dx_'+str(n_p)] - df_aux_ideal['dx_'+str(n)]).values
                eqa = (df_aux_ideal['px_'+str(n_p)] - df_aux_ideal['px_'+str(n)]).values

                EQI = np.round(eqi.mean(),2)
                EQA = np.round(eqa.mean(),2)
                F = np.round(math.sqrt(EQA**2 + EQI**2),2)
                df_f.loc[n] = [groups[n],groups[n_p],EQI,EQA,F]
        return df_f,df_aux_ideal

In [70]:
rows = 1000
for n in [10]:

    df = pd.DataFrame()
    df['sex'] = np.random.randint(0,2,rows)
    features = []
    for n in range(n+1):
        df['feature_'+str(n)] = np.random.randint(0,10,rows)
        features.append('feature_'+str(n))
    df['label'] = np.random.randint(0,2,rows)

    flai_dataset = data.Data(df[features + ['sex', 'label']], transform=True)
    start_time = time.time()
    df_f,df_aux_ideal = fairness_eqa_eqi(flai_dataset,features = features, 
                                target_column = 'label', 
                                column_filter = ['sex'],
                                plot = False)
    stop_time = time.time()
    duration = stop_time - start_time
    print('rows: ',rows,' - features: ',n,' - duration: ',duration)

0.056899070739746094
rows:  1000  - features:  10  - duration:  0.10020780563354492


In [71]:
df_f

Unnamed: 0,group,reference,EQI,EQA,F
0,"['sex'](1.0,)","['sex'](0.0,)",-0.02,0.01,0.02


In [68]:
df_aux_ideal


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,count,sum,px,dx,sex,count_0,sum_0,px_0,dx_0,sex_1,count_1,sum_1,px_1,dx_1
feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0.0,0.0,1.0,1.0,0.0,3.0,9.0,1.0,9.0,7.0,7.0,1,1.0,1.0,0.499,1.0,1.0,1.0,1.0,0.000000,0.0,0.0,0.0,0.0,0.000000
0.0,0.0,1.0,1.0,6.0,7.0,4.0,1.0,4.0,1.0,6.0,1,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.001992,0.0,1.0,0.0,0.0,0.000000
0.0,0.0,2.0,2.0,8.0,1.0,6.0,8.0,1.0,1.0,6.0,1,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.001992,0.0,1.0,0.0,0.0,0.002008
0.0,0.0,2.0,9.0,8.0,2.0,2.0,2.0,8.0,3.0,5.0,1,0.0,0.0,0.002,1.0,1.0,0.0,0.0,0.001992,0.0,0.0,0.0,0.0,0.004016
0.0,0.0,4.0,2.0,1.0,4.0,4.0,4.0,2.0,3.0,7.0,1,0.0,0.0,0.003,1.0,1.0,0.0,0.0,0.003984,0.0,0.0,0.0,0.0,0.004016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.0,9.0,5.0,7.0,9.0,0.0,0.0,6.0,4.0,5.0,7.0,1,0.0,0.0,0.497,0.0,0.0,0.0,0.0,0.996016,0.0,1.0,0.0,0.0,0.993976
9.0,9.0,6.0,9.0,8.0,4.0,9.0,5.0,6.0,1.0,1.0,1,0.0,0.0,0.498,1.0,1.0,0.0,0.0,0.996016,0.0,0.0,0.0,0.0,0.995984
9.0,9.0,8.0,7.0,1.0,3.0,6.0,8.0,4.0,5.0,0.0,1,1.0,1.0,0.997,0.0,0.0,0.0,0.0,0.998008,0.0,1.0,1.0,1.0,0.995984
9.0,9.0,8.0,8.0,2.0,9.0,7.0,4.0,3.0,0.0,9.0,1,1.0,1.0,0.998,0.0,0.0,0.0,0.0,0.998008,0.0,1.0,1.0,1.0,0.997992


In [50]:
combinations[0]

[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0, 0.0, 7.0, 9.0, 3.0, 1.0),
 (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0, 0.0, 7.0, 9.0, 3.0, 1.0)]

In [42]:
datos_f

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,count,sum,px,dx,px_0,count_0,dx_0,px_1,count_1,dx_1
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,7.0,7.0,3.0,...,1,0.0,0.0,0.000,0,0,0.000000,0,1,0.000000
1,0.0,0.0,0.0,2.0,0.0,9.0,5.0,2.0,1.0,1.0,...,1,1.0,1.0,0.001,0,0,0.000000,1,1,0.001938
2,0.0,0.0,2.0,6.0,8.0,9.0,9.0,6.0,3.0,3.0,...,1,0.0,0.0,0.002,0,0,0.000000,0,1,0.003876
3,0.0,0.0,2.0,7.0,1.0,1.0,9.0,5.0,5.0,4.0,...,1,1.0,1.0,0.003,0,0,0.000000,1,1,0.005814
4,0.0,0.0,3.0,5.0,1.0,5.0,3.0,0.0,6.0,5.0,...,1,1.0,1.0,0.004,1,1,0.000000,0,0,0.007752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9.0,9.0,3.0,5.0,9.0,4.0,8.0,1.0,8.0,6.0,...,1,0.0,0.0,0.995,0,1,0.995868,0,0,0.994186
996,9.0,9.0,4.0,0.0,3.0,9.0,3.0,2.0,4.0,8.0,...,1,0.0,0.0,0.996,0,0,0.997934,0,1,0.994186
997,9.0,9.0,6.0,5.0,6.0,2.0,8.0,5.0,1.0,3.0,...,1,1.0,1.0,0.997,1,1,0.997934,0,0,0.996124
998,9.0,9.0,7.0,2.0,7.0,7.0,7.0,5.0,4.0,6.0,...,1,1.0,1.0,0.998,0,0,1.000000,1,1,0.996124
