This work is licensed under the Creative Commons Attribution 4.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.

#### Importing libraries

In [None]:
import fairsearchcore as fsc
from fairsearchcore.models import FairScoreDoc
import math
import sys
import warnings
warnings.filterwarnings("ignore")

#### Loading variables from another notebook

In [None]:
%store -r df_u1
%store -r queries_fr_l
%store -r queries_fr_v
%store -r queries_es_l
%store -r queries_es_v
%store -r queries_uk_l
%store -r queries_uk_v

%store -r df_u2
%store -r intersection
%store -r f_queries_co
%store -r f_queries_mx
%store -r f_queries_es
%store -r intersection

#### Building model from ranking information

In [None]:
def build_ranking_model(is_protected_list):
    size = len(is_protected_list)
    unfair_ranking = []
    for i in range(size):
        unfair_ranking.append(FairScoreDoc((size-i), (size-i), is_protected_list[i]))
    return unfair_ranking

#### Determining if a ranking is fair or not given the model

In [None]:
def is_ranking_fair(ranking, p, k, alpha=0.1):
    #k = 20 # number of topK elements returned (value should be between 10 and 400)
    #p = 0.1 # proportion of protected candidates in the topK elements (value should be between 0.02 and 0.98) 
    #alpha = 0.1 # significance level (value should be between 0.01 and 0.15)
    
    fair = fsc.Fair(k, p, alpha)
    return fair.is_fair(ranking)

#### Functions to determine if a person is part of the protected group or not (depending on the discrimination type)

In [None]:
def sex_discrimination(row, protected_sex='female'):
    return row['sex'] == protected_sex        

def calculate_age_median(df):
    return df['age'].median()

def old_age_discrimination(row, median):
    return age_discrimination(row, median, comparator = '>=')

def young_age_discrimination(row, median):
    return age_discrimination(row, median, comparator = '<')

def age_discrimination(row, median, comparator):
    age = row['age']
    return eval(str(row['age']) + comparator + str(median))           


def translate_if_foreigner(key):
    protected_value = False
    if key == 'foreigner':
        protected_value = True
    return protected_value

def nationality_discrimination(row, protected_value = 'foreigner'):
    protected_value = translate_if_foreigner(protected_value)
    return row['is_foreigner'] == protected_value 


def translate_if_has_photo(key):
    protected_value = False
    if key == 'with photo':
        protected_value = True
    return protected_value

def photo_discrimination(row, protected_value = 'with photo'):
    protected_value = translate_if_has_photo(protected_value)
    return row['has_photo'] == protected_value 


def translate_if_is_premium(key):
    protected_value = False
    if key == 'premium':
        protected_value = True
    return protected_value

def premium_discrimination(row, protected_value = 'premium'):
    protected_value = translate_if_is_premium(protected_value)
    return row['is_premium'] == protected_value

#### Evaluating if the ranking is fair or not for different values of proportion

In [None]:
def get_fairness_metrics(df, queries, source, country, discrimination_function, positional_arguments, n, tag_name='query'):
    protected_lists = []
    p_values = [0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.98]
    print("{: >15} {: >10}".format("Query","p = True"))
    print('          '+'-'*20)
    
    if discrimination_function == old_age_discrimination or discrimination_function == young_age_discrimination:
        median = 35
        print("Median age= ",median)

    for q in sorted(list(queries)):
        df_u1_res = df[(df[tag_name] == q) & (df['source'] == source) & (df['country'] == country)].sort_values('ranking')[:n]
        if len(df_u1_res) == n: # if it has the minimum number of elements (minimum number of elements without unknowns for all queries)
            is_protected_list = []
            for i in range(len(df_u1_res)):
                if discrimination_function == old_age_discrimination or discrimination_function == young_age_discrimination:
                    condition = discrimination_function(df_u1_res.iloc[i], median)
                else:
                    condition = discrimination_function(df_u1_res.iloc[i],positional_arguments['protected_value'])
                if condition:
                    is_protected_list.append(True)
                else:
                    is_protected_list.append(False)
            #print(is_protected_list)
            protected_lists.append(is_protected_list)
            ranking = build_ranking_model(is_protected_list)

            is_fair_list = list([bool(is_ranking_fair(ranking,p, n)) for p in p_values])
            if True in is_fair_list:
                p_index = len(is_fair_list) - 1 - is_fair_list[::-1].index(True)
                p_max = p_values[p_index]
            else:
                p_max = "-"
            #print(is_fair_list)
            if len(q) > 15: 
                q = q[:11]+'.'
            print("{: >15} {: >10}".format(q,p_max))

#### Query with less elements

In [None]:
def get_min_size_elements(df, queries, source, country, min_elements=10, tag_name='query'):
    min_size = float('inf')
    min_query = ""
    """
    print("{: >20} {: >10}".format("Query","Size"))
    print("-----------------------------------------")
    """

    for q in queries:
        size = len(df[(df[tag_name]==q) & (df['source'] == source) & (df['country'] == country)])
        if (size < min_size) and (size >= min_elements):
            min_size = size
            min_query = q
            """
            print("\n")
            print("{: >20} {: >10}".format(q,size))
    
    
    print("\n")
    print("{: >20} {: >10}".format(q,size))
    """
    return min_size

#### Data collected from Linkedin and Viadeo sites ingnoring unknown values

In [None]:
df_u1_sex_filtered = df_u1[(df_u1['sex'] != None) & (df_u1['sex'] != 'unknown')]
df_u1_age_filtered = df_u1[df_u1.age.notnull() & df_u1.age.dropna()]
df_u1_nationality_filtered = df_u1[df_u1.is_foreigner.notnull() &  (df_u1['is_foreigner']!='unknown') & (df_u1['is_foreigner']!='tooshort')]
df_u1_photo_filtered = df_u1[df_u1.has_photo.notnull()]
df_u1_premium_filtered = df_u1[df_u1.is_premium.notnull()]

#### Getting metrics for the different data sources and countries considered

In [None]:
def print_fairness_statistics_u1(country):
    if country == 'Spain':
        queries_l = queries_es_l
        queries_v = queries_es_v
    elif country == 'United Kingdom':
        queries_l = queries_uk_l 
        queries_v = queries_uk_v
    elif country == 'France':
        queries_l = queries_fr_l
        queries_v = queries_fr_v
    else:
        print("Wrong country")
        return
        
    # FEMALE
    print("Source: Linkedin \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_sex_filtered, queries_l, source='linkedin', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: female\n")
    get_fairness_metrics(df_u1_sex_filtered, queries_l, 'linkedin', country, sex_discrimination, {'df':df_u1, 'protected_value':'female'}, min_size)

    print("Source: Viadeo \nCountry: ",country)
    min_size =get_min_size_elements(df_u1_sex_filtered, queries_v, source = 'viadeo', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: female\n")
    get_fairness_metrics(df_u1_sex_filtered, queries_v, 'viadeo', country, sex_discrimination, {'df':df_u1, 'protected_value':'female'}, min_size)

    # OLD PEOPLE
    print("Source: Linkedin \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_age_filtered, queries_l, source='linkedin', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: old people\n")
    get_fairness_metrics(df_u1_age_filtered, queries_l, 'linkedin', country, old_age_discrimination, {'df':df_u1}, min_size)

    print("Source: Viadeo \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_age_filtered, queries_v, source='viadeo', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: old people\n")
    get_fairness_metrics(df_u1_age_filtered, queries_v, 'viadeo', country, old_age_discrimination, {'df':df_u1}, min_size)

    # YOUNG PEOPLE
    print("Source: Linkedin \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_age_filtered, queries_l, source='linkedin', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: young people\n")
    get_fairness_metrics(df_u1_age_filtered, queries_l, 'linkedin', country, young_age_discrimination, {'df':df_u1}, min_size)

    print("Source: Viadeo \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_age_filtered, queries_v, source='viadeo', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: young people\n")
    get_fairness_metrics(df_u1_age_filtered, queries_v, 'viadeo', country, young_age_discrimination, {'df':df_u1}, min_size)

    
    # FOREIGNER PEOPLE
    print("Source: Linkedin \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_nationality_filtered, queries_l, source='linkedin', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: foreigner people\n")
    get_fairness_metrics(df_u1_nationality_filtered, queries_l, 'linkedin', country, nationality_discrimination, {'df':df_u1, 'protected_value':'foreigner'}, min_size)

    print("Source: Viadeo \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_nationality_filtered, queries_v, source='viadeo', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: foreigner people\n")
    get_fairness_metrics(df_u1_nationality_filtered, queries_v, 'viadeo', country, nationality_discrimination, {'df':df_u1, 'protected_value':'foreigner'}, min_size)

    # NON FOREIGNER PEOPLE
    print("Source: Linkedin \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_nationality_filtered, queries_l, source='linkedin', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: not foreigner people\n")
    get_fairness_metrics(df_u1_nationality_filtered, queries_l, 'linkedin', country, nationality_discrimination, {'df':df_u1, 'protected_value':'not foreigner'}, min_size)

    print("Source: Viadeo \nCountry: ",country)
    min_size = get_min_size_elements(df_u1_nationality_filtered, queries_v, source='viadeo', country=country)
    print("Ranking size: ",min_size)
    print("Protected group: not foreigner people\n")
    get_fairness_metrics(df_u1_nationality_filtered, queries_v, 'viadeo', country, nationality_discrimination, {'df':df_u1, 'protected_value':'not foreigner'}, min_size)
    

#### Showing metrics

In [None]:
# SPAIN
sys.stdout = open('fairness_spain_u1.txt', 'w')
print_fairness_statistics_u1(country='Spain')

# FRANCE
sys.stdout = open('fairness_france_u1.txt', 'w')
print_fairness_statistics_u1(country='France')

# UK
sys.stdout = open('fairness_uk_u1.txt', 'w')
print_fairness_statistics_u1(country='United Kingdom')

#### Other general statistics

In [None]:
with_photo = (len(df_u1_photo_filtered[(df_u1_photo_filtered['has_photo']==True)])/len(df_u1_photo_filtered))
without_photo = (len(df_u1_photo_filtered[(df_u1_photo_filtered['has_photo']==False)])/len(df_u1_photo_filtered))
print(round(with_photo + without_photo)== 1.0)
print("Has photo: ", round(with_photo,2))

is_premium = (len(df_u1_premium_filtered[(df_u1_premium_filtered['is_premium']==True)])/len(df_u1_premium_filtered))
is_not_premium = (len(df_u1_premium_filtered[(df_u1_premium_filtered['is_premium']==False)])/len(df_u1_premium_filtered))
print(round(is_premium + is_not_premium)== 1.0)
print("Is premium: ", round(is_premium,2))

#### Data gathered from Top doctors ignoring unknown values 

In [None]:
# only doctors whose speciality is part of the intersection of the 3 countries
df_u2 = df_u2[df_u2['speciality'].isin(intersection)] 

df_u2_sex_filtered = df_u2[(df_u2['sex']!=None) & (df_u2['sex']!='unknown')]
df_u2_age_filtered = df_u2[df_u2.age.notnull() & df_u2.age.dropna()]
df_u2_nationality_filtered = df_u2[df_u2.is_foreigner.notnull() & (df_u2['is_foreigner']!='unknown') & (df_u2['is_foreigner']!='tooshort')]

#### Getting metrics for the different countries

In [None]:
def print_fairness_statistics_u2(country):
    if country == 'Spain':
        queries = f_queries_es
    elif country == 'Colombia':
        queries = f_queries_co 
    elif country == 'Mexico':
        queries = f_queries_mx
    else:
        print("Wrong country")
        return
    
    print("Source: Top doctors \nCountry: ",country)
    min_size = get_min_size_elements(df_u2_sex_filtered, queries, source='top doctors', country=country, tag_name='speciality')
    print("Ranking size: ",min_size)
    print("Protected sex: female\n")
    get_fairness_metrics(df_u2_sex_filtered, queries, 'top doctors', country, sex_discrimination, {'df':df_u1, 'protected_value':'female'}, min_size, tag_name='speciality')
    
    
    print("Source: Top doctors \nCountry: ",country)
    min_size = get_min_size_elements(df_u2_age_filtered, queries, source='top doctors', country=country, tag_name='speciality')
    
    if min_size > 0:
        print("Ranking size: ",min_size)
        print("Protected sex: old people\n")
        get_fairness_metrics(df_u2_age_filtered, queries, 'top doctors', country, old_age_discrimination, {'df':df_u1}, min_size, tag_name='speciality')


        print("Source: Top doctors \nCountry: ",country)
        min_size = get_min_size_elements(df_u2_age_filtered, queries, source='top doctors', country=country, tag_name='speciality')
        print("Ranking size: ",min_size)
        print("Protected sex: young people\n")
        get_fairness_metrics(df_u2_age_filtered, queries, 'top doctors', country, young_age_discrimination, {'df':df_u1}, min_size, tag_name='speciality')

       
    print("Source: Top doctors \nCountry: ",country)
    min_size = get_min_size_elements(df_u2_nationality_filtered, queries, source='top doctors', country=country, tag_name='speciality')
    print("Ranking size: ",min_size)
    print("Protected group: foreigner people\n")
    get_fairness_metrics(df_u2_nationality_filtered, queries, 'top doctors', country, nationality_discrimination, {'df':df_u1, 'protected_value':'foreigner'}, min_size, tag_name='speciality')

    print("Source: Top doctors \nCountry: ",country)
    min_size = get_min_size_elements(df_u2_nationality_filtered, queries, source='top doctors', country=country, tag_name='speciality')
    print("Ranking size: ",min_size)
    print("Protected group: not foreigner people\n")
    get_fairness_metrics(df_u2_nationality_filtered, queries, 'top doctors', country, nationality_discrimination, {'df':df_u1, 'protected_value':'not foreigner'}, min_size, tag_name='speciality')
    

#### Showing metrics

In [None]:
# SPAIN
sys.stdout = open('fairness_spain_u2.txt', 'w')
print_fairness_statistics_u2(country='Spain')

# COLOMBIA
sys.stdout = open('fairness_colombia_u2.txt', 'w')
print_fairness_statistics_u2(country='Colombia')

# MEXICO
sys.stdout = open('fairness_mexico_u2.txt', 'w')
print_fairness_statistics_u2(country='Mexico')

#### Other general statistics

In [None]:
was_awarded = (len(df_u2[(df_u2['was_awarded']==True)])/len(df_u2))
was_not_awarded = (len(df_u2[(df_u2['was_awarded']==False)])/len(df_u2))
print(round(was_awarded + was_not_awarded)== 1.0)
print("Was awarded: ", round(was_awarded,2))

with_photo = (len(df_u2[(df_u2['has_photo']==True)])/len(df_u2))
without_photo = (len(df_u2[(df_u2['has_photo']==False)])/len(df_u2))
print(round(with_photo + without_photo)== 1.0)
print("Has photo: ", round(with_photo,2))