# Libraries

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

import pandas as pd

import scipy
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

from scipy import optimize

# Methods

In [2]:
def auc(y, y_hat):
    n1 = np.sum(y == 1)
    n2 = np.sum(y == 0)
    
    R1 = np.sum(stats.rankdata(y_hat.values)[y.values == 1])
    
    U = R1 - (n1*(n1+1))/2
    
    return(U/(n1*n2))

In [3]:
def disparate_impact(y_hat, s):
    priv = np.mean(y_hat.values[s.values == 0])/np.mean(y_hat.values[s.values == 1])
    disc = np.mean(y_hat.values[s.values == 1])/np.mean(y_hat.values[s.values == 0])
    
    return np.min([priv, disc])

In [4]:
def equality_of_opportunity(y_hat, y, s, out = 1):
    disc_eoo = np.mean(y_hat.values[(s.values == 1) & (y.values == out)]) / np.mean(y_hat.values[(s.values == 0) & (y.values == out)])
    priv_eoo = np.mean(y_hat.values[(s.values == 0) & (y.values == out)]) / np.mean(y_hat.values[(s.values == 1) & (y.values == out)])
    
    return np.min([disc_eoo, priv_eoo])

In [5]:
def sigmoid(w, X):
    return(1/(1 + np.exp(-np.sum(w*X, axis=1))))

In [6]:
def logistic_loss(w, X, y):
    y_hat = sigmoid(w, X).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.00001
    y_hat[y_hat == 1] = 0.99999
    
    return -np.mean(y.values * np.log2(y_hat) + (1 - y.values) * np.log2(1 - y_hat))

In [7]:
def performance_score(w, X, y, s):
    X_d = X.loc[s == 1, :]
    X_p = X.loc[s == 0, :]
    
    y_d = y[s == 1]
    y_p = y[s == 0]
    
    y_hat = sigmoid(w, X)
    y_d_hat = sigmoid(w, X_d)
    y_p_hat = sigmoid(w, X_p)
    
    print('--------------------')
    print(f'AUC: {auc(y, y_hat)}, AUC Disc: {auc(y_d, y_d_hat)}, AUC Priv: {auc(y_p, y_p_hat)}')
    print(f'LL: {logistic_loss(w, X, y)}, LL Disc: {logistic_loss(w, X_d, y_d)}, LL Priv: {logistic_loss(w, X_p, y_p)}')
    print('-----')
    print(f'Disparate impact {disparate_impact(y_hat, s)}')
    print(f'Equal opportunity for y=1 {equality_of_opportunity(y_hat, y, s, 1)}')
    print(f'Equal opportunity for y=0 {equality_of_opportunity(y_hat, y, s, 0)}')
    print('--------------------')

# Baseline methods

In [8]:
# Zafar, M. B., Valera, I., Gomez-Rodriguez, M., & Gummadi, K. P. (2019). 
# Fairness Constraints: A Flexible Approach for Fair Classification. J. Mach. Learn. Res., 20(75), 1-42.

def avg_disp_imp_lower(w, s, X, t):
    y_intensity = t - np.mean((s - np.mean(s)) * np.sum(w*X, axis=1))
    
    return y_intensity

def avg_disp_imp_upper(w, s, X, t):
    y_intensity = np.mean((s - np.mean(s)) * np.sum(w*X, axis=1)) - t
    
    return y_intensity

def zafar_ll(X, y, s, t):
    w = np.repeat(0, X.shape[1])
    
    cons = ({'type': 'eq', 'fun': avg_disp_imp_lower, 'args': (s, X, t)})
    
    model = optimize.minimize(fun=logistic_loss, x0=w, args=(X, y), 
                              method='SLSQP', constraints=cons)
    
    return model.x

In [9]:
# Radovanović, S., Petrović, A., Delibašić, B., & Suknović, M. (2020, August). 
# Enforcing fairness in logistic regression algorithm. 
# In 2020 International Conference on INnovations in 
# Intelligent SysTems and Applications (INISTA) (pp. 1-7). IEEE.

def avg_disp_imp_lower(w, s, X, t):
    y_intensity = t - np.mean((s - np.mean(s)) * np.sum(w*X, axis=1))
    
    return y_intensity

def avg_equal_odds_lower(w, s, y, X, t):
    y_intensity = t - np.mean((s - np.mean(s)) * (y - np.mean(y)) * np.sum(w*X, axis=1))
    
    return y_intensity

def radovanovic_ll(X, y, s, t):
    w = np.repeat(0, X.shape[1])
    
    cons = ({'type': 'eq', 'fun': avg_disp_imp_lower, 'args': (s, X, t)},
            {'type': 'eq', 'fun': avg_equal_odds_lower, 'args': (s, y, X, t)})
    
    model = optimize.minimize(fun=logistic_loss, x0=w, args=(X, y), 
                              method='SLSQP', constraints=cons)
    
    return model.x

In [10]:
# Kamishima, T., Akaho, S., Asoh, H., & Sakuma, J. (2012, September). 
# Fairness-aware classifier with prejudice remover regularizer. 
# In Joint European Conference on Machine Learning and Knowledge Discovery in Databases (pp. 35-50). 
# Springer, Berlin, Heidelberg.

In [11]:
# Radovanović, S., Petrović, A., Delibašić, B., & Suknović, M. (2020, August). 
# Enforcing fairness in logistic regression algorithm. 
# In 2020 International Conference on INnovations in 
# Intelligent SysTems and Applications (INISTA) (pp. 1-7). IEEE.

def logistic_loss_fairness(w, X, y, s, fair_reg = 0):
    y_hat = sigmoid(w, X).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.00001
    y_hat[y_hat == 1] = 0.99999
    
    ll =  -np.mean(y.values * np.log2(y_hat) + (1 - y.values) * np.log2(1 - y_hat))
    
    y_intensity = np.sum(w*X, axis=1)
    reg = np.power(np.mean(y_intensity[s == 1]) - np.mean(y_intensity[s == 0]), 2)
    
    return ll + fair_reg * reg

def disparate_impact_ll(X, y, s, fair_reg):
    w = np.repeat(0, X.shape[1])
    
    model = optimize.minimize(fun=logistic_loss_fairness, x0=w, args=(X, y, s, fair_reg), 
                              method='SLSQP')
    
    return model.x

# Experiment

In [12]:
data = pd.read_csv('Data/adult_prepared.csv')

In [13]:
y = data['Income']
s = data['Sex']

X = data.drop(['Income', 'Sex'], axis=1)
X = (X - np.mean(X))/np.std(X)

In [14]:
model = zafar_ll(X, y, s, -0.05)

performance_score(model, X, y, s)

--------------------
AUC: 0.8021491669586759, AUC Disc: 0.8448590534520922, AUC Priv: 0.814884237713094
LL: 0.7386303284218942, LL Disc: 0.7826909886818015, LL Priv: 0.7168507335774948
-----
Disparate impact 0.9628725303088113
Equal opportunity for y=1 0.9170574132346296
Equal opportunity for y=0 0.8011741182744901
--------------------


In [15]:
model = radovanovic_ll(X, y, s, -0.05)

performance_score(model, X, y, s)

--------------------
AUC: 0.8280853453075672, AUC Disc: 0.8700992875742508, AUC Priv: 0.8313974627559227
LL: 0.7599214228722774, LL Disc: 0.7934963680198919, LL Priv: 0.7433250146949044
-----
Disparate impact 0.9713169432110834
Equal opportunity for y=1 0.942662358303743
Equal opportunity for y=0 0.9078670709897037
--------------------


In [16]:
model = disparate_impact_ll(X, y, s, 3)

performance_score(model, X, y, s)

--------------------
AUC: 0.7795108969985584, AUC Disc: 0.81902070993569, AUC Priv: 0.8020007216315302
LL: 0.7753768841471327, LL Disc: 0.864847241265488, LL Priv: 0.7311508530997805
-----
Disparate impact 0.8931455348044774
Equal opportunity for y=1 0.8930415018808973
Equal opportunity for y=0 0.7417189402795096
--------------------
