# Libraries

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

import pandas as pd

import scipy
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

from scipy import optimize

# Methods

In [2]:
def auc(y, y_hat):
    n1 = np.sum(y == 1)
    n2 = np.sum(y == 0)
    
    R1 = np.sum(stats.rankdata(y_hat.values)[y.values == 1])
    
    U = R1 - (n1*(n1+1))/2
    
    return(U/(n1*n2))

In [3]:
def disparate_impact(y_hat, s):
    priv = np.mean(y_hat.values[s.values == 0])/np.mean(y_hat.values[s.values == 1])
    disc = np.mean(y_hat.values[s.values == 1])/np.mean(y_hat.values[s.values == 0])
    
    return np.min([priv, disc])

In [4]:
def equality_of_opportunity(y_hat, y, s, out = 1):
    eoo = np.mean(y_hat.values[(s.values == 1) & (y.values == out)]) / np.mean(y_hat.values[(s.values == 0) & (y.values == out)])
    
    return(eoo)

In [5]:
def sigmoid(w, X):
    return(1/(1 + np.exp(-np.sum(w[:-1] * X, axis=1))))

# Kalai Fair Logistic Regression

In [6]:
def kalai_fair_logistic_regression(w, X, y, s):
    '''
    Kalai Bargaining solution applied to Logistic Regression.
    Instead of optimizing logistic loss, one needs to optimize negative value of the new variable 
    subject to logistic losses of two groups (discriminated and privileged) having a greater value of that variable
    
    Paramters:
    w : ndarray
        Coefficients of the logistic regression with an additional variable
    X : pandas.DataFrame
        DataFrame of input attributes
    y : ndarray
        Outcome vector
    s : ndarray
        Sensitive attribute vector
    '''
    
    return w[-1]

def constraint_logistic_loss(w, X, y, s):
    '''
    Logistic losses for both discriminated and privileged groups. 
    Due to constraint types in scipy (<= 0), max is transfered to min with negative values of logistic losses
    
    Parameters:
    w : ndarray
        Coefficients of the logistic regression with an additional variable
    X : pandas.DataFrame
        DataFrame of input attributes
    y : ndarray
        Outcome vector
    s : ndarray
        Sensitive attribute vector
    '''
    
    w_coef = w[:-1]
    
    X_d, X_p = X.loc[s.values == 1, :], X.loc[s.values == 0, :]
    y_d, y_p = y[s.values == 1], y[s.values == 0]
    
    y_d_hat = sigmoid(w, X_d).values
    y_p_hat = sigmoid(w, X_p).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_d_hat[y_d_hat == 0] = 0.0000001
    y_d_hat[y_d_hat == 1] = 0.9999999
    
    y_p_hat[y_p_hat == 0] = 0.0000001
    y_p_hat[y_p_hat == 1] = 0.9999999
    
    disc_loss = np.mean(y_d * np.log2(y_d_hat) + (1 - y_d) * np.log2(1 - y_d_hat)) + w[-1]
    priv_loss = np.mean(y_p * np.log2(y_p_hat) + (1 - y_p) * np.log2(1 - y_p_hat)) + w[-1]
    
    return [disc_loss, priv_loss]

def statistical_parity(w, X, s, parity = 0.8):
    '''
    Adding statistical parity constraints
    
    Parameters:
    w : ndarray
        Coefficients of the logistic regression with an additional variable
    X : pandas.DataFrame
        DataFrame of input attributes
    y : ndarray
        Outcome vector
    s : ndarray
        Sensitive attribute vector
    '''
    
    y_hat = sigmoid(w, X)
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.0000001
    y_hat[y_hat == 1] = 0.9999999
    
    lower_bound = parity * np.mean(y_hat.values[s.values == 1]) - np.mean(y_hat.values[s.values == 0])
    upper_bound = np.mean(y_hat.values[s.values == 0]) - 1/parity * np.mean(y_hat.values[s.values == 1])
    
    return [lower_bound, upper_bound]

In [7]:
def logistic_loss(w, X, y):
    y_hat = sigmoid(w, X).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.0000001
    y_hat[y_hat == 1] = 0.9999999
    
    return -np.mean(y.values * np.log2(y_hat) + (1 - y.values) * np.log2(1 - y_hat))

In [8]:
def performance_score(w, X, y, s):
    X_d = X.loc[s == 1, :]
    X_p = X.loc[s == 0, :]
    
    y_d = y[s == 1]
    y_p = y[s == 0]
    
    y_hat = sigmoid(w, X)
    y_d_hat = sigmoid(w, X_d)
    y_p_hat = sigmoid(w, X_p)
    
    print(f'AUC: {auc(y, y_hat)}, AUC Disc: {auc(y_d, y_d_hat)}, AUC Priv: {auc(y_p, y_p_hat)}')
    print(f'LL: {logistic_loss(w, X, y)}, LL Disc: {logistic_loss(w, X_d, y_d)}, LL Priv: {logistic_loss(w, X_p, y_p)}')
    print(f'Disparate impact {disparate_impact(y_hat, s)}')

# Experiment

In [9]:
data = pd.read_csv('Data/adult_prepared.csv')

In [10]:
y = data['Income']
s = data['Sex']

X = data.drop(['Income', 'Sex'], axis=1)
X = (X - np.mean(X))/np.std(X)

In [11]:
w = np.repeat(0, repeats = X.shape[1] + 1)
# parity = 0.8

# cons = ({'type': 'ineq', 'fun': constraint_logistic_loss, 'args': (X, y, s)},
#         {'type': 'ineq', 'fun': statistical_parity, 'args': (X, s, parity)})

cons = ({'type': 'ineq', 'fun': constraint_logistic_loss, 'args': (X, y, s)})

adjustment = optimize.minimize(fun=kalai_fair_logistic_regression, x0=w, constraints=cons, 
                               args=(X, y, s), method='SLSQP')

In [12]:
performance_score(adjustment.x, X, y, s)

AUC: 0.8585715865158207, AUC Disc: 0.8602453380361497, AUC Priv: 0.8559451342936354
LL: 0.6815791431331266, LL Disc: 0.6815790469226015, LL Priv: 0.6815791906908854
Disparate impact 0.796601043746812
