# Libraries

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

import pandas as pd

import scipy
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

from scipy import optimize

# Methods

In [2]:
def auc(y, y_hat):
    n1 = np.sum(y == 1)
    n2 = np.sum(y == 0)
    
    R1 = np.sum(stats.rankdata(y_hat.values)[y.values == 1])
    
    U = R1 - (n1*(n1+1))/2
    
    return(U/(n1*n2))

In [3]:
def disparate_impact(y_hat, s):
    priv = np.mean(y_hat.values[s.values == 0])/np.mean(y_hat.values[s.values == 1])
    disc = np.mean(y_hat.values[s.values == 1])/np.mean(y_hat.values[s.values == 0])
    
    return np.min([priv, disc])

In [4]:
def equality_of_opportunity(y_hat, y, s, out = 1):
    eoo = np.mean(y_hat.values[(s.values == 1) & (y.values == out)]) / np.mean(y_hat.values[(s.values == 0) & (y.values == out)])
    
    return(eoo)

In [5]:
def sigmoid(w, X):
    return(1/(1 + np.exp(-np.sum(w*X, axis=1))))

# Multi-task Fair Logistic Regression

In [6]:
def multi_task_loss(w, X, y, s, reg_lambda = 0):
    X_d = X.loc[s == 1, :]
    X_p = X.loc[s == 0, :]
    
    y_d = y.loc[s == 1].values
    y_p = y.loc[s == 0].values
    
    w_d = w[:X.shape[1]]
    w_p = w[X.shape[1]:]
    
    y_d_hat = sigmoid(w_d, X_d).values
    y_p_hat = sigmoid(w_p, X_p).values
    
    disc_loss = (-np.mean(y_d * np.log2(y_d_hat) + (1 - y_d) * np.log2(1 - y_d_hat)))
    priv_loss = (-np.mean(y_p * np.log2(y_p_hat) + (1 - y_p) * np.log2(1 - y_p_hat)))
    
    reg_term = np.mean(np.power(w_d - w_p, 2))
    
    return disc_loss + priv_loss + reg_lambda * reg_term

In [7]:
def predict(w, X, s):
    X_d = X.loc[s == 1, :]
    X_p = X.loc[s == 0, :]
    
    w_d = w[:X.shape[1]]
    w_p = w[X.shape[1]:]
    
    y_d_hat = sigmoid(w_d, X_d)
    y_p_hat = sigmoid(w_p, X_p)
    
    y_hat = y_d_hat.append(y_p_hat).sort_index()
    
    return y_hat

In [12]:
def logistic_loss(w, X, y):
    y_hat = sigmoid(w, X).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.00001
    y_hat[y_hat == 1] = 0.99999
    
    return -np.mean(y.values * np.log2(y_hat) + (1 - y.values) * np.log2(1 - y_hat))

In [8]:
def performance_score(w, X, y, s):
    X_d = X.loc[s == 1, :]
    X_p = X.loc[s == 0, :]
    
    y_d = y[s == 1]
    y_p = y[s == 0]
    
    y_hat = sigmoid(w, X)
    y_d_hat = sigmoid(w, X_d)
    y_p_hat = sigmoid(w, X_p)
    
    print(f'AUC: {auc(y, y_hat)}, AUC Disc: {auc(y_d, y_d_hat)}, AUC Priv: {auc(y_p, y_p_hat)}')
    print(f'LL: {logistic_loss(w, X, y)}, LL Disc: {logistic_loss(w, X_d, y_d)}, LL Priv: {logistic_loss(w, X_p, y_p)}')
    print(f'Disparate impact {disparate_impact(y_hat, s)}')

# Experiment

In [9]:
data = pd.read_csv('Data/adult_prepared.csv')

In [10]:
y = data['Income']
s = data['Sex']

X = data.drop(['Income', 'Sex'], axis=1)
X = (X - np.mean(X))/np.std(X)

In [13]:
regs = [0.01, 0.1, 1, 2, 4, 0]

for reg in regs:
    w = np.repeat(0, repeats=X.shape[1] * 2)

    adjustment = optimize.minimize(fun=multi_task_loss, x0=w, args=(X, y, s, reg), method='SLSQP')

    print('-----' + str(reg) + '-----')
    print(performance_score(adjustment.x[:X.shape[1]], X, y, s))
    print(performance_score(adjustment.x[X.shape[1]:], X, y, s))
    print('-----')

-----0.01-----
AUC: 0.8094176366943487, AUC Disc: 0.9019983963169761, AUC Priv: 0.7426066404865215
LL: 1.354126615665054, LL Disc: 0.4032057905900064, LL Priv: 1.824175638468328
Disparate impact 0.28835851649950617
None
AUC: 0.8373595054045432, AUC Disc: 0.8220250512690459, AUC Priv: 0.8529183510159914
LL: 0.8500892663204234, LL Disc: 1.1145893260479627, LL Priv: 0.7193444226616199
Disparate impact 0.9475348241331227
None
-----
-----0.1-----
AUC: 0.8151017966716319, AUC Disc: 0.9046668095621103, AUC Priv: 0.7505270148649269
LL: 1.2573158868092407, LL Disc: 0.3936154236742918, LL Priv: 1.6842510721431798
Disparate impact 0.29834979476879475
None
AUC: 0.8391150068369359, AUC Disc: 0.82349583091932, AUC Priv: 0.8544904853545552
LL: 0.8294952750615985, LL Disc: 1.057555775162022, LL Priv: 0.7167628452047073
Disparate impact 0.942319224114959
None
-----
-----1-----
AUC: 0.8430826532511663, AUC Disc: 0.91210762998003, AUC Priv: 0.7915709194479499
LL: 1.0013541799886754, LL Disc: 0.3951277773