# Libraries

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

import pandas as pd

import scipy
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

from scipy import optimize

# Methods

In [2]:
def auc(y, y_hat):
    n1 = np.sum(y == 1)
    n2 = np.sum(y == 0)
    
    R1 = np.sum(stats.rankdata(y_hat.values)[y.values == 1])
    
    U = R1 - (n1*(n1+1))/2
    
    return(U/(n1*n2))

In [3]:
def disparate_impact(y_hat, s):
    priv = np.mean(y_hat.values[s.values == 0])/np.mean(y_hat.values[s.values == 1])
    disc = np.mean(y_hat.values[s.values == 1])/np.mean(y_hat.values[s.values == 0])
    
    return np.min([priv, disc])

In [4]:
def equality_of_opportunity(y_hat, y, s, out = 1):
    eoo = np.mean(y_hat.values[(s.values == 1) & (y.values == out)]) / np.mean(y_hat.values[(s.values == 0) & (y.values == out)])
    
    return(eoo)

In [5]:
def sigmoid(w, X):
    return(1/(1 + np.exp(-np.sum(w*X, axis=1))))

In [6]:
def logistic_loss(w, X, y):
    y_hat = sigmoid(w, X).values
    
    # NUMERICAL UNDERFLOW AND OVERFLOW
    y_hat[y_hat == 0] = 0.0000001
    y_hat[y_hat == 1] = 0.9999999
    
    return -np.mean(y.values * np.log2(y_hat) + (1 - y.values) * np.log2(1 - y_hat))

In [7]:
def performance_score(w, X, y, s):
    X_d = X.loc[s.values == 1, :]
    X_p = X.loc[s.values == 0, :]
    
    y_d = y[s.values == 1]
    y_p = y[s.values == 0]
    
    y_hat = sigmoid(w, X)
    y_d_hat = sigmoid(w, X_d)
    y_p_hat = sigmoid(w, X_p)
    
    print(f'AUC: {auc(y, y_hat)}, AUC Disc: {auc(y_d, y_d_hat)}, AUC Priv: {auc(y_p, y_p_hat)}')
    print(f'LL: {logistic_loss(w, X, y)}, LL Disc: {logistic_loss(w, X_d, y_d)}, LL Priv: {logistic_loss(w, X_p, y_p)}')
    print(f'Disparate impact {disparate_impact(y_hat, s)}')

# Kalai Smorodinsky Solution

In [8]:
class KS_fair_solution:
    
    def __init__(self, fair=1, normalize=True):
        '''
        Construction of KS_fair_solution
        
        Parameters:
        fair : numeric
            Importance of the fairness in the final solution
        normalize : bool
            Normalization of the Disparate Impact Logistic Loss ratio
        '''
        
        self.fair = fair
        self.normalize = normalize
        
        return
    
    # ----- HELP FUNCTIONS -----
    def _logistic_loss(self, y, y_hat):
        '''
        Calculate Logistic Loss
        
        Parameters:
        y : numpy.array
            Vector of true values of the outcome
        y_hat : numpy.array
            Vector of predicted values of the outcome
            
        Returns:
        score : ndarray
            Logistic Loss
        '''
        
        return -np.mean(y.values * np.log2(y_hat.values) + (1 - y.values) * np.log2(1 - y_hat.values))
    
    def _disparate_impact(self, y_hat, s):
        '''
        Calculate Disparate Impact
        
        Parameters:
        y_hat : numpy.array
            Vector of predicted values of the outcome
            
        Returns:
        score : ndarray
            Disparate Impact
        '''
        
        priv = np.mean(y_hat.values[s.values == 0])/np.mean(y_hat.values[s.values == 1])
        disc = np.mean(y_hat.values[s.values == 1])/np.mean(y_hat.values[s.values == 0])

        return np.min([priv, disc])
    
    def _sigmoid(self, X):
        return(1/(1 + np.exp(-np.sum(self.w * X, axis=1))))
    
    # ----- OPERATIONS -----
    def _prepare_ks(self):
        '''
        Calculate DataFrame where each attribute is represented as a row, 
        with columns representing Disparate Impact and Logistic Loss. 
        As a result this method will create a new DataFrame in an instance 
        of this class with rows representing attributes and columns representing
        Disparate Impact, Logistic Loss and their ratio for Discriminated and
        Privileged groups
        '''
        
        values = []

        for el in range(X.shape[1]):
            w_d_zero = np.repeat(0.0, self.X.shape[1])
            w_p_zero = np.repeat(0.0, self.X.shape[1])

            w_d_zero[el] = self.w_d[el].copy()
            w_p_zero[el] = self.w_p[el].copy()

            y_d_hat = sigmoid(w_d_zero, X)
            y_p_hat = sigmoid(w_p_zero, X)

            di_d = self._disparate_impact(y_d_hat, s)
            di_p = self._disparate_impact(y_p_hat, s)

            ll_d = self._logistic_loss(y, y_d_hat)
            ll_p = self._logistic_loss(y, y_p_hat)

            values.append([X.columns[el], w_d[el], di_d, ll_d, w_p[el], di_p, ll_p])
    
        ks_bargaining = pd.DataFrame(values, columns=['Attribute', 'Coef_D', 'DI_D', 'LL_D', 'Coef_P', 'DI_P', 'LL_P'])
        
        ks_bargaining[['DI_D', 'LL_D', 'DI_P', 'LL_P']] = ks_bargaining[['DI_D', 'LL_D', 'DI_P', 'LL_P']]*1./np.max(ks_bargaining[['DI_D', 'LL_D', 'DI_P', 'LL_P']], axis=0)
        
        ks_bargaining[['DI_D', 'DI_P']] = self.fair * ks_bargaining[['DI_D', 'DI_P']]
        
        ks_bargaining['DI_LL_D'] = ks_bargaining['DI_D']/ks_bargaining['LL_D']
        ks_bargaining['DI_LL_P'] = ks_bargaining['DI_P']/ks_bargaining['LL_P']
    
        ks_bargaining = ks_bargaining.set_index('Attribute')
        
        self.ks_dataset = ks_bargaining
    
        return
    
    def _calculate(self):
        ks_data = self.ks_dataset.copy()
    
        # IF NORMALIZATION IS NEEDED
        if self.normalize:
            ks_data = ks_data[['DI_LL_D', 'DI_LL_P']].T.apply(lambda x: x - min(x), axis=1)
        else:
            ks_data = ks_data[['DI_LL_D', 'DI_LL_P']].T
    
        # MULTIPLY VALUE WITH MAX FROM OTHER ROW
        ks_data = ks_data * np.flip(np.max(ks_data, axis=1)).values[:, np.newaxis]

        # OPTIMIZATION VECTOR PREPARATION (MAX_2 * ROW_1 - MAX_1 * ROW_2)
        ks_opt_data = ks_data.iloc[0, :] - ks_data.iloc[1, :]

        # OPTIMIZATION
        def goal_fun_ks(w_ks, ks_data):
            return -np.sum(ks_data * w_ks)

        # CONSTRAINT THE INTENSITY OF CHANGES
        def const_w_ks(w_ks):
            return np.sum(w_ks) - 1

        def ks_optimize(ks_data):
            w_ks = np.repeat(0.0, ks_data.shape[0])

            cons = ({'type': 'eq', 'fun': const_w_ks})
            bounds = [(0, 1) for n in w_ks]
            
            np.random.seed(seed=2021)
#             model = optimize.minimize(fun=goal_fun_ks, x0=w_ks, args=(ks_data), 
#                                       method='SLSQP', constraints=cons, bounds=bounds)

            model = optimize.minimize(fun=goal_fun_ks, x0=w_ks, args=(ks_data), 
                                      method='SLSQP', bounds=bounds)

            return model

        self.optimization = ks_optimize(ks_opt_data)
        return
    
    def _final_weights(self):
        w_ks = self.w_d * self.optimization.x + self.w_p * (1 - self.optimization.x)
        self.w = w_ks
        
        return
    
    # ----- FIT AND PREDICT -----
    def fit(self, w_d, w_p, X, y, s):
        '''
        Calculate Kalai-Smorodinsky Fair Logistic Regression solution

        Parameters:
        w_d : numpy.array
            Vector of logistic regression coefficients for discriminated group
        w_p : numpy.array
            Vector of logistic regression coefficients for privileged group
        X : pandas.DataFrame
            Matrix of input attributes
        y : pandas.Series
            Output attribute vector (label)
        s : pandas.Series
            Sensitive attribute vector
            
        Returns:
        KSFairLR : self
            Object of Kalai Smorodinsky Fair Logistic Regression
        '''
    
        # SAVE TO CLASS
        self.w_d = w_d
        self.w_p = w_p
        self.X = X
        self.y = y
        self.s = s
        
        # PERFORM IN ORDER
        self._prepare_ks()
        self._calculate()
        self._final_weights()
        
        return self
    
    def predict(self, X):
        '''
        Perform predictions
        
        Parameters:
        X : pandas.DataFrame
            DataFrame for which predicitons should be created
            
        Returns:
        score : ndarray
            Probabilities of the outcome
        '''
        
        return _sigmoid(X)

# Experiment

In [None]:
data = pd.read_csv('Data/adult_prepared.csv')

In [None]:
y = data['Income']
s = data['Sex']

X = data.drop(['Unnamed: 0', 'Income', 'Sex'], axis=1)
X = (X - np.mean(X))/np.std(X)

In [None]:
X_d = X.loc[s == 1, :]
X_p = X.loc[s == 0, :]

y_d = y[s == 1]
y_p = y[s == 0]

w = np.repeat(0.0, X.shape[1])

np.random.seed(seed=2021)
w_d = optimize.minimize(fun=logistic_loss, x0=w, args=(X_d, y_d), method='SLSQP').x

np.random.seed(seed=2021)
w_p = optimize.minimize(fun=logistic_loss, x0=w, args=(X_p, y_p), method='SLSQP').x

In [None]:
w_d

In [None]:
w_p

In [None]:
performance_score(w_d, X, y, s)

In [None]:
performance_score(w_p, X, y, s)

In [None]:
new_model = KS_fair_solution(fair=0.15)

new_model.fit(w_d, w_p, X, y, s)

In [None]:
new_model.optimization

In [None]:
new_model.w

In [None]:
performance_score(new_model.w, X, y, s)