In [37]:
import sys
import numpy as np
import pandas as pd
import scipy
import copy
import random
import math
from scipy import stats
from scipy.stats import rankdata
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
np.random.seed(1)

In [38]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation', 'relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']
df_train = pd.read_csv('adult.data', names=cols, sep=",")
df_test = pd.read_csv('adult.test', names=cols, sep=",")

**One-hot encoding**

In [39]:
 def one_hot_encode(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['marital'], prefix='marital')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

    df = df.drop(columns=['workclass', 'gender', 'fnlwgt', 'education', 'occupation', \
                      'relationship', 'marital', 'race', 'country', 'capgain', \
                      'caploss'])
    return df

# one-hot encoding (for regression mdoels)
df_train = one_hot_encode(df_train)
df_test = one_hot_encode(df_test)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

**Protected, privileged**

In [40]:
# protected: 'gender_Female'=1
# privileged: 'gender_Male'=1

**Parametric Model**

In [41]:
X_train = df_train.drop(columns='income')
y_train = df_train['income']

X_test = df_test.drop(columns='income')
y_test = df_test['income']

X_train_orig = copy.deepcopy(X_train)
X_test_orig = copy.deepcopy(X_test)

# Scale data: regularization penalty default: ‘l2’, ‘lbfgs’ solvers support only l2 penalties. 
# Regularization makes the predictor dependent on the scale of the features.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

clf = LogisticRegression(random_state=0, max_iter=300)

**Compute statistical parity difference**

In [42]:
def computeFairness(y_pred, X_test): 
    protected_idx = X_test[X_test['gender_Female']==1].index
    numProtected = len(protected_idx)
    privileged_idx = X_test[X_test['gender_Male']==1].index
    numPrivileged = len(privileged_idx)
    
    p_protected = 0
    for i in range(len(protected_idx)):
        p_protected += y_pred[protected_idx[i]][1]
    p_protected /= len(protected_idx)
    
    p_privileged = 0
    for i in range(len(privileged_idx)):
        p_privileged += y_pred[privileged_idx[i]][1]
    p_privileged /= len(privileged_idx)
    
    spd = p_protected - p_privileged
    return spd

**Influence of points computed using ground truth**

In [43]:
def ground_truth_influence(X_train, y_train, X_test, X_test_orig):
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    spd_0 = computeFairness(y_pred, X_test_orig)

    delta_spd = []
    for i in range(len(X_train)):
        X_removed = np.delete(X_train, i, 0)
        y_removed = y_train.drop(index=i, inplace=False)
        clf.fit(X_removed, y_removed)
        y_pred = clf.predict_proba(X_test)
        delta_spd_i = computeFairness(y_pred, X_test_orig) - spd_0
        delta_spd.append(delta_spd_i)
    
    return delta_spd

**Loss function** (Log loss for logistic regression)

In [44]:
def logistic_loss(y_true, y_pred):
    loss = 0
    for i in range(len(y_true)):
        if (y_pred[i][1] != 0 and y_pred[i][0] != 0):
            loss += - y_true[i] * math.log(y_pred[i][1]) - (1 - y_true[i]) * math.log(y_pred[i][0])
    loss /= len(y_true)
    return loss

**Compute Accuracy** 

In [45]:
def computeAccuracy(y_true, y_pred):
    accuracy = 0
    for i in range(len(y_true)):
        idx = y_true[i]
        accuracy += y_pred[i][idx]
    accuracy /= len(y_true)
    return accuracy

**First-order derivative of loss function at z with respect to model parameters**

(Pre-computed for all training points)

In [46]:
def del_L_del_theta_i(num_params, y_true, x, y_pred):
#     del_L_del_theta = np.ones((num_params, 1)) * ((1 - y_true) * y_pred[1] - y_true * y_pred[0])
    del_L_del_theta = np.ones((num_params, 1)) * (- y_true + y_pred[1])
    for j in range(1, num_params):
            del_L_del_theta[j] *=  x[j-1]
    return del_L_del_theta

**Hessian: Second-order partial derivative of loss function with respect to model parameters**

(Pre-computed for all training points)

In [47]:
def hessian_one_point(num_params, x, y_pred):
    H = np.ones((num_params, num_params)) * (y_pred[0] * y_pred[1])
    for i in range(1, num_params):
        for j in range(i+1):
            if j == 0:
                H[i][j] *= x[i-1]
            else:
                H[i][j] *= x[i-1] * x[j-1] 
    i_lower = np.tril_indices(num_params, -1)
    H.T[i_lower] = H[i_lower]     
    return H

**First-order derivative of $P(y \mid \textbf{x})$ with respect to model parameters**

In [48]:
def del_f_del_theta_i(num_params, x, y_pred):
    del_f_del_theta = np.ones((num_params, 1)) * (y_pred[0] * y_pred[1])
    for j in range(1, num_params):
            del_f_del_theta[j] *=  x[j-1]
    return del_f_del_theta

**Computing $v=\nabla($Statistical parity difference$)$**

In [49]:
# Return v = del(SPD)/del(theta)
def del_spd_del_theta(num_params, X_test_orig, X_test, y_pred):
    del_f_protected = np.zeros((num_params, 1))
    del_f_privileged = np.zeros((num_params, 1))
    numProtected = X_test_orig['gender_Female'].sum()
    numPrivileged = X_test_orig['gender_Male'].sum()
    for i in range(len(X_test)):
        del_f_i = del_f_del_theta_i(num_params, X_test[i], y_pred[i])
        if X_test_orig.iloc[i]['gender_Male'] == 1: #privileged
            del_f_privileged = np.add(del_f_privileged, del_f_i)
        elif X_test_orig.iloc[i]['gender_Female'] == 1:
            del_f_protected = np.add(del_f_protected, del_f_i)
    del_f_privileged /= numPrivileged
    del_f_protected /= numProtected
    v = np.subtract(del_f_protected, del_f_privileged)
    return v

**Stochastic estimation of Hessian vector product (involving del fairness): $H_{\theta}^{-1}v = H_{\theta}^{-1}\nabla_{\theta}f(z, \theta) = v + [I - \nabla_{\theta}^2L(z_{s_j}, \theta^*)]H_{\theta}^{-1}v$**

In [50]:
# Uniformly sample t points from training data 
def hessian_vector_product(num_params, n, size, v, hessian_all_points):
    sample = random.sample(range(n), size)
    hinv_v = copy.deepcopy(v)
    for idx in range(size):
        i = sample[idx]
        hessian_i = hessian_all_points[i]
        hinv_v = np.matmul(np.subtract(np.identity(num_params), hessian_i), hinv_v)
        hinv_v = np.add(hinv_v, v)
    return hinv_v

**First-order influence computation**

In [51]:
def first_order_influence(del_L_del_theta, hinv_v, n):
    infs = []
    for i in range(n):
        inf = -np.dot(del_L_del_theta[i].transpose(), hinv_v)
        inf *= -1/n
        infs.append(inf[0][0].tolist())
    return infs

**Second-order Influence function computation**

(For any group of points U)

In [52]:
def second_order_influence(X_train, v1, U, size, del_L_del_theta, hessian_all_points):
    u = len(U) 
    s = len(X_train)
    p = u/s
    c1 = (1 - 2*p)/(s * (1-p)**2)
    c2 = 1/((s * (1-p))**2)
    num_params = len(v1)
    
    del_L_del_theta_hinv = np.zeros((num_params, 1))
    del_L_del_theta_sum = np.zeros((num_params, 1))
    hessian_all = np.zeros((num_params, num_params))
    for i in range(u):
        idx = U[i]
#         if (i%1000==0):
#             print(i)
        del_L_del_theta_hinv = np.add(del_L_del_theta_hinv, hessian_vector_product(num_params, s, size, del_L_del_theta[idx], hessian_all_points))
        hessian_all = np.add(hessian_all, hessian_all_points[idx])
        del_L_del_theta_sum = np.add(del_L_del_theta_sum, del_L_del_theta[idx])
    
    term1 = c1 * del_L_del_theta_sum
    term2 = c2 * np.dot(hessian_all, del_L_del_theta_hinv)

    I = np.dot(v1.transpose(), (term1 + term2))
    return I

**Metrics: Initial state**

In [53]:
threshold = 0.0001
clf.fit(X_train, y_train)
num_params = len(clf.coef_.transpose()) + 1 #weights and intercept; params: clf.coef_, clf.intercept_
y_pred_test = clf.predict_proba(X_test)
y_pred_train = clf.predict_proba(X_train)
    
spd_0 = computeFairness(y_pred_test, X_test_orig)
print("Initial fairness: ", spd_0)

loss_0 = logistic_loss(y_test, y_pred_test)
print("Initial loss: ", loss_0)

accuracy_0 = computeAccuracy(y_test, y_pred_test)
print("Initial accuracy: ", accuracy_0)

Initial fairness:  -0.20059371090978573
Initial loss:  0.360972684923813
Initial accuracy:  0.7683939044369612


**Pre-compute: (1) Hessian (2) del_L_del_theta for each training data point**

In [24]:
del_L_del_theta = []
for i in range(int(len(X_train))):
    del_L_del_theta.insert(i, del_L_del_theta_i(num_params, y_train[i], X_train[i], y_pred_train[i]))

hessian_all_points = []
for i in range(len(X_train)):
    hessian_all_points.insert(i, hessian_one_point(num_params, X_train[i], y_pred_train[i])
                              /len(X_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Initial fairness:  -0.202729849784728
Initial loss:  0.3601514459724083
Initial accuracy:  0.7689688172487666


KeyError: 0

**Fairness/Accuracy change removing predicates according to the regression tree**

In [21]:
df = pd.DataFrame(data=X_train, columns=X_train_orig.columns)
# idx = df[(df['hours']>-.453) & (df['age']>-.224) &(df['gender_Male']>-.375) &(df['age']<=1.299) &(df['hours']<=1.3)].index.tolist() 
# idx = df[(df['hours']<=35.5) & (df['hours']<=3.72) & (df['age']>0.224) & (df['relationship_Husband']>0.176) & (df['hours']<=1.3)].index.tolist() 
# idx = df_train[(df_train['hours']>35.5) 
#                & (df_train['age']>35.5) 
#                & (df_train['gender_Male']>0.5) 
#                & (df_train['hours']>72.5) 
#                & (df_train['age']>24.5) 
#                & (df_train['hours']>62.5)
#               & (df_train['marital_Never-married']<=.5)
#               ].index.tolist() 
idx = df_train[(df_train['age']<=59.5) 
#                & (df_train['hours']>85.5) 
#                & (df_train['relationship_Wife']>0.5) 
#                & (df_train['relationship_Husband']<=0.5) 
#                & (df_train['hours']<=56.5) 
#                & (df_train['hours']>34.5)
#                & (df_train['age']<=55.5)
              ].index.tolist() 
X_removed = np.delete(X_train, idx, 0)
y_removed = y_train.drop(index=idx, inplace=False)
clf.fit(X_removed, y_removed)
y_pred = clf.predict_proba(X_test)
spd = computeFairness(y_pred, X_test_orig)
loss = logistic_loss(y_test, y_pred)
accuracy = computeAccuracy(y_test, y_pred)
print("Fraction of records: ", 100*len(idx)/len(X_train))
print("% Change in fairness: ", (spd/spd_0 - 1)*100)
print("% Change in loss: ", (loss/loss_0 - 1)*100)
print("% Change in accuracy: ", (accuracy/accuracy_0 - 1)*100)

Fraction of records:  93.09727471653073
% Change in fairness:  41.30145207075857
% Change in loss:  32.13272299623042
% Change in accuracy:  -10.382397688419731


**Compute: (1) First-order influence, (2) Ground truth influence of each training data point**

In [18]:
# Ground truth influence
# spdgt = ground_truth_influence(X_train, y_train, X_test, X_test_orig)
# with open('delta_spd_ground_truth_v0.txt', 'w') as filehandle:
#     for listitem in delta_spd:
#         filehandle.write('%s\n' % listitem)
gt_spd = pd.read_csv('delta_spd_ground_truth_v0.txt', names=["Values"], sep=",")
gt_spd = gt_spd.values.tolist()
spdgt=[]
for i in range(len(gt_spd)):
    spdgt.append(gt_spd[i][0])
sort_index = np.argsort(spdgt)[::-1][:len(spdgt)]

In [19]:
size_hvp = int(len(X_train) * .12)
# Hessian vector product H^{-1}v, v = del_fairness
v = del_spd_del_theta(num_params, X_test_orig, X_test, y_pred_test)
# v = del_L_del_theta[3]
hinv_v = hessian_vector_product(num_params, len(X_train), size_hvp, v, hessian_all_points)

infs_1 = first_order_influence(del_L_del_theta, hinv_v, len(X_train))
print("Spearman rank correlation between 1st order inf and ground truth inf: ", 
      stats.spearmanr(spdgt, infs_1)[0])
print("Pearson correlation coefficient between 1st order inf and ground truth inf: ", 
      stats.pearsonr(spdgt, infs_1)[0])

Spearman rank correlation between 1st order inf and ground truth inf:  0.9244866003665443
Pearson correlation coefficient between 1st order inf and ground truth inf:  0.8918920705281899


**Checking ground truth, first-order and second-order influences for a set**

In [56]:
# ground truth predicates
predicates = ['marital_Married-civ-spouse']

# 1st-order influence predicates
# predicates = ['occupation_Armed-Forces', 'relationship_Not-in-family', 'education.num', 'marital_Married-spouse-absent']
# predicates = ['relationship_Not-in-family', 'occupation_Tech-support']
# predicates = ['workclass_Private', 'occupation_Handlers-cleaners', 'race_Black', 'occupation_Other-service', 'marital_Divorced', 'race_Other', 'occupation_Protective-serv', 'relationship_Wife', 'occupation_Sales', 'relationship_Husband', 'hours', 'marital_Never-married', 'relationship_Own-child', 'marital_Married-spouse-absent', 'occupation_Exec-managerial', 'marital_Married-civ-spouse', 'occupation_Tech-support', 'race_Asian-Pac-Islander', 'occupation_Farming-fishing', 'marital_Married-AF-spouse', 'relationship_Other-relative', 'workclass_Self-emp-not-inc', 'marital_Separated', 'race_White', 'workclass_Federal-gov', 'marital_Widowed', 'occupation_Prof-specialty', 'relationship_Unmarried', 'education.num', 'workclass_State-gov', 'occupation_Transport-moving', 'gender_Male', 'workclass_Local-gov', 'occupation_Craft-repair', 'relationship_Not-in-family', 'occupation_Priv-house-serv']

idx = X_train_orig[(X_train_orig[predicates[0]] == 1)
#                    & (X_train_orig[predicates[1]] == 1) 
#                    & (X_train_orig[predicates[2]] == 1)
#                    & (X_train_orig[predicates[3]] == 1)
                  ].index 

print(len(idx))

X = np.delete(X_train, idx, 0)
# X = X_train.drop(index=idx, inplace=False)
y = y_train.drop(index=idx, inplace=False)
clf.fit(X, y)
y_pred_test = clf.predict_proba(X_test)
print("Ground truth influence: ", computeFairness(y_pred_test, X_test_orig) - spd_0)

# del_f = 0
# for i in range(len(idx)):
#     del_f += infs_1[idx[i]]
# print("First-order influence: ", del_f)

# print("Second-order influence: ", second_order_influence(X_train, hinv_v, idx, size_hvp, del_L_del_theta, hessian_all_points))

14065
Ground truth influence:  0.17445129592915587


**Space Partitioner for reducing bias**

In [36]:
def getInfluenceOfSet(indices, spd_0, X_df, y_df, X_test_df, X_test_orig_df):
    X = X_df.drop(index=indices, inplace=False)
    y = y_df.drop(index=indices, inplace=False)
    if len(y.unique()) < 2:
        return 0
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test_df)
    inf = computeFairness(y_pred, X_test_orig_df) - spd_0
    return inf

def getSplitGain(numLeft, numRight, infLeft, infRight):
    gain = 0
    gain = infLeft + infRight
    return gain

def getAttribute(cols, X_df, y_df, X_test_df, X_train_orig_df, X_test_orig_df):
    splitCol, splitVal, score, left, right, leftInf, rightInf = None, np.Inf, 0, None, None, 0, 0
    for col in cols:
#         print("Column: ", col)
        vals = X_df[col].unique()
        vals.sort()
        mid = []
        for i in range(len(vals) - 1):
            mid.append(np.mean(vals[i:i+2]))
        for val in mid:
#             print(val)
            idxLeft = X_df[X_df[col] <= val].index
            idxRight = X_df[X_df[col] > val].index
            infLeft = getInfluenceOfSet(idxLeft, spd_0, X_df, y_df, X_test_df, X_test_orig_df)
            infRight = getInfluenceOfSet(idxRight, spd_0, X_df, y_df, X_test_df, X_test_orig_df)
            gain = getSplitGain(len(idxLeft), len(idxRight), infLeft, infRight)
            if gain > score:
#                 sc.inverse_transform()
#                 print("Column: ", col)
#                 print("Value: ", val)
                splitCol, splitVal, score = col, val, gain 
                left, right = idxLeft, idxRight
                leftInf, rightInf = infLeft, infRight
    return {'splitCol':splitCol, 'val':splitVal, 
            'idxLeft':left, 'idxRight':right, 
            'leftInf':leftInf, 'rightInf':rightInf
            }

def partition(node, maxDepth, minSize, depth, cols, X_train, y_train, X_test, X_test_orig):
    print("Depth: ", depth)
    X_left = X_train.drop(index=node['idxLeft'], inplace=False)
    y_left = y_train.drop(index=node['idxLeft'], inplace=False)
    X_right = X_train.drop(index=node['idxRight'], inplace=False)
    y_right = y_train.drop(index=node['idxRight'], inplace=False)
    del(node['idxLeft'])
    del(node['idxRight'])
    if len(X_left)==0 or len(X_right)==0:
        node['left'] = node['right'] = node['leftInf'] + node['rightInf']
        return
    if depth >= maxDepth:
        node['left'], node['right'] = node['leftInf'], node['rightInf']
        return
    if len(X_left) <= minSize:
        node['left'] =
        node['leftInf']
    else:
        node['left'] = getAttribute(cols, X_left, y_left, X_test, X_test_orig)
#         X_train = X_train.drop(index=node['idxLeft'], inplace=False)
#         print("Left: ", node['left'])
        partition(node['left'], maxDepth, minSize, depth + 1, cols, X_train, y_train, X_test, X_test_orig)
    # process right child
    if len(X_right) <= minSize:
        node['right'] = node['rightInf']
    else:
        node['right'] = getAttribute(cols, X_right, y_right, X_test, X_test_orig)
#         X_train = X_train.drop(index=node['idxRight'], inplace=False)
#         print("Right: ", node['right'])
        partition(node['right'], maxDepth, minSize, depth + 1, cols, X_train, y_train, X_test, X_test_orig)

def build_tree(X_train, maxDepth, minSize):
    cols = copy.deepcopy(X_train_orig.columns).tolist()
    cols_continuous = ['age', 'hours', 'education.num']
    X_train = pd.DataFrame(data=X_train, columns=cols)
    cols = list(set(cols)-set(cols_continuous))
    root = getAttribute(cols, X_train, y_train, X_test, X_test_orig)
    partition(root, maxDepth, minSize, 1, cols, X_train, y_train, X_test, X_test_orig)
    return root

build_tree(X_train, 2, 20)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Depth:  1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Depth:  2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [64]:
def getSplit(indices, method, f_curr, size, infs, v1, del_L_del_theta,
                      X_train, y_train, X_test, X_test_orig):
    X = np.delete(X_train, indices, 0)
    y = y_train.drop(index=indices, inplace=False)
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    del_f = computeFairness(y_pred, X_test_orig) - f_curr
    print("Influence of set: ", del_f)

    f_col = f_curr + del_f
    print("Del_f: ", del_f)
    print("f_col: ", f_col)

#         if ((del_f > 0) and (abs(f_col) > threshold) and (abs(f_col) < abs(f_curr))):
    if ((abs(f_col) > threshold) and (abs(f_col) < abs(f_curr))):
        attr = col
        f_curr = f_col
        print("Attribute passed: ", attr)
        print("f: ", f_curr)            
        return [attr, f_curr]
    return [None, f_curr]

f_curr = spd_0
cols = copy.deepcopy(X_train_orig.columns)
continuous_cols = ["age", "hours", "education.num"]
for col in cols:
    print("Column: ", col)
    idx = cols.get_loc(col)
    if col not in continuous_cols: #binary
        indices = np.where(X_train[:, idx] == 1)
        print(indices)
        [attr, f_curr] = getSplit(indices, 1, spd_0, int(len(X_train)/1000), None, None, None,
                      X_train, y_train, X_test, X_test_orig)
#     else:
#         vals = X_train[col].unique()
#         for val in vals:
            
#         indices = X_train_pred[X_train_pred[col] == 1].index      
        
        
        

Column:  age
Column:  education.num
Column:  hours
Column:  gender_Female
(array([], dtype=int64),)


ValueError: Index data must be 1-dimensional