In [56]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import mean_squared_error

In [6]:
import xgboost as xgb

In [2]:
import adiscriminator as ad

# Get data

In [9]:
adult = ad.data.get_data()
X, y = ad.data.data_to_np(adult)

In [10]:
X.shape, y.shape, y.mean()

((32560, 6), (32560,), 0.7591830466830467)

In [11]:
protected_column = np.array((adult.sex == ' Female').astype(int))

# Set up xgb DMatrix

In [13]:
X_xgb = xgb.DMatrix(X, label = y)

In [14]:
setattr(X_xgb, 'group', protected_column)

In [18]:
setattr(X_xgb, 'N1', protected_column.sum())
setattr(X_xgb, 'N2', protected_column.shape[0] - protected_column.sum())

In [None]:
setattr(X_xgb, 'group', protected_column)

In [199]:
setattr(X_xgb, 'lambda_', 0.001)

# Define loss function gradient and hessian

In [207]:
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    p_one_minus_p = preds * (1.0 - preds)
    d = (np.sum(preds[dtrain.group]) / X_xgb.N1) - (np.sum(preds[np.invert(dtrain.group)]) / X_xgb.N2)
    print((- labels * np.log(preds) - (1 - labels) * np.log(1 - preds) + dtrain.lambda_ * np.log(1 - d**2)).sum())
    grad = preds - labels
    p_one_minus_p_d = (np.sum(p_one_minus_p[dtrain.group]) / X_xgb.N1) - (np.sum(p_one_minus_p[np.invert(dtrain.group)]) / X_xgb.N2)
    d2pdx2 = p_one_minus_p * (1 - 2 * preds)
    d2pdx2_d = (np.sum(d2pdx2[dtrain.group]) / X_xgb.N1) - (np.sum(d2pdx2[np.invert(dtrain.group)]) / X_xgb.N2)
    grad = grad - (2 * d * dtrain.lambda_) * p_one_minus_p_d / (1 - d ** 2)
    hess = p_one_minus_p 
    hess_extra_1 = -2 * dtrain.lambda_ * (1 + d ** 2) * p_one_minus_p_d  / (1 - d ** 2) ** 2 
    hess_extra_2 = -2 * dtrain.lambda_ * d2pdx2_d / (1 - d ** 2) 
    hess = hess + hess_extra_1 + hess_extra_2
    return grad, hess

# Build model

In [200]:
param = {'max_depth': 2, 'eta': 1, 'eval_metric': 'logloss'}
watchlist = [(X_xgb, 'train')]
num_round = 100

In [208]:
bst = xgb.train(param, X_xgb, num_round, watchlist, obj=logregobj)

19279.691
[0]	train-logloss:4.26792
nan
[1]	train-logloss:15.88878
36050.24
[2]	train-logloss:4.26291
nan
[3]	train-logloss:3.85816
nan
[4]	train-logloss:5.03606
nan
[5]	train-logloss:5.27794
nan
[6]	train-logloss:5.08399
nan
[7]	train-logloss:5.17690
nan
[8]	train-logloss:5.14558
nan
[9]	train-logloss:5.05676
nan
[10]	train-logloss:13.85444
nan
[11]	train-logloss:4.74888
nan
[12]	train-logloss:4.63285
nan
[13]	train-logloss:4.63285
nan
[14]	train-logloss:4.63285
nan
[15]	train-logloss:4.63285
nan
[16]	train-logloss:4.63285
nan
[17]	train-logloss:4.63285
nan
[18]	train-logloss:4.63285
nan
[19]	train-logloss:4.63285
nan
[20]	train-logloss:4.63285
nan
[21]	train-logloss:4.63285
nan
[22]	train-logloss:4.63285
nan
[23]	train-logloss:4.63285
nan
[24]	train-logloss:4.63285
nan
[25]	train-logloss:4.63285
nan
[26]	train-logloss:4.63285
nan
[27]	train-logloss:4.63285
nan
[28]	train-logloss:4.63285
nan
[29]	train-logloss:4.63285
nan
[30]	train-logloss:4.63285


  
  
  


nan
[31]	train-logloss:4.63285
nan
[32]	train-logloss:4.63285
nan
[33]	train-logloss:4.63285
nan
[34]	train-logloss:4.63285
nan
[35]	train-logloss:4.63285
nan
[36]	train-logloss:4.63285
nan
[37]	train-logloss:4.63285
nan
[38]	train-logloss:4.63285
nan
[39]	train-logloss:4.63285
nan
[40]	train-logloss:4.63285
nan
[41]	train-logloss:4.63285
nan
[42]	train-logloss:4.63285
nan
[43]	train-logloss:4.63285
nan
[44]	train-logloss:4.63285
nan
[45]	train-logloss:4.63285
nan
[46]	train-logloss:4.63285
nan
[47]	train-logloss:4.63285
nan
[48]	train-logloss:4.63285
nan
[49]	train-logloss:4.63285
nan
[50]	train-logloss:4.63285
nan
[51]	train-logloss:4.63285
nan
[52]	train-logloss:4.63285
nan
[53]	train-logloss:4.63285
nan
[54]	train-logloss:4.63285
nan
[55]	train-logloss:4.63285
nan
[56]	train-logloss:4.63285
nan
[57]	train-logloss:4.63285
nan
[58]	train-logloss:4.63285
nan
[59]	train-logloss:4.63285
nan
[60]	train-logloss:4.63285
nan
[61]	train-logloss:4.63285
nan
[62]	train-logloss:4.63285
nan
[63]

In [202]:
adult['predictions'] = pd.Series(1.0 / (1.0 + np.exp(-bst.predict(X_xgb, ntree_limit = 4))))

In [203]:
adult.groupby('sex')[['predictions', 'income']].mean()

Unnamed: 0_level_0,predictions,income
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.843208,0.890539
Male,0.785265,0.694249


In [204]:
print(accuracy_score(y, adult['predictions'] > 0.5), f1_score(y, adult['predictions'] > 0.5))

0.8126535626535627 0.8898200997037787


In [96]:
adult.groupby('sex')[['predictions', 'income']].mean()

Unnamed: 0_level_0,predictions,income
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.828104,0.890539
Male,0.725014,0.694249


In [97]:
print(accuracy_score(y, adult['predictions'] > 0.5), f1_score(y, adult['predictions'] > 0.5))

0.8505528255528255 0.907119679328116
