In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [2]:
import adiscriminator as ad

# Get data

In [3]:
adult = ad.data.get_data()
X, y = ad.data.data_to_np(adult)

In [4]:
X.shape, y.shape, y.mean()

((32560, 6), (32560,), 0.7591830466830467)

In [5]:
protected_column = np.array((adult.sex == ' Female').astype(int))

# Build model without regularisation

In [6]:
model = ad.logistic_regression.base.LogisticRegression()

In [7]:
model.fit(X, y)

<adiscriminator.logistic_regression.base.LogisticRegression at 0x7fc7bc1390d0>

In [8]:
model.coefficients

Unnamed: 0,name,std_coef,coef
0,intercept,1.351561,8.450479
1,x1,-0.591669,-0.04337612
2,x2,-0.060192,-5.702823e-07
3,x3,-0.83161,-0.323248
4,x4,-2.353576,-0.0003186843
5,x5,-0.282279,-0.0007005138
6,x6,-0.50506,-0.04090409


In [9]:
adult['model_preds'] = model.predict_proba(X)

In [10]:
adult['model_preds'].mean()

0.7591828755054341

# Build model with L2 regularisation

## Using adiscriminator

In [11]:
model2 = ad.logistic_regression.ridge.RidgeRegression(
    lambda_ = 5, 
    penalise_intercept = True
)

In [12]:
model2.fit(X, y)

<adiscriminator.logistic_regression.ridge.RidgeRegression at 0x7fc7bc88dcd0>

In [13]:
model2.coefficients

Unnamed: 0,name,std_coef,coef
0,intercept,1.353157,8.42353
1,x1,-0.590112,-0.043262
2,x2,-0.059913,-5.67638e-07
3,x3,-0.828911,-0.3221988
4,x4,-2.298129,-0.0003111766
5,x5,-0.281212,-0.0006978648
6,x6,-0.503613,-0.04078684


In [14]:
adult['model2_preds'] = model2.predict_proba(X)

In [15]:
adult['model2_preds'].mean()

0.7589751075704007

## Compare to scikit-learn L2 model
Note, X needs to be scaled first and the regularisation parameter for sklearn is the reciprocal.

In [16]:
X_scaled = preprocessing.scale(X)

In [17]:
sklearn_model = LogisticRegression(
    C = 1/5, 
    penalty = 'l2', 
    fit_intercept = True
)

In [18]:
sklearn_model.fit(X_scaled, y)

LogisticRegression(C=0.2)

In [19]:
print(sklearn_model.intercept_, sklearn_model.coef_[0])

[1.35547861] [-0.59069404 -0.05995663 -0.8295936  -2.29543759 -0.28128687 -0.50406425]


# Build models penalising differences in average prediction

## Try different penalty strength

In [20]:
for l in np.linspace(0, 200, 21):
    model_p = ad.logistic_regression.fair.GroupMeanEqualisingRegression(
        group = protected_column,
        lambda_ = l
    ).fit(X, y)
    adult[f'model_p{l}_preds'] = model_p.predict_proba(X)

## Gather results

In [21]:
prediction_columns = ['model_preds'] + [f'model_p{l}_preds' for l in np.linspace(0, 200, 21)]

In [22]:
results = adult.groupby('sex')[prediction_columns].mean().T
results['diff'] = results[' Female'] - results[' Male']

In [23]:
results['accuracy'] = [accuracy_score(y, adult[col] > 0.5) for col in prediction_columns] 
results['f1'] = [f1_score(y, adult[col] > 0.5) for col in prediction_columns] 

In [24]:
results

sex,Female,Male,diff,accuracy,f1
model_preds,0.809917,0.734103,0.075814,0.815387,0.886398
model_p0.0_preds,0.809917,0.734103,0.075814,0.815387,0.886398
model_p10.0_preds,0.782867,0.715137,0.06773,0.801843,0.878397
model_p20.0_preds,0.848566,0.770168,0.078399,0.801843,0.878397
model_p30.0_preds,0.80762,0.735491,0.072129,0.801843,0.878397
model_p40.0_preds,0.729788,0.67264,0.057149,0.801843,0.878397
model_p50.0_preds,0.663241,0.621049,0.042191,0.801843,0.878397
model_p60.0_preds,0.663241,0.621049,0.042191,0.801843,0.878397
model_p70.0_preds,0.744288,0.697045,0.047243,0.802058,0.880265
model_p80.0_preds,0.799119,0.751609,0.047509,0.799939,0.879811
