# Fairness Checking: Linear Program (Statistical Parity)
This notebook uses PuLP to solve the linear program outlined in the "Fairness Checking" document. 

In [0]:
!pip install pulp



In [0]:
import pandas as pd
import numpy as np
import pulp

In [0]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Predicted Data
We take A (the protected attribute) to be the 'race' variable, and a = 0 while a' = 1. The last column, 'prediction,' is our f(X) variable, which can either be 0 or 1.

In [0]:
df = pd.read_csv('/content/drive/My Drive/colab/fairness_checking/predicted_dataset_scores.csv')

In [0]:
df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,two_year_recid,c_charge_degree_F,c_charge_degree_M,risk_recid,prediction_probs,prediction
0,3,0.000693,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.029285,0
1,4,0.000832,1.0,0.294872,0.0,0.0,0.0,0.0,0.368421,1.0,1.0,0.0,1.0,0.73793,1
2,5,0.00097,1.0,0.320513,0.0,0.0,0.0,0.0,0.078947,0.0,1.0,0.0,0.0,0.16901,0
3,9,0.001664,0.0,0.24359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.057516,0
4,16,0.002634,1.0,0.166667,0.0,0.0,0.0,0.0,0.131579,1.0,1.0,0.0,0.0,0.623609,1


In [0]:
# Our protected variable A is race, and a = 0 or a = 1. 
a = df['race']
np.asarray(a)
a_0_indices = []
a_1_indices = []

for i in range(len(a)):
    if a[i] == 0:
        a_0_indices.append(i)
    elif a[i] == 1:
        a_1_indices.append(i)

In [0]:
# Our pi variables are simply the proportions of a_0 or a_1 
pi_0 = float(len(a_0_indices))/(len(a_0_indices) + len(a_1_indices))
pi_1 = float(len(a_1_indices))/(len(a_0_indices) + len(a_1_indices))

In [0]:
print(pi_0)
print(pi_1)

0.4943181818181818
0.5056818181818182


In [0]:
# Our prediction probability is under 'prediction_probs.' Notice that we use the 
# actual probability of the label, not the label itself.
f_X = df['prediction_probs']
np.asarray(f_X)

array([0.02928513, 0.7379298 , 0.1690098 , ..., 0.99888   , 0.47041905,
       0.53738034])

In [0]:
# Check if the dimensions match
print(len(f_X))
print(len(a_0_indices) + (len(a_1_indices)))

1232
1232


## Create Linear Program
We use the PuLP package to create our linear program.

In [0]:
# Define the linear program as a maximization problem
model = pulp.LpProblem("Statistical Parity Fairness Checking", pulp.LpMaximize)

In [0]:
# Our w variable in the objective
# Lower bound constraint set here with lowBound=0 option
w = pulp.LpVariable.dicts("w", (i for i in range(len(f_X))),lowBound=0, cat='Continuous')

In [0]:
# Objective Function
model += pulp.lpSum(
    [(1./pi_0) * w[index] * f_X[index] for index in a_0_indices] +
    [- (1./pi_1) * w[index] * f_X[index] for index in a_1_indices]
)

In [0]:
# Constraint that the \sum(w_i * 1{a = 0}) = pi_0
model += pulp.lpSum([w[index] for index in a_0_indices]) == pi_0

In [0]:
# Constraint that the \sum(w_i * 1{a = 1}) = pi_1
model += pulp.lpSum([w[index] for index in a_1_indices]) == pi_1

In [0]:
# Constraint that the w's all sum to 1
model += pulp.lpSum([w[i] for i in range(len(f_X))]) == 1

In [0]:
# Solve the linear program
model.solve()

1

In [0]:
pulp.LpStatus[model.status]

'Optimal'

In [0]:
# Get the results
results_dict = dict()
for i in range(len(f_X)):
  results_dict[w[i]] = w[i].varValue

In [0]:
# Print the results
# results_dict

In [0]:
# The final value of the objective, optimized
pulp.value(model.objective)

0.9842211002659029