In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

DIVERSITY_UTILITY = float(os.environ.get('DIVERSITY_UTILITY'))
FRAC_ADMIT = float(os.environ.get('FRAC_ADMIT'))

In [2]:
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','Y']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['Y'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_minority','T_majority','Y']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_minority,T_majority,Y
99974,0,12,0.02,10,12,0
99914,0,16,0.03,10,16,0
99913,0,16,0.03,12,16,0
99650,0,17,0.06,14,17,0
99653,0,17,0.06,13,17,0
...,...,...,...,...,...,...
99940,1,94,1.21,94,107,0
99949,1,101,1.23,101,118,1
98552,1,98,1.25,98,114,1
99977,1,112,1.25,112,129,1


In [4]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [5]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [6]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [7]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [8]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,191,0,0,12,0.02,1
3,190,3,0,16,0.06,2
2,189,2,0,15,0.07,1
1,188,1,0,13,0.10,1
104,187,104,1,7,0.25,1
...,...,...,...,...,...,...
49,4,49,0,62,1074.56,1679
42,3,42,0,55,1091.00,2182
44,2,44,0,57,1107.54,2051
46,1,46,0,59,1115.34,1923


### Setup optimization problem 

In [9]:
from ortools.linear_solver import pywraplp


In [10]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [11]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,191,0,0,12,0.02,1
1,188,1,0,13,0.10,1
2,189,2,0,15,0.07,1
3,190,3,0,16,0.06,2
4,183,4,0,17,0.48,8
...,...,...,...,...,...,...
187,172,187,1,95,1.19,1
188,166,188,1,97,1.25,1
189,167,189,1,98,1.25,1
190,169,190,1,101,1.23,1


In [12]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [13]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [14]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total admits cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

50000


In [15]:
# Now we have one constraint
solver.NumConstraints()

1

## Add Equalized Odds Constraints

In [16]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [17]:
T_minoritys_list = df[df['R']==0][['T','T_minority']].groupby('T')['T_minority'].apply(list).reset_index(name='T_minoritys')
T_minoritys_list['probs'] = T_minoritys_list['T_minoritys'].apply(convertListToProb)
didntexist = 0
exists = 0
for ix, row in T_minoritys_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    majority_T = row['T']
    minoritys_Ts = row['probs']        
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, majority_T)], -1.0)
    for prob in minoritys_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
didntexist,exists

(6, 711)

In [18]:
T_majoritys_list = df[df['R']==1][['T','T_majority']].groupby('T')['T_majority'].apply(list).reset_index(name='T_majoritys')
T_majoritys_list['probs'] = T_majoritys_list['T_majoritys'].apply(convertListToProb)
didntexist = 0
exists = 0

for ix, row in T_majoritys_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    minority_T = row['T']
    majority_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, minority_T)], -1.0)
    for prob in majority_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
didntexist,exists  

(2, 651)

In [19]:
solver.NumConstraints()

193

## Solve linear program

In [20]:
solver.ABNORMAL

4

In [21]:
status = solver.Solve()


In [22]:
status

0

In [23]:
solver.OPTIMAL

0

In [24]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [25]:
df_decisions

Unnamed: 0,row_id,decision
0,0,1.059064e-11
1,1,4.681036e-01
2,2,0.000000e+00
3,3,2.586908e-11
4,4,4.618012e-01
...,...,...
187,187,1.000000e+00
188,188,1.000000e+00
189,189,1.000000e+00
190,190,1.000000e+00


In [26]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
45,0,45,0,58,1123.36,2006,45,4.999609e-01
46,1,46,0,59,1115.34,1923,46,4.999609e-01
44,2,44,0,57,1107.54,2051,44,4.999609e-01
42,3,42,0,55,1091.00,2182,42,4.999609e-01
49,4,49,0,62,1074.56,1679,49,4.999609e-01
...,...,...,...,...,...,...,...,...
104,187,104,1,7,0.25,1,104,4.681036e-01
1,188,1,0,13,0.10,1,1,4.681036e-01
2,189,2,0,15,0.07,1,2,0.000000e+00
3,190,3,0,16,0.06,2,3,2.586908e-11


In [27]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
104,187,104,1,7,0.25,1,104,4.681036e-01
0,191,0,0,12,0.02,1,0,1.059064e-11
105,186,105,1,12,0.31,1,105,0.000000e+00
106,182,106,1,13,0.60,2,106,4.681036e-01
1,188,1,0,13,0.10,1,1,4.681036e-01
...,...,...,...,...,...,...,...,...
99,161,99,0,113,1.98,2,99,1.000000e+00
100,177,100,0,115,1.00,1,100,1.000000e+00
101,176,101,0,116,1.00,1,101,1.000000e+00
102,179,102,0,118,0.98,1,102,1.000000e+00


In [28]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
45,0,45,0,58,1123.36,2006,45,4.999609e-01
46,1,46,0,59,1115.34,1923,46,4.999609e-01
44,2,44,0,57,1107.54,2051,44,4.999609e-01
42,3,42,0,55,1091.00,2182,42,4.999609e-01
49,4,49,0,62,1074.56,1679,49,4.999609e-01
...,...,...,...,...,...,...,...,...
104,187,104,1,7,0.25,1,104,4.681036e-01
1,188,1,0,13,0.10,1,1,4.681036e-01
2,189,2,0,15,0.07,1,2,0.000000e+00
3,190,3,0,16,0.06,2,3,2.586908e-11


In [29]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [30]:
len(xxx)

192

In [31]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT


FRAC_minority_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['Y'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['Y'] * admit_decisions['decision_random']).sum()


In [32]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Counterfactual Fairness',str(FRAC_minority_POLICY),str(SUM_BP_POLICY)))
file.close()


In [33]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.5000000000000003