In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 10
pd.options.display.max_columns = 500

In [2]:
import os
DIVERSITY_UTILITY = float(os.environ.get('DIVERSITY_UTILITY'))
FRAC_ADMIT = float(os.environ.get('FRAC_ADMIT'))
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','Y']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['Y'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [5]:
len(df)

999999

In [6]:
#df['ml_outcomes'] = df['R']

In [7]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_minority','T_majority','Y']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_minority,T_majority,Y
999835,0,11,0.00,11,11,0
999838,0,11,0.00,7,11,0
999837,0,11,0.00,8,11,0
999836,0,11,0.00,11,11,0
999886,0,12,0.02,11,12,0
...,...,...,...,...,...,...
999913,1,98,1.25,98,115,1
999914,1,98,1.25,98,115,1
999925,1,109,1.25,109,126,1
998845,1,97,1.25,97,112,1


In [8]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [9]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','Y','ml_outcomes']].groupby(['R','T','Y']).sum().reset_index()

In [10]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes','Y']].groupby(['R','T','Y']).count().reset_index()
df_count.columns = ['R','T','Y','Count']
df_count['N'] = df_count['Count']

In [11]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [12]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N
2,395,2,0,11,0,0.00,4
1,394,1,0,10,0,0.08,1
3,393,3,0,12,0,0.08,4
8,392,8,0,16,1,0.09,3
0,391,0,0,9,0,0.10,1
...,...,...,...,...,...,...,...
96,4,96,0,60,1,6601.80,11003
104,3,104,0,64,1,6646.40,9920
102,2,102,0,63,1,6676.80,10272
98,1,98,0,61,1,6797.06,10963


### Setup optimization problem 

In [13]:
from ortools.linear_solver import pywraplp


In [14]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [15]:
dff

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N
0,391,0,0,9,0,0.10,1
1,394,1,0,10,0,0.08,1
2,395,2,0,11,0,0.00,4
3,393,3,0,12,0,0.08,4
4,374,4,0,13,0,1.00,10
...,...,...,...,...,...,...,...
391,363,391,1,107,1,1.25,1
392,364,392,1,109,1,1.25,1
393,365,393,1,111,1,1.25,1
394,379,394,1,113,1,0.92,1


In [16]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'],row['Y'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [17]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [18]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

499999


In [19]:
# Now we have one constraint
solver.NumConstraints()

1

## Add Equalized Odds Constraints

In [20]:
## Make sure that you have to add all people in Y stratum or none
## i.e. you can't add only people who pass boards and reject those who fail boards from same T, R stratum
didntexist, exists = 0, 0 

for ix, row in dff.iterrows():
    constrain_bp = solver.Constraint(0.0, 0.0)
    
    var1 = vars_cache[(row['R'],row['T'],row['Y'])]
    key2 = (row['R'],row['T'], 1-row['Y'])
    
    if key2 not in vars_cache:
        didntexist+=1
        continue
        
    var2 = vars_cache[key2]
    
    constrain_bp.SetCoefficient(var1, -1.0)
    constrain_bp.SetCoefficient(var2, 1.0)
    exists+=1

didntexist, exists

(48, 348)

In [21]:
majority_pass_boards = []
majority_fail_boards = []
minority_pass_boards = []
minority_fail_boards = []

for key in vars_cache:
    r, t, Y = key
    if Y == 1 and r==0:
        majority_pass_boards.append(key)
    elif Y == 0 and r==0:
        majority_fail_boards.append(key)
    elif Y == 1 and r==1:
        minority_pass_boards.append(key)
    elif Y == 0 and r==1:
        minority_fail_boards.append(key)

len(majority_pass_boards),len(majority_fail_boards),len(minority_pass_boards),len(minority_fail_boards)

(111, 99, 96, 90)

In [22]:
NUM_TOTALS = {}
df_totals = dff[['N','R','Y']].groupby(['R','Y']).sum().reset_index()
for ix, row in df_totals.iterrows():
    NUM_TOTALS[(row['R'],row['Y'])] = row['N']
    
N_IN_STRATAS = {}
for ix, row in dff.iterrows():
    N_IN_STRATAS[(row['R'],row['T'],row['Y'])] = row['N']

In [23]:
# Now we have one constraint
solver.NumConstraints()

397

In [24]:
#Of those who pass the boards exams
#Frac majority admitted and frac minority admitted should be the same

constrain_pass_boards = solver.Constraint(0.0, 0.0)

for key in majority_pass_boards:
    r, t, Y = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y)]
    N_TOTAL = NUM_TOTALS[(r,Y)]
    
    constrain_pass_boards.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_pass_boards:
    r, t, Y = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y)]
    N_TOTAL = NUM_TOTALS[(r,Y)]
    
    constrain_pass_boards.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [25]:
#Of those who fail the boards exams
#Frac majority admitted and frac minority admitted should be the same

constrain_fail_boards = solver.Constraint(0.0, 0.0)

for key in majority_fail_boards:
    r, t, Y = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y)]
    N_TOTAL = NUM_TOTALS[(r,Y)]
    
    constrain_fail_boards.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_fail_boards:
    r, t, Y = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y)]
    N_TOTAL = NUM_TOTALS[(r,Y)]
    
    constrain_fail_boards.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


## Solve linear program

In [26]:
solver.ABNORMAL

4

In [27]:
status = solver.Solve()


In [28]:
status

0

In [29]:
solver.OPTIMAL

0

In [30]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [31]:
df_decisions

Unnamed: 0,row_id,decision
0,0,1.0
1,1,1.0
2,2,0.0
3,3,0.0
4,4,1.0
...,...,...
391,391,1.0
392,392,1.0
393,393,1.0
394,394,1.0


In [32]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision
100,0,100,0,62,1,6819.84,10656,100,1.0
98,1,98,0,61,1,6797.06,10963,98,1.0
102,2,102,0,63,1,6676.80,10272,102,1.0
104,3,104,0,64,1,6646.40,9920,104,1.0
96,4,96,0,60,1,6601.80,11003,96,1.0
...,...,...,...,...,...,...,...,...,...
0,391,0,0,9,0,0.10,1,0,1.0
8,392,8,0,16,1,0.09,3,8,0.0
3,393,3,0,12,0,0.08,4,3,0.0
1,394,1,0,10,0,0.08,1,1,1.0


In [33]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision
210,382,210,1,7,0,0.50,2,210,0.0
211,385,211,1,8,0,0.32,1,211,0.0
212,376,212,1,9,0,1.00,4,212,0.0
0,391,0,0,9,0,0.10,1,0,1.0
1,394,1,0,10,0,0.08,1,1,1.0
...,...,...,...,...,...,...,...,...,...
205,354,205,0,123,1,2.00,2,205,1.0
206,373,206,0,124,0,1.00,1,206,1.0
207,353,207,0,125,1,2.00,2,207,1.0
208,372,208,0,127,1,1.00,1,208,1.0


In [34]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision
100,0,100,0,62,1,6819.84,10656,100,1.0
98,1,98,0,61,1,6797.06,10963,98,1.0
102,2,102,0,63,1,6676.80,10272,102,1.0
104,3,104,0,64,1,6646.40,9920,104,1.0
96,4,96,0,60,1,6601.80,11003,96,1.0
...,...,...,...,...,...,...,...,...,...
0,391,0,0,9,0,0.10,1,0,1.0
8,392,8,0,16,1,0.09,3,8,0.0
3,393,3,0,12,0,0.08,4,3,0.0
1,394,1,0,10,0,0.08,1,1,1.0


In [35]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)+'_'+df['Y'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)+'_'+xxx['Y'].astype(str)

In [36]:
xxx

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision,key
100,0,100,0,62,1,6819.84,10656,100,1.0,0_62_1
98,1,98,0,61,1,6797.06,10963,98,1.0,0_61_1
102,2,102,0,63,1,6676.80,10272,102,1.0,0_63_1
104,3,104,0,64,1,6646.40,9920,104,1.0,0_64_1
96,4,96,0,60,1,6601.80,11003,96,1.0,0_60_1
...,...,...,...,...,...,...,...,...,...,...
0,391,0,0,9,0,0.10,1,0,1.0,0_9_0
8,392,8,0,16,1,0.09,3,8,0.0,0_16_1
3,393,3,0,12,0,0.08,4,3,0.0,0_12_0
1,394,1,0,10,0,0.08,1,1,1.0,0_10_0


In [37]:
admit_decisions = df.merge(xxx[['key','decision']],how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_minority_POLICY = (admit_decisions['R'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['Y'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['Y'] * admit_decisions['decision_random']).sum()

In [38]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Counterfactual Equalized Odds',str(FRAC_minority_POLICY),str(SUM_BP_POLICY)))
file.close()

In [39]:
xxx[xxx['R']==0].sort_values(by='decision')

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision,key
2,395,2,0,11,0,0.00,4,2,0.0,0_11_0
19,294,19,0,22,0,18.08,226,19,0.0,0_22_0
60,147,60,0,42,1,1037.07,3841,60,0.0,0_42_1
47,145,47,0,36,0,1045.76,5504,47,0.0,0_36_0
30,300,30,0,27,1,13.86,126,30,0.0,0_27_1
...,...,...,...,...,...,...,...,...,...,...
154,159,154,0,89,1,800.73,861,154,1.0,0_89_1
156,166,156,0,90,1,682.44,726,156,1.0,0_90_1
158,169,158,0,91,1,642.02,683,158,1.0,0_91_1
147,241,147,0,86,0,96.60,105,147,1.0,0_86_0


In [40]:
xxx[xxx['R']==1].sort_values(by='decision')

Unnamed: 0,level_0,index,R,T,Y,ml_outcomes,N,row_id,decision,key
281,28,281,1,46,0,4816.64,9088,281,0.0,1_46_0
264,172,264,1,37,1,607.21,1481,264,0.0,1_37_1
266,165,266,1,38,1,694.68,1654,266,0.0,1_38_1
245,162,245,1,28,0,741.54,2181,245,0.0,1_28_0
236,301,236,1,23,1,13.64,44,236,0.0,1_23_1
...,...,...,...,...,...,...,...,...,...,...
338,170,338,1,74,1,638.40,608,338,1.0,1_74_1
340,178,340,1,75,1,524.70,495,340,1.0,1_75_1
325,179,325,1,68,0,500.16,521,325,1.0,1_68_0
334,160,334,1,72,1,800.70,785,334,1.0,1_72_1


In [41]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.4999994999995001

In [42]:
admit_decisions[admit_decisions['Y']==1][['decision','R']].groupby(['R']).mean()

Unnamed: 0_level_0,decision
R,Unnamed: 1_level_1
0,0.678964
1,0.678964
