In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 10
pd.options.display.max_columns = 500

In [2]:
DIVERSITY_UTILITY = 0.25
FRAC_ADMIT = 0.5


###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','B_p']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['B_p'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
df['ml_outcomes'] = df['ml_outcomes_smooth'].round(2)

In [4]:
FRAC_ADMIT = 0.5#df[['A']].sum()/len(df)


In [5]:
FRAC_ADMIT

0.5

In [6]:
len(df)

100000

In [7]:
#df['ml_outcomes'] = df['R']

In [8]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_black','T_white','B_p']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_black,T_white,B_p
99999,0,11,0.03,8,11,0
99728,0,15,0.04,11,15,0
99729,0,15,0.04,12,15,0
99805,0,13,0.04,11,13,0
99948,0,14,0.04,10,14,0
...,...,...,...,...,...,...
99984,1,98,1.21,98,114,1
99945,1,99,1.22,99,114,1
99996,1,101,1.22,101,116,1
99998,1,100,1.22,100,117,1


In [9]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [10]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','B_p','ml_outcomes']].groupby(['R','T','B_p']).sum().reset_index()

In [11]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes','B_p']].groupby(['R','T','B_p']).count().reset_index()
df_count.columns = ['R','T','B_p','Count']
df_count['N'] = df_count['Count']

In [12]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [13]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N
0,339,0,0,11,0,0.03,1
1,337,1,0,13,0,0.04,1
2,338,2,0,14,0,0.04,1
4,336,4,0,15,1,0.04,1
10,335,10,0,20,1,0.06,1
...,...,...,...,...,...,...,...
103,4,103,0,67,1,648.72,901
91,3,91,0,61,1,650.38,1049
89,2,89,0,60,1,652.20,1087
93,1,93,0,62,1,654.08,1022


### Setup optimization problem 

In [14]:
from ortools.linear_solver import pywraplp


In [15]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [16]:
dff

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N
0,339,0,0,11,0,0.03,1
1,337,1,0,13,0,0.04,1
2,338,2,0,14,0,0.04,1
3,334,3,0,15,0,0.08,2
4,336,4,0,15,1,0.04,1
...,...,...,...,...,...,...,...
335,283,335,1,98,1,2.42,2
336,298,336,1,99,1,1.22,1
337,297,337,1,100,1,1.22,1
338,296,338,1,101,1,1.22,1


In [17]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'],row['B_p'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [18]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [19]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

50000


In [20]:
# Now we have one constraint
solver.NumConstraints()

1

## Add Equalized Odds Constraints

In [21]:
## Make sure that you have to add all people in B_p stratum or none
## i.e. you can't add only people who pass boards and reject those who fail boards from same T, R stratum
didntexist, exists = 0, 0 

for ix, row in dff.iterrows():
    constrain_bp = solver.Constraint(0.0, 0.0)
    
    var1 = vars_cache[(row['R'],row['T'],row['B_p'])]
    key2 = (row['R'],row['T'], 1-row['B_p'])
    
    if key2 not in vars_cache:
        didntexist+=1
        continue
        
    var2 = vars_cache[key2]
    
    constrain_bp.SetCoefficient(var1, -1.0)
    constrain_bp.SetCoefficient(var2, 1.0)
    exists+=1

didntexist, exists

(46, 294)

In [22]:
white_pass_boards = []
white_fail_boards = []
black_pass_boards = []
black_fail_boards = []

for key in vars_cache:
    r, t, b_p = key
    if b_p == 1 and r==0:
        white_pass_boards.append(key)
    elif b_p == 0 and r==0:
        white_fail_boards.append(key)
    elif b_p == 1 and r==1:
        black_pass_boards.append(key)
    elif b_p == 0 and r==1:
        black_fail_boards.append(key)

len(white_pass_boards),len(white_fail_boards),len(black_pass_boards),len(black_fail_boards)

(96, 87, 83, 74)

In [23]:
NUM_TOTALS = {}
df_totals = dff[['N','R','B_p']].groupby(['R','B_p']).sum().reset_index()
for ix, row in df_totals.iterrows():
    NUM_TOTALS[(row['R'],row['B_p'])] = row['N']
    
N_IN_STRATAS = {}
for ix, row in dff.iterrows():
    N_IN_STRATAS[(row['R'],row['T'],row['B_p'])] = row['N']

In [24]:
# Now we have one constraint
solver.NumConstraints()

341

In [25]:
#Of those who pass the boards exams
#Frac white admitted and frac black admitted should be the same

constrain_pass_boards = solver.Constraint(0.0, 0.0)

for key in white_pass_boards:
    r, t, b_p = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p)]
    N_TOTAL = NUM_TOTALS[(r,b_p)]
    
    constrain_pass_boards.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_pass_boards:
    r, t, b_p = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p)]
    N_TOTAL = NUM_TOTALS[(r,b_p)]
    
    constrain_pass_boards.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [26]:
#Of those who fail the boards exams
#Frac white admitted and frac black admitted should be the same

constrain_fail_boards = solver.Constraint(0.0, 0.0)

for key in white_fail_boards:
    r, t, b_p = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p)]
    N_TOTAL = NUM_TOTALS[(r,b_p)]
    
    constrain_fail_boards.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_fail_boards:
    r, t, b_p = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p)]
    N_TOTAL = NUM_TOTALS[(r,b_p)]
    
    constrain_fail_boards.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


## Solve linear program

In [27]:
solver.ABNORMAL

4

In [28]:
status = solver.Solve()


In [29]:
status

0

In [30]:
solver.OPTIMAL

0

In [31]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [32]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
...,...,...
335,335,1.0
336,336,1.0
337,337,1.0
338,338,1.0


In [33]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision
95,0,95,0,63,1,671.45,1033,95,1.0
93,1,93,0,62,1,654.08,1022,93,1.0
89,2,89,0,60,1,652.20,1087,89,1.0
91,3,91,0,61,1,650.38,1049,91,1.0
103,4,103,0,67,1,648.72,901,103,1.0
...,...,...,...,...,...,...,...,...,...
10,335,10,0,20,1,0.06,1,10,0.0
4,336,4,0,15,1,0.04,1,4,0.0
1,337,1,0,13,0,0.04,1,1,0.0
2,338,2,0,14,0,0.04,1,2,0.0


In [34]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision
183,326,183,1,9,0,0.27,1,183,0.0
0,339,0,0,11,0,0.03,1,0,0.0
184,327,184,1,11,0,0.27,1,184,0.0
185,328,185,1,12,0,0.27,1,185,0.0
1,337,1,0,13,0,0.04,1,1,0.0
...,...,...,...,...,...,...,...,...,...
178,311,178,0,112,1,0.99,1,178,1.0
179,308,179,0,113,1,0.99,1,179,1.0
180,307,180,0,114,1,0.99,1,180,1.0
181,286,181,0,118,1,1.98,2,181,1.0


In [35]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision
95,0,95,0,63,1,671.45,1033,95,1.0
93,1,93,0,62,1,654.08,1022,93,1.0
89,2,89,0,60,1,652.20,1087,89,1.0
91,3,91,0,61,1,650.38,1049,91,1.0
103,4,103,0,67,1,648.72,901,103,1.0
...,...,...,...,...,...,...,...,...,...
10,335,10,0,20,1,0.06,1,10,0.0
4,336,4,0,15,1,0.04,1,4,0.0
1,337,1,0,13,0,0.04,1,1,0.0
2,338,2,0,14,0,0.04,1,2,0.0


In [36]:
xxx.to_csv('./decision.csv')

In [37]:
solution = pd.read_csv('./decision.csv')

In [38]:
xxx

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision
95,0,95,0,63,1,671.45,1033,95,1.0
93,1,93,0,62,1,654.08,1022,93,1.0
89,2,89,0,60,1,652.20,1087,89,1.0
91,3,91,0,61,1,650.38,1049,91,1.0
103,4,103,0,67,1,648.72,901,103,1.0
...,...,...,...,...,...,...,...,...,...
10,335,10,0,20,1,0.06,1,10,0.0
4,336,4,0,15,1,0.04,1,4,0.0
1,337,1,0,13,0,0.04,1,1,0.0
2,338,2,0,14,0,0.04,1,2,0.0


In [39]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)+'_'+df['B_p'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)+'_'+xxx['B_p'].astype(str)

In [40]:
xxx

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision,key
95,0,95,0,63,1,671.45,1033,95,1.0,0_63_1
93,1,93,0,62,1,654.08,1022,93,1.0,0_62_1
89,2,89,0,60,1,652.20,1087,89,1.0,0_60_1
91,3,91,0,61,1,650.38,1049,91,1.0,0_61_1
103,4,103,0,67,1,648.72,901,103,1.0,0_67_1
...,...,...,...,...,...,...,...,...,...,...
10,335,10,0,20,1,0.06,1,10,0.0,0_20_1
4,336,4,0,15,1,0.04,1,4,0.0,0_15_1
1,337,1,0,13,0,0.04,1,1,0.0,0_13_0
2,338,2,0,14,0,0.04,1,2,0.0,0_14_0


In [41]:
admit_decisions = df.merge(xxx[['key','decision']],how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_BLACK_POLICY = (admit_decisions['R'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()

In [42]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Counterfactual Equalized Odds',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()

In [43]:
xxx[xxx['R']==0].sort_values(by='decision')

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision,key
0,339,0,0,11,0,0.03,1,0,0.0,0_11_0
31,258,31,0,31,1,5.32,38,31,0.0,0_31_1
44,125,44,0,38,0,148.05,705,44,0.0,0_38_0
33,260,33,0,32,1,5.04,36,33,0.0,0_32_1
57,121,57,0,44,1,155.10,517,57,0.0,0_44_1
...,...,...,...,...,...,...,...,...,...,...
112,135,112,0,72,0,127.98,162,112,1.0,0_72_0
114,140,114,0,73,0,116.00,145,114,1.0,0_73_0
143,144,143,0,87,1,103.96,113,143,1.0,0_87_1
108,120,108,0,70,0,159.39,207,108,1.0,0_70_0


In [44]:
xxx[xxx['R']==1].sort_values(by='decision')

Unnamed: 0,level_0,index,R,T,B_p,ml_outcomes,N,row_id,decision,key
226,172,226,1,36,1,54.80,137,226,0.0,1_36_1
199,214,199,1,23,0,21.08,68,199,0.0,1_23_0
201,209,201,1,24,0,26.04,84,201,0.0,1_24_0
220,206,220,1,33,1,27.75,75,220,0.0,1_33_1
222,198,222,1,34,1,31.92,84,222,0.0,1_34_1
...,...,...,...,...,...,...,...,...,...,...
254,89,254,1,50,1,244.80,408,254,1.0,1_50_1
298,176,298,1,72,1,51.51,51,298,1.0,1_72_1
304,175,304,1,75,1,53.55,51,304,1.0,1_75_1
266,85,266,1,56,1,254.88,354,266,1.0,1_56_1


In [45]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.5000000000000004

In [46]:
admit_decisions[admit_decisions['B_p']==1][['decision','R']].groupby(['R']).mean()

Unnamed: 0_level_0,decision
R,Unnamed: 1_level_1
0,0.676788
1,0.676788
