In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

DIVERSITY_UTILITY = float(os.environ.get('DIVERSITY_UTILITY'))
FRAC_ADMIT = float(os.environ.get('FRAC_ADMIT'))

In [2]:


###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','Y']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['Y'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
df['Y_reject']

0         0
1         1
2         0
3         0
4         1
         ..
999994    1
999995    1
999996    1
999997    1
999998    1
Name: Y_reject, Length: 999999, dtype: int64

In [4]:
df['Y_stratum'] = df['Y'].astype(str) + df['Y_reject'].astype(str)



In [5]:
df['Y_stratum'].value_counts()

00    557322
11    357374
10     85303
Name: Y_stratum, dtype: int64

In [6]:
# = 
#df['Y_reject'] = df['Y']

In [7]:
len(df)

999999

In [8]:
#df['ml_outcomes'] = df['R']

In [9]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_minority','T_majority','Y']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_minority,T_majority,Y
999835,0,11,0.00,11,11,0
999838,0,11,0.00,7,11,0
999837,0,11,0.00,8,11,0
999836,0,11,0.00,11,11,0
999886,0,12,0.02,11,12,0
...,...,...,...,...,...,...
999913,1,98,1.25,98,115,1
999914,1,98,1.25,98,115,1
999925,1,109,1.25,109,126,1
998845,1,97,1.25,97,112,1


In [10]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [11]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','Y','Y_reject','ml_outcomes']].groupby(['R','T','Y','Y_reject']).sum().reset_index()

In [12]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes','Y','Y_reject']].groupby(['R','T','Y','Y_reject']).count().reset_index()
df_count.columns = ['R','T','Y','Y_reject','Count']
df_count['N'] = df_count['Count']

In [13]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [14]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,Y,Y_reject,ml_outcomes,N
2,561,2,0,11,0,0,0.00,4
8,560,8,0,16,1,0,0.03,1
9,558,9,0,16,1,1,0.06,2
12,559,12,0,17,1,1,0.06,1
1,557,1,0,10,0,0,0.08,1
...,...,...,...,...,...,...,...,...
115,4,115,0,52,0,0,5544.00,12320
150,3,150,0,63,1,1,5595.85,8609
144,2,144,0,61,1,1,5654.40,9120
153,1,153,0,64,1,1,5675.57,8471


### Setup optimization problem 

In [15]:
from ortools.linear_solver import pywraplp


In [16]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [17]:
len(dff[['R','T','Y','Y_reject']])

562

In [18]:
dff

Unnamed: 0,level_0,index,R,T,Y,Y_reject,ml_outcomes,N
0,554,0,0,9,0,0,0.10,1
1,557,1,0,10,0,0,0.08,1
2,561,2,0,11,0,0,0.00,4
3,555,3,0,12,0,0,0.08,4
4,523,4,0,13,0,0,1.00,10
...,...,...,...,...,...,...,...,...
557,507,557,1,107,1,1,1.25,1
558,508,558,1,109,1,1,1.25,1
559,509,559,1,111,1,1,1.25,1
560,531,560,1,113,1,1,0.92,1


In [19]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'],row['Y'], row['Y_reject'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [20]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [21]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

249999


In [22]:
# Now we have one constraint
solver.NumConstraints()

1

## Add Equalized Odds Constraints

In [23]:
## Make sure that you have to add all people in Y stratum or none
## i.e. you can't add only people who pass boards and reject those who fail boards from same T, R stratum
didntexist, exists = 0, 0 

for ix, row in dff.iterrows():
    
    
    var1 = vars_cache[(row['R'],row['T'],row['Y'], row['Y_reject'])]
    key2s = [(row['R'],row['T'], 1-row['Y'], 1-row['Y_reject']), (row['R'],row['T'], row['Y'], 1-row['Y_reject']), (row['R'],row['T'], 1-row['Y'], row['Y_reject'])]
    
    for key2 in key2s:
        constrain_bp = solver.Constraint(0.0, 0.0)
        
        if key2 not in vars_cache:
            didntexist+=1
            continue
            
        var2 = vars_cache[key2]
        
        constrain_bp.SetCoefficient(var1, -1.0)
        constrain_bp.SetCoefficient(var2, 1.0)
        exists+=1

didntexist, exists

(680, 1006)

In [24]:
majority_pass_boards_pass_boards_reject = []
majority_fail_boards_pass_boards_reject = []
minority_pass_boards_pass_boards_reject = []
minority_fail_boards_pass_boards_reject = []
majority_pass_boards_fail_boards_reject = []
majority_fail_boards_fail_boards_reject = []
minority_pass_boards_fail_boards_reject = []
minority_fail_boards_fail_boards_reject = []


for key in vars_cache:
    r, t, Y, Y_reject = key
    if Y == 1 and r==0 and Y_reject==1:
        majority_pass_boards_pass_boards_reject.append(key)
    elif Y == 0 and r==0 and Y_reject==1:
        majority_fail_boards_pass_boards_reject.append(key)
    elif Y == 1 and r==1 and Y_reject==1:
        minority_pass_boards_pass_boards_reject.append(key)
    elif Y == 0 and r==1 and Y_reject==1:
        minority_fail_boards_pass_boards_reject.append(key)
        
    if Y == 1 and r==0 and Y_reject==0:
        majority_pass_boards_fail_boards_reject.append(key)
    elif Y == 0 and r==0 and Y_reject==0:
        majority_fail_boards_fail_boards_reject.append(key)
    elif Y == 1 and r==1 and Y_reject==0:
        minority_pass_boards_fail_boards_reject.append(key)
    elif Y == 0 and r==1 and Y_reject==0:
        minority_fail_boards_fail_boards_reject.append(key)

len(majority_pass_boards_pass_boards_reject),len(majority_fail_boards_pass_boards_reject),len(minority_pass_boards_pass_boards_reject),len(minority_fail_boards_pass_boards_reject)




(111, 0, 96, 0)

In [25]:
NUM_TOTALS = {}
df_totals = dff[['N','R','Y','Y_reject']].groupby(['R','Y','Y_reject']).sum().reset_index()
for ix, row in df_totals.iterrows():
    NUM_TOTALS[(row['R'],row['Y'],row['Y_reject'])] = row['N']
    
N_IN_STRATAS = {}
for ix, row in dff.iterrows():
    N_IN_STRATAS[(row['R'],row['T'],row['Y'],row['Y_reject'])] = row['N']

In [26]:
# Now we have one constraint
solver.NumConstraints()

1687

In [27]:
#Of those who pass the boards exams, pass boards exams if rejected
#Frac majority admitted and frac minority admitted should be the same

constrain_pass_boards_pass_boards_reject = solver.Constraint(0.0, 0.0)

for key in majority_pass_boards_pass_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_pass_boards_pass_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_pass_boards_pass_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_pass_boards_pass_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [28]:
#Of those who fail the boards exams, pass boards exams if rejected
#Frac majority admitted and frac minority admitted should be the same

constrain_fail_boards_pass_boards_reject = solver.Constraint(0.0, 0.0)

for key in majority_fail_boards_pass_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_fail_boards_pass_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_fail_boards_pass_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_fail_boards_pass_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [29]:
constrain_pass_boards_fail_boards_reject = solver.Constraint(0.0, 0.0)

for key in majority_pass_boards_fail_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_pass_boards_fail_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_pass_boards_fail_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_pass_boards_fail_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))

    
#Of those who fail the boards exams, fail boards exams if rejected
#Frac majority admitted and frac minority admitted should be the same

constrain_fail_boards_fail_boards_reject = solver.Constraint(0.0, 0.0)

for key in majority_fail_boards_fail_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_fail_boards_fail_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in minority_fail_boards_fail_boards_reject:
    r, t, Y, Y_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,Y, Y_reject)]
    N_TOTAL = NUM_TOTALS[(r,Y, Y_reject)]
    
    constrain_fail_boards_fail_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


## Add constraints on people who fail boards exams

## Solve linear program

In [30]:
solver.ABNORMAL

4

In [31]:
status = solver.Solve()


In [32]:
status

0

In [33]:
solver.OPTIMAL

0

In [34]:
applicant_stratum

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [35]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [36]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
...,...,...
557,557,1.0
558,558,1.0
559,559,1.0
560,560,1.0


In [37]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,Y,Y_reject,ml_outcomes,N,row_id,decision
147,0,147,0,62,1,1,5688.32,8888,147,1.0
153,1,153,0,64,1,1,5675.57,8471,153,1.0
144,2,144,0,61,1,1,5654.40,9120,144,0.0
150,3,150,0,63,1,1,5595.85,8609,150,1.0
115,4,115,0,52,0,0,5544.00,12320,115,0.0
...,...,...,...,...,...,...,...,...,...,...
1,557,1,0,10,0,0,0.08,1,1,0.0
9,558,9,0,16,1,1,0.06,2,9,0.0
12,559,12,0,17,1,1,0.06,1,12,0.0
8,560,8,0,16,1,0,0.03,1,8,0.0


In [38]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

Unnamed: 0,level_0,index,R,T,Y,Y_reject,ml_outcomes,N,row_id,decision
300,541,300,1,7,0,0,0.50,2,300,1.0
301,544,301,1,8,0,0,0.32,1,301,1.0
0,554,0,0,9,0,0,0.10,1,0,0.0
302,524,302,1,9,0,0,1.00,4,302,1.0
1,557,1,0,10,0,0,0.08,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...
295,494,295,0,123,1,1,2.00,2,295,0.0
296,521,296,0,124,0,0,1.00,1,296,0.0
297,496,297,0,125,1,1,2.00,2,297,0.0
298,522,298,0,127,1,1,1.00,1,298,0.0


In [39]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,Y,Y_reject,ml_outcomes,N,row_id,decision
147,0,147,0,62,1,1,5688.32,8888,147,1.0
153,1,153,0,64,1,1,5675.57,8471,153,1.0
144,2,144,0,61,1,1,5654.40,9120,144,0.0
150,3,150,0,63,1,1,5595.85,8609,150,1.0
115,4,115,0,52,0,0,5544.00,12320,115,0.0
...,...,...,...,...,...,...,...,...,...,...
1,557,1,0,10,0,0,0.08,1,1,0.0
9,558,9,0,16,1,1,0.06,2,9,0.0
12,559,12,0,17,1,1,0.06,1,12,0.0
8,560,8,0,16,1,0,0.03,1,8,0.0


In [40]:
len(xxx)

562

In [41]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)+'_'+df['Y'].astype(str)+'_'+df['Y_reject'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)+'_'+xxx['Y'].astype(str)+'_'+xxx['Y_reject'].astype(str)



In [42]:
admit_decisions = df.merge(xxx[['key','decision']],how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_minority_POLICY = (admit_decisions['R'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['Y'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['Y'] * admit_decisions['decision_random']).sum()

In [43]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Principal Fairness',str(FRAC_minority_POLICY),str(SUM_BP_POLICY)))
file.close()

In [44]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.24999924999924988