In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 10
pd.options.display.max_columns = 500

In [2]:
DIVERSITY_UTILITY = 0.0
FRAC_ADMIT = 0.5


###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

df_stratum_utility = df[['R','T','B_p']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['B_p'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_black','T_white','ml_outcomes_black','ml_outcomes_white','ml_outcomes_decision','B_p']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_black,T_white,ml_outcomes_black,ml_outcomes_white,ml_outcomes_decision,B_p
99928,0,19,0.00,18,19,-3.123641,-2.830157,0,0
99929,0,19,0.00,14,19,-3.440023,-2.830157,0,0
99930,0,19,0.00,16,19,-3.281832,-2.830157,0,0
99931,0,19,0.00,14,19,-3.440023,-2.830157,0,0
99932,0,19,0.00,17,19,-3.202737,-2.830157,0,0
...,...,...,...,...,...,...,...,...,...
99936,1,90,1.25,90,101,2.571222,3.655659,1,1
99937,1,90,1.25,90,104,2.571222,3.892945,1,1
99966,1,86,1.25,86,100,2.254840,3.576564,1,1
99850,1,82,1.25,82,95,1.938459,3.181087,1,1


In [4]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [5]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [6]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [7]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [8]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,192,0,0,12,0.00,1
1,188,1,0,14,0.00,1
4,191,4,0,17,0.00,4
5,190,5,0,18,0.00,4
6,189,6,0,19,0.00,8
...,...,...,...,...,...,...
50,4,50,0,63,1065.90,1615
45,3,45,0,58,1102.38,1934
43,2,43,0,56,1126.25,2125
46,1,46,0,59,1144.60,1940


### Setup optimization problem 

In [9]:
from ortools.linear_solver import pywraplp


In [10]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [11]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,192,0,0,12,0.00,1
1,188,1,0,14,0.00,1
2,176,2,0,15,1.00,2
3,177,3,0,16,1.00,2
4,191,4,0,17,0.00,4
...,...,...,...,...,...,...
188,171,188,1,94,1.25,1
189,172,189,1,95,1.25,1
190,173,190,1,97,1.25,1
191,183,191,1,101,0.25,1


In [12]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [13]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [14]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total admits cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

50000


In [15]:
# Now we have one constraint
solver.NumConstraints()

1

## Solve linear program

In [16]:
solver.ABNORMAL

4

In [17]:
status = solver.Solve()


In [18]:
status

0

In [19]:
solver.OPTIMAL

0

In [20]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [21]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.0
1,1,0.0
2,2,1.0
3,3,1.0
4,4,0.0
...,...,...
188,188,1.0
189,189,1.0
190,190,1.0
191,191,0.0


In [22]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
47,0,47,0,60,1146.60,1820,47,1.0
46,1,46,0,59,1144.60,1940,46,1.0
43,2,43,0,56,1126.25,2125,43,1.0
45,3,45,0,58,1102.38,1934,45,1.0
50,4,50,0,63,1065.90,1615,50,1.0
...,...,...,...,...,...,...,...,...
1,188,1,0,14,0.00,1,1,0.0
6,189,6,0,19,0.00,8,6,0.0
5,190,5,0,18,0.00,4,5,0.0
4,191,4,0,17,0.00,4,4,0.0


In [23]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
104,185,104,1,5,0.25,1,104,0.0
105,186,105,1,8,0.25,1,105,0.0
106,182,106,1,10,0.50,2,106,0.0
0,192,0,0,12,0.00,1,0,0.0
107,187,107,1,13,0.25,1,107,0.0
...,...,...,...,...,...,...,...,...
99,161,99,0,112,3.00,3,99,1.0
100,164,100,0,113,2.00,2,100,1.0
101,180,101,0,114,1.00,1,101,1.0
102,179,102,0,116,1.00,1,102,1.0


In [24]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
47,0,47,0,60,1146.60,1820,47,1.0
46,1,46,0,59,1144.60,1940,46,1.0
43,2,43,0,56,1126.25,2125,43,1.0
45,3,45,0,58,1102.38,1934,45,1.0
50,4,50,0,63,1065.90,1615,50,1.0
...,...,...,...,...,...,...,...,...
1,188,1,0,14,0.00,1,1,0.0
6,189,6,0,19,0.00,8,6,0.0
5,190,5,0,18,0.00,4,5,0.0
4,191,4,0,17,0.00,4,4,0.0


In [25]:
len(xxx_)

193

In [26]:
xxx.to_csv('./decision.csv')

In [27]:
solution = pd.read_csv('./decision.csv')

In [28]:
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
47,0,47,0,60,1146.60,1820,47,1.0
46,1,46,0,59,1144.60,1940,46,1.0
43,2,43,0,56,1126.25,2125,43,1.0
45,3,45,0,58,1102.38,1934,45,1.0
50,4,50,0,63,1065.90,1615,50,1.0
...,...,...,...,...,...,...,...,...
1,188,1,0,14,0.00,1,1,0.0
6,189,6,0,19,0.00,8,6,0.0
5,190,5,0,18,0.00,4,5,0.0
4,191,4,0,17,0.00,4,4,0.0


In [29]:
(xxx['ml_outcomes']*xxx['decision']).sum()

34247.06999999999

In [30]:
def maxMLOutcomes(frac_black=0.01):
    black_admits = int(K*frac_black)
    white_admits = int(K*(1-frac_black))
    
    b = df[df['R']==1].sort_values(by='ml_outcomes',ascending=False).head(n=black_admits)
    w = df[df['R']==0].sort_values(by='ml_outcomes',ascending=False).head(n=white_admits)
    
    return b['B_p'].sum() + w['B_p'].sum()    

In [31]:
def maxMLOutcomesBlack(frac_black=0.01):
    black_admits = int(K*frac_black)
    white_admits = int(K*(1-frac_black))
    
    b = df[df['R']==1].sort_values(by='ml_outcomes',ascending=False).head(n=black_admits)
    w = df[df['R']==0].sort_values(by='ml_outcomes',ascending=False).head(n=white_admits)
    
    return b['B_p'].sum()   

In [32]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [33]:
len(xxx)

193

In [34]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT


FRAC_BLACK_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()


In [35]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Maximize Boards',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()


In [36]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.5