In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 5000
pd.options.display.max_columns = 500

In [2]:
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['ml_outcomes'] = df['ml_outcomes_div'].round(4)


In [3]:
#df['B_p_reject'] = np.random.randint(0,2,size=(len(df)))

In [4]:
df['B_p_reject']

0          1
1          0
2          1
3          0
4          1
          ..
9999995    1
9999996    1
9999997    1
9999998    0
9999999    1
Name: B_p_reject, Length: 10000000, dtype: int64

In [None]:
df['B_p_stratum'] = df['B_p'].astype(str) + df['B_p_reject'].astype(str)



In [None]:
df['B_p_stratum'].value_counts()

In [None]:
# = 
#df['B_p_reject'] = df['B_p']

In [None]:
FRAC_ADMIT = df[['A']].sum()/len(df)


In [None]:
FRAC_ADMIT

In [None]:
len(df)

In [None]:
#df['ml_outcomes'] = df['R']

In [None]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_black','T_white','ml_outcomes_black','ml_outcomes_white','ml_outcomes_decision','B_p']].sort_values(by='ml_outcomes')



In [None]:
#df['ml_outcomes'] = df['ml_outcomes'] + 1*df['R']

In [None]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','B_p','B_p_reject','ml_outcomes']].groupby(['R','T','B_p','B_p_reject']).sum().reset_index()

In [None]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes','B_p','B_p_reject']].groupby(['R','T','B_p','B_p_reject']).count().reset_index()
df_count.columns = ['R','T','B_p','B_p_reject','Count']
df_count['N'] = df_count['Count']

In [None]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [None]:
# final info table
dff.sort_values(by='ml_outcomes')

### Setup optimization problem 

In [None]:
from ortools.linear_solver import pywraplp


In [None]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [None]:
len(dff[['R','T','B_p','B_p_reject']])

In [None]:
dff

In [None]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'],row['B_p'], row['B_p_reject'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [None]:
# Currently we have no constraints 
solver.NumConstraints()

In [None]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

In [None]:
# Now we have one constraint
solver.NumConstraints()

## Add Equalized Odds Constraints

In [None]:
## Make sure that you have to add all people in B_p stratum or none
## i.e. you can't add only people who pass boards and reject those who fail boards from same T, R stratum
didntexist, exists = 0, 0 

for ix, row in dff.iterrows():
    
    
    var1 = vars_cache[(row['R'],row['T'],row['B_p'], row['B_p_reject'])]
    key2s = [(row['R'],row['T'], 1-row['B_p'], 1-row['B_p_reject']), (row['R'],row['T'], row['B_p'], 1-row['B_p_reject']), (row['R'],row['T'], 1-row['B_p'], row['B_p_reject'])]
    
    for key2 in key2s:
        constrain_bp = solver.Constraint(0.0, 0.0)
        
        if key2 not in vars_cache:
            didntexist+=1
            continue
            
        var2 = vars_cache[key2]
        
        constrain_bp.SetCoefficient(var1, -1.0)
        constrain_bp.SetCoefficient(var2, 1.0)
        exists+=1

didntexist, exists

In [None]:
white_pass_boards_pass_boards_reject = []
white_fail_boards_pass_boards_reject = []
black_pass_boards_pass_boards_reject = []
black_fail_boards_pass_boards_reject = []
white_pass_boards_fail_boards_reject = []
white_fail_boards_fail_boards_reject = []
black_pass_boards_fail_boards_reject = []
black_fail_boards_fail_boards_reject = []


for key in vars_cache:
    r, t, b_p, b_p_reject = key
    if b_p == 1 and r==0 and b_p_reject==1:
        white_pass_boards_pass_boards_reject.append(key)
    elif b_p == 0 and r==0 and b_p_reject==1:
        white_fail_boards_pass_boards_reject.append(key)
    elif b_p == 1 and r==1 and b_p_reject==1:
        black_pass_boards_pass_boards_reject.append(key)
    elif b_p == 0 and r==1 and b_p_reject==1:
        black_fail_boards_pass_boards_reject.append(key)
        
    if b_p == 1 and r==0 and b_p_reject==0:
        white_pass_boards_fail_boards_reject.append(key)
    elif b_p == 0 and r==0 and b_p_reject==0:
        white_fail_boards_fail_boards_reject.append(key)
    elif b_p == 1 and r==1 and b_p_reject==0:
        black_pass_boards_fail_boards_reject.append(key)
    elif b_p == 0 and r==1 and b_p_reject==0:
        black_fail_boards_fail_boards_reject.append(key)

len(white_pass_boards_pass_boards_reject),len(white_fail_boards_pass_boards_reject),len(black_pass_boards_pass_boards_reject),len(black_fail_boards_pass_boards_reject)




In [None]:
NUM_TOTALS = {}
df_totals = dff[['N','R','B_p','B_p_reject']].groupby(['R','B_p','B_p_reject']).sum().reset_index()
for ix, row in df_totals.iterrows():
    NUM_TOTALS[(row['R'],row['B_p'],row['B_p_reject'])] = row['N']
    
N_IN_STRATAS = {}
for ix, row in dff.iterrows():
    N_IN_STRATAS[(row['R'],row['T'],row['B_p'],row['B_p_reject'])] = row['N']

In [None]:
# Now we have one constraint
solver.NumConstraints()

In [None]:
#Of those who pass the boards exams, pass boards exams if rejected
#Frac white admitted and frac black admitted should be the same

constrain_pass_boards_pass_boards_reject = solver.Constraint(0.0, 0.0)

for key in white_pass_boards_pass_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_pass_boards_pass_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_pass_boards_pass_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_pass_boards_pass_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [None]:
#Of those who fail the boards exams, pass boards exams if rejected
#Frac white admitted and frac black admitted should be the same

constrain_fail_boards_pass_boards_reject = solver.Constraint(0.0, 0.0)

for key in white_fail_boards_pass_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_fail_boards_pass_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_fail_boards_pass_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_fail_boards_pass_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


In [None]:
constrain_pass_boards_fail_boards_reject = solver.Constraint(0.0, 0.0)

for key in white_pass_boards_fail_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_pass_boards_fail_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_pass_boards_fail_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_pass_boards_fail_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))

    
#Of those who fail the boards exams, fail boards exams if rejected
#Frac white admitted and frac black admitted should be the same

constrain_fail_boards_fail_boards_reject = solver.Constraint(0.0, 0.0)

for key in white_fail_boards_fail_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_fail_boards_fail_boards_reject.SetCoefficient(vars_cache[key], float(N_IN_STRATUM) / float(N_TOTAL))

for key in black_fail_boards_fail_boards_reject:
    r, t, b_p, b_p_reject = key
    N_IN_STRATUM = N_IN_STRATAS[(r,t,b_p, b_p_reject)]
    N_TOTAL = NUM_TOTALS[(r,b_p, b_p_reject)]
    
    constrain_fail_boards_fail_boards_reject.SetCoefficient(vars_cache[key], -1.0 * (float(N_IN_STRATUM) / float(N_TOTAL)))


## Add constraints on people who fail boards exams

## Solve linear program

In [None]:
solver.ABNORMAL

In [None]:
status = solver.Solve()


In [None]:
status

In [None]:
solver.OPTIMAL

In [None]:
applicant_stratum

In [None]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [None]:
df_decisions

In [None]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

In [None]:
xxx_ = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx_.sort_values(by='T')

In [None]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

In [None]:
xxx.to_csv('./decision.csv')

In [None]:
solution = pd.read_csv('./decision.csv')

In [None]:
len(xxx)

In [None]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)+'_'+df['B_p'].astype(str)+'_'+df['B_p_reject'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)+'_'+xxx['B_p'].astype(str)+'_'+xxx['B_p_reject'].astype(str)



In [None]:
admit_decisions = df.merge(xxx[['key','decision']],how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT['A']

FRAC_BLACK_POLICY = (admit_decisions['R'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()

In [None]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Principal Fairness',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()