In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 10
pd.options.display.max_columns = 500

In [2]:
DIVERSITY_UTILITY = 0.25
FRAC_ADMIT = 0.5


###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','B_p']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['B_p'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
df

Unnamed: 0.1,Unnamed: 0,R,I_noise,E_noise,E_black,E_white,M_noise,M_black,M_white,T_noise,T_black,T_white,T_white_star,T_black_star,E,M,T,A_raw,A_raw_black,A_raw_white,A_prob,A,B_p_unif,B_p_reject_noise,B_p,B_p_raw,B_p_reject,B_p_reject_raw,D_p,key,stratum_utility,ml_outcomes
0,1,1,0,-0.938105,-0.938105,0.061895,0.215945,-1.722160,-0.722160,13.309506,54,61,57,57,-0.938105,-1.722160,54,-0.25,-0.25,0.22,0.437823,1,0.878470,-0.825473,0,0.151593,0,0.097778,1,1_54,0.69,0.69
1,164,1,0,-1.017543,-1.017543,-0.017543,1.596718,-0.420824,0.579176,9.416984,54,62,58,57,-1.017543,-0.420824,54,-0.25,-0.25,0.24,0.437823,1,0.908374,-0.478208,0,0.396320,0,0.284790,1,1_54,0.69,0.69
2,296,1,0,-1.342337,-1.342337,-0.342337,0.225058,-2.117280,-1.117280,15.124537,54,60,56,57,-1.342337,-2.117280,54,-0.25,-0.25,0.20,0.437823,1,0.120712,-0.402754,0,0.107429,0,0.068035,1,1_54,0.69,0.69
3,387,1,0,0.896351,0.896351,1.896351,-0.346163,-0.449812,0.550188,2.296308,54,63,57,59,0.896351,-0.449812,54,-0.25,-0.25,0.26,0.437823,1,0.648928,-0.672201,0,0.389405,0,0.278923,1,1_54,0.69,0.69
4,415,1,0,-0.064503,-0.064503,0.935497,0.545602,-0.518901,0.481099,6.499207,54,63,58,58,-0.064503,-0.518901,54,-0.25,-0.25,0.26,0.437823,1,0.039539,-0.932143,1,0.373109,1,0.265242,1,1_54,0.69,0.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,91106,0,0,4.545513,4.545513,5.545513,0.603047,4.148560,5.148560,-7.272068,96,114,105,105,5.545513,5.148560,114,1.28,0.59,1.28,0.782450,1,0.422052,0.034169,1,0.994226,1,0.990515,0,0_114,1.00,1.00
99996,93421,1,0,3.547363,3.547363,4.547363,1.320808,3.868171,4.868171,10.821452,104,121,112,112,3.547363,3.868171,104,0.75,0.75,1.42,0.679179,1,0.238850,-0.672673,1,0.979531,1,0.966695,1,1_104,1.25,1.25
99997,93449,0,0,3.327352,3.327352,4.327352,2.147981,4.475333,5.475333,0.182072,96,113,105,104,4.327352,5.475333,113,1.26,0.59,1.26,0.779026,1,0.657541,-0.566227,1,0.995829,1,0.993141,0,0_113,1.00,1.00
99998,98989,0,0,2.589651,2.589651,3.589651,2.835858,4.425509,5.425509,7.820073,97,113,106,104,3.589651,5.425509,113,1.26,0.61,1.26,0.779026,1,0.436918,-0.568169,1,0.995616,1,0.992793,0,0_113,1.00,1.00


In [4]:
FRAC_ADMIT = 0.5#df[['A']].sum()/len(df)


In [5]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [6]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [7]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [8]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
1,192,1,0,14,0.03,1
0,191,0,0,12,0.05,1
3,190,3,0,16,0.14,2
2,189,2,0,15,0.18,2
105,188,105,1,8,0.25,1
...,...,...,...,...,...,...
44,4,44,0,57,1071.36,1984
45,3,45,0,58,1083.04,1934
47,2,47,0,60,1092.00,1820
43,1,43,0,56,1105.00,2125


### Setup optimization problem 

In [9]:
from ortools.linear_solver import pywraplp


In [10]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [11]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,191,0,0,12,0.05,1
1,192,1,0,14,0.03,1
2,189,2,0,15,0.18,2
3,190,3,0,16,0.14,2
4,185,4,0,17,0.28,4
...,...,...,...,...,...,...
188,170,188,1,94,1.23,1
189,172,189,1,95,1.20,1
190,167,190,1,97,1.25,1
191,168,191,1,101,1.25,1


In [12]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [13]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [14]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

50000


In [15]:
# Now we have one constraint
solver.NumConstraints()

1

## Add CF Fair constraints

In [16]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [17]:
T_blacks_list = df[df['R']==0][['T','T_black_star']].groupby('T')['T_black_star'].apply(list).reset_index(name='T_blacks')


In [18]:
T_blacks_list['probs'] = T_blacks_list['T_blacks'].apply(convertListToProb)


In [19]:
T_blacks_list

Unnamed: 0,T,T_blacks,probs
0,12,[12],"[(1.0, 12)]"
1,14,[15],"[(1.0, 15)]"
2,15,"[14, 14]","[(1.0, 14)]"
3,16,"[13, 13]","[(1.0, 13)]"
4,17,"[15, 18, 15, 17]","[(0.5, 15), (0.25, 18), (0.25, 17)]"
...,...,...,...
99,112,"[103, 103, 104]","[(0.6666666666666666, 103), (0.333333333333333..."
100,113,"[104, 104]","[(1.0, 104)]"
101,114,[105],"[(1.0, 105)]"
102,116,[108],"[(1.0, 108)]"


In [20]:
didntexist = 0
exists = 0
for ix, row in T_blacks_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    White_T = row['T']
    Blacks_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, White_T)], -1.0)
    for prob in Blacks_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
    

In [21]:
didntexist,exists

(9, 542)

In [22]:
T_whites_list = df[df['R']==1][['T','T_white_star']].groupby('T')['T_white_star'].apply(list).reset_index(name='T_whites')

In [23]:
T_whites_list['probs'] = T_whites_list['T_whites'].apply(convertListToProb)


In [24]:
didntexist = 0
exists = 0

for ix, row in T_whites_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    Black_T = row['T']
    White_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, Black_T)], -1.0)
    for prob in White_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
    

In [25]:
cf_fair_stratum.basis_status

<bound method Constraint.basis_status of <ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7fc728e300c0> >>

In [26]:
solver.NumConstraints()

194

## Solve linear program

In [27]:
solver.ABNORMAL

4

In [28]:
status = solver.Solve()


In [29]:
status

0

In [30]:
solver.OPTIMAL

0

In [31]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [32]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.000000
1,1,0.523221
2,2,0.348814
3,3,0.523221
4,4,0.512397
...,...,...
188,188,0.499796
189,189,0.499796
190,190,1.000000
191,191,1.000000


In [33]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
46,0,46,0,59,1125.20,1940,46,0.499796
43,1,43,0,56,1105.00,2125,43,0.499796
47,2,47,0,60,1092.00,1820,47,0.499796
45,3,45,0,58,1083.04,1934,45,0.499796
44,4,44,0,57,1071.36,1984,44,0.499796
...,...,...,...,...,...,...,...,...
105,188,105,1,8,0.25,1,105,0.000000
2,189,2,0,15,0.18,2,2,0.348814
3,190,3,0,16,0.14,2,3,0.523221
0,191,0,0,12,0.05,1,0,0.000000


In [34]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
46,0,46,0,59,1125.20,1940,46,0.499796
43,1,43,0,56,1105.00,2125,43,0.499796
47,2,47,0,60,1092.00,1820,47,0.499796
45,3,45,0,58,1083.04,1934,45,0.499796
44,4,44,0,57,1071.36,1984,44,0.499796
...,...,...,...,...,...,...,...,...
105,188,105,1,8,0.25,1,105,0.000000
2,189,2,0,15,0.18,2,2,0.348814
3,190,3,0,16,0.14,2,3,0.523221
0,191,0,0,12,0.05,1,0,0.000000


In [35]:
xxx.sort_values(by='decision',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
98,166,98,0,111,1.98,2,98,1.000000
100,165,100,0,113,2.00,2,100,1.000000
96,150,96,0,109,4.95,5,96,1.000000
94,151,94,0,107,4.95,5,94,1.000000
97,178,97,0,110,0.99,1,97,1.000000
...,...,...,...,...,...,...,...,...
2,189,2,0,15,0.18,2,2,0.348814
104,187,104,1,5,0.25,1,104,0.000000
105,188,105,1,8,0.25,1,105,0.000000
0,191,0,0,12,0.05,1,0,0.000000


In [36]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [37]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_BLACK_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()


In [38]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Path-Specific Fairness',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()


In [39]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.5000000000000001