In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 10
pd.options.display.max_columns = 500

In [2]:
DIVERSITY_UTILITY = 0.25
FRAC_ADMIT = 0.5


###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','B_p']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['B_p'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [4]:
df

Unnamed: 0.1,Unnamed: 0,R,I_noise,E_noise,E_black,E_white,M_noise,M_black,M_white,T_noise,T_black,T_white,T_white_star,T_black_star,E,M,T,A_raw,A_raw_black,A_raw_white,A_prob,A,B_p_unif,B_p_reject_noise,B_p,B_p_raw,B_p_reject,B_p_reject_raw,D_p,ml_outcomes_smooth,key,stratum_utility,ml_outcomes
0,1,1,0,0.525994,0.525994,1.525994,0.207809,-0.266197,0.733803,-12.671517,38,47,42,43,0.525994,-0.266197,38,-0.57,-0.57,-0.06,0.361237,0,0.530407,-0.142676,0,0.433841,0,0.317302,1,0.421641,1_38,0.43,0.42
1,395,1,0,0.969962,0.969962,1.969962,-1.526573,-1.556611,-0.556611,-7.842947,38,47,41,43,0.969962,-1.556611,38,-0.57,-0.57,-0.06,0.361237,0,0.277582,-0.276530,0,0.174134,0,0.113386,1,0.421641,1_38,0.43,0.42
2,613,1,0,0.134061,0.134061,1.134061,0.424828,-0.441112,0.558888,-10.986879,38,46,41,42,0.134061,-0.441112,38,-0.57,-0.57,-0.08,0.361237,1,0.373307,-0.395197,1,0.391476,0,0.280676,1,0.421641,1_38,0.43,0.42
3,714,1,0,-0.274987,-0.274987,0.725013,0.034153,-1.240834,-0.240834,-6.721347,38,45,40,41,-0.274987,-1.240834,38,-0.57,-0.57,-0.10,0.361237,1,0.737121,-0.401191,0,0.224291,0,0.149207,1,0.421641,1_38,0.43,0.42
4,760,1,0,-0.487114,-0.487114,0.512886,0.196148,-1.290966,-0.290966,-5.520145,38,45,41,42,-0.487114,-1.290966,38,-0.57,-0.57,-0.10,0.361237,0,0.257239,-0.616042,0,0.215689,0,0.142954,1,0.421641,1_38,0.43,0.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,73371,0,0,2.450656,2.450656,3.450656,1.792608,3.243264,4.243264,14.772250,95,110,103,102,3.450656,4.243264,110,1.20,0.57,1.20,0.768525,1,0.436682,-0.382502,1,0.985843,1,0.976871,0,0.986571,0_110,1.00,0.99
99996,78609,1,0,2.636615,2.636615,3.636615,1.699397,3.336013,4.336013,17.947400,101,116,108,107,2.636615,3.336013,101,0.69,0.69,1.32,0.665967,0,0.863950,-1.026399,1,0.965644,1,0.944591,1,1.220684,1_101,1.25,1.22
99997,83133,0,0,2.775036,2.775036,3.775036,1.996713,3.771749,4.771749,11.088490,98,113,106,105,3.775036,4.771749,113,1.26,0.63,1.26,0.779026,1,0.809441,-0.481194,1,0.991606,1,0.986235,0,0.989339,0_113,1.00,0.99
99998,85089,1,0,4.022088,4.022088,5.022088,0.440460,3.462548,4.462548,6.555284,100,117,108,108,4.022088,3.462548,100,0.67,0.67,1.34,0.661503,0,0.496923,-0.677384,1,0.969603,1,0.950853,1,1.218303,1_100,1.25,1.22


In [5]:
FRAC_ADMIT = 0.5#df[['A']].sum()/len(df)


In [6]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [7]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [8]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [9]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,192,0,0,11,0.03,1
1,191,1,0,13,0.04,1
2,190,2,0,14,0.04,1
3,189,3,0,15,0.12,3
4,188,4,0,16,0.15,3
...,...,...,...,...,...,...
45,4,45,0,57,1084.32,2008
48,3,48,0,60,1093.20,1822
43,2,43,0,55,1110.00,2220
44,1,44,0,56,1115.40,2145


### Setup optimization problem 

In [10]:
from ortools.linear_solver import pywraplp


In [11]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [12]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,192,0,0,11,0.03,1
1,191,1,0,13,0.04,1
2,190,2,0,14,0.04,1
3,189,3,0,15,0.12,3
4,188,4,0,16,0.15,3
...,...,...,...,...,...,...
188,161,188,1,98,2.42,2
189,170,189,1,99,1.22,1
190,169,190,1,100,1.22,1
191,168,191,1,101,1.22,1


In [13]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [14]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [15]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

50000


In [16]:
# Now we have one constraint
solver.NumConstraints()

1

## Add CF Fair constraints

In [17]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [18]:
T_blacks_list = df[df['R']==0][['T','T_black_star']].groupby('T')['T_black_star'].apply(list).reset_index(name='T_blacks')


In [19]:
T_blacks_list['probs'] = T_blacks_list['T_blacks'].apply(convertListToProb)


In [20]:
T_blacks_list

Unnamed: 0,T,T_blacks,probs
0,11,[10],"[(1.0, 10)]"
1,13,[12],"[(1.0, 12)]"
2,14,[14],"[(1.0, 14)]"
3,15,"[13, 13, 15]","[(0.6666666666666666, 13), (0.3333333333333333..."
4,16,"[14, 16, 16]","[(0.3333333333333333, 14), (0.6666666666666666..."
...,...,...,...
99,112,[103],"[(1.0, 103)]"
100,113,[105],"[(1.0, 105)]"
101,114,[105],"[(1.0, 105)]"
102,118,"[110, 109]","[(0.5, 110), (0.5, 109)]"


In [21]:
didntexist = 0
exists = 0
for ix, row in T_blacks_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    White_T = row['T']
    Blacks_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, White_T)], -1.0)
    for prob in Blacks_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
    

In [22]:
didntexist,exists

(9, 538)

In [23]:
T_whites_list = df[df['R']==1][['T','T_white_star']].groupby('T')['T_white_star'].apply(list).reset_index(name='T_whites')

In [24]:
T_whites_list['probs'] = T_whites_list['T_whites'].apply(convertListToProb)


In [25]:
didntexist = 0
exists = 0

for ix, row in T_whites_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    Black_T = row['T']
    White_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, Black_T)], -1.0)
    for prob in White_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
    

In [26]:
cf_fair_stratum.basis_status

<bound method Constraint.basis_status of <ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7fb3838e9f00> >>

In [27]:
solver.NumConstraints()

194

## Solve linear program

In [28]:
solver.ABNORMAL

4

In [29]:
status = solver.Solve()


In [30]:
status

0

In [31]:
solver.OPTIMAL

0

In [32]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [33]:
df_decisions

Unnamed: 0,row_id,decision
0,0,5.490719e-12
1,1,0.000000e+00
2,2,8.839817e-01
3,3,1.362190e-01
4,4,6.015690e-01
...,...,...
188,188,9.734256e-01
189,189,9.867128e-01
190,190,9.867128e-01
191,191,9.867128e-01


In [34]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
46,0,46,0,58,1153.04,2059,46,4.990802e-01
44,1,44,0,56,1115.40,2145,44,4.990802e-01
43,2,43,0,55,1110.00,2220,43,4.990802e-01
48,3,48,0,60,1093.20,1822,48,4.990802e-01
45,4,45,0,57,1084.32,2008,45,4.990802e-01
...,...,...,...,...,...,...,...,...
4,188,4,0,16,0.15,3,4,6.015690e-01
3,189,3,0,15,0.12,3,3,1.362190e-01
2,190,2,0,14,0.04,1,2,8.839817e-01
1,191,1,0,13,0.04,1,1,0.000000e+00


In [35]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
46,0,46,0,58,1153.04,2059,46,4.990802e-01
44,1,44,0,56,1115.40,2145,44,4.990802e-01
43,2,43,0,55,1110.00,2220,43,4.990802e-01
48,3,48,0,60,1093.20,1822,48,4.990802e-01
45,4,45,0,57,1084.32,2008,45,4.990802e-01
...,...,...,...,...,...,...,...,...
4,188,4,0,16,0.15,3,4,6.015690e-01
3,189,3,0,15,0.12,3,3,1.362190e-01
2,190,2,0,14,0.04,1,2,8.839817e-01
1,191,1,0,13,0.04,1,1,0.000000e+00


In [36]:
xxx.sort_values(by='decision',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
101,176,101,0,114,0.99,1,101,1.000000e+00
102,163,102,0,118,1.98,2,102,1.000000e+00
192,167,192,1,109,1.23,1,192,1.000000e+00
98,179,98,0,110,0.99,1,98,1.000000e+00
99,178,99,0,112,0.99,1,99,1.000000e+00
...,...,...,...,...,...,...,...,...
0,192,0,0,11,0.03,1,0,5.490719e-12
106,183,106,1,12,0.27,1,106,0.000000e+00
104,184,104,1,9,0.27,1,104,0.000000e+00
105,185,105,1,11,0.27,1,105,0.000000e+00


In [37]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [38]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_BLACK_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()


In [39]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Path-Specific Fairness',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()


In [40]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.49999999999999994