In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 5000
pd.options.display.max_columns = 500

In [2]:
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['ml_outcomes'] = df['ml_outcomes_div'].round(4) #+ df['R']*0.13



In [3]:
len(df)

100000

In [4]:
FRAC_ADMIT = df[['A']].sum()/len(df)


In [5]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_black','T_white','ml_outcomes_black','ml_outcomes_white','ml_outcomes_decision']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_black,T_white,ml_outcomes_black,ml_outcomes_white,ml_outcomes_decision
54672,0,23,0.0690,21,23,-2.852601,-2.601610,0
30167,0,24,0.0755,21,24,-2.852601,-2.505586,0
5963,0,24,0.0755,21,24,-2.852601,-2.505586,0
88102,0,24,0.0755,19,24,-3.044648,-2.505586,0
38263,0,24,0.0755,22,24,-2.756577,-2.505586,0
...,...,...,...,...,...,...,...,...
1338,1,101,1.2271,101,117,4.829291,6.424613,1
54317,1,103,1.2284,103,120,5.021339,6.712684,1
39894,1,104,1.2290,104,121,5.117362,6.808708,1
67613,1,104,1.2290,104,122,5.117362,6.904732,1


In [6]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [7]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [8]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [9]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,182,0,0,23,0.069,1
2,181,2,0,25,0.2472,3
99,180,99,1,17,0.2728,1
1,179,1,0,24,0.302,4
100,178,100,1,20,0.5696,2
4,177,4,0,27,0.5892,6
3,176,3,0,26,0.72,8
90,175,90,0,113,0.9976,1
94,174,94,0,117,0.9984,1
97,173,97,0,123,0.9991,1


### Setup optimization problem 

In [10]:
from ortools.linear_solver import pywraplp


In [11]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [12]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,182,0,0,23,0.069,1
1,179,1,0,24,0.302,4
2,181,2,0,25,0.2472,3
3,176,3,0,26,0.72,8
4,177,4,0,27,0.5892,6
5,167,5,0,28,1.605,15
6,156,6,0,29,3.2648,28
7,155,7,0,30,3.4236,27
8,151,8,0,31,4.9608,36
9,144,9,0,32,7.48,50


In [13]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [14]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [15]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

27139


In [16]:
# Now we have one constraint
solver.NumConstraints()

1

## Add CF Fair constraints

In [17]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [18]:
T_blacks_list = df[df['R']==0][['T','T_black_star']].groupby('T')['T_black_star'].apply(list).reset_index(name='T_blacks')


In [19]:
T_blacks_list['probs'] = T_blacks_list['T_blacks'].apply(convertListToProb)


In [20]:
T_blacks_list

Unnamed: 0,T,T_blacks,probs
0,23,[22],"[(1.0, 22)]"
1,24,"[23, 22, 22, 22]","[(0.25, 23), (0.75, 22)]"
2,25,"[22, 21, 24]","[(0.3333333333333333, 22), (0.3333333333333333..."
3,26,"[27, 24, 25, 22, 24, 24, 24, 26]","[(0.125, 27), (0.5, 24), (0.125, 25), (0.125, ..."
4,27,"[24, 26, 25, 25, 26, 25]","[(0.16666666666666666, 24), (0.333333333333333..."
5,28,"[26, 27, 27, 27, 26, 28, 25, 24, 26, 27, 27, 2...","[(0.26666666666666666, 26), (0.4, 27), (0.2, 2..."
6,29,"[27, 27, 29, 28, 28, 26, 27, 28, 29, 27, 27, 2...","[(0.35714285714285715, 27), (0.107142857142857..."
7,30,"[27, 31, 28, 28, 29, 27, 31, 27, 28, 29, 30, 2...","[(0.18518518518518517, 27), (0.148148148148148..."
8,31,"[28, 29, 29, 28, 29, 30, 29, 30, 32, 30, 30, 2...","[(0.2222222222222222, 28), (0.3611111111111111..."
9,32,"[30, 30, 30, 32, 29, 30, 30, 31, 30, 30, 29, 3...","[(0.38, 30), (0.1, 32), (0.18, 29), (0.34, 31)]"


In [21]:
didntexist = 0
exists = 0
for ix, row in T_blacks_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    White_T = row['T']
    Blacks_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, White_T)], -1.0)
    for prob in Blacks_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
    

In [22]:
didntexist,exists

(11, 449)

In [23]:
T_whites_list = df[df['R']==1][['T','T_white_star']].groupby('T')['T_white_star'].apply(list).reset_index(name='T_whites')

In [24]:
T_whites_list['probs'] = T_whites_list['T_whites'].apply(convertListToProb)


In [25]:
didntexist = 0
exists = 0

for ix, row in T_whites_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    Black_T = row['T']
    White_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, Black_T)], -1.0)
    for prob in White_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
    

In [26]:
cf_fair_stratum.basis_status

<bound method Constraint.basis_status of <ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7fdca8cf8ba0> >>

In [27]:
solver.NumConstraints()

184

## Solve linear program

In [28]:
solver.ABNORMAL

4

In [29]:
status = solver.Solve()


In [30]:
status

0

In [31]:
solver.OPTIMAL

0

In [32]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [33]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.26298
1,1,0.255011
2,2,0.278919
3,3,0.269575
4,4,0.270777
5,5,0.270798
6,6,0.270816
7,7,0.270807
8,8,0.270808
9,9,0.270808


In [34]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
132,0,132,1,52,1443.91,1885,132,0.270808
133,1,133,1,53,1435.8564,1818,133,0.270808
131,2,131,1,51,1423.898,1919,131,0.270808
134,3,134,1,54,1392.5408,1712,134,0.270808
130,4,130,1,50,1386.458,1931,130,0.270808
135,5,135,1,55,1367.0044,1634,135,0.270808
128,6,128,1,48,1350.1856,2014,128,0.270808
129,7,129,1,49,1327.1192,1912,129,0.270808
136,8,136,1,56,1295.9752,1508,136,0.270808
137,9,137,1,57,1293.3072,1467,137,0.270808


In [35]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
132,0,132,1,52,1443.91,1885,132,0.270808
133,1,133,1,53,1435.8564,1818,133,0.270808
131,2,131,1,51,1423.898,1919,131,0.270808
134,3,134,1,54,1392.5408,1712,134,0.270808
130,4,130,1,50,1386.458,1931,130,0.270808
135,5,135,1,55,1367.0044,1634,135,0.270808
128,6,128,1,48,1350.1856,2014,128,0.270808
129,7,129,1,49,1327.1192,1912,129,0.270808
136,8,136,1,56,1295.9752,1508,136,0.270808
137,9,137,1,57,1293.3072,1467,137,0.270808


In [36]:
xxx.sort_values(by='decision',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
90,175,90,0,113,0.9976,1,90,1.0
94,174,94,0,117,0.9984,1,94,1.0
97,173,97,0,123,0.9991,1,97,1.0
87,165,87,0,110,1.9936,2,87,1.0
95,162,95,0,118,1.997,2,95,1.0
178,160,178,1,99,2.4508,2,178,1.0
85,143,85,0,108,7.9696,8,85,1.0
181,159,181,1,104,2.458,2,181,1.0
98,157,98,0,124,2.9976,3,98,1.0
89,147,89,0,112,5.9844,6,89,1.0


In [37]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [38]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT['A']

FRAC_BLACK_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()


In [39]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Path-Specific Fairness',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()
