In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
pd.options.display.max_rows = 5000
pd.options.display.max_columns = 500

In [81]:
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['ml_outcomes'] = df['ml_outcomes_div'].round(1) #+ df['R']*0.13



In [82]:
len(df)

1000000

In [83]:
FRAC_ADMIT = df[['A']].sum()/len(df)


In [84]:
##
# R = race, T = test score, ml_outcomes = expected utility from admitting, 
# ml_outcomes_{} counterfactual utility given race 
##

df[['R','T','ml_outcomes','T_black','T_white','ml_outcomes_black','ml_outcomes_white','ml_outcomes_decision']].sort_values(by='ml_outcomes')



Unnamed: 0,R,T,ml_outcomes,T_black,T_white,ml_outcomes_black,ml_outcomes_white,ml_outcomes_decision
612172,0,7,0.01,6,7,-4.540866,-4.259679,0
334045,0,7,0.01,2,7,-4.905552,-4.259679,0
284712,0,7,0.01,3,7,-4.814380,-4.259679,0
898343,0,12,0.02,7,12,-4.449695,-3.803823,0
911496,0,12,0.02,7,12,-4.449695,-3.803823,0
...,...,...,...,...,...,...,...,...
943810,1,112,1.33,112,130,5.123298,6.954398,1
966919,1,117,1.33,117,137,5.579155,7.592598,1
747470,1,111,1.33,111,132,5.032127,7.136741,1
254086,1,111,1.33,111,130,5.032127,6.954398,1


In [85]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [86]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [87]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [88]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
1,250,1,0,8,0.02,1
2,251,2,0,9,0.02,1
0,249,0,0,7,0.03,3
3,248,3,0,10,0.06,3
4,247,4,0,11,0.1,5
5,246,5,0,12,0.28,14
140,244,140,1,7,0.34,1
138,245,138,1,5,0.34,1
6,243,6,0,13,0.36,18
7,242,7,0,14,0.6,20


### Setup optimization problem 

In [89]:
from ortools.linear_solver import pywraplp


In [90]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [91]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,249,0,0,7,0.03,3
1,250,1,0,8,0.02,1
2,251,2,0,9,0.02,1
3,248,3,0,10,0.06,3
4,247,4,0,11,0.1,5
5,246,5,0,12,0.28,14
6,243,6,0,13,0.36,18
7,242,7,0,14,0.6,20
8,230,8,0,15,1.11,37
9,223,9,0,16,1.71,57


In [92]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [93]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [94]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

375084


In [95]:
# Now we have one constraint
solver.NumConstraints()

1

## Add CF Fair constraints

In [96]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [97]:
T_blacks_list = df[df['R']==0][['T','T_black_star']].groupby('T')['T_black_star'].apply(list).reset_index(name='T_blacks')


In [98]:
T_blacks_list['probs'] = T_blacks_list['T_blacks'].apply(convertListToProb)


In [99]:
T_blacks_list

Unnamed: 0,T,T_blacks,probs
0,7,"[6, 5, 8]","[(0.3333333333333333, 6), (0.3333333333333333,..."
1,8,[9],"[(1.0, 9)]"
2,9,[10],"[(1.0, 10)]"
3,10,"[12, 11, 12]","[(0.6666666666666666, 12), (0.3333333333333333..."
4,11,"[10, 10, 11, 10, 12]","[(0.6, 10), (0.2, 11), (0.2, 12)]"
5,12,"[13, 13, 11, 12, 14, 12, 11, 15, 14, 13, 16, 1...","[(0.21428571428571427, 13), (0.214285714285714..."
6,13,"[12, 12, 12, 13, 14, 11, 15, 13, 12, 12, 12, 1...","[(0.3888888888888889, 12), (0.2777777777777778..."
7,14,"[14, 13, 13, 13, 15, 15, 13, 15, 13, 16, 13, 1...","[(0.25, 14), (0.35, 13), (0.25, 15), (0.1, 16)..."
8,15,"[17, 14, 15, 14, 15, 14, 13, 14, 15, 16, 15, 1...","[(0.02702702702702703, 17), (0.216216216216216..."
9,16,"[18, 16, 16, 16, 18, 17, 16, 15, 15, 15, 16, 1...","[(0.12280701754385964, 18), (0.421052631578947..."


In [100]:
didntexist = 0
exists = 0
for ix, row in T_blacks_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    White_T = row['T']
    Blacks_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, White_T)], -1.0)
    for prob in Blacks_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
    

In [101]:
didntexist,exists

(13, 888)

In [102]:
T_whites_list = df[df['R']==1][['T','T_white_star']].groupby('T')['T_white_star'].apply(list).reset_index(name='T_whites')

In [103]:
T_whites_list['probs'] = T_whites_list['T_whites'].apply(convertListToProb)


In [104]:
didntexist = 0
exists = 0

for ix, row in T_whites_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    Black_T = row['T']
    White_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, Black_T)], -1.0)
    for prob in White_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
    

In [105]:
cf_fair_stratum.basis_status

<bound method Constraint.basis_status of <ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f7d303f9330> >>

In [106]:
solver.NumConstraints()

253

## Solve linear program

In [107]:
solver.ABNORMAL

4

In [108]:
status = solver.Solve()


In [109]:
status

0

In [110]:
solver.OPTIMAL

0

In [111]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [112]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.122116
1,1,0.315265
2,2,0.382795
3,3,0.373355
4,4,0.378732
5,5,0.374671
6,6,0.374574
7,7,0.375199
8,8,0.375072
9,9,0.37506


In [113]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
53,0,53,0,60,11091.2,17330,53,0.375056
54,1,54,0,61,11012.1,16685,54,0.375056
51,2,51,0,58,10999.8,18333,51,0.375056
52,3,52,0,59,10959.12,17676,52,0.375056
56,4,56,0,63,10742.9,15347,56,0.375056
55,5,55,0,62,10718.16,15762,55,0.375056
50,6,50,0,57,10692.63,18759,50,0.375056
57,7,57,0,64,10488.96,14568,57,0.375056
49,8,49,0,56,10398.3,18906,49,0.375056
58,9,58,0,65,10298.58,13917,58,0.375056


In [114]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
53,0,53,0,60,11091.2,17330,53,0.375056
54,1,54,0,61,11012.1,16685,54,0.375056
51,2,51,0,58,10999.8,18333,51,0.375056
52,3,52,0,59,10959.12,17676,52,0.375056
56,4,56,0,63,10742.9,15347,56,0.375056
55,5,55,0,62,10718.16,15762,55,0.375056
50,6,50,0,57,10692.63,18759,50,0.375056
57,7,57,0,64,10488.96,14568,57,0.375056
49,8,49,0,56,10398.3,18906,49,0.375056
58,9,58,0,65,10298.58,13917,58,0.375056


In [115]:
xxx.sort_values(by='decision',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
129,207,129,0,136,5.0,5,129,1.0
135,236,135,0,145,1.0,1,135,1.0
127,212,127,0,134,4.0,4,127,1.0
126,231,126,0,133,1.0,1,126,1.0
128,232,128,0,135,1.0,1,128,1.0
130,222,130,0,138,2.0,2,130,1.0
137,234,137,0,150,1.0,1,137,1.0
136,235,136,0,147,1.0,1,136,1.0
125,233,125,0,132,1.0,1,125,1.0
134,237,134,0,143,1.0,1,134,1.0


In [116]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [117]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT['A']

FRAC_BLACK_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['B_p'] * admit_decisions['decision_random']).sum()


In [118]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Path-Specific Fairness',str(FRAC_BLACK_POLICY),str(SUM_BP_POLICY)))
file.close()
