In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

DIVERSITY_UTILITY = float(os.environ.get('DIVERSITY_UTILITY'))
FRAC_ADMIT = float(os.environ.get('FRAC_ADMIT'))

In [2]:
###
## Load dataframe where each row is a single applicant
###

df = pd.read_csv('./df_test.csv')
df['key'] = df['R'].astype(str) + "_"  + df['T'].astype(str)

###
## Estimate E[Y(1)|T, A]
###

df_train = pd.read_csv('./df_train.csv')
df_stratum_utility = df_train[['R','T','Y']].groupby(['R','T']).mean().reset_index()
df_stratum_utility['stratum_utility'] = (df_stratum_utility['Y'] + DIVERSITY_UTILITY * df_stratum_utility['R']).round(2)
df_stratum_utility['key'] = df_stratum_utility['R'].astype(str) + "_" + df_stratum_utility['T'].astype(str)

df = df.merge(df_stratum_utility[['stratum_utility','key']], on='key')
df['ml_outcomes'] = df['stratum_utility']

In [3]:
df

Unnamed: 0.1,Unnamed: 0,R,E_noise,E_minority,E_majority,M_noise,M_minority,M_majority,T_noise,T_minority,...,T,Y_unif,Y_reject_noise,Y,Y_raw,Y_reject,Y_reject_raw,key,stratum_utility,ml_outcomes
0,1,0,1.389398,1.389398,2.389398,-1.253731,-0.864334,0.135666,3.772786,55,...,64,0.555882,0.070407,0,0.533865,0,0.409911,0_64,0.67,0.67
1,112,0,0.829848,0.829848,1.829848,0.458639,0.288488,1.288488,-0.897407,54,...,64,0.142150,-0.301607,1,0.783891,1,0.687506,0_64,0.67,0.67
2,266,0,-0.343388,-0.343388,0.656612,0.427825,-0.915563,0.084437,10.606118,56,...,64,0.826323,-0.054387,0,0.521097,0,0.397579,0_64,0.67,0.67
3,564,0,-0.083986,-0.083986,0.916014,0.888432,-0.195554,0.804446,5.890213,55,...,64,0.650710,-0.290176,1,0.690925,0,0.575529,0_64,0.67,0.67
4,592,0,1.340353,1.340353,2.340353,1.676530,2.016883,3.016883,-14.122165,52,...,64,0.617825,-0.602749,1,0.953331,1,0.925317,0_64,0.67,0.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999994,881557,1,3.124335,3.124335,4.124335,1.126062,3.250397,4.250397,24.908820,111,...,111,0.060380,-0.397774,1,0.962687,1,0.939936,1_111,1.25,1.25
999995,896888,1,3.497749,3.497749,4.497749,1.051885,3.549634,4.549634,15.645740,106,...,106,0.048438,-0.079427,1,0.972067,1,0.954767,1_106,1.17,1.17
999996,957949,1,3.727728,3.727728,4.727728,1.444929,4.172658,5.172658,8.472932,106,...,106,0.471938,-0.536495,1,0.984823,1,0.975221,1_106,1.17,1.17
999997,949400,0,3.357725,3.357725,4.357725,1.626278,3.984003,4.984003,11.747240,104,...,121,0.206499,-0.427715,1,0.993200,1,0.988838,0_121,1.00,1.00


In [4]:
FRAC_ADMIT = 0.5#df[['A']].sum()/len(df)


In [5]:
###
## Get total utility from admitting people in a stratum 
###

df_ = df[['R','T','ml_outcomes']].groupby(['R','T']).sum().reset_index()

In [6]:
###
## Get number of people in each stratum
###

df_count = df[['R','T','ml_outcomes']].groupby(['R','T']).count().reset_index()
df_count.columns = ['R','T','Count']
df_count['N'] = df_count['Count']

In [7]:
###
## Merge summary tables to get one table with Race, Test Score, SUM(Utility), COUNT(applicants) per stratum
###

dff = df_.merge(df_count[['N']],left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False).reset_index().sort_values(by='index').reset_index()

In [8]:
# final info table
dff.sort_values(by='ml_outcomes')

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
2,221,2,0,11,0.00,4
1,219,1,0,10,0.08,1
3,220,3,0,12,0.08,4
0,218,0,0,9,0.10,1
120,217,120,1,8,0.32,1
...,...,...,...,...,...,...
51,4,51,0,60,10970.40,18284
47,3,47,0,56,10985.52,21126
48,2,48,0,57,11030.04,20426
50,1,50,0,59,11169.64,19258


### Setup optimization problem 

In [9]:
from ortools.linear_solver import pywraplp


In [10]:
solver = pywraplp.Solver.CreateSolver('GLOP')


In [11]:
dff

Unnamed: 0,level_0,index,R,T,ml_outcomes,N
0,218,0,0,9,0.10,1
1,219,1,0,10,0.08,1
2,221,2,0,11,0.00,4
3,220,3,0,12,0.08,4
4,211,4,0,13,1.00,10
...,...,...,...,...,...,...
217,203,217,1,107,1.25,1
218,204,218,1,109,1.25,1
219,205,219,1,111,1.25,1
220,212,220,1,113,0.92,1


In [12]:
applicant_stratum = []
vars_cache = {}

# Objective: Maximize the expected utility of the admitted students
objective = solver.Objective()

# For each stratum
for ix, row in dff.iterrows():
    # probability of admission
    numvar = solver.NumVar(0.0, 1.0, str(ix))
    
    # store variable by index, and also by stratum R, T
    applicant_stratum.append(numvar)
    vars_cache[(row['R'],row['T'])] = numvar
    
    # Benefit of admitting people is total utility in that stratum
    objective.SetCoefficient(applicant_stratum[ix], float(row['ml_outcomes']))
objective.SetMaximization()


In [13]:
# Currently we have no constraints 
solver.NumConstraints()

0

In [14]:
# Constraint: At most K applicants
K = int(len(df)*FRAC_ADMIT)
print(K)
admit_quota = solver.Constraint(0, K)

# Total applicants cannot exceed K 
for ix, row in dff.iterrows():
    admit_quota.SetCoefficient(applicant_stratum[ix], float(row['N']))

499999


In [15]:
# Now we have one constraint
solver.NumConstraints()

1

## Add CF Fair constraints

In [16]:
from collections import Counter

def convertListToProb(raw_list):
    counts = dict(Counter(raw_list))
    probs = {}
    for test_score in counts:
        probs[test_score]  = counts[test_score]/float(len(raw_list))
    return [(probs[t], t) for t in probs]

In [17]:
T_minoritys_list = df[df['R']==0][['T','T_minority_star']].groupby('T')['T_minority_star'].apply(list).reset_index(name='T_minoritys')


In [18]:
T_minoritys_list['probs'] = T_minoritys_list['T_minoritys'].apply(convertListToProb)


In [19]:
T_minoritys_list

Unnamed: 0,T,T_minoritys,probs
0,9,[10],"[(1.0, 10)]"
1,10,[10],"[(1.0, 10)]"
2,11,"[12, 12, 9, 9]","[(0.5, 12), (0.5, 9)]"
3,12,"[13, 13, 12, 13]","[(0.75, 13), (0.25, 12)]"
4,13,"[11, 13, 11, 13, 15, 13, 13, 13, 12, 11]","[(0.3, 11), (0.5, 13), (0.1, 15), (0.1, 12)]"
...,...,...,...
114,123,"[113, 114]","[(0.5, 113), (0.5, 114)]"
115,124,[115],"[(1.0, 115)]"
116,125,"[116, 115]","[(0.5, 116), (0.5, 115)]"
117,127,[117],"[(1.0, 117)]"


In [20]:
didntexist = 0
exists = 0
for ix, row in T_minoritys_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    majority_T = row['T']
    minoritys_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(0.0, majority_T)], -1.0)
    for prob in minoritys_Ts:
        if (1.0, prob[1]) not in vars_cache:
            vars_cache[(1.0, prob[1])] = solver.NumVar(0.0, 1.0, str((1.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
        cf_fair_stratum.SetCoefficient(vars_cache[(1.0, prob[1])], prob[0])
    

In [21]:
didntexist,exists

(10, 731)

In [22]:
T_majoritys_list = df[df['R']==1][['T','T_majority_star']].groupby('T')['T_majority_star'].apply(list).reset_index(name='T_majoritys')

In [23]:
T_majoritys_list['probs'] = T_majoritys_list['T_majoritys'].apply(convertListToProb)


In [24]:
didntexist = 0
exists = 0

for ix, row in T_majoritys_list.iterrows():
    cf_fair_stratum = solver.Constraint(0.0, 0.0)

    minority_T = row['T']
    majority_Ts = row['probs']
    
    cf_fair_stratum.SetCoefficient(vars_cache[(1.0, minority_T)], -1.0)
    for prob in majority_Ts:
        if (0.0, prob[1]) not in vars_cache:
            vars_cache[(0.0, prob[1])] = solver.NumVar(0.0, 1.0, str((0.0, prob[1])))
            didntexist+=1
        else:
            exists+=1
            
        cf_fair_stratum.SetCoefficient(vars_cache[(0.0, prob[1])], prob[0])
    

In [25]:
cf_fair_stratum.basis_status

<bound method Constraint.basis_status of <ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x7f88f1799ea0> >>

In [26]:
solver.NumConstraints()

223

## Solve linear program

In [27]:
solver.ABNORMAL

4

In [28]:
status = solver.Solve()


In [29]:
status

0

In [30]:
solver.OPTIMAL

0

In [31]:
row = []
admit = []

for i in applicant_stratum:
    row.append(int(str(i)))
    admit.append(i.solution_value())

df_decisions = pd.DataFrame({'row_id':row,'decision':admit})

In [32]:
df_decisions

Unnamed: 0,row_id,decision
0,0,0.499933
1,1,0.499933
2,2,0.499933
3,3,0.499933
4,4,0.499933
...,...,...
217,217,0.965715
218,218,1.000000
219,219,1.000000
220,220,1.000000


In [33]:
dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
49,0,49,0,58,11233.60,20060,49,0.499933
50,1,50,0,59,11169.64,19258,50,0.499933
48,2,48,0,57,11030.04,20426,48,0.499933
47,3,47,0,56,10985.52,21126,47,0.499933
51,4,51,0,60,10970.40,18284,51,0.499933
...,...,...,...,...,...,...,...,...
120,217,120,1,8,0.32,1,120,0.499933
0,218,0,0,9,0.10,1,0,0.499933
1,219,1,0,10,0.08,1,1,0.499933
3,220,3,0,12,0.08,4,3,0.499933


In [34]:
xxx = dff.merge(df_decisions,left_index=True,right_index=True).sort_values(by='ml_outcomes',ascending=False)
xxx

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
49,0,49,0,58,11233.60,20060,49,0.499933
50,1,50,0,59,11169.64,19258,50,0.499933
48,2,48,0,57,11030.04,20426,48,0.499933
47,3,47,0,56,10985.52,21126,47,0.499933
51,4,51,0,60,10970.40,18284,51,0.499933
...,...,...,...,...,...,...,...,...
120,217,120,1,8,0.32,1,120,0.499933
0,218,0,0,9,0.10,1,0,0.499933
1,219,1,0,10,0.08,1,1,0.499933
3,220,3,0,12,0.08,4,3,0.499933


In [35]:
xxx.sort_values(by='decision',ascending=False)

Unnamed: 0,level_0,index,R,T,ml_outcomes,N,row_id,decision
221,206,221,1,116,1.25,1,221,1.000000
118,198,118,0,128,2.00,2,118,1.000000
116,199,116,0,125,2.00,2,116,1.000000
114,200,114,0,123,2.00,2,114,1.000000
117,208,117,0,127,1.00,1,117,1.000000
...,...,...,...,...,...,...,...,...
137,115,137,1,25,407.22,1234,137,0.499933
136,123,136,1,24,297.60,930,136,0.499933
149,60,149,1,37,3743.71,9131,149,0.499933
16,150,16,0,25,58.41,649,16,0.499933


In [36]:
df['key'] = df['R'].astype(str)+'_'+df['T'].astype(str)
xxx['key'] = xxx['R'].astype(str)+'_'+xxx['T'].astype(str)

In [37]:
admit_decisions = df.merge(xxx,how='left',on='key')
admit_decisions['decision_random'] = pd.Series([random.random() for x in range(0,len(admit_decisions))]) < FRAC_ADMIT

FRAC_minority_POLICY = (admit_decisions['R_y'] * admit_decisions['decision']).sum()/admit_decisions['decision'].sum()
SUM_BP_POLICY = ( admit_decisions['Y'] * admit_decisions['decision']).sum()

FRAC_RANDOM_POLICY = (admit_decisions['R_y'] * admit_decisions['decision_random']).sum()/admit_decisions['decision_random'].sum()
SUM_BP_RAND_POLICY = ( admit_decisions['Y'] * admit_decisions['decision_random']).sum()


In [38]:
file = open('./lp_results.csv','a')
file.write('{}\t{}\t{}\n'.format('Path-Specific Fairness',str(FRAC_minority_POLICY),str(SUM_BP_POLICY)))
file.close()


In [39]:
(xxx['N']*xxx['decision']).sum()/len(df)

0.49999949999949994