In [14]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
sns.set(style="darkgrid")
from time import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
# Import the three supervised learning models from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA    

# Pretty display for notebooks
%matplotlib inline

In [15]:
# Add column names to data set
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
           'relationship', 'race','sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Read in train data
adult_train = pd.read_csv('../input/adult/adult_train_data.csv', header=None, names=columns, skipinitialspace=True)

# Drop the fnlwgt column which is useless for later analysis
adult_train = adult_train.drop('fnlwgt', axis=1)

# Read in test data
adult_test = pd.read_csv('../input/adult/adult_test_data.csv', header=None, skiprows=1, names=columns, skipinitialspace=True)

# Drop the fnlwgt column which is useless for later analysis
adult_test = adult_test.drop('fnlwgt', axis=1)

# Remove '.' in income column
adult_test['income'] = adult_test['income'].apply(lambda x: '>50k' if x=='>50k.'  else '<=50k')




In [16]:
print(adult_train.shape[0],adult_test.shape[0])

32561 16280


In [17]:
# Convert '?' to NaNs and remove the entries with NaN value
# Check missing value code and convert to NaNs
object_col = adult_train.select_dtypes(include=object).columns.tolist()
for col in object_col:
    adult_train.loc[adult_train[col]=='?', col] = np.nan
    adult_test.loc[adult_test[col]=='?', col] = np.nan

# Perform an mssing assessment in each column of the dataset.
col_missing_pct = adult_train.isna().sum()/adult_train.shape[0]
col_missing_pct.sort_values(ascending=False)

# Remove data entries with missing value
adult_train = adult_train.dropna(axis=0, how='any')
adult_test = adult_test.dropna(axis=0, how='any')

# Show the results of the split
print("After removing the missing value:")
print("Training set has {} samples.".format(adult_train.shape[0]))
print("Testing set has {} samples.".format(adult_test.shape[0]))

After removing the missing value:
Training set has 30162 samples.
Testing set has 15059 samples.


In [18]:
 for col in object_col:
    print(adult_train[col].value_counts(dropna=False)/adult_train.shape[0],'\n')
print(adult_train.head())
print(adult_test.head())    

private             0.738877
self-emp-not-inc    0.082853
local-gov           0.068530
state-gov           0.042404
self-emp-inc        0.035608
federal-gov         0.031265
without-pay         0.000464
Name: workclass, dtype: float64 

hs-grad         0.326238
some-college    0.221404
bachelors       0.167230
masters         0.053942
assoc-voc       0.043333
11th            0.034746
assoc-acdm      0.033420
10th            0.027187
7th-8th         0.018467
prof-school     0.017970
9th             0.015085
12th            0.012499
doctorate       0.012433
5th-6th         0.009548
1st-4th         0.005006
preschool       0.001492
Name: education, dtype: float64 

married-civ-spouse       0.466315
never-married            0.322459
divorced                 0.139712
separated                0.031132
widowed                  0.027419
married-spouse-absent    0.012267
married-af-spouse        0.000696
Name: marital-status, dtype: float64 

prof-specialty       0.133877
craft-repair         0

In [19]:
adult_train.reset_index(drop=True, inplace=True)
adult_test.reset_index(drop=True, inplace=True)
p=adult_train.shape[0]
q =adult_test.shape[0]
# reducing dimensionality of some very sparse features
for i in range(0,p):
    if adult_train.loc[i,'native-country'] not in ["united-states"] :
               adult_train.loc[i,"native-country"] = "non-united-stated"        
    if adult_train.loc[i,"education"] in ["Preschool", "1st-4th", "5th-6th", "7th-8th"]:
               adult_train.loc[i,"education"] = "prim-middle-school"
    elif adult_train.loc[i,"education"] in ["9th", "10th", "11th", "12th"]:
               adult_train.loc[i,"education"] = "high-school"   
    if adult_train.loc[i,'income'] in [">50k"] :
               adult_train.loc[i,"income"] = 1 
    else: 
               adult_train.loc[i,"income"] = 0         
#reducing dimensionality of some very sparse features
for i in range(0,q):                
    if adult_test.loc[i,'native-country'] not in ["united-states"]:
               adult_test.loc[i,'native-country'] = "Non-United-Stated"
    if adult_test.loc[i,'education'] in ["Preschool", "1st-4th", "5th-6th", "7th-8th"]:
               adult_test.loc[i,'education'] = "prim-middle-school"
    elif adult_test.loc[i,'education'] in ["9th", "10th", "11th", "12th"]:
               adult_test.loc[i,'education'] = "high-school"   
    if adult_test.loc[i,'native-country'] not in ["united-states"] :
               adult_train.loc[i,"native-country"] = "non-united-stated"
    if adult_test.loc[i,'income'] in [">50k"] :
               adult_test.loc[i,"income"] = 1 
    else: 
               adult_test.loc[i,"income"] = 0            
print(adult_train.head())
print(adult_test.head())

   age         workclass    education  education-num      marital-status  \
0   39         state-gov    bachelors             13       never-married   
1   50  self-emp-not-inc    bachelors             13  married-civ-spouse   
2   38           private      hs-grad              9            divorced   
3   53           private  high-school              7  married-civ-spouse   
4   28           private    bachelors             13  married-civ-spouse   

          occupation   relationship   race     sex  capital-gain  \
0       adm-clerical  not-in-family  white    male          2174   
1    exec-managerial        husband  white    male             0   
2  handlers-cleaners  not-in-family  white    male             0   
3  handlers-cleaners        husband  black    male             0   
4     prof-specialty           wife  black  female             0   

   capital-loss  hours-per-week     native-country income  
0             0              40      united-states      0  
1             

In [20]:
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
num_col = adult_train.dtypes[adult_train.dtypes != 'object'].index
features_log_minmax_transform = pd.DataFrame(data = adult_train)
features_log_minmax_transform[num_col] = scaler.fit_transform(features_log_minmax_transform[num_col])

# Transform the test data set
features_log_minmax_transform_test = pd.DataFrame(data = adult_test)
features_log_minmax_transform_test[num_col] = scaler.transform(features_log_minmax_transform_test[num_col])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head())
display(features_log_minmax_transform_test.head())

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.30137,state-gov,bachelors,0.8,never-married,adm-clerical,not-in-family,white,male,0.02174,0.0,0.397959,united-states,0
1,0.452055,self-emp-not-inc,bachelors,0.8,married-civ-spouse,exec-managerial,husband,white,male,0.0,0.0,0.122449,united-states,0
2,0.287671,private,hs-grad,0.533333,divorced,handlers-cleaners,not-in-family,white,male,0.0,0.0,0.397959,united-states,0
3,0.493151,private,high-school,0.4,married-civ-spouse,handlers-cleaners,husband,black,male,0.0,0.0,0.397959,united-states,0
4,0.150685,private,bachelors,0.8,married-civ-spouse,prof-specialty,wife,black,female,0.0,0.0,0.397959,non-united-stated,0


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.287671,private,hs-grad,0.533333,married-civ-spouse,farming-fishing,husband,white,male,0.0,0.0,0.5,united-states,0
1,0.150685,local-gov,assoc-acdm,0.733333,married-civ-spouse,protective-serv,husband,white,male,0.0,0.0,0.397959,united-states,1
2,0.369863,private,some-college,0.6,married-civ-spouse,machine-op-inspct,husband,black,male,0.076881,0.0,0.397959,united-states,1
3,0.232877,private,high-school,0.333333,never-married,other-service,not-in-family,white,male,0.0,0.0,0.295918,united-states,0
4,0.630137,self-emp-not-inc,prof-school,0.933333,married-civ-spouse,prof-specialty,husband,white,male,0.03103,0.0,0.316327,united-states,1


In [21]:

Data_train = pd.get_dummies(features_log_minmax_transform, columns=['workclass','education','marital-status','occupation','relationship','native-country','race'], prefix =['work','edu','ms','occ','rls','nc','r'])
Data_train['INC'] = Data_train.loc[:,'income']
Data_train=Data_train.drop(columns=['income','sex'])

Data_test = pd.get_dummies(features_log_minmax_transform_test, columns=['workclass','education','marital-status','occupation','relationship','native-country','race'], prefix =['work','edu','ms','occ','rls','nc','r'])
Data_test['INC'] = Data_test.loc[:,'income']
Data_test=Data_test.drop(columns=['income','sex'])

m=Data_train.shape[1]

X_train=Data_train.iloc[:,0:m-1]
Y_train=Data_train.iloc[:,m-1]
print(m)

m=Data_test.shape[1]
X_test=Data_test.iloc[:,0:m-1]
Y_test=Data_test.iloc[:,m-1]


print(m)
sensitive_attr_train=adult_train.drop(columns=['age','workclass','education','education-num','marital-status','occupation','relationship','capital-gain','capital-loss','hours-per-week','native-country','income','race'])
sensitive_attr_train = pd.get_dummies(sensitive_attr_train, columns=['sex'], prefix =['s'])
#print(sensitive_attr_train.head())
sensitive_attr_test=adult_test.drop(columns=['age','workclass','education','education-num','marital-status','occupation','relationship','capital-gain','capital-loss','hours-per-week','native-country','income','race'])
sensitive_attr_test = pd.get_dummies(sensitive_attr_test, columns=['sex'], prefix =['s'])
#print(sensitive_attr_train.head())


# display(X_train.head())
# display(X_test.head())
# display(Y_train.head())
# display(Y_test.head())


print(Y_test.isin(['>50k'])) 



print(Y_test.dtype)
Y_test=Y_test.astype('int')
Y_train=Y_train.astype('int')

print(Y_test.dtype)
# print(X_train.shape[0])
# print(X_test.shape[0])
# print(Y_train.shape[0])
# print(Y_test.shape[0])



58
58
0        False
1        False
2        False
3        False
4        False
         ...  
15054    False
15055    False
15056    False
15057    False
15058    False
Name: INC, Length: 15059, dtype: bool
object
int64


In [22]:
#SVM 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%pylab inline
from random import *
from subprocess import check_output
def Adult_svm(X_train,X_test,Y_train,Y_test):
    #Split data into training and test datasets (training will be based on 70% of data)
    from sklearn.model_selection import train_test_split
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0,shuffle=False) 
    #test_size: if integer, number of examples into test dataset; if between 0.0 and 1.0, means proportion
    print('There are {} samples in the training set and {} samples in the test set'.format(X_train.shape[0], X_test.shape[0]))


    #Scaling data
    #from sklearn.preprocessing import StandardScaler
    
    from sklearn.model_selection import cross_val_score

    #sc = StandardScaler(with_mean=False)
    
    
    #sc.fit(X_train)
    #X_train_std = sc.transform(X_train)
    #X_test_std = sc.transform(X_test)

    #X_train_std and X_test_std are the scaled datasets to be used in algorithms

    #Applying SVC (Support Vector Classification)
    from sklearn.svm import SVC
    svm = SVC(kernel='rbf', random_state=0, gamma=.1, C=10.0)
    svm.fit(X_train, Y_train)
    print('The accuracy of the SVM classifier on training data is {:.2f}'.format(svm.score(X_train, Y_train)))
    print('The accuracy of the SVM classifier on test data is {:.2f}'.format(svm.score(X_test, Y_test)))
    print('####Train prediction Label###############################################')
    Y_train_pred=svm.predict(X_train)
    #print(y_1)
    Y_test_pred=svm.predict(X_test)

    print('####Actual Train Label###############################################')


    print('####Change to colors###############################################')
        
    
    return Y_train_pred,Y_train,Y_test_pred,Y_test
     



Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [None]:
def Adult_rf(X_train,X_test,Y_train,Y_test):   
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators= 100, random_state=42)
    # Extract the two most important features
    
    
    # Train the random forest
    rf.fit(X_train, Y_train)
    # Make predictions and determine the error
    Y_train_pred = rf.predict(X_train)
    Y_test_pred = rf.predict(X_test)
    print('The accuracy of the SVM classifier on training data is {:.2f}'.format(rf.score(X_train, Y_train)))
    print('The accuracy of the SVM classifier on test data is {:.2f}'.format(rf.score(X_test, Y_test)))
    
    return Y_train_pred,Y_train,Y_test_pred,Y_test

In [None]:
def Adult_mlp(X_train,X_test,Y_train,Y_test):   
    from sklearn.neural_network import MLPClassifier
    mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(53, 2),alpha=1e-5, random_state=42)
    # Extract the two most important features
    
    
    # Train the random forest
    mlp.fit(X_train, Y_train)
    # Make predictions and determine the error
    Y_train_pred = mlp.predict(X_train)
    Y_test_pred = mlp.predict(X_test)
    print('The accuracy of the SVM classifier on training data is {:.2f}'.format(mlp.score(X_train, Y_train)))
    print('The accuracy of the SVM classifier on test data is {:.2f}'.format(mlp.score(X_test, Y_test)))
    




    return Y_train_pred,Y_train,Y_test_pred,Y_test

In [None]:
#table1 for Adult

import time
import pulp as p 
def min_max_lp_all(data1,gamma,eps,r):
    
    m=data1.shape[0]
    n=data1.shape[1]
    print('dimension of data')
    print(m,n)
    Lp_prob = p.LpProblem('Problem', p.LpMinimize)  
   
    
    X=np.zeros(n+1,dtype=p.LpVariable)
    sizes=np.zeros(m,dtype=int)
    for i in range(m):
        count=0
        for j in range(n):
            if data1[i][j]==1:
                count=count+1
                
        sizes[i]=count
  

    for i in range(n):
        var1=str(i)
        
        X[i]=p.LpVariable(var1,lowBound=0,upBound=1,cat='Integer')
       
        
    X[n] =  p.LpVariable("z1",lowBound=0)


    #########objective function#####################
    Lp_prob += X[n]


    ##############constraint#################
    for i in range(2*m):
        if i<m:
            Lp_prob += X[n] >= p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)])
            Lp_prob += p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)]) >= (2*gamma-1)*sizes[i]
            Lp_prob += p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)]) <= ((2*gamma-1)+eps)*sizes[i]
            
        else:        
            Lp_prob += X[n] >= p.lpSum([-1*2*(X[j]-0.5)*data1[i-m][j] for j in range(n)])
            
         
 
    #n is the number of elements in sensitive attribute 
           
       
    Lp_prob += X[n] <= 42000
    
    #####################################
    status = Lp_prob.solve()   # Solver 
    print(p.LpStatus[status]) 
    print("discripency is:")        
    print(p.value(Lp_prob.objective))
    x=np.zeros(n,dtype=float)

   # The solution status 
    Synth1={}
    Synth2={}
    # # Printing the final solution 
    for i in range(n):
        if(p.value(X[i])==1):
            Synth1[i]=1 
            Synth2[i]=-1
        else:
            Synth1[i]=-1
            Synth2[i]=1
    Synthu1=Synth1  
    Synthu2=Synth2  
    
              
    return Synthu1,Synthu2   


In [31]:
#table1 for Adult

import time
import pulp as p 
def min_max_lp_all2(data1,gam,eps,r,delta):
    
    m=data1.shape[0]
    n=data1.shape[1]
    print('dimension of data')
    print(m,n)
    Lp_prob = p.LpProblem('Problem', p.LpMinimize)  
   
    
    X=np.zeros(n+1,dtype=p.LpVariable)
    sizes=np.zeros(m,dtype=int)
    for i in range(m):
        count=0
        for j in range(n):
            if data1[i][j]==1:
                count=count+1
                
        sizes[i]=count
  

    for i in range(n):
        var1=str(i)
        
        X[i]=p.LpVariable(var1,lowBound=0,upBound=1,cat='Integer')
    
    #########################
    gamma=np.zeros(2,dtype=int)
    gamma[0]=0.5
    gamma[1]=0.05
        
    X[n]=  p.LpVariable("z1",lowBound=0)
    #X[n+1]=  p.LpVariable("z2",lowBound=0)

     
    #########objective function#####################
    Lp_prob += X[n] 
    #Lp_prob += 1 

    
    ##############constraint#################
    for i in range(2*m):
        if i<m:
            Lp_prob += X[n] >= p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)])
            Lp_prob += p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)]) >= (2*gamma[i]-1)*sizes[i]
            Lp_prob += p.lpSum([2*(X[j]-0.5)*data1[i][j] for j in range(n)]) <= ((2*gamma[i]-1)+eps)*sizes[i]
            
        else:        
            Lp_prob += X[n] >= p.lpSum([-1*2*(X[j]-0.5)*data1[i-m][j] for j in range(n)])
            #Lp_prob += X[n+1] >= p.lpSum([-1*2*(X[j]-0.5)+r[j] for j in range(n)]) 
#     Lp_prob += X[n+1] >= p.lpSum([2*(X[j]-0.5)-r[j] for j in range(n)])
#     Lp_prob += X[n+1] >= p.lpSum([-1*2*(X[j]-0.5)+r[j] for j in range(n)])       
         
    Lp_prob += p.lpSum([2*(X[i]-0.5)*r[i] for i in range(n)])>=delta*n
    #n is the number of elements in sensitive attribute 
           
       
    Lp_prob += X[n] <= 42000
    
    #####################################
    status = Lp_prob.solve()   # Solver 
    print(p.LpStatus[status]) 
    print("discripency is:")        
    print(p.value(Lp_prob.objective))
    x=np.zeros(n,dtype=float)

   # The solution status 
    Synth1={}
    Synth2={}
    # # Printing the final solution 
    for i in range(n):
        if(p.value(X[i])==1):
            Synth1[i]=1 
            Synth2[i]=-1
        else:
            Synth1[i]=-1
            Synth2[i]=1
    Synthu1=Synth1  
    Synthu2=Synth2  
    
              
    return Synthu1,Synthu2   


In [32]:
display(sensitive_attr_test)
sens_train=sensitive_attr_train.transpose()
sens_test=sensitive_attr_test.transpose()
#gamma=0.1
#eps=0.1
#min_max_lp_all(sens_test,gamma,eps)


Unnamed: 0,s_female,s_male
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
15054,0,1
15055,1,0
15056,0,1
15057,0,1


In [25]:
Y_train_pred,Y_train,Y_test_pred,Y_test=Adult_svm(X_train,X_test,Y_train,Y_test)



There are 30162 samples in the training set and 15059 samples in the test set
The accuracy of the SVM classifier on training data is 0.86
The accuracy of the SVM classifier on test data is 0.85
####Train prediction Label###############################################
####Actual Train Label###############################################
####Change to colors###############################################


In [None]:
Y_train_pred,Y_train,Y_test_pred,Y_test=Adult_rf(X_train,X_test,Y_train,Y_test)

In [None]:
Y_train_pred,Y_train,Y_test_pred,Y_test=Adult_mlp(X_train,X_test,Y_train,Y_test)

In [None]:
########Feldman----nonwhite white female male 

#data1=sens_test

# min_max_lp_all(data1,gamma,eps,r):

rows=Y_test.shape[0]   

###############################################33       
r = np.zeros(rows, dtype = int)
for i in range(rows):
    if Y_test.iloc[i]==0 :
        r[i]=-1
    else :
        r[i]= 1  
r2 = np.zeros(rows, dtype = int)        
        
for i in range(rows):
    if Y_test_pred[i]==0 :
        r2[i]=-1
    else :
        r2[i]= 1        
        
data = np.zeros((4, rows), dtype = float)

a=0
b=0
c=0
d=0

acc1=0
acc2=0
acc3=0
acc4=0


for i in range(rows):
    if (sensitive_attr_test.iloc[i,0]==1 or sensitive_attr_test.iloc[i,1]==1 or sensitive_attr_test.iloc[i,2]==1 or sensitive_attr_test.iloc[i,3]==1):
            if(sensitive_attr_test.iloc[i,5]==1):
                data[0][i]= 1
                a=a+1
                if r[i]==1:
                    acc1=acc1+1
    elif sensitive_attr_test.iloc[i,4]==1:
            if(sensitive_attr_test.iloc[i,5]==1):
                data[1][i]= 1
                b=b+1
                if r[i]==1:
                    acc2=acc2+1
    if (sensitive_attr_test.iloc[i,0]==1 or sensitive_attr_test.iloc[i,1]==1 or sensitive_attr_test.iloc[i,2]==1 or sensitive_attr_test.iloc[i,3]==1):
           if(sensitive_attr_test.iloc[i,6]==1):
                data[2][i]= 1
                c=c+1
                if r[i]==1:
                    acc3=acc3+1
    elif sensitive_attr_test.iloc[i,4]==1:
           if(sensitive_attr_test.iloc[i,6]==1):
                data[3][i]= 1
                d=d+1
                if r[i]==1:
                    acc4=acc4+1   
print(a,b,c,d)              
print(acc1,acc2,acc3,acc4)
a1=float(acc1/a)
b1=float(acc2/b)
c1=float(acc3/c)
d1=float(acc4/d)


print(a1,b1,c1,d1)





###########################################



# for gamma in np.arange(0.15,0.5,.05):
#     for eps in np.arange(0.05,0.02,-.01): 
gamma=0.16
eps=0.06
for gamma in np.arange(.05,.28,0.05):
    acc1=0
    acc2=0
    acc3=0
    acc4=0
   


    u1,u2=min_max_lp_all(data,gamma,eps,r)

    #######################Disp_impact#######################  

    for i in range(rows):
            if data[0][i]== 1 and u1[i]==1:
                    acc1=acc1+1
            elif data[1][i]== 1 and u1[i]==1:
                    acc2=acc2+1  
            elif data[2][i]== 1 and u1[i]==1:
                    acc3=acc3+1 
            elif data[3][i]== 1 and u1[i]==1:
                    acc4=acc4+1                

    a1=float(acc1/a)
    b1=float(acc2/b)
    c1=float(acc3/c)
    d1=float(acc4/d)
#    print(acc1,acc2,acc3,acc4)
#    print(a1,b1,c1,d1)

#     count1=0
#     count2=0
#     for j in range(r.shape[0]):
#             if(r[j]==u1[j]):
#                 count1+=1
#     acc1=float(count1/r.shape[0])        

#     for j in range(r.shape[0]):
#             if(r[j]==u2[j]):
#                 count2+=1
#     acc2=float(count2/r.shape[0]) 
#     print(acc1)
#     print(acc2)
#     print(gamma)
#     print(eps)
    
#     acc2_l.append(acc2)
#     acc1_l.append(acc1)


#     print("Accuracy::")    
#     print(acc1_l)     
#     print(acc2_l)

#     ###########################
#     a_acc=0
#     b_acc=0

    fi= np.zeros(rows,dtype=int) 
    count=0
    acc=0
    ci=[]
    for alpha in np.arange(0,1.05,0.05):
        f_acc=0
        acc1=0
        acc2=0
        acc3=0
        acc4=0
       
        for i in range(rows):

            z=random()
            if z < alpha:
                    fi[i]= u1[i] 
                    count=count+1
            else:
                   fi[i]= r2[i]

        for i in range(rows):
             if fi[i] == r[i]:
                    f_acc=f_acc+1


        f_acc_percent=f_acc/rows
        ci.append(f_acc_percent)

        for i in range(rows):
            if data[0][i]== 1 and fi[i]==1:
                    acc1=acc1+1
            elif data[1][i]== 1 and fi[i]==1:
                    acc2=acc2+1  
            elif data[2][i]== 1 and fi[i]==1:
                    acc3=acc3+1 
            elif data[3][i]== 1 and fi[i]==1:
                    acc4=acc4+1         
            


        a1=float(acc1/a)
        b1=float(acc2/b)
        c1=float(acc3/c)
        d1=float(acc4/d)
       
        print(acc1,acc2,acc3,acc4)
        print(a1,b1,c1,d1)
    print(ci)
     

In [41]:
########Bilalzafar  (race-5) + (gender-2)

#data1=sens_test

# min_max_lp_all(data1,gamma,eps,r):

rows=Y_test.shape[0]   

###############################################33       
r = np.zeros(rows, dtype = int)
for i in range(rows):
    if Y_test.iloc[i]==0 :
        r[i]=-1
    else :
        r[i]= 1  
r2 = np.zeros(rows, dtype = int)        
        
for i in range(rows):
    if Y_test_pred[i]==0 :
        r2[i]=-1
    else :
        r2[i]= 1        
        
data = np.zeros((2, rows), dtype = float)

a=0
b=0


acc1=0
acc2=0



for i in range(rows):
    if (sensitive_attr_test.iloc[i,0]==1):
                data[0][i]= 1
                a=a+1
                if r[i]==1:
                    acc1=acc1+1
    elif (sensitive_attr_test.iloc[i,1]==1):
                data[1][i]= 1
                b=b+1
                if r[i]==1:
                    acc2=acc2+1              
print(a,b)              
print(acc1,acc2)
a1=float(acc1/a)
b1=float(acc2/b)




print(a1,b1)





###########################################
#madaras=


# for gamma in np.arange(0.15,0.5,.05):
# for eps in np.arange(0.05,0.02,-.01): 
#gamma=0.16
eps=.20
delta=.500
for gamma in np.arange(.05,.06,0.05):
    a1=0
    b1=0
  
   


    u1,u2=min_max_lp_all2(data,gamma,eps,r2,delta)

    #######################Disp_impact#######################  

    for i in range(rows):
            if data[0][i]== 1 and u1[i]==1:
                    acc1=acc1+1
            elif data[1][i]== 1 and u1[i]==1:
                    acc2=acc2+1  
                 

    a1=float(acc1/a)
    b1=float(acc2/b)
    
#    print(acc1,acc2,acc3,acc4)
#    print(a1,b1,c1,d1)

#     count1=0
#     count2=0
#     for j in range(r.shape[0]):
#             if(r[j]==u1[j]):
#                 count1+=1
#     acc1=float(count1/r.shape[0])        

#     for j in range(r.shape[0]):
#             if(r[j]==u2[j]):
#                 count2+=1
#     acc2=float(count2/r.shape[0]) 
#     print(acc1)
#     print(acc2)
#     print(gamma)
#     print(eps)
    
#     acc2_l.append(acc2)
#     acc1_l.append(acc1)


#     print("Accuracy::")    
#     print(acc1_l)     
#     print(acc2_l)

#     ###########################
#     a_acc=0
#     b_acc=0

    fi= np.zeros(rows,dtype=int) 
    count=0
    acc=0
    ci=[]
    k=0
    di= [] 
    di2=[]
    for alpha in np.arange(0,1.05,0.05):
        f_acc=0
        acc1=0
        acc2=0
       
        for i in range(rows):

            z=random()
            if z < alpha:
                    fi[i]= u1[i] 
                    count=count+1
            else:
                   fi[i]= r2[i]

        for i in range(rows):
             if fi[i] == r[i]:
                    f_acc=f_acc+1


        f_acc_percent=f_acc/rows
        ci.append(f_acc_percent)

        for i in range(rows):
            if data[0][i]== 1 and fi[i]==1:
                    acc1=acc1+1
            elif data[1][i]== 1 and fi[i]==1:
                    acc2=acc2+1  
              


        a1=float(acc1/a)
        b1=float(acc2/b)
        print(a1,b1)
        #print(acc1,acc2,acc3,acc4,acc4,acc6,acc7)
        #print(a1,b1,c1,d1,e1,f1,g1)
        k2=min(a1,b1)/max(a1,b1)
        k=max(a1,b1)-min(a1,b1)
        di.append(k)
        di2.append(k2)
    print(ci)
    print(di)
    print(di2)
      
        

4913 10146
557 3143
0.11337268471402402 0.3097772521190617
dimension of data
2 15059
Optimal
discripency is:
8118.0
0.08406269082027275 0.25231618371772124
0.08406269082027275 0.2431500098561009
0.08406269082027275 0.23447664104080426
0.08406269082027275 0.22807017543859648
0.08406269082027275 0.22343780800315396
0.08406269082027275 0.21584861028976937
0.08406269082027275 0.20451409422432484
0.08406269082027275 0.1995860437610881
0.08406269082027275 0.19317957815888034
0.08406269082027275 0.1803666469544648
0.08406269082027275 0.17819830475064063
0.08406269082027275 0.16794795978710822
0.08406269082027275 0.1608515671200473
0.08406269082027275 0.15799329785137
0.08406269082027275 0.1407451212300414
0.08406269082027275 0.14094224324857085
0.08406269082027275 0.12842499507194954
0.08406269082027275 0.1263552138773901
0.08406269082027275 0.11738616203429923
0.08406269082027275 0.10546027991326631
0.08406269082027275 0.09994086339444117
[0.8462713327578193, 0.8387675144431901, 0.8337206985