In [None]:
##############Synthetic data Disperate mistreatment##############################

from __future__ import division
import os,sys
import math
import numpy as np
import matplotlib.pyplot as plt # for plotting stuff
from random import seed, shuffle
from scipy.stats import multivariate_normal # generating synthetic data
from sklearn.linear_model import LogisticRegression
SEED = 1122334455
seed(SEED) # set the random seed so that the random permutations can be reproduced again
np.random.seed(SEED)
sys.path.insert(0, '../../fair_classification/') 
# import utils as ut


def generate_synthetic_data(data_type, plot_data=False):

    """
        Code for generating the synthetic data.
        We will have two non-sensitive features and one sensitive feature.
        Non sensitive features will be drawn from a 2D gaussian distribution.
        Sensitive feature specifies the demographic group of the data point and can take values 0 and 1.

        The code will generate data such that a classifier optimizing for accuracy will lead to disparate misclassification rates for the two demographic groups.
        You can generate different data configurations using different values for the "data_type" parameter.
    """

    n_samples = 1000 # generate these many data points per cluster

    def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
        """
        mean_in: mean of the gaussian cluster
        cov_in: covariance matrix
        z_val: sensitive feature value
        class_label: +1 or -1
        n: number of points
        """

        nv= multivariate_normal(mean = mean_in, cov = cov_in)
        X = nv.rvs(n)
        y = np.ones(n, dtype=float) * class_label
        z = np.ones(n, dtype=float) * z_val # all the points in this cluster get this value of the sensitive attribute

        return nv, X, y, z


    if data_type == 1:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates as well as disparate false negative rates for both groups.
        """


        cc = [[10,1], [1,4]]
        mu1, sigma1 = [2, 3], cc  # z=1, +
        cc = [[5,2], [2,5]]
        mu2, sigma2 = [1, 2], cc  # z=0, +

        cc = [[5, 1], [1, 5]]
        mu3, sigma3 = [-5,0], cc # z=1, -
        cc = [[7, 1], [1, 7]]
        mu4, sigma4 = [0,-1], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -

    elif data_type == 2:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates for both groups but will have equal false negative rates.
        """


        cc = [[3,1], [1,3]]
        mu1, sigma1 = [2, 2], cc  # z=1, +
        mu2, sigma2 = [2, 2], cc  # z=0, +

        mu3, sigma3 = [-2,-2], cc # z=1, -
        cc = [[3,3], [1,3]]
        mu4, sigma4 = [-1,0], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -



    # merge the clusters
    X = np.vstack((X1, X2, X3, X4))
    y = np.hstack((y1, y2, y3, y4))
    x_control = np.hstack((z1, z2, z3, z4))

    # shuffle the data
    perm = [*range(len(X))]
    shuffle(perm)
    X = X[perm]
    y = y[perm]
    x_control = x_control[perm]

    
    """ Plot the data """
    if plot_data:
        plt.figure()
        num_to_draw = 200 # we will only draw a small number of points to avoid clutter
        x_draw = X[:num_to_draw]
        y_draw = y[:num_to_draw]
        x_control_draw = x_control[:num_to_draw]

        X_s_0 = x_draw[x_control_draw == 0.0]
        X_s_1 = x_draw[x_control_draw == 1.0]
        y_s_0 = y_draw[x_control_draw == 0.0]
        y_s_1 = y_draw[x_control_draw == 1.0]

        plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=60, linewidth=2, label= "group-0 +ve")
        plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=60, linewidth=2, label = "group-0 -ve")
        plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 +ve")
        plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 -ve")


        plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution
        plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
        plt.legend(loc=2, fontsize=21)
        plt.ylim((-8,12))

#         plt.savefig("img/data.png")
        plt.show()


#     x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
#     X = ut.add_intercept(X)
    

    return X,y,x_control
def Synth_data1():
    X,y,x_control=generate_synthetic_data(1,True)
    
    
    X=np.column_stack((X,x_control))
#     print(X)
    return(X,y)
def Synth_data2():
    X,y,x_control=generate_synthetic_data(2,True)
    
    
    X=np.column_stack((X,x_control))
#     print(X)
    return(X,y)


In [None]:
x,y=Synth_data1()

c1=0
c2=0
for j in range(x.shape[0]):
        if(x[j][2]==1 and y[j]==-1):
            c1+=1       

for j in range(x.shape[0]):
        if(x[j][2]==1 and y[j]==1):
            c2+=1
print(c1,c2)
c1=0
c2=0
for j in range(x.shape[0]):
        if(x[j][2]==0 and y[j]==-1):
            c1+=1       

for j in range(x.shape[0]):
        if(x[j][2]==0 and y[j]==1):
            c2+=1
print(c1,c2)



In [None]:
x,y=Synth_data2()

c1=0
c2=0
for j in range(x.shape[0]):
        if(x[j][2]==1 and y[j]==-1):
            c1+=1       

for j in range(x.shape[0]):
        if(x[j][2]==1 and y[j]==1):
            c2+=1
print(c1,c2)
c1=0
c2=0
for j in range(x.shape[0]):
        if(x[j][2]==0 and y[j]==-1):
            c1+=1       

for j in range(x.shape[0]):
        if(x[j][2]==0 and y[j]==1):
            c2+=1
print(c1,c2)


In [None]:
#3y axis axis in one

import matplotlib.pyplot as plt


def make_patch_spines_invisible(ax):
    ax.set_frame_on(True)
    ax.patch.set_visible(False)
    for sp in ax.spines.values():
        sp.set_visible(False)


fig, host = plt.subplots()
fig.subplots_adjust(right=0.75)

par1 = host.twinx()
par2 = host.twinx()

# Offset the right spine of par2.  The ticks and label have already been
# placed on the right by twinx above.
par2.spines["right"].set_position(("axes", 1.2))
# Having been created by twinx, par2 has its frame off, so the line of its
# detached spine is invisible.  First, activate the frame but make the patch
# and spines invisible.
make_patch_spines_invisible(par2)
# Second, show the right spine.
par2.spines["right"].set_visible(True)

p1, = host.plot([0, 1, 2], [0, 1, 2], "b-", label="Density")
p2, = par1.plot([0, 1, 2], [0, 3, 2], "r-", label="Temperature")
p3, = par2.plot([0, 1, 2], [50, 30, 15], "g-", label="Velocity")

host.set_xlim(0, 2)
host.set_ylim(0, 2)
par1.set_ylim(0, 4)
par2.set_ylim(1, 65)

host.set_xlabel("Distance")
host.set_ylabel("Density")
par1.set_ylabel("Temperature")
par2.set_ylabel("Velocity")

host.yaxis.label.set_color(p1.get_color())
par1.yaxis.label.set_color(p2.get_color())
par2.yaxis.label.set_color(p3.get_color())

tkw = dict(size=4, width=1.5)
host.tick_params(axis='y', colors=p1.get_color(), **tkw)
par1.tick_params(axis='y', colors=p2.get_color(), **tkw)
par2.tick_params(axis='y', colors=p3.get_color(), **tkw)
host.tick_params(axis='x', **tkw)

lines = [p1, p2, p3]

host.legend(lines, [l.get_label() for l in lines])

plt.show()



In [8]:
##############Synthetic data disparate impact################

import math
import numpy as np
import matplotlib.pyplot as plt # for plotting stuff
from random import seed, shuffle
from scipy.stats import multivariate_normal # generating synthetic data
SEED = 1122334455
seed(SEED) # set the random seed so that the random permutations can be reproduced again
np.random.seed(SEED)

def generate_synthetic_data(plot_data=False):

    """
        Code for generating the synthetic data.
        We will have two non-sensitive features and one sensitive feature.
        A sensitive feature value of 0.0 means the example is considered to be in protected group (e.g., female) and 1.0 means it's in non-protected group (e.g., male).
    """

    n_samples = 1000000 # generate these many data points per class
    disc_factor = math.pi / 4.0 # this variable determines the initial discrimination in the data -- decraese it to generate more discrimination

    def gen_gaussian(mean_in, cov_in, class_label):
        nv = multivariate_normal(mean = mean_in, cov = cov_in)
        X = nv.rvs(n_samples)
        y = np.ones(n_samples, dtype=float) * class_label
        return nv,X,y

    """ Generate the non-sensitive features randomly """
    # We will generate one gaussian cluster for each class
    mu1, sigma1 = [2, 2], [[5, 1], [1, 5]]
    mu2, sigma2 = [-2,-2], [[10, 1], [1, 3]]
    nv1, X1, y1 = gen_gaussian(mu1, sigma1, 1) # positive class
    nv2, X2, y2 = gen_gaussian(mu2, sigma2, -1) # negative class

    # join the posisitve and negative class clusters
    X = np.vstack((X1, X2))
    y = np.hstack((y1, y2))

    # shuffle the data
    perm = [*range(0,n_samples*2)]
    shuffle(perm)
    X = X[perm]
    y = y[perm]
    
    rotation_mult = np.array([[math.cos(disc_factor), -math.sin(disc_factor)], [math.sin(disc_factor), math.cos(disc_factor)]])
    X_aux = np.dot(X, rotation_mult)


    """ Generate the sensitive feature here """
    x_control = [] # this array holds the sensitive feature value
    for i in range (0, len(X)):
        x = X_aux[i]

        # probability for each cluster that the point belongs to it
        p1 = nv1.pdf(x)
        p2 = nv2.pdf(x)
        
        # normalize the probabilities from 0 to 1
        s = p1+p2
        p1 = p1/s
        p2 = p2/s
        
        r = np.random.uniform() # generate a random number from 0 to 1

        if r < p1: # the first cluster is the positive class
            x_control.append(1.0) # 1.0 means its male
        else:
            x_control.append(0.0) # 0.0 -> female

    x_control = np.array(x_control)

    """ Show the data """
    if plot_data:
        num_to_draw = 200 # we will only draw a small number of points to avoid clutter
        x_draw = X[:num_to_draw]
        y_draw = y[:num_to_draw]
        x_control_draw = x_control[:num_to_draw]

        X_s_0 = x_draw[x_control_draw == 0.0]
        X_s_1 = x_draw[x_control_draw == 1.0]
        y_s_0 = y_draw[x_control_draw == 0.0]
        y_s_1 = y_draw[x_control_draw == 1.0]
        plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=30, linewidth=1.5, label= "Prot. +ve")
        plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=30, linewidth=1.5, label = "Prot. -ve")
        plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=30, label = "Non-prot. +ve")
        plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=30, label = "Non-prot. -ve")

        
        plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution
        plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
        plt.legend(loc=2, fontsize=15)
        plt.xlim((-15,10))
        plt.ylim((-10,15))
       #         plt.show()
#       plt.savefig("img/data.png")
        
#     x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
    return X,y,x_control

def Synth_data():
    X,y,x_control=generate_synthetic_data(True)
    X=np.column_stack((X,x_control))
#     print(X)
    return(X,y,x_control)

In [None]:

xx,yy,x_control=Synth_data()
print(yy.shape[0])
print(xx.shape[1])
gamma=0.2
u1,u2=min_max_lp_all(x_control,gamma)

2000000
3
2 2000000


In [7]:
############################---LP3-with epsilon-with accuracy-########################
#############----Synthetic with 5D MIN Max COMMAND----###################### 

import pulp as p 
def min_max_lp_all(data,gamma):
    x1=data
    
    
    n=x1.shape[0]
    
    R = np.zeros((2, n), dtype = int)
    for i in range(n):
        if x1[i]== 1:
            R[0][i]= 1
        else:
            R[1][i]= 1
            
    
    m=R.shape[0]
    print(m,n)
    
    X=np.zeros(n+1,dtype=p.LpVariable)
    Lp_prob = p.LpProblem('Problem', p.LpMinimize)  
    sizes=np.zeros(m,dtype=int)
    for i in range(m):
        count=0
        for j in range(n):
            if R[i][j]==1:
                count=count+1
                
        sizes[i]=count
   
    #X[n]=z() n last value of X
   
   
    #X[]=0 to n-1

    for i in range(n):
        var1=str(i)
        
        X[i]=p.LpVariable(var1,lowBound=0,upBound=1,cat='Integer')
       
        
    X[n] =  p.LpVariable("z1",lowBound=0)
  
  

    #########objective function#####################
    
    Lp_prob += X[n] 

    ##############constraint#################
    for i in range(2*m):
        if i<m:
            Lp_prob += X[n] >= p.lpSum([2*(X[j]-0.5)*R[i][j] for j in range(n)])
            Lp_prob += p.lpSum([2*(X[j]-0.5)*R[i][j] for j in range(n)]) >= abs(2*gamma-1)*sizes[i]
        else:        
            Lp_prob += X[n] >= p.lpSum([-1*2*(X[j]-0.5)*R[i-m][j] for j in range(n)])
            
         
    ##### r(y_train values real labels of data)
    
  

    
    Lp_prob += X[n] <=1000000
    
    #####################################
    status = Lp_prob.solve()   # Solver 
    print(p.LpStatus[status]) 
    print("discripency is:")        
    print(p.value(Lp_prob.objective))
  
   
    Synth1={}
    Synth2={}
    # # Printing the final solution 
    for i in range(n):
        if(p.value(X[i])==1):
            Synth1[i]=1 
            Synth2[i]=-1
        else:
            Synth1[i]=-1
            Synth2[i]=1
    Synthu1=Synth1  
    Synthu2=Synth2  
    return Synthu1,Synthu2  


  