In [1]:
from sklearn.feature_extraction import FeatureHasher
import pandas as pd
from sklearn import preprocessing
import numpy as np
import cvxpy as cp
from load_data import load_adult
from sklearn import svm
from sklearn.metrics import accuracy_score
from measures import equalized_odds_measure_TP
from sklearn.model_selection import GridSearchCV
from cvxopt import matrix
import numpy as np
from numpy import linalg
import cvxopt
import cvxopt.solvers
from sklearn.base import BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
def get_sensitive_attr(data_train, data_test, col_name):
    train_sensitive_attr = {}
    test_sensitive_attr = {}
    for i in range(data_train.shape[0]):
        try:
            train_sensitive_attr[data_train.loc[i, col_name]].append(i)
        except:
            train_sensitive_attr[data_train.loc[i, col_name]] = [i]
            
    for i in range(data_test.shape[0]):
        try:
            test_sensitive_attr[data_test.loc[i, col_name]].append(i)
        except:
            test_sensitive_attr[data_test.loc[i, col_name]] = [i]
            
    return train_sensitive_attr, test_sensitive_attr

In [3]:
def shared_loss(W, X, Y):
    s = 0
    sensitive_group_keys = list(train_sensitive_attr.keys())
    for x in range(len(sensitive_group_keys)):
        L_s = 0
        L_t_hat = 0
        for i in train_sensitive_attr[sensitive_group_keys[x]]:
            L_t_hat += cp.maximum(0, 1 - Y[i]*(cp.matmul(W, X[i, :]))) #hinge_loss(i)
        L_s = L_t_hat / float(len(train_sensitive_attr[sensitive_group_keys[x]]))
        s += L_s
    return s

In [4]:
def sum_squared_weights(W):
    s = 0
    for i in range(W.shape[0]):
        for j in range(W.shape[1]): 
            s += cp.norm(W[i, j], p=2)**2
    return s

In [5]:
def sum_group_losses(W, X, Y):
    s = 0
    sensitive_group_keys = list(train_sensitive_attr.keys())
    for x in range(len(sensitive_group_keys)):
        L_s = 0
        L_t_hat = 0
        for i in train_sensitive_attr[sensitive_group_keys[x]]:
            L_t_hat += cp.maximum(0, 1 - Y[i]*(cp.matmul(W[x, :], X[i, :]))) #hinge_loss(i)
        L_s = L_t_hat / float(len(train_sensitive_attr[sensitive_group_keys[x]]))
        s += L_s
    return s

In [6]:
def predict(X, Y, W, b):
    preds = []
    for i in range(X.shape[0]):
        if W[0] * X[i, 0] + W[1] * X[i, 1] + b >= 0:
            preds.append(1)
        else:
            preds.append(-1)
    return preds

In [7]:
#Only for STL and MTL

def make_SM_constraints(train_sensitive_attr, W0, X, Y):
    constraints = []
    dict_keys = list(train_sensitive_attr.keys())
    u_1_pos = np.zeros(X.shape[1])
    u_1_pos_count = 0
    u_1_neg = np.zeros(X.shape[1])
    u_1_neg_count = 0
    for i in train_sensitive_attr[dict_keys[0]]:
        if Y[i] == 1:
            u_1_pos += X[i, :]
            u_1_pos_count += 1
        else:
            u_1_neg += X[i, :]
            u_1_neg_count += 1
    u_1_pos = u_1_pos / u_1_pos_count
    u_1_neg = u_1_neg / u_1_neg_count
    
    for i in range(1, len(dict_keys)):
        temp_u_pos = np.zeros(X.shape[1])
        temp_u_pos_count = 0
        temp_u_neg = np.zeros(X.shape[1])
        temp_u_neg_count = 0
        for x in train_sensitive_attr[dict_keys[i]]:
            if Y[x] == 1:
                temp_u_pos += X[i, :]
                temp_u_pos_count += 1 
            else:
                temp_u_neg += X[i, :]
                temp_u_neg_count += 1
        temp_u_pos = temp_u_pos / temp_u_pos_count
        temp_u_neg = temp_u_neg / temp_u_neg_count
        constraints.append(cp.matmul(W0, u_1_pos - temp_u_pos) == 0)
        constraints.append(cp.matmul(W0, u_1_neg - temp_u_neg) == 0)
    return constraints
        

In [8]:
train_data = pd.read_csv('final_hashed_train_data.csv')
test_data = pd.read_csv('final_hashed_test_data.csv')

In [9]:
train_data_X_with_race = train_data.drop('income', axis=1)
test_data_X_with_race = test_data.drop('income', axis=1)

In [10]:
train_data_X_no_race = train_data_X_with_race.drop('race', axis=1)
test_data_X_no_race = test_data_X_with_race.drop('race', axis=1)

In [11]:
train_data_Y = train_data.loc[:, 'income']
test_data_Y = test_data.loc[:, 'income']

In [12]:
data_train = pd.read_csv(
        "./datasets/adult/adult.data",
        names=[
            "Age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
            "occupation", "relationship", "race", "gender", "capital gain", "capital loss",
            "hours per week", "native-country", "income"],
            # dtype=object,
            # sep=r'\s*,\s*',
            # engine='python',
            #na_values="?"
            )

In [13]:
data_test = pd.read_csv(
        "./datasets/adult/adult.test",
        names=[
            "Age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
            "occupation", "relationship", "race", "gender", "capital gain", "capital loss",
            "hours per week", "native-country", "income"],
            # dtype=object,
            # sep=r'\s*,\s*',
            # engine='python',
            #na_values="?"
            )

In [14]:
train_sensitive_attr, test_sensitive_attr = get_sensitive_attr(data_train, data_test, 'race')

In [15]:
num_sensitive_features = len(train_sensitive_attr.keys())
num_bin
s = 5

In [16]:
train_sensitive_attr.keys()

dict_keys([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo', ' Other'])

In [17]:
train_data_X_no_race.shape

(32561, 5)

In [93]:
constraints

[Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ()))]

In [96]:
W = cp.Variable((num_sensitive_features + 1, num_bins))
theta = 0.7
Lambda = 0.6
#rho = cp.Variable(1)
rho = 0.8 #cp.Parameter(shape=1, sign="positive", value=0.5)

temp_mat = train_data_X_no_race.as_matrix()
temp_Y = train_data_Y.as_matrix()
#sum_group_losses(W, temp_mat, train_data_Y)
constraints = make_SM_constraints(train_sensitive_attr, W[0, :], temp_mat, temp_Y)

obj = cp.Minimize(rho * Lambda * cp.norm(W[0, :], p =2)**2 + 
            theta * shared_loss(W[0, :], temp_mat, temp_Y) + 
            (rho * (1 - Lambda) / float(num_sensitive_features)) * sum_squared_weights(W[1:, :]) + 
            (1 - theta) / float(num_sensitive_features) * sum_group_losses(W[1:, :], temp_mat, temp_Y))
prob = cp.Problem(obj, constraints)
prob.solve()
print("status:", prob.status)
print("optimal value", W.value)


  import sys
  


status: optimal
optimal value [[ 1.85311391e-14 -3.01526883e-14  2.87753991e-14 -9.16510686e-15
   3.43138225e-16]
 [ 6.99455220e-02 -1.54252792e-01 -6.33727918e-02  9.03456445e-02
  -2.14339263e-01]
 [ 6.20408559e-02 -1.71430805e-01 -8.36727085e-02  1.09389948e-01
  -2.60406700e-01]
 [ 5.38102543e-02 -9.89257498e-02  3.21147136e-02 -7.87465913e-02
  -2.62038318e-01]
 [ 4.78146957e-02 -1.47195427e-01  5.72749196e-02  4.78146959e-02
  -3.03114208e-01]
 [ 1.38376378e-02 -2.23131917e-01  3.11346864e-02  3.80535049e-02
  -2.78482472e-01]]


In [19]:
constraints

[Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ())),
 Equality(Expression(AFFINE, UNKNOWN, ()), Constant(CONSTANT, ZERO, ()))]

In [88]:
sensitive_keys = list(train_sensitive_attr.keys())
def predict(X, Y, W, sensitive_keys, sensitive_dict):
    preds = []
    for i in range(len(sensitive_keys)):
        #print(sensitive_keys[i])
        for j in range(len(sensitive_dict[sensitive_keys[i]])):
            #print(sensitive_dict[sensitive_keys[i]][j])
            if np.matmul(W[0, :], X[sensitive_dict[sensitive_keys[i]][j]]) + np.matmul(W[i, :], X[sensitive_dict[sensitive_keys[i]][j]]) >= 0:
                preds.append(1)
            else:
                preds.append(-1)
    return preds

In [97]:
temp_W = np.array(W.value)
predictions = predict(temp_mat, temp_Y, temp_W, sensitive_keys, train_sensitive_attr)

In [98]:
def accuracy(Y, predictions, sensitive_keys, sensitive_dict):
    correct = 0
    predict_count = 0
    for i in range(len(sensitive_keys)):
        #print(sensitive_keys[i])
        for j in range(len(sensitive_dict[sensitive_keys[i]])):
            #print(sensitive_dict[sensitive_keys[i]][j])
            if Y[sensitive_dict[sensitive_keys[i]][j]] == predictions[predict_count]:
                correct += 1
            predict_count += 1
            
    print('Accuracy :', correct / len(predictions))

In [99]:
accuracy(train_data_Y, predictions, sensitive_keys, train_sensitive_attr)

Accuracy : 0.587789072817174


In [49]:
train_sensitive_attr

{' White': [0,
  1,
  2,
  5,
  7,
  8,
  9,
  12,
  16,
  17,
  18,
  19,
  20,
  23,
  24,
  25,
  26,
  28,
  29,
  30,
  32,
  33,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  46,
  47,
  48,
  49,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  94,
  95,
  96,
  97,
  98,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  118,
  119,
  120,
  121,
  123,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  135,
  136,
  137,
  138,
  140,
  141,
  142,
  143,
  144,
  146,
  147,
  148,
  149,
  150,
  151,
  153,
  154,
  155,
  156,
  158,
  160,
  161,
  164,
  165,
  166,
  167,
  168,
  169,
  170,
  171,
  172,
  173,
  174,
  175,
  178,
  179,
  181,
  183,
  184,
  186,
  187,
  188,


In [71]:
len(predictions)

32561

In [76]:
pcount = 0
ncount = 0
for i in range(len(train_data_Y)):
    if train_data_Y[i] == -1:
        ncount += 1
    else:
        pcount += 1

In [78]:
ncount

24720

In [79]:
pcount

7841