In [4]:
from numba.decorators import jit
import numpy as np

In [5]:
#@jit
def distances(X, v, alpha, N, P, k):
    dists = np.zeros((N, P))
    for i in range(N):
        for p in range(P):
            for j in range(k):    
                dists[i, j] += (X[i, p] - v[j, p]) * (X[i, p] - v[j, p]) * alpha[p]
    return dists

In [6]:
#@jit
def M_nk(dists, N, k):
    M_nk = np.zeros((N, k))
    exp = np.zeros((N, k))
    denom = np.zeros(N)
    for i in range(N):
        for j in range(k):
            exp[i, j] = np.exp(-1 * dists[i, j])
            denom[i] += exp[i, j]
        for j in range(k):
            if denom[i]:
                M_nk[i, j] = exp[i, j] / denom[i]
            else:
                M_nk[i, j] = exp[i, j] / 1e-6
    return M_nk

In [7]:
#@jit    
def M_k(M_nk, N, k):
    M_k = np.zeros(k)
    for j in range(k):
        for i in range(N):
            M_k[j] += M_nk[i, j]
        M_k[j] /= N 
    return M_k

In [8]:
#@jit        
def x_n_hat(X, M_nk, v, N, P, k):
    x_n_hat = np.zeros((N, P))
    L_x = 0.0
    for i in range(N):
        for p in range(P):
            for j in range(k):
                x_n_hat[i, p] += M_nk[i, j] * v[j, p]
            L_x += (X[i, p] - x_n_hat[i, p]) * (X[i, p] - x_n_hat[i, p])
    return x_n_hat, L_x

In [9]:
#@jit
def yhat(M_nk, y, w, N, k):
    yhat = np.zeros(N)
    L_y = 0.0
    for i in range(N):
        for j in range(k):
            yhat[i] += M_nk[i, j] * w[j]
        yhat[i] = 1e-6 if yhat[i] <= 0 else yhat[i]
        yhat[i] = 0.999 if yhat[i] >= 1 else yhat[i]
        L_y += -1 * y[i] * np.log(yhat[i]) - (1.0 - y[i]) * np.log(1.0 - yhat[i])
    return yhat, L_y

In [51]:
data_sensitive.shape

(191, 11)

In [55]:
data_nonsensitive.shape

(1666, 11)

In [58]:
my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
my_list[:2]

[1, 2]

In [57]:
my_list[2:7]

[3, 4, 5, 6, 7]

In [10]:

#@jit
def LFR(params, data_sensitive, data_nonsensitive, y_sensitive, 
        y_nonsensitive,  k=10, A_x = 1e-4, A_y = 0.1, A_z = 1000, results=0):
    
    LFR.iters += 1 
    Ns, P = data_sensitive.shape
    Nns, _ = data_nonsensitive.shape
    
    alpha0 = params[:P]
    alpha1 = params[P : 2 * P]
    w = params[2 * P : (2 * P) + k]
    v = np.matrix(params[(2 * P) + k:]).reshape((k, P))
        
    dists_sensitive = distances(data_sensitive, v, alpha1, Ns, P, k)
    dists_nonsensitive = distances(data_nonsensitive, v, alpha0, Nns, P, k)

    M_nk_sensitive = M_nk(dists_sensitive, Ns, k)
    M_nk_nonsensitive = M_nk(dists_nonsensitive, Nns, k)
    
    M_k_sensitive = M_k(M_nk_sensitive, Ns, k)
    M_k_nonsensitive = M_k(M_nk_nonsensitive, Nns, k)
    
    L_z = 0.0
    for j in range(k):
        L_z += abs(M_k_sensitive[j] - M_k_nonsensitive[j])

    x_n_hat_sensitive, L_x1 = x_n_hat(data_sensitive, M_nk_sensitive, v, Ns, P, k)
    x_n_hat_nonsensitive, L_x2 = x_n_hat(data_nonsensitive, M_nk_nonsensitive, v, Nns, P, k)
    L_x = L_x1 + L_x2

    yhat_sensitive, L_y1 = yhat(M_nk_sensitive, y_sensitive, w, Ns, k)
    yhat_nonsensitive, L_y2 = yhat(M_nk_nonsensitive, y_nonsensitive, w, Nns, k)
    L_y = L_y1 + L_y2

    criterion = A_x * L_x + A_y * L_y + A_z * L_z

    if LFR.iters % 250 == 0:
        print(LFR.iters, criterion)
      
    if results:
        return yhat_sensitive, yhat_nonsensitive, M_nk_sensitive, M_nk_nonsensitive
    else:
        return criterion
LFR.iters = 0

In [11]:
#del helpers

In [12]:
import pickle
import os
import numpy as np
import csv
import scipy.optimize as optim
#from helpers import *
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [13]:
# k = number of propotypes
k = 12
firstLine = True # assume csv has header column

In [14]:

#with open('', 'rb') as f:
 #   filk = csv.reader(f)
  #  dat = []
   # for row in filk:
    #    if firstLine:
     #       firstLine = False
      #      continue
       # dat.append([float(r) for r in row])
#
#print('finished reading data')

In [72]:
data = pd.read_csv("wage_num_header.csv")

In [73]:
data.head()

Unnamed: 0.1,Unnamed: 0,year,age,maritl,education,region,jobclass,health,health_ins,logwage,wage,race_cat,high_wage
0,1,2006,18,1,1,2,1,1,2,4.318063,75.043154,0,1
1,2,2004,24,1,4,2,2,2,2,4.255273,70.47602,0,1
2,3,2003,45,2,3,2,1,1,1,4.875061,130.982177,0,2
3,5,2005,50,4,2,2,2,1,1,4.318063,75.043154,0,1
4,6,2008,54,2,4,2,2,2,1,4.845098,127.115744,0,1


In [74]:
data = np.array(data)

In [75]:
y = np.array(data[:,-1]).flatten()
y

array([1., 1., 2., ..., 1., 1., 1.])

In [76]:
data = data[:,:-1]
data

array([[1.00000000e+00, 2.00600000e+03, 1.80000000e+01, ...,
        4.31806333e+00, 7.50431540e+01, 0.00000000e+00],
       [2.00000000e+00, 2.00400000e+03, 2.40000000e+01, ...,
        4.25527251e+00, 7.04760196e+01, 0.00000000e+00],
       [3.00000000e+00, 2.00300000e+03, 4.50000000e+01, ...,
        4.87506126e+00, 1.30982177e+02, 0.00000000e+00],
       ...,
       [2.99800000e+03, 2.00500000e+03, 2.70000000e+01, ...,
        4.19312460e+00, 6.62294083e+01, 1.00000000e+00],
       [2.99900000e+03, 2.00500000e+03, 2.70000000e+01, ...,
        4.47712125e+00, 8.79810328e+01, 0.00000000e+00],
       [3.00000000e+03, 2.00900000e+03, 5.50000000e+01, ...,
        4.50514998e+00, 9.04819134e+01, 0.00000000e+00]])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=42)

In [78]:
sensitive_train = X_train[:,-1]
sensitive_train.size

1857

In [79]:
sensitive_test = X_test[:,-1]
sensitive_test.size

916

In [80]:
sensitive_train_idx = np.array(np.where(sensitive_train==1))[0].flatten()
nonsensitive_train_idx = np.array(np.where(sensitive_train==0))[0].flatten()

In [81]:
sensitive_test_idx = np.array(np.where(sensitive_test==1))[0].flatten()
nonsensitive_test_idx = np.array(np.where(sensitive_test==0))[0].flatten()

In [82]:
X_train = X_train[:,:-1]

In [83]:
data_sensitive = X_train[sensitive_train_idx,:]
data_nonsensitive = X_train[nonsensitive_train_idx,:]
y_sensitive = y_train[sensitive_train_idx]
y_nonsensitive = y_train[nonsensitive_train_idx]

In [112]:
src = 'wage_num.csv'
if os.path.isfile(src):
    with open(src, 'r') as f:
        rez = f.read().split('\n')[:-1]

list_of_lists = []
for row in rez:
    row_list = row.split(',')
    row_list = row_list[:-2]
    list_of_lists.append(row_list)

flat_list = []
for sublist in list_of_lists:
    for item in sublist:
        flat_list.append(item)
    
    rez = np.array([float(i) for i in flat_list])
    print(LFR(rez, data_sensitive, data_nonsensitive, y_sensitive, 
              y_nonsensitive, k, 1e-4, 0.1, 1000, 0))
else:
    print('not loading')
    rez = np.random.uniform(size=data.shape[1] * 2 + k + data.shape[1] * k)

bnd = []
for i, k2 in enumerate(rez):
    if i < data.shape[1] * 2 or i >= data.shape[1] * 2 + k:
        bnd.append((None, None))
    else:
        bnd.append((0, 1))

rez = optim.fmin_l_bfgs_b(LFR, x0=rez, epsilon=1e-5, 
                          args=(training_sensitive, training_nonsensitive, 
                                ytrain_sensitive, ytrain_nonsensitive, k, 1e-4,
                                0.1, 1000, 0),
                          bounds = bnd, approx_grad=True, maxfun=150000, 
                          maxiter=150000)

ValueError: cannot reshape array of size 0 into shape (12,11)

In [133]:
src = 'wage_num.csv'
if os.path.isfile(src):
    with open(src, 'r') as f:
        rez = f.read().split('\n')[:-1]

In [134]:
rez[1]

'2,2004,24,1,4,2,2,2,2,4.25527250510331,70.4760196469445,0,1'

In [135]:
len(rez)

2772

In [136]:
list_of_lists = []
for row in rez:
    row_list = row.split(',')
    row_list = row_list[:-2]
    list_of_lists.append(row_list)

In [137]:
list_of_lists

[['1',
  '2006',
  '18',
  '1',
  '1',
  '2',
  '1',
  '1',
  '2',
  '4.31806333496276',
  '75.0431540173515'],
 ['2',
  '2004',
  '24',
  '1',
  '4',
  '2',
  '2',
  '2',
  '2',
  '4.25527250510331',
  '70.4760196469445'],
 ['3',
  '2003',
  '45',
  '2',
  '3',
  '2',
  '1',
  '1',
  '1',
  '4.8750612633917',
  '130.982177377461'],
 ['5',
  '2005',
  '50',
  '4',
  '2',
  '2',
  '2',
  '1',
  '1',
  '4.31806333496276',
  '75.0431540173515'],
 ['6',
  '2008',
  '54',
  '2',
  '4',
  '2',
  '2',
  '2',
  '1',
  '4.84509804001426',
  '127.115743812184'],
 ['9',
  '2006',
  '41',
  '1',
  '3',
  '2',
  '2',
  '2',
  '1',
  '4.77815125038364',
  '118.884359339886'],
 ['10',
  '2004',
  '52',
  '2',
  '2',
  '2',
  '2',
  '2',
  '1',
  '4.85733249643127',
  '128.680488220624'],
 ['11',
  '2007',
  '45',
  '4',
  '3',
  '2',
  '2',
  '1',
  '1',
  '4.76342799356294',
  '117.146816914805'],
 ['12',
  '2007',
  '34',
  '2',
  '2',
  '2',
  '1',
  '2',
  '2',
  '4.39794000867204',
  '81.2832532

In [138]:
flat_list = []
for sublist in list_of_lists:
    for item in sublist:
        flat_list.append(item)

In [139]:
flat_list

['1',
 '2006',
 '18',
 '1',
 '1',
 '2',
 '1',
 '1',
 '2',
 '4.31806333496276',
 '75.0431540173515',
 '2',
 '2004',
 '24',
 '1',
 '4',
 '2',
 '2',
 '2',
 '2',
 '4.25527250510331',
 '70.4760196469445',
 '3',
 '2003',
 '45',
 '2',
 '3',
 '2',
 '1',
 '1',
 '1',
 '4.8750612633917',
 '130.982177377461',
 '5',
 '2005',
 '50',
 '4',
 '2',
 '2',
 '2',
 '1',
 '1',
 '4.31806333496276',
 '75.0431540173515',
 '6',
 '2008',
 '54',
 '2',
 '4',
 '2',
 '2',
 '2',
 '1',
 '4.84509804001426',
 '127.115743812184',
 '9',
 '2006',
 '41',
 '1',
 '3',
 '2',
 '2',
 '2',
 '1',
 '4.77815125038364',
 '118.884359339886',
 '10',
 '2004',
 '52',
 '2',
 '2',
 '2',
 '2',
 '2',
 '1',
 '4.85733249643127',
 '128.680488220624',
 '11',
 '2007',
 '45',
 '4',
 '3',
 '2',
 '2',
 '1',
 '1',
 '4.76342799356294',
 '117.146816914805',
 '12',
 '2007',
 '34',
 '2',
 '2',
 '2',
 '1',
 '2',
 '2',
 '4.39794000867204',
 '81.2832532842527',
 '13',
 '2005',
 '35',
 '1',
 '2',
 '2',
 '2',
 '2',
 '1',
 '4.49415459401844',
 '89.4924795180001

In [143]:
flat_list = [float(i) for i in flat_list]

In [144]:
type(flat_list[1])

float

In [146]:
array = np.array(flat_list)

In [147]:
len(array)

30492

In [113]:
Ns, P = data_sensitive.shape

In [114]:
Nns, _ = data_nonsensitive.shape

In [148]:
params = np.array([float(i) for i in flat_list])
len(params)

30492

In [149]:
alpha0 = params[:P]
alpha0

array([1.00000000e+00, 2.00600000e+03, 1.80000000e+01, 1.00000000e+00,
       1.00000000e+00, 2.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       2.00000000e+00, 4.31806333e+00, 7.50431540e+01])

In [150]:
alpha1 = params[P : 2 * P]
alpha1[2]

24.0

In [151]:
len(params)

30492

In [152]:
alpha1 = params[1:4]
alpha1

array([2.006e+03, 1.800e+01, 1.000e+00])

In [153]:
w = params[2 * P : (2 * P) + k] 

In [154]:
v = np.matrix(params[(2 * P) + k:]).reshape((k, P))

ValueError: cannot reshape array of size 30458 into shape (12,11)

In [None]:
#with open('wage_num.csv', 'rb') as f:
#  indices = pickle.load(f)

In [28]:
#idx=indices[0]
#training_sensitive = data_sensitive[idx,:]
#ytrain_sensitive = y_sensitive[idx]
#idx2=indices[1]
#test_sensitive = data_sensitive[idx2,:]
#ytest_sensitive = y_sensitive[idx2]
#indices.append(idx)
#indices.append(idx2)

#idx=indices[2]
#training_nonsensitive = data_nonsensitive[idx,:]
#ytrain_nonsensitive = y_nonsensitive[idx]
#idx2=indices[3]
#test_nonsensitive = data_nonsensitive[idx2,:]
#ytest_nonsensitive = y_nonsensitive[idx2]

In [29]:
#indices = []
#
#idx=np.array(list(set(np.random.randint(0, data_sensitive.shape[0], 3000))))
#training_sensitive = data_sensitive[idx,:]
#ytrain_sensitive = y_sensitive[idx]
#idx2=np.array([i for i in range(data_sensitive.shape[0]) if i not in idx])
#test_sensitive = data_sensitive[idx2,:]
#ytest_sensitive = y_sensitive[idx2]
#indices.append(idx)
#indices.append(idx2)
#
#idx=np.array(list(set(np.random.randint(0, data_nonsensitive.shape[0], 6000))))
#training_nonsensitive = data_nonsensitive[idx,:]
#ytrain_nonsensitive = y_nonsensitive[idx]
#idx2=np.array([i for i in range(data_nonsensitive.shape[0]) if i not in idx])
#test_nonsensitive = data_nonsensitive[idx2,:]
#ytest_nonsensitive = y_nonsensitive[idx2]
#indices.append(idx)
#indices.append(idx2)
# 
##with open('d:/dropbox/crime_lab_ny/fair_algorithms/data/indices_zemel.csv', 'wb') as f:
##  pickle.dump(indices, f)

In [30]:
#training = np.concatenate((training_sensitive, training_nonsensitive))
#ytrain = np.concatenate((ytrain_sensitive, ytrain_nonsensitive))

#test = np.concatenate((test_sensitive, test_nonsensitive))
#ytest = np.concatenate((ytest_sensitive, ytest_nonsensitive))

array([0.76682394, 0.62261533, 0.95983169, 0.22081032, 0.24562696,
       0.8356629 , 0.6614533 , 0.95853011, 0.82994536, 0.37458413,
       0.6863825 , 0.89890159])

(2772, 12)

12

In [34]:
rez = np.random.uniform(size=data.shape[1] * 2 + k + data.shape[1] * k)

result = LFR(rez, data_sensitive, data_nonsensitive, y_sensitive, 
              y_nonsensitive, k, 1e-4, 0.1, 1000, 0)

print(result)
    

ValueError: cannot reshape array of size 146 into shape (12,11)

In [35]:
src = 'wage_num.csv'
if os.path.isfile(src):
    with open(src, 'rb') as f:
        rez = str(f.read()).split('\n')[:-1]
    rez = np.array([float(r) for r in rez])
    result = LFR(rez, data_sensitive, data_nonsensitive, y_sensitive, 
              y_nonsensitive, k, 1e-4, 0.1, 1000, 0)
    print(result)
else:
    print('not loading')
    rez = np.random.uniform(size=data.shape[1] * 2 + k + data.shape[1] * k)


ValueError: cannot reshape array of size 0 into shape (12,11)

In [36]:
src = 'wage_num.csv'
if os.path.isfile(src):
    with open(src, 'rb') as f:
        obj = f.read()

In [37]:
str(obj).split('\n')[0]

"b'1,2006,18,1,1,2,1,1,2,4.31806333496276,75.0431540173515,0,1\\r\\n2,2004,24,1,4,2,2,2,2,4.25527250510331,70.4760196469445,0,1\\r\\n3,2003,45,2,3,2,1,1,1,4.8750612633917,130.982177377461,0,2\\r\\n5,2005,50,4,2,2,2,1,1,4.31806333496276,75.0431540173515,0,1\\r\\n6,2008,54,2,4,2,2,2,1,4.84509804001426,127.115743812184,0,1\\r\\n9,2006,41,1,3,2,2,2,1,4.77815125038364,118.884359339886,1,1\\r\\n10,2004,52,2,2,2,2,2,1,4.85733249643127,128.680488220624,0,1\\r\\n11,2007,45,4,3,2,2,1,1,4.76342799356294,117.146816914805,0,1\\r\\n12,2007,34,2,2,2,1,2,2,4.39794000867204,81.2832532842527,0,1\\r\\n13,2005,35,1,2,2,2,2,1,4.49415459401844,89.4924795180001,0,1\\r\\n14,2003,39,2,4,2,1,2,1,4.90308998699194,134.705375118879,0,2\\r\\n15,2009,54,2,2,2,2,2,1,4.90308998699194,134.705375118879,0,2\\r\\n16,2009,51,2,3,2,1,2,1,4.50514997831991,90.4819133566401,0,1\\r\\n18,2006,50,2,5,2,2,2,2,5.36055176170483,212.842352315711,0,2\\r\\n19,2007,56,2,4,2,1,1,1,4.86102634169657,129.156693004704,0,2\\r\\n20,2003,37,1,3

In [38]:
print('hej')

hej


In [39]:

bnd = []
for i, k2 in enumerate(rez):
    if i < data.shape[1] * 2 or i >= data.shape[1] * 2 + k:
        bnd.append((None, None))
    else:
        bnd.append((0, 1))

rez = optim.fmin_l_bfgs_b(LFR, x0=rez, epsilon=1e-5, 
                          args=(training_sensitive, training_nonsensitive, 
                                ytrain_sensitive, ytrain_nonsensitive, k, 1e-4,
                                0.1, 1000, 0),
                          bounds = bnd, approx_grad=True, maxfun=150000, 
                          maxiter=150000)



NameError: name 'training_sensitive' is not defined