In [176]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
from __future__ import division
import os,sys
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn import feature_extraction
from sklearn import preprocessing
from random import seed, shuffle
import urllib

Dataset preparation code was taken from https://github.com/mbilalzafar/fair-classification/tree/master/disparate_mistreatment

In [0]:
def split_into_train_test(x_all, y_all, x_control_all, train_fold_size):

    split_point = int(round(float(x_all.shape[0]) * train_fold_size))
    x_all_train = x_all[:split_point]
    x_all_test = x_all[split_point:]
    y_all_train = y_all[:split_point]
    y_all_test = y_all[split_point:]
    x_control_all_train = {}
    x_control_all_test = {}
    for k in x_control_all.keys():
        x_control_all_train[k] = x_control_all[k][:split_point]
        x_control_all_test[k] = x_control_all[k][split_point:]

    return x_all_train, y_all_train, x_control_all_train, x_all_test, y_all_test, x_control_all_test


In [0]:
def add_intercept(x):

    """ Add intercept to the data before linear classification """
    m,n = x.shape
    intercept = np.ones(m).reshape(m, 1) # the constant b
    return np.concatenate((intercept, x), axis = 1)

In [0]:
#sys.path.insert(0, '../../fair_classification/') # the code for fair classification is in this directory


SEED = 1234
seed(SEED)
np.random.seed(SEED)

"""
    The adult dataset can be obtained from: https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv
    The code will look for the data file in the present directory, if it is not found, it will download them from GitHub.
"""

def check_data_file(fname):
    files = os.listdir("drive/My Drive/KAIST/Lab/COMPAS/") # get the current directory listing
    print ("Looking for file '%s' in the current directory..." % fname)

    if fname not in files:
    
      print("'%s' not found! Downloading from GitHub..." % fname)
      addr = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
      response = urllib.request.urlopen(addr)

      data = response.read()
      print(data)
      fileOut = open("drive/My Drive/KAIST/Lab/COMPAS/"+fname, "w")
      fileOut.write(data.decode("utf-8") )
      fileOut.close()
      print( "'%s' download and saved locally.." % fname)
    else:
      print( "File found in current directory..")
    

def load_compas_data():


	FEATURES_CLASSIFICATION = ["age","race", "sex", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree"] #features to be used for classification
	CONT_VARIABLES = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
	CLASS_FEATURE = "two_year_recid" # the decision variable
	SENSITIVE_ATTRS = ["race"]


	COMPAS_INPUT_FILE = "compas-scores-two-years3.csv"
	check_data_file(COMPAS_INPUT_FILE)

	# load the data and get some stats
	df = pd.read_csv("drive/My Drive/KAIST/Lab/COMPAS/" + COMPAS_INPUT_FILE)
	df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals
	
	# convert to np array
	data = df.to_dict('list')
	for k in data.keys():
		data[k] = np.array(data[k])


	""" Filtering the data """

	# These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis)
	# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. 
	idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30)


	# We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
	idx = np.logical_and(idx, data["is_recid"] != -1)

	# In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
	idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct

	# We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
	idx = np.logical_and(idx, data["score_text"] != "NA")

	# we will only consider blacks and whites for this analysis
	idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian"))

	# select the examples that satisfy this criteria
	for k in data.keys():
		data[k] = data[k][idx]



	""" Feature normalization and one hot encoding """

	# convert class label 0 to -1
	y = data[CLASS_FEATURE]
	y[y==0] = -1

	
	
	print( "\nNumber of people recidivating within two years")
	print( pd.Series(y).value_counts())
	print( "\n")


	X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it
	x_control = defaultdict(list)

	feature_names = []
	for attr in FEATURES_CLASSIFICATION:
		vals = data[attr]
		if attr in CONT_VARIABLES:
			vals = [float(v) for v in vals]
			vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_ATTRS:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONT_VARIABLES: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


	# convert the sensitive feature to 1-d array
	x_control = dict(x_control)
	for k in x_control.keys():
		assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding
		x_control[k] = np.array(x_control[k]).flatten()

	# sys.exit(1)

	"""permute the date randomly"""
	perm = list(range(0,X.shape[0]))
	shuffle(perm)
	X = X[perm]
	y = y[perm]
	for k in x_control.keys():
		x_control[k] = x_control[k][perm]


	X = add_intercept(X)

	feature_names = ["intercept"] + feature_names
	assert(len(feature_names) == X.shape[1])
	print( "Features we will be using for classification are:", feature_names, "\n")


	return X, y, x_control

In [181]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(input_size, hidden_size)
        # activation function ReLU
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = nn.functional.softmax(x)
        #x = F.one_hot(x, num_classes=2)
        return x

#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")
model = Net(input_size=94, hidden_size=100, num_classes=2).to(device)

print(model)

Net(
  (fc1): Linear(in_features=94, out_features=100, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=100, out_features=2, bias=True)
)


In [0]:
adam = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=1)
sgd = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1)
optimizer = adam

In [0]:
global my_counter
my_counter = 0
def helper_loss(out, trg, alpha):
    n = out.shape[0]
    benefit = 0
    utility = 0
    mean_benefit = 0
    tens_benefit = torch.tensor(0.0)
    for it in range(0, n):
        if  trg[it][0].item() > trg[it][1].item():
            c = 0.25
            d = 1.25
        else:
            c = 0.5
            d = 0.5
        if  out[it][0].item() > out[it][1].item():
            benefit = (-c*out[it][0].item() + d)**alpha
            tens_benefit.add_((-c*out[it][0]+d)**alpha)
        else:
            benefit = (c**out[it][1].item() + d)**alpha
            tens_benefit.add_((c*out[it][1]+d)**alpha)
        
        mean_benefit += benefit
        utility += benefit
    #tens_benefit = torch.sum(tens_benefit**alpha)
    #tens_utility = torch.sum(tens_benefit)
    tens_utility = tens_benefit
    return utility, n, tens_utility

def my_torch_loss(alpha, tau, lgrng_mult = 100):
    def utility_loss(output, target):        
        
        utility, n, tens_utility = helper_loss(output, target, alpha)
        global my_counter
        my_counter += 1
        sign = 1
        
        sq_loss = torch.sum((output - target)**2)
        if tens_utility - tau*n > 0:
            sign = -0.1
        if my_counter%1 == 0:
            print("\n\n\t\tUTILITY: ", tens_utility, "\tTAU*n: ", tau*n, " Regularizer loss: ", -sign*lgrng_mult*(tens_utility -tau*n), "SQ loss: ",sq_loss)
        
        #loss = sq_loss - torch.tensor(lgrng_mult*sign*(tens_utility - tau*n), requires_grad=True)
        #loss = sq_loss - lgrng_mult*sign*(tens_utility.add_(-tau*n))
        loss = sq_loss - sign*lgrng_mult*(tens_utility - tau*n)
        return loss
    
    return utility_loss

In [0]:
loss_fn = my_torch_loss(0.5, 0.95)

In [0]:
#WITH BATCH SIZE = dataset size.
from torch.utils import data
def train_test_classifier(x_train, y_true, x_test, y_test):
    x = torch.from_numpy(x_train)
    y = torch.from_numpy(y_true)
    print("TRAIN ON:\t", device, '\n')

    my_dataset = data.TensorDataset(x,y) # create your datset
    my_dataloader = data.DataLoader(my_dataset, batch_size=4222,
                                          shuffle=True, num_workers=2) # create your dataloader
    print("Training DATASET SIZE: ", len(my_dataset), "\tNUM OF BATCHES: ", len(my_dataset)/4222)
    for t in range(50):
        # Forward pass: compute predicted y by passing x to the model.
        running_loss = 0.0
        for i, batch in enumerate(my_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = batch
            x_cur = inputs.to(device)
            y_cur = labels.to(device)

            y_pred = model(x_cur.float())
            #print("Prediction:\t", y_pred, "\n\n\n")
            #_, y_pr_ind = torch.max(y_pred, 1)
            #_, y_cur_ind = torch.max(y_cur, 1)
            # Compute and print loss.
            loss = loss_fn(y_pred, y_cur.long())
            running_loss += loss.item()
            #if (i+1) % 4222 == 0:
               #print("Epoch: ", t+1, ":", (i+1)*4222, "\tLoss: ", running_loss)
                #running_loss = 0
            if t % 50 == 0:
                for g in optimizer.param_groups:
                    g['lr'] = g['lr'] / 10
  
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("Epoch: ", t+1, "\tLoss: ", running_loss)

    correct = 0
    total = 0
    x_t = torch.from_numpy(x_train).to(device)
    y_t = torch.from_numpy(y_true).to(device)
    with torch.no_grad():
        outputs = model(x_t.float())
        #print(outputs)
        _, pred = torch.max(outputs, 1)
        _, tr = torch.max(y_t, 1)
        total += outputs.shape[0]
        print(pred)
        print(tr)
        correct += (pred == tr).sum().item()

    print('Accuracy of the network on the train set: %f total: %d' % (
        correct/total, total))
    

    correct = 0
    total = 0
    x_t = torch.from_numpy(x_test).to(device)
    y_t = torch.from_numpy(y_test).to(device)
    with torch.no_grad():
        outputs = model(x_t.float())
        #print(outputs)
        _, pred = torch.max(outputs, 1)
        _, tr = torch.max(y_t, 1)
        total += outputs.shape[0]
        print(pred)
        print(tr)
        correct += (pred == tr).sum().item()

    print('\n\nAccuracy of the network on test set: %f total: %d' % (
        correct/total, total))
    
    score = correct

    return score, "correct" #[score, model.metrics_names]

In [165]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
def test_compas_data():
  ''' Get the data 	'''
  data_type = 1
  X, y, x_control = load_compas_data()
  sensitive_attrs = x_control.keys()


  ''' Split the data into train and test 	'''
  train_fold_size = 0.8
  
  x_train, y_train, x_control_train, x_test, y_test, x_control_test = split_into_train_test(X, y, x_control, train_fold_size)
  #print(y_train[:20])
  lb = LabelEncoder()
  #y_train = one_hot_encode(lb.fit_transform(y_train))
  #y_test = np_utils.to_categorical(one_hot_encode(y_test))
  y_train = np_utils.to_categorical(lb.fit_transform(y_train))
  y_test = np_utils.to_categorical(lb.fit_transform(y_test))
  print(x_train.shape, y_test.shape)
  print("classes: ", lb.classes_, " turned into ", [0, 1])
  score1 = train_test_classifier(x_train, y_train, x_test, y_test)
  
  print("\n\nTEST ACCURACY:", score1)
  return y_train, score1


def main():
  return test_compas_data()

y_train = main()

Looking for file 'compas-scores-two-years3.csv' in the current directory...
File found in current directory..

Number of people recidivating within two years
-1    2795
 1    2483
dtype: int64


Features we will be using for classification are: ['intercept', 'age_18', 'age_19', 'age_20', 'age_21', 'age_22', 'age_23', 'age_24', 'age_25', 'age_26', 'age_27', 'age_28', 'age_29', 'age_30', 'age_31', 'age_32', 'age_33', 'age_34', 'age_35', 'age_36', 'age_37', 'age_38', 'age_39', 'age_40', 'age_41', 'age_42', 'age_43', 'age_44', 'age_45', 'age_46', 'age_47', 'age_48', 'age_49', 'age_50', 'age_51', 'age_52', 'age_53', 'age_54', 'age_55', 'age_56', 'age_57', 'age_58', 'age_59', 'age_60', 'age_61', 'age_62', 'age_63', 'age_64', 'age_65', 'age_66', 'age_67', 'age_68', 'age_69', 'age_70', 'age_71', 'age_72', 'age_73', 'age_74', 'age_75', 'age_77', 'age_78', 'age_79', 'age_80', 'race', 'sex', 'juv_fel_count_0', 'juv_fel_count_1', 'juv_fel_count_2', 'juv_fel_count_3', 'juv_fel_count_4', 'juv_fel_co





		UTILITY:  tensor(3672.5283, grad_fn=<AddBackward0>) 	TAU*n:  4010.8999999999996  Regularizer loss:  tensor(33837.1562, grad_fn=<MulBackward0>) SQ loss:  tensor(2083.3782, grad_fn=<SumBackward0>)
Epoch:  1 	Loss:  35920.53515625


		UTILITY:  tensor(4421.4761, grad_fn=<AddBackward0>) 	TAU*n:  4010.8999999999996  Regularizer loss:  tensor(4105.7617, grad_fn=<MulBackward0>) SQ loss:  tensor(2194.2744, grad_fn=<SumBackward0>)
Epoch:  2 	Loss:  6300.0361328125


		UTILITY:  tensor(4466.0972, grad_fn=<AddBackward0>) 	TAU*n:  4010.8999999999996  Regularizer loss:  tensor(4551.9727, grad_fn=<MulBackward0>) SQ loss:  tensor(2328.4280, grad_fn=<SumBackward0>)
Epoch:  3 	Loss:  6880.400390625


		UTILITY:  tensor(4488.3491, grad_fn=<AddBackward0>) 	TAU*n:  4010.8999999999996  Regularizer loss:  tensor(4774.4922, grad_fn=<MulBackward0>) SQ loss:  tensor(2412.3767, grad_fn=<SumBackward0>)
Epoch:  4 	Loss:  7186.869140625


		UTILITY:  tensor(4497.0156, grad_fn=<AddBackward0>) 	TAU*n:  4010.8999

In [110]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
def test_compas_data():
  ''' Get the data 	'''
  data_type = 1
  X, y, x_control = load_compas_data()
  sensitive_attrs = x_control.keys()


  ''' Split the data into train and test 	'''
  train_fold_size = 0.8
  
  x_train, y_train, x_control_train, x_test, y_test, x_control_test = split_into_train_test(X, y, x_control, train_fold_size)
  #print(y_train[:20])
  lb = LabelEncoder()
  #y_train = one_hot_encode(lb.fit_transform(y_train))
  #y_test = np_utils.to_categorical(one_hot_encode(y_test))
  y_train = np_utils.to_categorical(lb.fit_transform(y_train))
  y_test = np_utils.to_categorical(lb.fit_transform(y_test))
  print(x_train.shape, y_test.shape)
  print("classes: ", lb.classes_, " turned into ", [0, 1])
  score1 = train_test_classifier(x_train, y_train, x_test, y_test)
  
  print("\n\nTEST ACCURACY:", score1)
  return y_train, score1


def main():
  return test_compas_data()

y_train = main()

Looking for file 'compas-scores-two-years3.csv' in the current directory...
File found in current directory..

Number of people recidivating within two years
-1    2795
 1    2483
dtype: int64


Features we will be using for classification are: ['intercept', 'age_18', 'age_19', 'age_20', 'age_21', 'age_22', 'age_23', 'age_24', 'age_25', 'age_26', 'age_27', 'age_28', 'age_29', 'age_30', 'age_31', 'age_32', 'age_33', 'age_34', 'age_35', 'age_36', 'age_37', 'age_38', 'age_39', 'age_40', 'age_41', 'age_42', 'age_43', 'age_44', 'age_45', 'age_46', 'age_47', 'age_48', 'age_49', 'age_50', 'age_51', 'age_52', 'age_53', 'age_54', 'age_55', 'age_56', 'age_57', 'age_58', 'age_59', 'age_60', 'age_61', 'age_62', 'age_63', 'age_64', 'age_65', 'age_66', 'age_67', 'age_68', 'age_69', 'age_70', 'age_71', 'age_72', 'age_73', 'age_74', 'age_75', 'age_77', 'age_78', 'age_79', 'age_80', 'race', 'sex', 'juv_fel_count_0', 'juv_fel_count_1', 'juv_fel_count_2', 'juv_fel_count_3', 'juv_fel_count_4', 'juv_fel_co





		UTILITY:  tensor(3760.5740, grad_fn=<SumBackward0>) 	TAU*n:  3588.7  Regularizer loss:  tensor(1718.7402, grad_fn=<MulBackward0>) SQ loss:  tensor(2129.2515, grad_fn=<SumBackward0>)
Epoch:  1 	Loss:  3847.99169921875


		UTILITY:  tensor(3201.8022, grad_fn=<SumBackward0>) 	TAU*n:  3588.7  Regularizer loss:  tensor(38689.7695, grad_fn=<MulBackward0>) SQ loss:  tensor(2200.7029, grad_fn=<SumBackward0>)
Epoch:  2 	Loss:  40890.47265625


		UTILITY:  tensor(3308.7671, grad_fn=<SumBackward0>) 	TAU*n:  3588.7  Regularizer loss:  tensor(27993.2852, grad_fn=<MulBackward0>) SQ loss:  tensor(2121.5513, grad_fn=<SumBackward0>)
Epoch:  3 	Loss:  30114.8359375


		UTILITY:  tensor(4375.9434, grad_fn=<SumBackward0>) 	TAU*n:  3588.7  Regularizer loss:  tensor(7872.4341, grad_fn=<MulBackward0>) SQ loss:  tensor(2154.5322, grad_fn=<SumBackward0>)
Epoch:  4 	Loss:  10026.966796875


		UTILITY:  tensor(4424.0371, grad_fn=<SumBackward0>) 	TAU*n:  3588.7  Regularizer loss:  tensor(8353.3711, grad_fn=<M