In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import random
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import pickle
from torch.utils.data.sampler import (SubsetRandomSampler,
                                      RandomSampler)
from torchvision import datasets, transforms
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Read Data

### COMPAS DATASET

In [3]:
compas = pd.read_csv("COMPAS_ProPublica/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv")

### ADULT DATASET

In [4]:
adult = pd.read_csv("UCIAdult/adult.csv",
                   dtype={0:int, 1:str, 2:int, 3:str, 4:int, 5: str, 6:str , 7:str ,8:str ,9: str, 10:int, 11:int, 12:int, 13:str,14: str})

# Keep only United States samples, due to class imbalance
adult = adult[adult["native-country"] == 'United-States']

# Then drop Column
adult = adult.drop(['native-country'], axis=1)

# get rid of NAN vales
full_data = adult

str_list=[]

for data in [full_data]:
    for colname, colvalue in data.items(): 
        if type(colvalue[1]) == str:
            str_list.append(colname) 
num_list = data.columns.difference(str_list)

full_size = full_data.shape[0]
print('Dataset size Before pruning: ', full_size)
for data in [full_data]:
    for i in full_data:
        data[i].replace('nan', np.nan, inplace=True)
    data.dropna(inplace=True)
real_size = full_data.shape[0]
print('Dataset size after pruning: ', real_size)
print('We eliminated ', (full_size-real_size), ' datapoints')

### make prediction labels ### 
full_labels = full_data['income'].copy()
print(full_labels.shape[0])

full_data = full_data.drop(['income'], axis=1)
print(full_data.shape[0])

# Label Encode Labels
label_encoder = LabelEncoder()
full_labels = label_encoder.fit_transform(full_labels)


### Deal with categorical data ###

cat_data = full_data.select_dtypes(include=['object']).copy()
other_data = full_data.select_dtypes(include=['int']).copy()

newcat_data = pd.get_dummies(cat_data, columns = ['workclass', 'education',
       'marital-status', 'occupation', 'relationship', 'race'], dtype=int)

full_data = pd.concat([other_data, newcat_data], axis=1)
adult = full_data
adult_labels = full_labels

# one_hot = F.one_hot(torch.Tensor(adult_labels).to(torch.int).long(), 2)
# adult_labels = one_hot
# adult_labels = adult_labels.numpy()

adult = adult.drop(['gender'], axis=1)

Dataset size Before pruning:  43832
Dataset size after pruning:  43832
We eliminated  0  datapoints
43832
43832


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[i].replace('nan', np.nan, inplace=True)


### DataLoaders

In [5]:
from torch.utils.data import Dataset

In [6]:
class MyDataset(Dataset):    
    def __init__(self, dataframe, targets, transform=None):
        self.data_frame = dataframe
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        m_data = self.data_frame.iloc[idx, :].values
        m_target = self.targets[idx]
        m_data = m_data.astype('float')
        m_target = m_target.astype('float')
        sample = {'target': np.array([m_target]), 'data': np.array(m_data)}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [7]:
adultDataset = MyDataset(adult, adult_labels)

In [8]:
indices = list(range(len(adultDataset)))
random.shuffle(indices)

bs = 1028
batch_size_eval = 128

# Split dataset into train and Test sets
adult_train_loader = DataLoader(
    adultDataset,
    batch_size=bs,
    sampler=SubsetRandomSampler(indices[:30001]),
    num_workers=1,
)

adult_valid_loader = DataLoader(
    adultDataset,
    batch_size=batch_size_eval,
    sampler=SubsetRandomSampler(indices[30001:38001]),
    num_workers=1,
)

adult_test_loader = DataLoader(
    adultDataset,
    batch_size=batch_size_eval,
    sampler=SubsetRandomSampler(indices[38001:]),
    num_workers=1,
)

## Model

In [9]:
class Wide(nn.Module):
    def __init__(self, num_features_in, num_classes):
        super().__init__()
        self.hidden = nn.Linear(num_features_in, 180)
        self.relu = nn.ReLU()
        self.hidden1 = nn.Linear(180, 90)
        self.output = nn.Linear(90, num_classes)
        self.dropout = nn.Dropout(0.25)
 
    def forward(self, x):
        x = self.relu(self.hidden(x))
        x = self.relu(self.hidden1(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

In [10]:
class Deep(nn.Module):
    def __init__(self,num_features_in, num_classes):
        super().__init__()
        self.layer1 = nn.Linear(num_features_in, 60)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(60, 60)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(60, 60)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(60, num_classes)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

## ENGINE

In [11]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    
    for data in train_loader:
        inputs = data["data"]
        target = data["target"]
        inputs, target = inputs.to(torch.float32), target.to(torch.float32)
        target = target.type(torch.LongTensor)
        inputs, target = inputs.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(inputs)

        target = torch.squeeze(target, dim=1)

        loss = loss_fn(output, target)

        # Backprop
        loss.backward()
        optimizer.step()
        ###

def test(model, test_loader):
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
      
        for data in test_loader:
            inputs = data["data"]
            target = data["target"]
            inputs, target = inputs.to(torch.float32), target.to(torch.float32)
            target = target.type(torch.LongTensor)
            inputs, target = inputs.to(device), target.to(device)
            
            output = model(inputs)
            test_size += len(inputs)

            target = torch.squeeze(target, dim=1)
            
            test_loss += test_loss_fn(output, target).item() 
            # indicies of max pred
            pred = output.max(1, keepdim=True)[1]
            
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

## Vanilla Accuracy Run Code

In [12]:
num_features = 64
model = Wide(num_features, 2).to(device)
test_accuracy = []
train_loss = []
nbr_epochs = 50
lr = 0.0001
weight_decay = 0



# Surrogate loss used for training
loss_fn = nn.CrossEntropyLoss()
test_loss_fn = nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)


print('Training beginning...')
start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(model, adult_train_loader, optimizer, epoch)
    loss, acc = test(model, adult_valid_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
end_time = time.time()
print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

test_loss, test_acc = test(model, adult_test_loader)
print(test_loss, test_acc)

Training beginning...
Epoch  1 :

Test set: Average loss: 287.2973, Accuracy: 6048/8000 (76%)

Epoch  2 :

Test set: Average loss: 147.8070, Accuracy: 6085/8000 (76%)

Epoch  3 :

Test set: Average loss: 95.1159, Accuracy: 6126/8000 (77%)

Epoch  4 :

Test set: Average loss: 54.0652, Accuracy: 6238/8000 (78%)

Epoch  5 :

Test set: Average loss: 23.9191, Accuracy: 6302/8000 (79%)

Epoch  6 :

Test set: Average loss: 4.5424, Accuracy: 6269/8000 (78%)

Epoch  7 :

Test set: Average loss: 0.7455, Accuracy: 6272/8000 (78%)

Epoch  8 :

Test set: Average loss: 0.6880, Accuracy: 6326/8000 (79%)

Epoch  9 :

Test set: Average loss: 0.6493, Accuracy: 6333/8000 (79%)

Epoch  10 :

Test set: Average loss: 0.6296, Accuracy: 6312/8000 (79%)

Epoch  11 :

Test set: Average loss: 0.6207, Accuracy: 6327/8000 (79%)

Epoch  12 :

Test set: Average loss: 0.6226, Accuracy: 6300/8000 (79%)

Epoch  13 :

Test set: Average loss: 0.6009, Accuracy: 6313/8000 (79%)

Epoch  14 :

Test set: Average loss: 0.6018,

In [15]:
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, "modelDicts/vanilla.pt")

## Demographic Parity

In [14]:
## define the combination of accuracy and demographic parity regularization
def demoParityLoss(outputs, targets, samples, parityCoef):
    ## accuracy
    entropyLoss = loss_fn(outputs, targets)

    loss = entropyLoss
    return loss

SyntaxError: incomplete input (1669807239.py, line 2)

In [None]:
demModel = Wide(num_features, 2).to(device)

# Surrogate loss used for training
loss_fn = nn.CrossEntropyLoss()
test_loss_fn = nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)


print('Training beginning...')
start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(demModel, adult_train_loader, optimizer, epoch)
    loss, acc = test(demModel, adult_valid_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
end_time = time.time()
print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

test_loss, test_acc = test(model, adult_test_loader)
print(test_loss, test_acc)