In [1]:
import os
import torch
import random
import sklearn
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

In [2]:
'''
Initialize the parameters
'''
DATASET = 'income' # {"income", "crim-recid", "health"}
MODEL = 'main'
DATA_LOC = './../data/'

In [3]:
class IncomeDataset(Dataset):
    """Income dataset."""

    def __init__(self, file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.all_data = np.load(self.root_dir+file)
        self.x = self.all_data['x']
        self.y = self.all_data['y']
        self.a = self.all_data['a']
        self.transform = transform
        
        # Complete all the dataset specific processing here
        print('Income dataset (x) dims: {}'.format(self.x.shape))
        print('Income dataset (y) dims: {}'.format(self.y.shape))
        print('Income dataset (a) dims: {}'.format(self.a.shape))

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        #if torch.is_tensor(idx):
        #    idx = idx.tolist()
        
        sample_x, sample_y, sample_a = np.array(self.x[idx]), np.array(self.y[idx]), np.array(self.a[idx])
        sample_x, sample_y, sample_a = torch.tensor(sample_x, dtype=torch.float32), torch.tensor(sample_y, dtype=torch.long), torch.tensor(sample_a, dtype=torch.float32)

        #print('sample_x.shape: {}'.format(sample_x.shape))
        #print('sample_y.shape: {}'.format(sample_y.shape))
        #print('sample_a.shape: {}'.format(sample_a.shape))
        
        return sample_x, sample_y, sample_a

In [4]:
'''
Define the model
'''
class NNet(torch.nn.Module):
    def __init__(self):
        """
        Explicit layer definition
        """
        super(NNet, self).__init__()
        self.fc1 = nn.Linear(113, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 25)
        self.fc4 = nn.Linear(25, 2)

    def forward(self, x):
        """
        Explicit model definition
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

# Construct our model by instantiating the class defined above
model = NNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [5]:
'''
Load the datasets
'''
if (DATASET=='income'):
    dataset = IncomeDataset(file='adult_train.npz',
                                    root_dir=DATA_LOC)
    dataloader = DataLoader(dataset, batch_size=256,
                        shuffle=True, num_workers=10)    
elif (DATASET=='health'):
    dataset = HealthDataset(file='processed.switzerland.data',
                                    root_dir=DATA_LOC)
    dataloader = DataLoader(dataset, batch_size=256,
                        shuffle=True, num_workers=10)

Income dataset (x) dims: (32561, 113)
Income dataset (y) dims: (32561, 1)
Income dataset (a) dims: (32561, 1)


In [6]:
model

NNet(
  (fc1): Linear(in_features=113, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=25, bias=True)
  (fc4): Linear(in_features=25, out_features=2, bias=True)
)

In [7]:
'''
Train the model
'''

epochs = 100
total = 0
correct = 0
running_loss = 0
    
for t in range(epochs):
    for i_batch, sample_batched in enumerate(dataloader):
        x, y, a = sample_batched[0], sample_batched[1].squeeze(), sample_batched[2]
        
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = model(x)
        
        # Compute and print loss
        loss = criterion(y_pred, y)
        
        _, predicted = torch.max(y_pred.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
        running_loss += loss
        
        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%10==0):
        print('epoch: {}, loss: {:.5f}, accuracy: {:.2f}%'.format(t, running_loss/total, 100*correct/total))
        running_loss = 0
        total = 0
        correct = 0

epoch: 0, loss: 0.00233, accuracy: 77.63%
epoch: 10, loss: 0.00160, accuracy: 81.49%
epoch: 20, loss: 0.00129, accuracy: 84.94%
epoch: 30, loss: 0.00126, accuracy: 85.25%
epoch: 40, loss: 0.00125, accuracy: 85.38%
epoch: 50, loss: 0.00124, accuracy: 85.43%
epoch: 60, loss: 0.00124, accuracy: 85.50%
epoch: 70, loss: 0.00123, accuracy: 85.53%
epoch: 80, loss: 0.00123, accuracy: 85.52%
epoch: 90, loss: 0.00122, accuracy: 85.64%


In [8]:
'''
Train the model (Regularize with different fairness metrics)
'''


'\nTrain the model (Regularize with different fairness metrics)\n'