# UCI Adult Two Layer NN

Here we train a simple two layer neural network to predict whether someone's income is >= or < $50K. The results are saved to a file that can be preprocessed and loaded into FairVis.

The data is from UCI - https://archive.ics.uci.edu/ml/datasets/Adult

The model is trained using PyTorch

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data

In [5]:
train_og = pd.read_csv("data/adult/adult.csv", sep=", ", engine="python")

N = train_og.shape[0]
print(N, "rows")

train_og.head()

32561 rows


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,country,salary
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


Change categorical features into one-hot

In [6]:
cols = list(train_og.select_dtypes(include=['object']).columns)
train_oh = train_og.copy()

for c in cols:
    one_hot = pd.get_dummies(train_oh[c])

    for new_col_name in one_hot.columns:
        one_hot.rename(columns={new_col_name : c + "_" + new_col_name}, inplace=True)

    train_oh = train_oh.drop(c, axis = 1)
    train_oh = train_oh.join(one_hot)

In [7]:
train_oh.head()

Unnamed: 0,age,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia,salary_<=50K,salary_>50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Normalize all features

In [8]:
train_oh = train_oh.drop('salary_>50K', axis=1)
train_oh = (train_oh-train_oh.min())/(train_oh.max()-train_oh.min())
train_oh.head()

Unnamed: 0,age,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,country_Puerto-Rico,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia,salary_<=50K
0,0.30137,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.452055,0.122449,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.287671,0.397959,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.493151,0.397959,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.150685,0.397959,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


PyTorch model and help function

In [9]:
def train(loader, opt, model, crit):
    t_loss = 0
    for batch in loader:
        x = batch[:, 0:-1].float()
        y = batch[:, -1].view(-1, 1).float()

        opt.zero_grad()
        out = model(x)
        loss = crit(out, y)
        t_loss += loss
        loss.backward()
        optimizer.step()
    return t_loss/N

def get_acc(model, loader, items):
    correct = 0
    for batch in loader:
        x = batch[:, 0:-1].float()
        y = batch[:, -1].view(-1, 1).int()
        out = model(x).round().int()
        correct += (y == out).sum()
    return (correct.item() * 1.0 /items)

Train the model!

In [10]:
train_t = torch.tensor(train_oh.values)
print(train_t.shape)

train_loader = torch.utils.data.DataLoader(train_t, batch_size=1024, shuffle=True)

model = torch.nn.Sequential(
          torch.nn.Linear(train_t.shape[1] - 1, 300),
          torch.nn.ReLU(),
          torch.nn.Linear(300, 100),
          torch.nn.ReLU(),
          torch.nn.Linear(100, 1),
          torch.nn.Sigmoid()
        ).to(device)

criterion = torch.nn.functional.binary_cross_entropy
optimizer = torch.optim.Adam(model.parameters())

for i in range(0, 20):
    loss = train(train_loader, optimizer, model, criterion)
    print("Loss at epoch", i+1, " is ", loss.data.item(), ". Train accuracy is ", get_acc(model, train_loader, N))

torch.Size([32561, 105])
Loss at epoch 1  is  0.0005221187020651996 . Train accuracy is  0.7591904425539756
Loss at epoch 2  is  0.00038147508166730404 . Train accuracy is  0.8322840207610331
Loss at epoch 3  is  0.0003544554638210684 . Train accuracy is  0.8353858910967108
Loss at epoch 4  is  0.00034626200795173645 . Train accuracy is  0.8375664138079297
Loss at epoch 5  is  0.0003418916603550315 . Train accuracy is  0.8391634163569915
Loss at epoch 6  is  0.00033904609153978527 . Train accuracy is  0.8414667854181382
Loss at epoch 7  is  0.00033658603206276894 . Train accuracy is  0.8415589201805841
Loss at epoch 8  is  0.0003335931396577507 . Train accuracy is  0.8445993673412979
Loss at epoch 9  is  0.0003319533134344965 . Train accuracy is  0.8443536746414422
Loss at epoch 10  is  0.00033036552486009896 . Train accuracy is  0.8427259605048985
Loss at epoch 11  is  0.00032810153788886964 . Train accuracy is  0.8449371948035994
Loss at epoch 12  is  0.0003270760935265571 . Train ac

Save the data with output labels

In [12]:
train_loader = torch.utils.data.DataLoader(train_t, batch_size=N, shuffle=False)
for batch in train_loader:
    x = batch[:, 0:-1].float()
    y = batch[:, -1].view(-1, 1).float()
    train_og["class"] = train_oh["salary_<=50K"]
    train_og["out"] = model(x).data.numpy()
    train_oh["out"] = model(x).data.numpy()

train_og.drop("salary", axis=1)
train_og.to_csv("processed/adult_out.csv", index=False)