In [0]:
'''
*    Author: Olufemi Onimole
*    Date: 2019
*    Code version: 0.1

Dataset Source:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

@misc{Dua:2019 ,
author = "Dua, Dheeru and Graff, Casey",
year = "2017",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences" }
'''

!pip install --upgrade tensorflow
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import pandas as pd
import numpy as np
import tensorflow as tf

csv file path

In [0]:
train_file_path = "/content/drive/My Drive/Colab Notebooks/Projects/Census1994/adult.csv"
test_file_path = "/content/drive/My Drive/Colab Notebooks/Projects/Census1994/adult_test.csv"

read csv as dataframe

In [0]:
df = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

examine dataframe head

In [0]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


examine dataframe datatypes

In [0]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

remove unneeded features

In [0]:
df.pop('fnlwgt')
df_test.pop('fnlwgt')

0        226802
1         89814
2        336951
3        160323
4        103497
          ...  
16276    215419
16277    321403
16278    374983
16279     83891
16280    182148
Name: fnlwgt, Length: 16281, dtype: int64

examine dataframe

In [0]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


convert categories to numerical values

In [0]:
columns = ["workclass","education","marital-status","occupation","relationship","race","sex","capital-loss","native-country", "income"
]

for column in columns:
  df[column] = pd.Categorical(df[column])
  df[column] = df[column].cat.codes
  df_test[column] = pd.Categorical(df_test[column])
  df_test[column] = df_test[column].cat.codes

df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,9,13,2,10,5,2,0,0,0,40,5,0


shuffle data

In [0]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,24,4,11,9,2,3,0,4,1,0,0,45,39,0
1,36,4,15,10,2,3,0,4,1,0,0,40,0,1
2,33,4,11,9,0,12,1,4,1,0,0,40,39,0
3,23,4,11,9,4,7,3,4,1,0,0,40,39,0
4,21,4,11,9,2,1,0,4,1,0,0,40,39,0


balance data

In [0]:
target_counts = df['income'].value_counts()
print(target_counts)
df = df.groupby('income').head(target_counts.min())
balanced_target_counts = df['income'].value_counts()
print(balanced_target_counts)


0    24720
1     7841
Name: income, dtype: int64
1    7841
0    7841
Name: income, dtype: int64


split training and validation

In [0]:
dflen = len(df.index)
split_amount = int(dflen * .8)
df_train = df[:split_amount]
df_val = df[split_amount:]

separate targets from data

In [0]:
target_train = df_train.pop('income')
target_val = df_val.pop('income')
target_test = df_test.pop('income')

convert dataframe to dataset

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train.values, target_train.values))
print(train_dataset)
val_dataset = tf.data.Dataset.from_tensor_slices((df_val.values, target_val.values))
print(val_dataset)
test_dataset = tf.data.Dataset.from_tensor_slices((df_test.values, target_test.values))
print(test_dataset)

<TensorSliceDataset shapes: ((13,), ()), types: (tf.int64, tf.int8)>
<TensorSliceDataset shapes: ((13,), ()), types: (tf.int64, tf.int8)>
<TensorSliceDataset shapes: ((13,), ()), types: (tf.int64, tf.int8)>


examine dataset

In [0]:
for feat, targ in train_dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [24  4 11  9  2  3  0  4  1  0  0 45 39], Target: 0
Features: [36  4 15 10  2  3  0  4  1  0  0 40  0], Target: 1
Features: [33  4 11  9  0 12  1  4  1  0  0 40 39], Target: 0
Features: [23  4 11  9  4  7  3  4  1  0  0 40 39], Target: 0
Features: [21  4 11  9  2  1  0  4  1  0  0 40 39], Target: 0


shuffle and batch the dataset

In [0]:
train_dataset = train_dataset.shuffle(len(df)).batch(16)
val_dataset = val_dataset.batch(16)
test_dataset = test_dataset.batch(16)

create and train a model

In [0]:
def get_compiled_model():
  model = tf.keras.Sequential([
                               tf.keras.layers.Dense(50, activation='relu'),
                               tf.keras.layers.Dense(50, activation='relu'),
                               tf.keras.layers.Dense(100, activation='relu'),
                               tf.keras.layers.Dense(100, activation='relu'),
                               tf.keras.layers.Dense(200, activation='relu'),
                               tf.keras.layers.Dense(200, activation='relu'),
                               tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
  return model

In [0]:
model = get_compiled_model()
model.fit(train_dataset, validation_data=val_dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f00c9e962b0>

In [0]:
model.evaluate(test_dataset, verbose=2)

1018/1018 - 1s - loss: 0.3621 - accuracy: 0.8221


[0.3620566149518509, 0.82212394]