# Tensorflow: California Census Data 'Income' Classification 

In [27]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

%matplotlib inline

In [2]:
census = pd.read_csv('data/census_data.csv')
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
census['income_bracket'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [4]:
def label_preprocess(label):
    if label == ' <=50K':
        return 0
    else:
        return 1

In [5]:
census['income_bracket'] = census['income_bracket'].apply(label_preprocess)

In [6]:
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [8]:
X_data = census.drop('income_bracket', axis=1)
y_labels = census['income_bracket']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_labels, test_size=0.3, random_state=101)

print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape or y_test: ', y_test.shape)

Shape of X_train:  (22792, 13)
Shape of y_train:  (22792,)
Shape of X_test:  (9769, 13)
Shape or y_test:  (9769,)


In [9]:
census.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [11]:
gender = tf.feature_column.categorical_column_with_vocabulary_list("gender", ["Female", "Male"])
occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket("marital_status", hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket("relationship", hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket("education", hash_bucket_size=1000)
workclass = tf.feature_column.categorical_column_with_hash_bucket("workclass", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket("native_country", hash_bucket_size=1000)

In [13]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss= tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [16]:
feature_columns = [gender, occupation, marital_status, relationship, education, workclass, native_country, age, education_num,
capital_gain, capital_loss, hours_per_week]

In [17]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=100, num_epochs=None, shuffle=True)

In [18]:
model = tf.estimator.LinearClassifier(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7oemlzrg', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [20]:
model.train(input_fn=input_func, steps=5000)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tmp7oemlzrg/model.ckpt-5000
INFO:tensorflow:Saving checkpoints for 5001 into /tmp/tmp7oemlzrg/model.ckpt.
INFO:tensorflow:loss = 54.124603, step = 5001
INFO:tensorflow:global_step/sec: 375.655
INFO:tensorflow:loss = 60.371967, step = 5101 (0.268 sec)
INFO:tensorflow:global_step/sec: 345.959
INFO:tensorflow:loss = 43.085876, step = 5201 (0.289 sec)
INFO:tensorflow:global_step/sec: 373.113
INFO:tensorflow:loss = 32.791237, step = 5301 (0.269 sec)
INFO:tensorflow:global_step/sec: 296.583
INFO:tensorflow:loss = 43.14972, step = 5401 (0.335 sec)
INFO:tensorflow:global_step/sec: 298.128
INFO:tensorflow:loss = 40.020203, step = 5501 (0.340 sec)
INFO:tensorflow:global_step/sec: 320.93
INFO:tensorflow:loss = 63.66205, step = 5601 (0.309 sec)
INFO:tensorflow:global_step/sec: 320.181
INFO:tensorflow:loss = 43.798668, step = 5701 (0.312 sec)
INFO:tensorflow:global_step/sec: 294.544
INFO:tensorflow:loss = 40.

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7ff961fd49e8>

In [21]:
pred_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=len(X_test), shuffle=False)

In [22]:
predictions = list(model.predict(input_fn=pred_func))

INFO:tensorflow:Restoring parameters from /tmp/tmp7oemlzrg/model.ckpt-10000


In [23]:
predictions[0]

{'class_ids': array([0]),
 'classes': array([b'0'], dtype=object),
 'logistic': array([0.3001432], dtype=float32),
 'logits': array([-0.8466159], dtype=float32),
 'probabilities': array([0.69985676, 0.3001432 ], dtype=float32)}

In [26]:
results = []
for pred in predictions:
    results.append(pred['class_ids'][0])
results[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [28]:
print(classification_report(y_test, results))

             precision    recall  f1-score   support

          0       0.87      0.93      0.90      7436
          1       0.71      0.55      0.62      2333

avg / total       0.83      0.84      0.83      9769

