In [1]:
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
census_dir = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/'
train_path = keras.utils.get_file('adult.data', census_dir + 'adult.data')
test_path = keras.utils.get_file('adult.test', census_dir + 'adult.test')

In [3]:
import pandas as pd

columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race',
           'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']

train_data = pd.read_csv(train_path, header=None, names=columns)
test_data = pd.read_csv(test_path, header=None, names=columns, skiprows=1)

In [4]:
predictors = ['age', 'workclass', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'gender']

y_train = (train_data.income_bracket==' >50K').astype(int)
y_test = (test_data.income_bracket==' >50K.').astype(int)

In [5]:
train_data = train_data[predictors]
test_data = test_data[predictors]

In [6]:
train_data[['age', 'education_num']] = train_data[['age', 'education_num']].fillna(train_data[['age', 'education_num']].mean())
test_data[['age', 'education_num']] = test_data[['age', 'education_num']].fillna(train_data[['age', 'education_num']].mean())

In [7]:
def define_feature_columns(data_df, numeric_cols, categorical_cols, categorical_embeds, dimension=30):
    numeric_columns = []
    categorical_columns = []
    embeddings = []

    for feature_name in numeric_cols:
        numeric_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

    for feature_name in categorical_cols:
        vocabolary = data_df[feature_name].unique()
        categorical_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabolary))

    for feature_name in categorical_embeds:
        vocabolary = data_df[feature_name].unique()
        to_categorical = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabolary)

    embeddings.append(tf.feature_column.embedding_column(to_categorical, dimension=dimension))

    return numeric_columns, categorical_columns, embeddings

In [8]:
def create_interactions(interactions_list, buckets=10):
    feature_columns = []

    for (a, b) in interactions_list:
        crossed_feature = tf.feature_column.crossed_column([a, b], hash_bucket_size=buckets)
        crossed_feature_one_hot = tf.feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature_one_hot)

    return feature_columns

In [9]:
numeric_columns, categorical_columns, embeddings = define_feature_columns(train_data,
                                                                          numeric_cols=['age', 'education_num'],
                                                                          categorical_cols=['gender'],
                                                                          categorical_embeds=['workclass', 'education', 'marital_status', 'occupation', 'relationship'],
                                                                          dimension=32)
interactions = create_interactions([['education', 'occupation']], buckets=10)

In [10]:
estimator = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=numeric_columns+categorical_columns+interactions,
                                                     linear_optimizer=keras.optimizers.Ftrl(learning_rate=0.0002),
                                                     dnn_feature_columns=embeddings,
                                                     dnn_hidden_units=[1024, 256, 128, 64],
                                                     dnn_optimizer=keras.optimizers.Adam(learning_rate=0.0001))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpv0fdmd_q', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=256):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

In [12]:
train_input_fn = make_input_fn(train_data, y_train, num_epochs=100, batch_size=256)
test_input_fn = make_input_fn(test_data, y_test, num_epochs=1, shuffle=False)

In [13]:
estimator.train(input_fn=train_input_fn, steps=1500)
result = estimator.evaluate(input_fn=test_input_fn)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpv0fdmd_q/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.68948835, step = 0
INFO:tensorflow:global_step/sec: 90.6044
INFO:tensorflow:loss = 0.470763

In [14]:
print(result)

{'accuracy': 0.76377374, 'accuracy_baseline': 0.76377374, 'auc': 0.8287723, 'auc_precision_recall': 0.53418684, 'average_loss': 0.4301044, 'label/mean': 0.23622628, 'loss': 0.43032768, 'precision': 0.0, 'prediction/mean': 0.23563127, 'recall': 0.0, 'global_step': 1500}


In [15]:
import numpy as np

def predict_proba(predictor):
    preds = list()
    for pred in predictor:
        preds.append(pred['probabilities'])
    return np.array(preds)

In [16]:
predictions = predict_proba(estimator.predict(input_fn=test_input_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpv0fdmd_q/model.ckpt-1500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [17]:
predictions

array([[0.9846005 , 0.01539955],
       [0.5616371 , 0.43836293],
       [0.5580712 , 0.4419288 ],
       ...,
       [0.55534965, 0.44465035],
       [0.98383856, 0.01616144],
       [0.5548155 , 0.44518456]], dtype=float32)