First we will do a linear classifier to classify whether or not someone has diabetes. We will use the estimator API.

In [None]:
# tf.estimator.LinearClassifier constructs a linear classification model.
# tf.estimator.LinearRegressor constructs a linear regression model 
# tf.estimator.DNNClassifier constructs a neural network classification model.
# tf.estimator.DNNClassifier constructs a neural network regressor

In general, to use the Estimator API we do the following:
- Define a list of feature columns
- Create the estimator Model
- Create a Data input function
- Call train, evaluate, and predict methods on the estimator object

In [1]:
import pandas as pd
diabetes = pd.read_csv('pima-indians-diabetes.csv')

In [2]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,6,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,8,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,1,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


In [3]:
diabetes.columns

Index([u'Number_pregnant', u'Glucose_concentration', u'Blood_pressure',
       u'Triceps', u'Insulin', u'BMI', u'Pedigree', u'Age', u'Class',
       u'Group'],
      dtype='object')

In [5]:
# need to normalize/scale columns because data is very spread apart
cols_to_norm = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure',
       'Triceps', 'Insulin', 'BMI', 'Pedigree']
# scale with pandas .apply function
diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x: (x-x.min()) / (x.max() - x.min()))

In [6]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [7]:
# create tensorflow feature columns
# to pass into estimator API
num_preg = tf.feature_column.numeric_column('Number_pregnant')
plasm_gluc = tf.feature_column.numeric_column('Glucose_concentration')
dias_press = tf.feature_column.numeric_column('Blood_pressure')
tricep = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedigree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')

In [8]:
# one way of dealing with categorical columns
# must pass in list of possible values
# works better when there are only a few categorical labels
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Group',['A','B','C','D'])

In [9]:
# let's say there are a lot of categorical possibilities
# we can use hash_bucket
# hash_bucket_size represents the most possibilities we estimate there are
#assigned_group = tf.feature_column.categorical_column_with_hash_bucket('Group'hash_bucket_size=10)

# now we will convert a continuous value to a categorical value
# in this case it is age

age_bucket = tf.feature_column.bucketized_column(age,boundaries=[20,30,40,50,60,70,80])

# now create a list of the tf feature columns
feat_cols = [num_preg,plasm_gluc,dias_press,tricep,insulin,diabetes_pedigree,age_bucket]

In [10]:
# now create the test train split
x_data = diabetes.drop('Class',axis=1)
labels = diabetes['Class']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data,labels,test_size=0.3,random_state=101)

In [11]:
# now create an input function
# using an estimator
# using a pandas input function
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=20,num_epochs=1000,shuffle=True)

In [12]:
# use a linear classifier function
# essentially the same thing as
# multiple linear regression
# note that we must pass in a list of the feature cols
model = tf.estimator.LinearClassifier(feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2ba13e90>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmp7dyda4', '_train_distribute': None, '_save_summary_steps': 100}


In [13]:
# train the model
model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmp7dyda4/model.ckpt.
INFO:tensorflow:loss = 13.862944, step = 1
INFO:tensorflow:global_step/sec: 137.524
INFO:tensorflow:loss = 11.839209, step = 101 (0.729 sec)
INFO:tensorflow:global_step/sec: 230.218
INFO:tensorflow:loss = 9.041603, step = 201 (0.435 sec)
INFO:tensorflow:global_step/sec: 202.066
INFO:tensorflow:loss = 8.777344, step = 301 (0.498 sec)
INFO:tensorflow:global_step/sec: 186.178
INFO:tensorflow:loss = 8.714936, step = 401 (0.534 sec)
INFO:tensorflow:global_step/sec: 183.82
INFO:tensorflow:loss = 10.74392, step = 501 (0.545 sec)
INFO:tensorflow:global_step/sec: 167.043
INFO:tensorflow:loss = 9.83637, step = 601 (0.605 sec)
INFO:tensorflow:global

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1a2ba35350>

In [14]:
# pass in evaluation function
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,y=y_test,batch_size=10,num_epochs=1,
                                                     shuffle=False)

In [15]:
results = model.evaluate(eval_input_func)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-12-02:29:06
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmp7dyda4/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-12-02:29:07
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.7316017, accuracy_baseline = 0.64935064, auc = 0.80296296, auc_precision_recall = 0.66587216, average_loss = 0.52137256, global_step = 1000, label/mean = 0.35064936, loss = 5.018211, precision = 0.63013697, prediction/mean = 0.37936035, recall = 0.56790125
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmp7dyda4/model.ckpt-1000


In [16]:
results

{'accuracy': 0.7316017,
 'accuracy_baseline': 0.64935064,
 'auc': 0.80296296,
 'auc_precision_recall': 0.66587216,
 'average_loss': 0.52137256,
 'global_step': 1000,
 'label/mean': 0.35064936,
 'loss': 5.018211,
 'precision': 0.63013697,
 'prediction/mean': 0.37936035,
 'recall': 0.56790125}

In [17]:
# make predictions
# don't pass in a y
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                     batch_size=10,
                                                     num_epochs=1,
                                                     shuffle=False)

In [18]:
predictions = model.predict(pred_input_func)

In [19]:
my_pred = list(predictions)
my_pred

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmp7dyda4/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logistic': array([0.5118053], dtype=float32),
  'logits': array([0.04723012], dtype=float32),
  'probabilities': array([0.4881946, 0.5118053], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logistic': array([0.69231635], dtype=float32),
  'logits': array([0.8109708], dtype=float32),
  'probabilities': array([0.30768368, 0.69231635], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array(['0'], dtype=object),
  'logistic': array([0.41585582], dtype=float32),
  'logits': array([-0.33980918], dtype=float32),
  'probabilities': array([0.5841442 , 0.41585585], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array(['0'], dtype=object),
  'logistic': array([0.30436346], dtype=float32),
  'logits': array([-0.8266047], dtype=float32),
  'probabilities': array([0.6956365 , 0.30436346], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array(['0'], dtype=object),

In [21]:
# now do a deep neural network
# hidden_units provides a list with how many neurons are
# in the layer
# hidden_units = [10,10,10] means that we have three 10-neuron layers
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10],
                                      feature_columns=feat_cols,
                                      n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2dd5f290>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmpW9EMEV', '_train_distribute': None, '_save_summary_steps': 100}


In [23]:
# need to embed categorical columns, specifically group column
embedding_group_col = tf.feature_column.embedding_column(assigned_group,
                                                        dimension=4)

In [24]:
# redefine feat_cols
feat_cols = [num_preg,plasm_gluc,dias_press,tricep,insulin,
             diabetes_pedigree,embedding_group_col,age_bucket]

In [25]:
# must create our input function
input_func = tf.estimator.inputs.pandas_input_fn(X_train,y_train,
                                                batch_size=10,
                                                num_epochs=1000,
                                                shuffle=True)

In [26]:
# note that default activation function is 'relu'
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10],
                                      feature_columns=feat_cols,
                                      n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2deecf50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmprcKUDx', '_train_distribute': None, '_save_summary_steps': 100}


In [29]:
# train DNN model first
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmprcKUDx/model.ckpt.
INFO:tensorflow:loss = 7.307909, step = 1
INFO:tensorflow:global_step/sec: 172.782
INFO:tensorflow:loss = 5.037978, step = 101 (0.585 sec)
INFO:tensorflow:global_step/sec: 224.038
INFO:tensorflow:loss = 4.754703, step = 201 (0.446 sec)
INFO:tensorflow:global_step/sec: 178.176
INFO:tensorflow:loss = 7.062928, step = 301 (0.559 sec)
INFO:tensorflow:global_step/sec: 174.673
INFO:tensorflow:loss = 4.528586, step = 401 (0.582 sec)
INFO:tensorflow:global_step/sec: 165.524
INFO:tensorflow:loss = 6.1241517, step = 501 (0.594 sec)
INFO:tensorflow:global_step/sec: 180.679
INFO:tensorflow:loss = 4.473522, step = 601 (0.560 sec)
INFO:tensorflow:globa

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1a2dd5f390>

In [30]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,y=y_test,
                                                     batch_size=10,
                                                     num_epochs=1,
                                                     shuffle=False)

In [31]:
dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-12-03:00:05
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmprcKUDx/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-12-03:00:06
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.74891776, accuracy_baseline = 0.64935064, auc = 0.8271605, auc_precision_recall = 0.69825613, average_loss = 0.48577192, global_step = 1000, label/mean = 0.35064936, loss = 4.6755548, precision = 0.6666667, prediction/mean = 0.3344637, recall = 0.56790125
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /var/folders/g7/8vxnts7s1j7fxnn7zwtb830w0000gn/T/tmprcKUDx/model.ckpt-1000


{'accuracy': 0.74891776,
 'accuracy_baseline': 0.64935064,
 'auc': 0.8271605,
 'auc_precision_recall': 0.69825613,
 'average_loss': 0.48577192,
 'global_step': 1000,
 'label/mean': 0.35064936,
 'loss': 4.6755548,
 'precision': 0.6666667,
 'prediction/mean': 0.3344637,
 'recall': 0.56790125}