# Classification on Strcutured Data

### This tutorial demonstrates how to classify structured data (e.g. tabular data in a CSV). We will use Keras to define the model, and feature columns as a bridge to map from columns in a CSV to features used to train the model. 
This tutorial contains complete code to:
    Load a CSV file using Pandas.
    Build an input pipeline to batch and shuffle the rows using tf.data.
    Map from columns in the CSV to features used to train the model using feature columns.
    Build, train, and evaluate a model using Keras.

#### We will use a small dataset provided by the Cleveland Clinic Foundation for Heart Disease. There are several hundred rows in the CSV. Each row describe a patient, and each column describes an attribute. We will use this information to predict whether a patient has heart disease, which in this dataset is a binary classification task.

In [1]:
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
# Load data from the Cleveland website
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
df = pd.read_csv(URL)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


### Prepare the data. Split the dataframe into train, validation and test data sets

In [3]:
train, test = train_test_split(df, test_size = 0.2)
train, val = train_test_split(train, test_size = 0.2)
print (len(train), 'train samples')
print (len(val), 'validation samples')
print (len(test), 'test samples')

193 train samples
49 validation samples
61 test samples


### Create an Input Pipeline Using tf.data

#### We will wrap the dataframes with tf.data. This will enable us to use feature columns as a bridge to map from the columns in the Pandas dataframe to features used to train the model. If we were working with a very large CSV file (so large that it does not fit into memory), we would use tf.data to read it from disk directly.


In [4]:
# A Utility method to create a tf.data datasets from Pandas DF
def df_to_dataset(df, shuffle = True, batch_size = 32):
    df = df.copy()
    labels = df.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size = len(df))
    ds = ds.batch(batch_size)
    return ds

In [5]:
batch_size = 5 # For demo purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [6]:
# Check the format of the data the pipeline returns
for feature_batch, label_batch in train_ds.take(1):
    print ('Every feature: ', list(feature_batch.keys()))
    print ('A batch of ages: ', feature_batch['age'])
    print ('A batch of targets: ', label_batch)

Every feature:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages:  tf.Tensor([49 39 50 59 42], shape=(5,), dtype=int32)
A batch of targets:  tf.Tensor([1 1 0 1 0], shape=(5,), dtype=int32)


In [7]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [8]:
# Utility method to create a feature column and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print (feature_layer(example_batch).numpy())

In [9]:
age = feature_column.numeric_column("age")
demo(age)

W0407 08:54:41.128037 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:2758: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[49.]
 [39.]
 [50.]
 [59.]
 [42.]]


In [10]:
# Bucketized Columns
age_buckets = feature_column.bucketized_column(age, boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

W0407 08:54:41.164082 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:2902: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [11]:
# Categorical Columns
thal = feature_column.categorical_column_with_vocabulary_list(
'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

W0407 08:54:41.192149 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4307: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0407 08:54:41.196133 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4362: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [12]:
# Embedding of categorical columns for demonstartion purposes
thal_embedding = feature_column.embedding_column(thal, dimension = 8)
demo(thal_embedding)

[[-0.18937786 -0.32382473  0.08509588 -0.06288299 -0.0772744  -0.60210246
   0.4659765  -0.03980123]
 [-0.18937786 -0.32382473  0.08509588 -0.06288299 -0.0772744  -0.60210246
   0.4659765  -0.03980123]
 [ 0.4419626   0.3060164   0.38913035  0.29026836 -0.03838386  0.23689228
   0.349915   -0.19267416]
 [-0.18937786 -0.32382473  0.08509588 -0.06288299 -0.0772744  -0.60210246
   0.4659765  -0.03980123]
 [ 0.4419626   0.3060164   0.38913035  0.29026836 -0.03838386  0.23689228
   0.349915   -0.19267416]]


In [13]:
thal_hashed = feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size = 1000)
demo(feature_column.indicator_column(thal_hashed))

W0407 08:54:41.252217 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4362: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size= 1000)
demo(feature_column.indicator_column(crossed_feature))

W0407 08:54:41.276254 13452 deprecation.py:323] From C:\Users\sriniman\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4362: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
feature_columns = []    

In [16]:
# Numeric Columns
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

In [17]:
# Bucketized Columns
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

In [18]:
# Indicator Cols
thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

In [19]:
# Embedding Columns
thal_embedding = feature_column.embedding_column(thal, dimension = 8)
feature_columns.append(thal_embedding)

In [20]:
# Crossed Columns
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crosses_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [21]:
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [22]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [23]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle = False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle = False, batch_size=batch_size)

### Create, Compile and Train the Model

In [24]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation = 'relu'),
    layers.Dense(128, activation = 'relu'),
    layers.Dense(1, activation = 'sigmoid'),
])

In [25]:
model.compile(optimizer = 'adam', 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [26]:
model.fit(train_ds, 
          validation_data=val_ds,
          epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2d5bba9e828>

In [27]:
loss, accuracy = model.evaluate(test_ds)
print ('Accuracy', accuracy)

Accuracy 0.72131145
