In [101]:
"""https://www.tensorflow.org/alpha/tutorials/keras/feature_columns"""

#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Prepare a CSV data.
dataURL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(dataURL)  # pandas.DataFrame

The created `pandas.DataFrame` object has the following structure:

In [108]:
print("Type:", type(dataframe))
print("Shape:", dataframe.shape)
print("Attributes:", list(dataframe.keys()))
print("Column type:", type(dataframe['age']))
print("Column size:", dataframe['age'].shape)

Type: <class 'pandas.core.frame.DataFrame'>
Shape: (303, 14)
Attributes: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Column type: <class 'pandas.core.series.Series'>
Column size: (303,)


Each *row* corresponds to a patient (or a data point), and each *column* corresponds to an attribute.

Note that column values can be accessed by giving a column name as either an *attribute* or a *key*, i.e., `dataframe.age` or `dataframe['age']` for the age values.

We split the dataframe into sub-dataframes for training, validating and testing:

In [102]:
trainFrame, testFrame = train_test_split(dataframe, test_size=0.2)
trainFrame, validateFrame = train_test_split(trainFrame, test_size=0.2)
print(trainFrame.shape)
print(validateFrame.shape)
print(testFrame.shape)

(193, 14)
(49, 14)
(61, 14)


Next, we wrap each (sub-)dataframe into a `tensorflow.data.Dataset` object. The latter becomes a bridge that maps the dataframe to feature columns, which will be used to train the model.

In [113]:
def dataframe2dataset(dataframe, shuffle=True, batchSize=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')  # 1,0-diagnosis of hear disease.
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        # dict(dataframe).keys() -> the data attributes.
        # dict(dataframe).values() -> the data values.
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    dataset = dataset.batch(batchSize)  # Dataset -> BatchDataset
    return dataset

batchSize = 5  # A small batch size for demonstration.
trainSet = dataframe2dataset(trainFrame, batchSize=batchSize)
validateSet = dataframe2dataset(validateFrame, False, batchSize)
testSet = dataframe2dataset(testFrame, False, batchSize)

`trainSet`, `validateSet` and `testSet` are `BatchDataset` objects. When iterated, they give one **batch** of data rows. Each batch is a tuple of a *feature batch* and a *label batch*. The feature batch is a dict mapping the column names to values.

In [114]:
exampleBatch = next(iter(trainSet))
print("Type and length:", type(exampleBatch), ",", len(exampleBatch))
print("batch[0] keys:", list(exampleBatch[0].keys()))
print("batch[1] value example:", exampleBatch[0]['age'])
print("batch[1]:", exampleBatch[1])

Type and length: <class 'tuple'> , 2
batch[0] keys: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
batch[1] value example: tf.Tensor([44 43 70 57 40], shape=(5,), dtype=int32)
batch[1]: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int32)


Our original data has different types of features, e.g., numerical, categorical or binary. `tensorflow.feature_column` provides various types of feature columns.

We will use the following helper function to see some examples.

In [150]:
def inspect(featureColumn):
    """A utility function to see how a feature batch is transformed
       to a feature column."""
    featureLayer = tf.keras.layers.DenseFeatures(featureColumn)
    transformedBatch = featureLayer(exampleBatch[0])
    print(transformedBatch.numpy(), ", shape:", transformedBatch.shape)

1. Numeric columns

In [151]:
agesExample = tf.feature_column.numeric_column('age')
inspect(agesExample)
print(exampleBatch[0]['age'])

[[44.]
 [43.]
 [70.]
 [57.]
 [40.]] , shape: (5, 1)
tf.Tensor([44 43 70 57 40], shape=(5,), dtype=int32)


2. Bucketized columns

In [152]:
ageBuckets = tf.feature_column.bucketized_column(
    agesExample,
    boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
)
inspect(ageBuckets)

[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]] , shape: (5, 11)


3. Categorical columns

In [155]:
thalExample = tf.feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible'])
inspect(tf.feature_column.indicator_column(thalExample))
print(exampleBatch[0]['thal'])

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] , shape: (5, 3)
tf.Tensor([b'normal' b'normal' b'normal' b'normal' b'reversible'], shape=(5,), dtype=string)


4. Embedding columns.
<br>Dense embedding of a categorical one-hot with a large number of categories.

In [154]:
thalEmbedding = tf.feature_column.embedding_column(thalExample, dimension=8)
inspect(thalEmbedding)

[[-0.46359766 -0.03813995 -0.06968783 -0.4653723  -0.14574529  0.3870377
  -0.65910083  0.1292486 ]
 [-0.46359766 -0.03813995 -0.06968783 -0.4653723  -0.14574529  0.3870377
  -0.65910083  0.1292486 ]
 [-0.46359766 -0.03813995 -0.06968783 -0.4653723  -0.14574529  0.3870377
  -0.65910083  0.1292486 ]
 [-0.46359766 -0.03813995 -0.06968783 -0.4653723  -0.14574529  0.3870377
  -0.65910083  0.1292486 ]
 [ 0.6605061  -0.19259916  0.00831191 -0.48197597  0.32038784  0.26782286
   0.03419415  0.07020086]] , shape: (5, 8)


5. Hashed feature columns.
<br>Use `hash_bucket_size` number of hash buckets to encode category strings. `hash_bucket_size` can be much smaller than the vocabulary size.

In [157]:
thalHashed = tf.feature_column.categorical_column_with_hash_bucket(
    'thal', hash_bucket_size=1000)
inspect(tf.feature_column.indicator_column(thalHashed))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] , shape: (5, 1000)


6. Crossed feature columns.
<br>Hash encoding of **feature crosses**. The example below crosses the two features, age and thal.

In [159]:
featureCrossExample = tf.feature_column.crossed_column(
    [ageBuckets, thalExample], hash_bucket_size=1000)
inspect(tf.feature_column.indicator_column(featureCrossExample))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] , shape: (5, 1000)
