<a href="https://colab.research.google.com/github/sourcecode369/TensorFlow-2.0/blob/master/tensorflow_2.0_docs/TensorFlow%20Core/Tutorials/Structured%20Data/Feature%20Columns/Notebook1_Classify_structured_data_with_feature_columns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install TensorFlow 2.x

In [1]:
!pip install --upgrade tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/46/0f/7bd55361168bb32796b360ad15a25de6966c9c1beb58a8e30c01c8279862/tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 1.3MB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/95/00/5e6cdf86190a70d7382d320b2b04e4ff0f8191a37d90a422a2f8ff0705bb/tensorflow_estimator-2.0.0-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 32.0MB/s 
Collecting tensorboard<2.1.0,>=2.0.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/9b/a6/e8ffa4e2ddb216449d34cfcb825ebb38206bee5c4553d69e7bc8bc2c5d64/tensorboard-2.0.0-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 28.1MB/s 
Installing collected packages: tensorflow-estimator, tensorboard, tensorflow
  Found existing installation: tensorflow-estimator 1.15.1
    

### Import TensorFlow and other Libraries

In [2]:
%%time
%reload_ext autoreload
%autoreload 2
%reload_ext tensorboard

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = "retina"

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
from tensorflow import feature_column
from tensorflow.keras import layers
keras = tf.keras

print(f"TensorFlow version {tf.__version__}, Keras version {keras.__version__}")
if tf.config.experimental.list_physical_devices("GPU"):
    print("GPU is avaialble.")
print(f"TensorFlow is executing eagerly: {tf.executing_eagerly()}")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

TensorFlow version 2.0.0, Keras version 2.2.4-tf
TensorFlow is executing eagerly: True
CPU times: user 2.14 s, sys: 235 ms, total: 2.38 s
Wall time: 2.73 s


### Use Pandas to create a DataFrame

In [3]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [4]:
dataframe.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
target        int64
dtype: object

### Split the data into train and validation sets

In [5]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Validation shape: {val.shape}")

Train shape: (193, 14)
Test shape: (61, 14)
Validation shape: (49, 14)


### Create an input pipeline with tf.data

In [0]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  dataframe = dataframe.copy()
  labels = dataframe.pop("target")
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [0]:
batch_size = 64

train_ds = df_to_dataset(train, batch_size=batch_size)

val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)

test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [8]:
print(f"Training dataset: {train_ds.element_spec}")

print(f"Test dataset: {test_ds.element_spec}")

print(f"Validation dataset: {val_ds.element_spec}")

Training dataset: ({'age': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'sex': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'cp': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'trestbps': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'chol': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'fbs': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'restecg': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'thalach': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'exang': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'oldpeak': TensorSpec(shape=(None,), dtype=tf.float64, name=None), 'slope': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'ca': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'thal': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))
Test dataset: ({'age': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'sex': TensorSpec(shape=(None,)

### Understanding the input pipeline

In [9]:
for feature_batch, label_batch in train_ds.take(1):
  print(f"Label Batch: {label_batch}")
  print(f"A batch of feature values: {feature_batch.values()}")
  print(f"All features: {feature_batch.keys()}")

Label Batch: [1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0]
A batch of feature values: dict_values([<tf.Tensor: id=68, shape=(64,), dtype=int32, numpy=
array([59, 39, 57, 58, 71, 40, 58, 58, 57, 42, 55, 52, 60, 63, 45, 56, 45,
       58, 54, 45, 37, 41, 60, 48, 53, 58, 45, 51, 52, 44, 59, 76, 35, 47,
       65, 56, 61, 50, 42, 59, 46, 45, 74, 43, 39, 70, 41, 60, 54, 70, 66,
       41, 58, 59, 56, 44, 55, 43, 44, 68, 45, 66, 71, 49], dtype=int32)>, <tf.Tensor: id=76, shape=(64,), dtype=int32, numpy=
array([1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1],
      dtype=int32)>, <tf.Tensor: id=71, shape=(64,), dtype=int32, numpy=
array([4, 3, 4, 3, 3, 4, 4, 4, 4, 1, 2, 2, 4, 4, 4, 2, 2, 3, 3, 4, 3, 2,
       4, 3, 4, 4, 2, 3, 3, 2, 0, 3, 4, 4, 3, 2, 4, 

### Demonstrate several types of feature columns

In [0]:
example_batch = next(iter(train_ds))[0]

In [0]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [12]:
age = feature_column.numeric_column("age")
demo(age)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[65.]
 [63.]
 [57.]
 [58.]
 [41.]
 [42.]
 [53.]
 [62.]
 [60.]
 [55.]
 [60.]
 [62.]
 [48.]
 [66.]
 [59.]
 [44.]
 [63.]
 [58.]
 [68.]
 [44.]
 [41.]
 [51.]
 [55.]
 [63.]
 [42.]
 [47.]
 [46.]
 [54.]
 [49.]
 [44.]
 [71.]
 [41.]
 [67.]
 [54.]
 [57.]
 [63.]
 [59.]
 [45.]
 [54.]
 [65.]
 [66.]
 [54.]
 [59.]
 [52.]
 [65.]
 [51.]
 [65.]
 [45.]
 [53.]
 [29.]
 [58.]
 [57.]
 [58.]
 [64.]
 [59.]
 [54.]
 [56.]
 [40.]
 [71.]
 [49.]
 [35.]
 [57.]
 [57.]
 [39.]]


In [0]:
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 40, 45, 50, 55, 65, 70, 85])

In [14]:
demo(age_buckets)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1

In [15]:
thal = feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible']
)
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0.

In [16]:
thal_embedding = feature_column.embedding_column(thal, dimension=10)
demo(thal_embedding)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[ 0.07883181  0.4935395   0.30147004 -0.4979467  -0.1619393   0.09534717
   0.02093381  0.31305107  0.21107933  0.39731863]
 [-0.241402   -0.27333358  0.3484615  -0.07261305 -0.08289272  0.14960644
  -0.18578492 -0.12193766  0.303454    0.22694093]
 [-0.241402   -0.27333358  0.3484615  -0.07261305 -0.08289272  0.14960644
  -0.18578492 -0.12193766  0.303454    0.22694093]
 [ 0.07883181  0.4935395   0.30147004 -0.4979467  -0.1619393   0.09534717
   0.02093381  0.31305107  0.21107933  0.39731863]
 [-0.241402   -0.27333358  0.3484615  -0.07261305 -0.08289272  0.14960644
  -0.18578492 -0.12193766  0.303454    0.22694093]
 [-0.241402   -0.27333358  0.3484615  -0.07261305 -0.08289272  0.14960644


In [17]:
thal_hashed = feature_column.categorical_column_with_hash_bucket(
    'thal', hash_bucket_size=1000
)
demo(feature_column.indicator_column(thal_hashed))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Build a model to train

#### Create features

In [0]:
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [0]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [0]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

#### Build the model

In [0]:
model = tf.keras.Sequential([
                             feature_layer,
                             layers.Dense(128, activation=tf.nn.relu),
                             layers.Dense(128,activation=tf.nn.relu),
                             layers.Dense(1, activation=tf.nn.sigmoid)
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

#### Train the model

In [37]:
model.fit(train_ds, validation_data=val_ds, epochs=10)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f10a47de518>

#### Evaluate the model

In [38]:
loss, accuracy = model.evaluate(test_ds)

