In [1]:
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_schema

tf.__version__

'1.13.1'

### High Performance Input Tensors

In [2]:
feature_spec = {
    'beta1': tf.io.FixedLenFeature([1], tf.float32),
    'beta2': tf.io.FixedLenFeature([1], tf.float32),
    'weekday': tf.io.FixedLenFeature([1], tf.int64),
    'hour': tf.io.FixedLenFeature([1], tf.int64),
    'humidity': tf.io.FixedLenFeature([1], tf.float32)
}
schema = dataset_schema.from_feature_spec(feature_spec)

In [3]:
def make_tfr_input_fn(filename_pattern, batch_size, options):
    
    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=filename_pattern,
            batch_size=batch_size,
            features=feature_spec,
            shuffle_buffer_size=options['shuffle_buffer_size'],
            prefetch_buffer_size=options['prefetch_buffer_size'],
            reader_num_threads=options['reader_num_threads'],
            parser_num_threads=options['parser_num_threads'],
            sloppy_ordering=options['sloppy_ordering'],
            num_epochs=options['num_epochs'],
            label_key='humidity')

        if options['distribute']:
            return dataset 
        else:
            return dataset.make_one_shot_iterator().get_next()
    return _input_fn

Take the pattern from ```Beam_Pipelines.ipynb```:

In [4]:
with open('temp_dir.txt') as file:
    temp_dir = file.read()
import os

file_pattern = os.path.join(temp_dir, "training.tfr-*")
file_pattern

'..\\temp\\training.tfr-*'

In [5]:
train_input_fn = make_tfr_input_fn(
    filename_pattern=file_pattern,
    batch_size=5, 
    options={'num_epochs': None,  # repeat infinitely
             'shuffle_buffer_size': 1000,
             'prefetch_buffer_size': 1000,
             'reader_num_threads': 10,
             'parser_num_threads': 10,
             'sloppy_ordering': True,
             'distribute': False})

This design pattern allows us to provide parameters to a function that is not allowed to take some. We essentially have a function now that provides its parameters to a *daughter* function as constants.

Later, we will provide this ```train_input_fn``` to the so-called ```estimator```. It is then up to the ```estimator``` to call ```train_input_fn``` and by that create the input-generating computational sub-graph within it's own graph and session context.

For demonstration purposes, we call the function ourselves and see what it returns.

In [6]:
samples, labels = train_input_fn()

In [7]:
samples

{'beta1': <tf.Tensor 'IteratorGetNext:0' shape=(5, 1) dtype=float32>,
 'beta2': <tf.Tensor 'IteratorGetNext:1' shape=(5, 1) dtype=float32>,
 'hour': <tf.Tensor 'IteratorGetNext:2' shape=(5, 1) dtype=int64>,
 'weekday': <tf.Tensor 'IteratorGetNext:3' shape=(5, 1) dtype=int64>}

In [8]:
labels

<tf.Tensor 'IteratorGetNext:4' shape=(5, 1) dtype=float32>

Now, each time we evaluate ```samples``` and ```labels```, we'll get a new batch of 1000 samples with the associated 'humidity' labels.

In [9]:
with tf.Session() as sess:
    s, l = sess.run([samples, labels])

In [10]:
s, l

({'beta1': array([[0.13788432],
         [0.5533913 ],
         [0.82273513],
         [0.7386045 ],
         [0.5238801 ]], dtype=float32), 'beta2': array([[0.42998454],
         [0.5715966 ],
         [0.72879624],
         [0.07653333],
         [0.15327226]], dtype=float32), 'hour': array([[ 5],
         [ 4],
         [ 5],
         [16],
         [ 1]], dtype=int64), 'weekday': array([[5],
         [4],
         [6],
         [3],
         [4]], dtype=int64)}, array([[12.803208],
        [19.632471],
        [25.337847],
        [27.787937],
        [20.085531]], dtype=float32))