## Imports

In [1]:
import tensorflow as tf
import time

tf.__version__

'2.0.0'

## Data gathering

In [2]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

## Prepare `tf.data` datasets

In [3]:
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

## Common data streaming steps

- Shuffle
- Batching
- Batches to be available as soon as possible

Reference: https://www.tensorflow.org/tutorials/load_data/images

In [4]:
train_dataset = train_dataset.\
    shuffle(buffer_size=1000).\
    repeat().\
    batch(256).\
    prefetch(buffer_size=1000)
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

## Inspection of the batches

In [5]:
for (images, labels) in train_dataset.take(1):
    pass

images.shape

TensorShape([256, 28, 28])

## Speed comparison

In [6]:
gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255.)
train_flow = gen.flow(X_train.reshape(60000, 28, 28, 1), 
                      y_train, batch_size=256, shuffle=True)
test_flow = gen.flow(X_test.reshape(10000, 28, 28, 1), 
                     y_test, batch_size=256)

In [7]:
# Function courtesy: https://www.tensorflow.org/tutorials/load_data/images#performance
default_timeit_steps = 1000

def timeit(ds, steps=default_timeit_steps):
    start = time.time()
    it = iter(ds)
    for i in range(steps):
        batch = next(it)
        if i%10 == 0:
            print('.',end='')
    print()
    end = time.time()

    duration = end-start
    print("{} batches: {} s".format(steps, duration))
    print("{:0.5f} Images/s".format(256*steps/duration))

In [8]:
# Keras ImageDataGenerator
timeit(train_flow)

....................................................................................................
1000 batches: 4.16782808303833 s
61422.87899 Images/s


In [9]:
# `tf.data`
timeit(train_dataset)

....................................................................................................
1000 batches: 0.9977211952209473 s
256584.70646 Images/s


## With `AUTOTUNE` prefetching

In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

train_dataset = train_dataset.\
    shuffle(buffer_size=1000).\
    repeat().\
    batch(256).\
    prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

timeit(train_dataset)

....................................................................................................
1000 batches: 0.9096441268920898 s
281428.73947 Images/s


## With caching

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

train_dataset = train_dataset.cache().\
    shuffle(buffer_size=1000).\
    repeat().\
    batch(256).\
    prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

timeit(train_dataset)

....................................................................................................
1000 batches: 0.6213550567626953 s
412002.76269 Images/s


## Modeling with `tf.data` datasets

In [12]:
# Define and compile a model

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
test_dataset = test_dataset.batch(256)

In [14]:
model.fit(train_dataset,
          steps_per_epoch=len(X_train)//256,
          epochs=5,
          validation_data=test_dataset)

Train for 234 steps, validate for 40 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x147bb8cc0>