# CH 13 Loading and Preprocessing Data with TensorFlow


In [1]:
import tensorflow as tf

### The tf.data API


In [3]:
help(tf.data)

Help on package tensorflow._api.v2.data in tensorflow._api.v2:

NAME
    tensorflow._api.v2.data - `tf.data.Dataset` API for input pipelines.

DESCRIPTION
    See [Importing Data](https://tensorflow.org/guide/data) for an overview.

PACKAGE CONTENTS
    experimental (package)

DATA
    AUTOTUNE = -1
    INFINITE_CARDINALITY = -1
    UNKNOWN_CARDINALITY = -2

FILE
    c:\users\sayed\.conda\envs\homl3\lib\site-packages\tensorflow\_api\v2\data\__init__.py




In [4]:
tf.range(10)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [9]:
try :
    dataset[0]
except Exception as ex:
    print(ex)

'_TensorSliceDataset' object is not subscriptable


In [10]:
len(dataset)

10

In [15]:
it = iter(dataset)
print(next(it))
print(next(it))
print(next(it))

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


In [16]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [17]:
X_nested = {"a": ([1, 2, 3], [4, 5, 6]), "b": [7, 8, 9]}
dataset = tf.data.Dataset.from_tensor_slices(X_nested)

for item in dataset:
    print(item)

{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=4>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=7>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=5>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=8>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=6>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=9>}


In [25]:
# let's try using unbalanced dict
try :
    X_nested = {"a": ([1, 2, 3], [4, 5, 6]), "b": [7, 8, 9,10]}
    dataset = tf.data.Dataset.from_tensor_slices(X_nested)

    for item in dataset:
        print(item)
except Exception as ex:
    print('ERROR: ',ex)

ERROR:  Dimensions 3 and 4 are not compatible


### Chaining Transformation


In [34]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset = dataset.repeat(3)
print(list(dataset.as_numpy_iterator()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [38]:
list(dataset.batch(batch_size=8).as_numpy_iterator())

[array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([8, 9, 0, 1, 2, 3, 4, 5]),
 array([6, 7, 8, 9, 0, 1, 2, 3]),
 array([4, 5, 6, 7, 8, 9])]

In [79]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10)).repeat(200)
print(list(dataset.as_numpy_iterator())[:20])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [80]:
len(dataset)

2000

In [81]:
batched_data = dataset.batch(batch_size=8,drop_remainder=True)

num_parallel_calls
(Optional.) A tf.int64 scalar tf.Tensor, representing the number of batches to compute asynchronously in parallel. If not specified, batches will be computed sequentially. If the value tf.data.AUTOTUNE is used, then the number of parallel calls is set dynamically based on available resources.


In [82]:
parallel_batched_data = dataset.batch(batch_size=8,drop_remainder=True,
                            num_parallel_calls=tf.data.AUTOTUNE)

In [85]:
list(parallel_batched_data.as_numpy_iterator())[:4]

[array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([8, 9, 0, 1, 2, 3, 4, 5]),
 array([6, 7, 8, 9, 0, 1, 2, 3]),
 array([4, 5, 6, 7, 8, 9, 0, 1])]

#### map


In [93]:
mapped_data = batched_data.map(lambda x : x*3)
list(mapped_data.as_numpy_iterator())[:4]

[array([ 0,  3,  6,  9, 12, 15, 18, 21]),
 array([24, 27,  0,  3,  6,  9, 12, 15]),
 array([18, 21, 24, 27,  0,  3,  6,  9]),
 array([12, 15, 18, 21, 24, 27,  0,  3])]

In [95]:
def exp_func(x):
    return tf.exp(float(x))

mapped_data = batched_data.map(exp_func)
list(mapped_data.as_numpy_iterator())[:4]

[array([1.0000000e+00, 2.7182817e+00, 7.3890562e+00, 2.0085537e+01,
        5.4598148e+01, 1.4841316e+02, 4.0342877e+02, 1.0966332e+03],
       dtype=float32),
 array([2.9809580e+03, 8.1030835e+03, 1.0000000e+00, 2.7182817e+00,
        7.3890562e+00, 2.0085537e+01, 5.4598148e+01, 1.4841316e+02],
       dtype=float32),
 array([4.0342877e+02, 1.0966332e+03, 2.9809580e+03, 8.1030835e+03,
        1.0000000e+00, 2.7182817e+00, 7.3890562e+00, 2.0085537e+01],
       dtype=float32),
 array([5.4598148e+01, 1.4841316e+02, 4.0342877e+02, 1.0966332e+03,
        2.9809580e+03, 8.1030835e+03, 1.0000000e+00, 2.7182817e+00],
       dtype=float32)]

#### filter


In [122]:
filtered_data = dataset.filter(lambda x : x>5)
list(filtered_data.as_numpy_iterator())[:5]

[6, 7, 8, 9, 6]

In [124]:
try :
    filtered_data = batched_data.filter(lambda x : x>5)
    list(filtered_data.as_numpy_iterator())[:5]
except Exception as ex:
    print("ERROR",ex)


Invalid `predicate`. `predicate` must return a `tf.bool` scalar tensor, but its return type is TensorSpec(shape=(8,), dtype=tf.bool, name=None).


In [134]:
filtered_data = batched_data.filter(lambda x : tf.reduce_min(x)==0)
list(filtered_data.as_numpy_iterator())[:5]

[array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([8, 9, 0, 1, 2, 3, 4, 5]),
 array([6, 7, 8, 9, 0, 1, 2, 3]),
 array([4, 5, 6, 7, 8, 9, 0, 1]),
 array([0, 1, 2, 3, 4, 5, 6, 7])]

In [140]:
list(filtered_data.take(3).as_numpy_iterator())

[array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([8, 9, 0, 1, 2, 3, 4, 5]),
 array([6, 7, 8, 9, 0, 1, 2, 3])]

#### Shuffle


buffer_size
A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample. To uniformly shuffle the entire dataset, use buffer_size=dataset.cardinality().

Randomly shuffles the elements of this dataset.

This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

For instance, if your dataset contains 10,000 elements but buffer_size is set to 1,000, then shuffle will initially select a random element from only the first 1,000 elements in the buffer. Once an element is selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, maintaining the 1,000 element buffer.


In [157]:
dataset = tf.data.Dataset.range(10).repeat(2).shuffle(buffer_size=4, seed=42, reshuffle_each_iteration=False).batch(7)
list(dataset.as_numpy_iterator())

[array([0, 1, 3, 4, 5, 2, 6], dtype=int64),
 array([0, 9, 1, 7, 2, 3, 8], dtype=int64),
 array([5, 8, 7, 9, 4, 6], dtype=int64)]

### Interleaving Lines from Multiple Files
