In [112]:
import tensorflow as tf
import numpy as np

In [113]:
X = tf.constant([[2013, 2013], [2014,2014], [2015,2015], [2016,2016], [2017,2017]])
Y = tf.constant([12000, 14000, 15000, 16500, 17500])
# 也可以使用NumPy数组，效果相同
# X = np.array([2013, 2014, 2015, 2016, 2017])
# Y = np.array([12000, 14000, 15000, 16500, 17500])
# 当提供多个张量作为输入时，张量的第 0 维大小必须相同，且必须将多个张量作为元组 (Tuple，即使用 Python 中的小括号) 拼接并作为输入。
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
for x, y in dataset:
    print(x.numpy(), y.numpy()) 

    
dataset = tf.data.Dataset.from_tensor_slices({'a': ([1, 2], [3, 4]),'b': [5, 6]})
print(list(dataset.as_numpy_iterator()))
print(list(dataset.as_numpy_iterator())[0]['a'])
print(list(dataset.as_numpy_iterator())[1]['b'])


# Two tensors can be combined into one Dataset object.
features = tf.constant([[1, 3], [2, 1], [3, 3]]) # ==> 3x2 tensor
labels = tf.constant(['A', 'B', 'A']) # ==> 3x1 tensor
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
# Both the features and the labels tensors can be converted
# to a Dataset object separately and combined after.
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))
# A batched feature and label set can be converted to a Dataset
# in similar fashion.
batched_features = tf.constant([[[1, 3], [2, 3]],
                                [[2, 1], [1, 2]],
                                [[3, 3], [3, 2]]], shape=(3, 2, 2))
batched_labels = tf.constant([['A', 'A'],
                              ['B', 'B'],
                              ['A', 'B']], shape=(3, 2, 1))
dataset = tf.data.Dataset.from_tensor_slices((batched_features, batched_labels))
for element in dataset.as_numpy_iterator():
    print(element)

[2013 2013] 12000
[2014 2014] 14000
[2015 2015] 15000
[2016 2016] 16500
[2017 2017] 17500
[{'a': (1, 3), 'b': 5}, {'a': (2, 4), 'b': 6}]
(1, 3)
6
(array([[1, 3],
       [2, 3]]), array([[b'A'],
       [b'A']], dtype=object))
(array([[2, 1],
       [1, 2]]), array([[b'B'],
       [b'B']], dtype=object))
(array([[3, 3],
       [3, 2]]), array([[b'A'],
       [b'B']], dtype=object))


In [114]:
dataset = tf.data.TextLineDataset(["tfDatasetFile/file1.txt", "tfDatasetFile/file2.txt"])
for element in dataset:
    print(element)
print('------------------------')

dataset = tf.data.TFRecordDataset(["tfDatasetFile/file1.tfrecords", "tfDatasetFile/file2.tfrecords"])
for element in dataset:
    print(element)
print('------------------------')
   
    
dataset = tf.data.Dataset.list_files("tfDatasetFile/*.txt")  # doctest: +SKIP
for element in dataset:
    print(element)
print('------------------------')

tf.Tensor(b'file1\t1', shape=(), dtype=string)
tf.Tensor(b'file1\t2', shape=(), dtype=string)
------------------------
------------------------
tf.Tensor(b'tfDatasetFile\\file1.txt', shape=(), dtype=string)
tf.Tensor(b'tfDatasetFile\\file2.txt', shape=(), dtype=string)
------------------------


In [115]:
# flat_map(
#     map_func
# )
dataset = tf.data.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
list(dataset.as_numpy_iterator())
dataset = dataset.flat_map(lambda x: tf.data.Dataset.from_tensor_slices(x))
list(dataset.as_numpy_iterator())

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [116]:
# map(
#     map_func, num_parallel_calls=None, deterministic=None
# )
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
print(dataset.element_spec)
dataset = dataset.map(lambda x: x*2)
print(list(dataset.as_numpy_iterator()))

# Each element is a tuple containing two `tf.Tensor` objects.
elements = [(1, "foo"), (2, "bar"), (3, "baz")]
dataset = tf.data.Dataset.from_generator(
    lambda: elements, (tf.int32, tf.string))
# `map_func` takes two arguments of type `tf.Tensor`. This function
# projects out just the first component.
result = dataset.map(lambda x_int, y_str: x_int)
print(list(result.as_numpy_iterator()))

# Each element is a dictionary mapping strings to `tf.Tensor` objects.
elements =  ([{"a": 1, "b": "foo"},
              {"a": 2, "b": "bar"},
              {"a": 3, "b": "baz"}])
dataset = tf.data.Dataset.from_generator(
    lambda: elements, {"a": tf.int32, "b": tf.string})
# `map_func` takes a single argument of type `dict` with the same keys
# as the elements.
result = dataset.map(lambda d: str(d["a"]) + d["b"])
print(list(result.as_numpy_iterator()))


# Note that tf.py_function accepts tf.Tensor 
# whereas tf.numpy_function accepts numpy arrays and returns only numpy arrays
d = tf.data.Dataset.from_tensor_slices(['hello', 'world'])
# transform a string tensor to upper case string using a Python function
def upper_case_fn(t: tf.Tensor):
    return t.numpy().decode('utf-8').upper()
d = d.map(lambda x: tf.py_function(func=upper_case_fn, inp=[x], Tout=tf.string))
print(list(d.as_numpy_iterator()))

d = tf.data.Dataset.from_tensor_slices(['hello', 'world'])
def upper_case_fn(t: np.ndarray):
    return t.decode('utf-8').upper()
d = d.map(lambda x: tf.numpy_function(func=upper_case_fn,inp=[x], Tout=tf.string))
print(list(d.as_numpy_iterator()))


TensorSpec(shape=(), dtype=tf.int32, name=None)
[2, 4, 6]
[1, 2, 3]
[b'Tensor("args_0:0", dtype=int32)foo', b'Tensor("args_0:0", dtype=int32)bar', b'Tensor("args_0:0", dtype=int32)baz']
[b'HELLO', b'WORLD']
[b'HELLO', b'WORLD']


In [117]:
# apply(
#     transformation_func
# ) 
# transformation_func	A function that takes one Dataset argument and returns a Dataset.
dataset = tf.data.Dataset.range(100)
def dataset_fn(ds):
    return ds.filter(lambda x: x < 5)
def dataset_tn(ds):
    print(list(ds.as_numpy_iterator()))
    return ds
dataset = dataset.apply(dataset_fn)
print(list(dataset.as_numpy_iterator()))
for element in dataset.as_numpy_iterator():
    print(element)

dataset = dataset.apply(dataset_tn)
print(list(dataset.as_numpy_iterator()))

[0, 1, 2, 3, 4]
0
1
2
3
4
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


In [118]:
# batch(
#     batch_size, drop_remainder=False
# )
dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3, drop_remainder=True)
list(dataset.as_numpy_iterator())

[array([0, 1, 2], dtype=int64), array([3, 4, 5], dtype=int64)]

In [119]:
# concatenate(
#     dataset
# )

a = tf.data.Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
b = tf.data.Dataset.range(4, 8)  # ==> [ 4, 5, 6, 7 ]
ds = a.concatenate(b)
print(list(ds.as_numpy_iterator()))

# The input dataset and dataset to be concatenated should have the same
# nested structures and output types.
c = tf.data.Dataset.zip((a, b))
print(list(c.as_numpy_iterator()))
# a.concatenate(c)   Two datasets to concatenate have different types <dtype: 'int64'> and (tf.int64, tf.int64)


d = tf.data.Dataset.from_tensor_slices(["a", "b", "c"])
# a.concatenate(d) # Two datasets to concatenate have different types <dtype: 'int64'> and <dtype: 'string'>

[1, 2, 3, 4, 5, 6, 7]
[(1, 4), (2, 5), (3, 6)]


In [120]:
# filter(
#     predicate
# )
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
dataset = dataset.filter(lambda x: x < 3)
print(list(dataset.as_numpy_iterator()))

# `tf.math.equal(x, y)` is required for equality comparison
def filter_fn(x):
    return tf.math.equal(x, 1)
dataset = dataset.filter(filter_fn)
print(list(dataset.as_numpy_iterator()))

[1, 2]
[1]


In [121]:
# interleave(
#     map_func, cycle_length=None, block_length=None, num_parallel_calls=None,
#     deterministic=None
# )
# The cycle_length and block_length arguments control the order in which elements are produced. 
# cycle_length controls the number of input elements that are processed concurrently. 
# If you set cycle_length to 1, this transformation will handle one input element at a time, 
# and will produce identical results to tf.data.Dataset.flat_map.

dataset = tf.data.Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
# NOTE: New lines indicate "block" boundaries.
dataset = dataset.interleave(lambda x: tf.data.Dataset.from_tensors(x).repeat(4), cycle_length=2, block_length=4)
list(dataset.as_numpy_iterator())

[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5]

In [122]:
# padded_batch(
#     batch_size, padded_shapes=None, padding_values=None, drop_remainder=False
# )
A = (tf.data.Dataset.range(1, 5, output_type=tf.int32).map(lambda x: tf.fill([x], x)))
print(A.as_numpy_iterator)
print('-------------------------------')


# Pad to the smallest per-batch size that fits all elements.
B = A.padded_batch(2)
for element in B.as_numpy_iterator():
    print(element)
print('-------------------------------')

# Pad to a fixed size.
C = A.padded_batch(2, padded_shapes=5)
for element in C.as_numpy_iterator():
    print(element)
print('-------------------------------')
    
# Pad with a custom value.
D = A.padded_batch(2, padded_shapes=5, padding_values=-1)
for element in D.as_numpy_iterator():
    print(element)
print('-------------------------------')
    
# Components of nested elements can be padded independently.
elements = [([1, 2, 3], [10]),
            ([4, 5], [11, 12])]
datasetc = tf.data.Dataset.from_generator(lambda: iter(elements), (tf.int32, tf.int32))
print(list(datasetc.as_numpy_iterator()))
# Pad the first component of the tuple to length 4, and the second
# component to the smallest size that fits.
dataset = datasetc.padded_batch(2,
    padded_shapes=([4], [None]),
    padding_values=(-1, 100))
print(list(dataset.as_numpy_iterator()))

dataset = datasetc.padded_batch(2,
    padded_shapes=([4], [3]),
    padding_values=(-1, 100))
print(list(dataset.as_numpy_iterator()))
print('-------------------------------')

# Pad with a single value and multiple components.
E = tf.data.Dataset.zip((A, A)).padded_batch(2, padding_values=-1)
for element in E.as_numpy_iterator():
    print(element)


<bound method DatasetV2.as_numpy_iterator of <MapDataset shapes: (None,), types: tf.int32>>
-------------------------------
[[1 0]
 [2 2]]
[[3 3 3 0]
 [4 4 4 4]]
-------------------------------
[[1 0 0 0 0]
 [2 2 0 0 0]]
[[3 3 3 0 0]
 [4 4 4 4 0]]
-------------------------------
[[ 1 -1 -1 -1 -1]
 [ 2  2 -1 -1 -1]]
[[ 3  3  3 -1 -1]
 [ 4  4  4  4 -1]]
-------------------------------
[(array([1, 2, 3]), array([10])), (array([4, 5]), array([11, 12]))]
[(array([[ 1,  2,  3, -1],
       [ 4,  5, -1, -1]]), array([[ 10, 100],
       [ 11,  12]]))]
[(array([[ 1,  2,  3, -1],
       [ 4,  5, -1, -1]]), array([[ 10, 100, 100],
       [ 11,  12, 100]]))]
-------------------------------
(array([[ 1, -1],
       [ 2,  2]]), array([[ 1, -1],
       [ 2,  2]]))
(array([[ 3,  3,  3, -1],
       [ 4,  4,  4,  4]]), array([[ 3,  3,  3, -1],
       [ 4,  4,  4,  4]]))


In [123]:
# reduce(
#     initial_state, reduce_func
# )
# A reduce_func that maps (old_state, input_element) to new_state. 
# It must take two arguments and return a new element
# The structure of new_state must match the structure of initial_state.
print(tf.data.Dataset.range(5).reduce(np.int64(0), lambda x, _: x + 1).numpy())
print(tf.data.Dataset.range(5).reduce(np.int64(0), lambda x, y: x + y).numpy())

5
10


In [124]:
# shuffle(
#     buffer_size, seed=None, reshuffle_each_iteration=None
# )
# reshuffle_each_iteration controls whether the shuffle order should be different for each epoch
dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=True)
dataset = dataset.repeat(2)  # doctest: +SKIP
print(list(dataset.as_numpy_iterator()))

dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=False)
dataset = dataset.repeat(2)  # doctest: +SKIP
print(list(dataset.as_numpy_iterator()))

dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=True)
print(list(dataset.as_numpy_iterator()))  # doctest: +SKIP
print(list(dataset.as_numpy_iterator()))  # doctest: +SKIP

dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=False)
print(list(dataset.as_numpy_iterator()))  # doctest: +SKIP
print(list(dataset.as_numpy_iterator()))  # doctest: +SKIP

[2, 0, 1, 0, 1, 2]
[1, 2, 0, 1, 2, 0]
[2, 1, 0]
[2, 0, 1]
[2, 1, 0]
[2, 1, 0]


In [125]:
# skip(
#     count
# )
# Creates a Dataset that skips count elements from this dataset.
dataset = tf.data.Dataset.range(10)
dataset = dataset.skip(7)
print(list(dataset.as_numpy_iterator()))

# take(
#     count
# )
# Creates a Dataset with at most count elements from this dataset.
dataset = tf.data.Dataset.range(10)
dataset = dataset.take(3)
print(list(dataset.as_numpy_iterator()))

[7, 8, 9]
[0, 1, 2]


In [126]:
# unbatch() Splits elements of a dataset into multiple elements.
elements = [ [1, 2, 3], [1, 2], [1, 2, 3, 4] ]
dataset = tf.data.Dataset.from_generator(lambda: elements, tf.int64)
print(list(dataset.as_numpy_iterator()))
dataset = dataset.unbatch()
print(list(dataset.as_numpy_iterator()))

[array([1, 2, 3], dtype=int64), array([1, 2], dtype=int64), array([1, 2, 3, 4], dtype=int64)]
[1, 2, 3, 1, 2, 1, 2, 3, 4]


In [127]:
# window(
#     size, shift=None, stride=1, drop_remainder=False
# )
dataset = tf.data.Dataset.range(7).window(2)
for window in dataset:
    print(list(window.as_numpy_iterator()))

dataset = tf.data.Dataset.range(7).window(3, 2, 1, True)
for window in dataset:
    print(list(window.as_numpy_iterator()))
print('-------------------------------')
    
dataset = tf.data.Dataset.range(7).window(3, 1, 2, True)
for window in dataset:
    print(list(window.as_numpy_iterator()))
print('-------------------------------')
    
nested = ([1, 2, 3, 4], [5, 6, 7, 8])
dataset = tf.data.Dataset.from_tensor_slices(nested).window(2)
for window in dataset:
    def to_numpy(ds):
        return list(ds.as_numpy_iterator())
    print(tuple(to_numpy(component) for component in window))
print('-------------------------------')
    
dataset = tf.data.Dataset.from_tensor_slices({'a': [1, 2, 3, 4]})
dataset = dataset.window(2)
for window in dataset:
    def to_numpy(ds):
        return list(ds.as_numpy_iterator())
    print({'a': to_numpy(window['a'])})

[0, 1]
[2, 3]
[4, 5]
[6]
[0, 1, 2]
[2, 3, 4]
[4, 5, 6]
-------------------------------
[0, 2, 4]
[1, 3, 5]
[2, 4, 6]
-------------------------------
([1, 2], [5, 6])
([3, 4], [7, 8])
-------------------------------
{'a': [1, 2]}
{'a': [3, 4]}
