In [38]:
import tensorflow as tf
import tensorflow.keras as keras

tf.__version__

'2.0.0'

In [5]:
X = tf.range(10)

# create a dataset entirely in memory
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [6]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


**Chaining transformations**

In [8]:
# creates a new dataset by repeating the original one
for item in dataset.repeat(2):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [11]:
for item in dataset.repeat(2).batch(6):
    print(item)

tf.Tensor([0 1 2 3 4 5], shape=(6,), dtype=int32)
tf.Tensor([6 7 8 9 0 1], shape=(6,), dtype=int32)
tf.Tensor([2 3 4 5 6 7], shape=(6,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [19]:
# batch

batches = dataset.repeat(2).batch(6, drop_remainder=True)

for item in batches:
    print(item)

tf.Tensor([0 1 2 3 4 5], shape=(6,), dtype=int32)
tf.Tensor([6 7 8 9 0 1], shape=(6,), dtype=int32)
tf.Tensor([2 3 4 5 6 7], shape=(6,), dtype=int32)


In [20]:
# unbatch

for item in batches.apply(tf.data.experimental.unbatch()):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)


**The TFRecord format**

In [26]:
file_name = 'data.ftrecord'

options = tf.io.TFRecordOptions(compression_type='GZIP')

with tf.io.TFRecordWriter(file_name, options) as f:
    f.write(b'First record')
    f.write(b'Second record')

In [29]:
filepaths = [file_name]

dataset = tf.data.TFRecordDataset(filepaths, compression_type='GZIP')

for item in dataset:
    print(item)

tf.Tensor(b'First record', shape=(), dtype=string)
tf.Tensor(b'Second record', shape=(), dtype=string)


**Word embeddings**

In [34]:
vocab = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']

indices = tf.range(len(vocab), dtype=tf.int64)

table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
# oov - out of vocabulary
# if we look up a category that does not exists in the vocab,
# the lookup table will compute a hash of this category and use it
# to assign the unknown category to one of the oov buckets
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [35]:
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'INLAND'])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: id=360, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [36]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: id=366, shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [31]:
embeding_dim = 2

# intitialize 2D vectors randomly and gradualy improve
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embeding_dim])

embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.7127887 , 0.44220078],
       [0.6015724 , 0.47879028],
       [0.11142731, 0.7646463 ],
       [0.14125073, 0.09847701],
       [0.02038884, 0.5837337 ],
       [0.4999746 , 0.74209213],
       [0.37292337, 0.7908884 ]], dtype=float32)>

In [37]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: id=369, shape=(4, 2), dtype=float32, numpy=
array([[0.14125073, 0.09847701],
       [0.4999746 , 0.74209213],
       [0.6015724 , 0.47879028],
       [0.6015724 , 0.47879028]], dtype=float32)>

In [40]:
embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
                                   output_dim=embeding_dim)

embedding(cat_indices)

<tf.Tensor: id=389, shape=(4, 2), dtype=float32, numpy=
array([[-0.03567611,  0.00554707],
       [ 0.03444651,  0.04278381],
       [-0.02629913,  0.03428841],
       [-0.02629913,  0.03428841]], dtype=float32)>