In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
# Example training data, of dtype `string`.
training_data = np.array([["This is the 1st sample."], ["And here's the 2nd sample."]])
# Create a TextVectorization layer instance. It can be configured to either
# return integer token indices, or a dense token representation (e.g. multi-hot
# or TF-IDF). The text standardization and text splitting algorithms are fully
# configurable.
vectorizer = TextVectorization(output_mode="int")


In [4]:
# Example training data, of dtype `string`.
training_data = np.array([["This is the 1st sample."], ["And here's the 2nd sample."]])
# Calling `adapt` on an array or dataset makes the layer generate a vocabulary
# index for the data, which can then be reused when seeing new data.
vectorizer.adapt(training_data)

# After calling adapt, the layer is able to encode any n-gram it has seen before
# in the `adapt()` data. Unknown n-grams are encoded via an "out-of-vocabulary"
# token.
integer_data = vectorizer(training_data)
print(integer_data)

tf.Tensor(
[[4 5 2 9 3]
 [7 6 2 8 3]], shape=(2, 5), dtype=int64)


In [5]:
# Create a TextVectorization layer instance. It can be configured to either
# return integer token indices, or a dense token representation (e.g. multi-hot
# or TF-IDF). The text standardization and text splitting algorithms are fully
# configurable.
vectorizer = TextVectorization(output_mode="binary", ngrams=2)

# Calling `adapt` on an array or dataset makes the layer generate a vocabulary
# index for the data, which can then be reused when seeing new data.
vectorizer.adapt(training_data)

# After calling adapt, the layer is able to encode any n-gram it has seen before
# in the `adapt()` data. Unknown n-grams are encoded via an "out-of-vocabulary"
# token.
integer_data = vectorizer(training_data)
print(integer_data)


tf.Tensor(
[[0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0.]], shape=(2, 17), dtype=float32)


In [6]:
len(vectorizer.get_vocabulary())

17

In [7]:
# Get the data as Numpy arrays
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()


In [8]:
# Build a simple model
inputs = keras.Input(shape=(28, 28))
x = layers.experimental.preprocessing.Rescaling(1.0 / 255)(inputs)
x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(10, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 28)]          0         
_________________________________________________________________
rescaling (Rescaling)        (None, 28, 28)            0         
_________________________________________________________________
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
_______________________________________________________

In [9]:
# Train the model for 1 epoch from Numpy data
batch_size = 64
print("Fit on NumPy data")
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=1)

# Train the model for 1 epoch using a dataset
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)

print("Fit on Dataset")
history = model.fit(dataset, epochs=1)

Fit on NumPy data
Fit on Dataset


In [10]:
history.history

{'loss': [0.11334305256605148]}

In [11]:
# You can pass a list of metric objects to compile(), like this:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
)
history = model.fit(dataset, epochs=1)




In [12]:
history.history

{'loss': [0.08001427352428436], 'acc': [0.975766658782959]}

In [13]:
# You can pass validation data to fit() to monitor your validation loss & validation metrics.
#  Validation metrics get reported at the end of each epoch.
val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
history = model.fit(dataset, validation_data=val_dataset,validation_steps=1)



## Callback

In [14]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath='path/to/my/model_{epoch}',
        save_freq='epoch')
]
model.fit(dataset, epochs=2, callbacks=callbacks)


Epoch 1/2
INFO:tensorflow:Assets written to: path/to/my\model_1\assets
Epoch 2/2
INFO:tensorflow:Assets written to: path/to/my\model_2\assets


<tensorflow.python.keras.callbacks.History at 0x2425b622508>

In [18]:
class CustomModel(keras.Model):
  def train_step(self, data):
    # Unpack the data. Its structure depends on your model and
    # on what you pass to `fit()`.
    x, y = data
    with tf.GradientTape() as tape:
      y_pred = self(x, training=True)  # Forward pass
      # Compute the loss value
      # (the loss function is configured in `compile()`)
      loss = self.compiled_loss(y, y_pred,
                                regularization_losses=self.losses)
    # Compute gradients
    trainable_vars = self.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)
    # Update weights
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))
    # Update metrics (includes the metric that tracks the loss)
    self.compiled_metrics.update_state(y, y_pred)
    # Return a dict mapping metric names to current value
    return {m.name: m.result() for m in self.metrics}

# Construct and compile an instance of CustomModel
inputs = keras.Input(shape=(28,28))
outputs = keras.layers.Dense(10)(inputs)
model = CustomModel(inputs, outputs)
model.compile(optimizer='adam', loss='mse', metrics=[...])

# Just use `fit` as usual
model.fit(dataset, epochs=3)


Epoch 1/3


ValueError: in user code:

    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\conda_tmp\ipykernel_17744\3219816401.py:18 train_step
        self.compiled_metrics.update_state(y, y_pred)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:387 update_state
        self.build(y_pred, y_true)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:318 build
        self._metrics, y_true, y_pred)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\util\nest.py:1163 map_structure_up_to
        **kwargs)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\util\nest.py:1258 map_structure_with_tuple_paths_up_to
        func(*args, **kwargs) for args in zip(flat_path_gen, *flat_value_gen)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\util\nest.py:1258 <listcomp>
        func(*args, **kwargs) for args in zip(flat_path_gen, *flat_value_gen)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\util\nest.py:1161 <lambda>
        lambda _, *values: func(*values),  # Discards the path arg.
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:418 _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:418 <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:437 _get_metric_object
        metric_obj = metrics_mod.get(metric)
    d:\anaconda\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\metrics.py:3495 get
        'Could not interpret metric function identifier: {}'.format(identifier))

    ValueError: Could not interpret metric function identifier: Ellipsis


To do asynchronous preprocessing, simply use dataset.map to inject a preprocessing operation into your data pipeline:

In [20]:
# Example training data, of dtype `string`.
samples = np.array([["This is the 1st sample."], ["And here's the 2nd sample."]])
labels = [[0], [1]]

# Prepare a TextVectorization layer.
vectorizer = TextVectorization(output_mode="int")
vectorizer.adapt(samples)

# Asynchronous preprocessing: the text vectorization is part of the tf.data pipeline.
# First, create a dataset
dataset = tf.data.Dataset.from_tensor_slices((samples, labels)).batch(2)
# Apply text vectorization to the samples
dataset = dataset.map(lambda x, y: (vectorizer(x), y))
# Prefetch with a buffer size of 2 batches
dataset = dataset.prefetch(2)

# Our model should expect sequences of integers as inputs
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(input_dim=10, output_dim=32)(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="adam", loss="mse", run_eagerly=True)
model.fit(dataset)




<tensorflow.python.keras.callbacks.History at 0x24264fa9d88>