In [3]:
import tensorflow as tf

sentences = [
    "I love my dog",
    "I love my cat",
    "You love my dog!",
    "Do you think my dog is amazing?"
]

vectorize_layer = tf.keras.layers.TextVectorization()
vectorize_layer.adapt(sentences)

vocabulary = vectorize_layer.get_vocabulary()

for index, word in enumerate(vocabulary):
    print(index, word)

0 
1 [UNK]
2 my
3 love
4 dog
5 you
6 i
7 think
8 is
9 do
10 cat
11 amazing


In [4]:
sample_input = "I love my dog"

sequence = vectorize_layer(sample_input)

sequence

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([6, 3, 2, 4])>

In [13]:
sequences_dataset = tf.data.Dataset.from_tensor_slices(sentences)

sequences = sequences_dataset.map(vectorize_layer)

for sentence, sequence in zip(sentences, sequences):
    print(f'{sentence} ----> {sequence}')

I love my dog ----> [6 3 2 4]
I love my cat ----> [ 6  3  2 10]
You love my dog! ----> [5 3 2 4]
Do you think my dog is amazing? ----> [ 9  5  7  2  4  8 11]


In [14]:
sequences_post = vectorize_layer(sentences)

print("INPUT:")
print(sentences)
print()

print("OUTPUT:")
print(sequences_post)

INPUT:
['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']

OUTPUT:
tf.Tensor(
[[ 6  3  2  4  0  0  0]
 [ 6  3  2 10  0  0  0]
 [ 5  3  2  4  0  0  0]
 [ 9  5  7  2  4  8 11]], shape=(4, 7), dtype=int64)


In [11]:
sequences_pre = tf.keras.utils.pad_sequences(sequences, padding="pre")

print("INPUT:")
for sequence in sequences:
    print(sequence.numpy())
print()

print("OUTPUT:")
print(sequences_pre)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


2025-02-01 12:02:00.705170: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-01 12:02:00.718709: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-01 12:02:00.730851: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [16]:
sequences_post_trunc = tf.keras.utils.pad_sequences(sequences, maxlen=5, padding="pre")

print("INPUT:")
for sequence in sequences:
    print(sequence.numpy())
print()

print("OUTPUT:")
print(sequences_post_trunc)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 0  6  3  2  4]
 [ 0  6  3  2 10]
 [ 0  5  3  2  4]
 [ 7  2  4  8 11]]


2025-02-01 12:06:52.188185: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-01 12:06:52.201772: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-01 12:06:52.213152: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [17]:
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)

vectorize_layer.adapt(sentences)

ragged_sequences = vectorize_layer(sentences)

print(ragged_sequences)

<tf.RaggedTensor [[6, 3, 2, 4], [6, 3, 2, 10], [5, 3, 2, 4], [9, 5, 7, 2, 4, 8, 11]]>


In [20]:
sequences_pre = tf.keras.utils.pad_sequences(ragged_sequences.numpy())

sequences_pre

array([[ 0,  0,  0,  6,  3,  2,  4],
       [ 0,  0,  0,  6,  3,  2, 10],
       [ 0,  0,  0,  5,  3,  2,  4],
       [ 9,  5,  7,  2,  4,  8, 11]], dtype=int32)

In [21]:
sequences_post = tf.keras.utils.pad_sequences(ragged_sequences.numpy(), padding="post")

sequences_post

array([[ 6,  3,  2,  4,  0,  0,  0],
       [ 6,  3,  2, 10,  0,  0,  0],
       [ 5,  3,  2,  4,  0,  0,  0],
       [ 9,  5,  7,  2,  4,  8, 11]], dtype=int32)

In [22]:
sentences_with_oov = [
    "i really love my dog",
    "my dog loves my manatee"
]

sequences_with_oov = vectorize_layer(sentences_with_oov)

for sentence, sequence in zip(sentences_with_oov, sequences_with_oov):
    print(f"{sentence} ----> {sequence}")

i really love my dog ----> [6 1 3 2 4]
my dog loves my manatee ----> [2 4 1 2 1]
