In [1]:
import tensorflow as tf

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
    ]



In [2]:
from tensorflow.keras.layers import TextVectorization

vectorize_layer = TextVectorization()
vectorize_layer.adapt(sentences)

vocabulary = vectorize_layer.get_vocabulary()

for index, word in enumerate(vocabulary):
    print(index, word)

0 
1 [UNK]
2 my
3 love
4 dog
5 you
6 i
7 think
8 is
9 do
10 cat
11 amazing


In [2]:
from tensorflow.data import Dataset

sentences_dataset = Dataset.from_tensor_slices(sentences)

sequences = sentences_dataset.map(vectorize_layer)

for sentence, sequence in zip(sentences, sequences):
    print(f'{sentence} ---> {sequence}')



NameError: name 'sentences' is not defined

In [5]:
from tensorflow.keras.utils import pad_sequences

sequences_post = vectorize_layer(sentences)

sequences_post

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[ 6,  3,  2,  4,  0,  0,  0],
       [ 6,  3,  2, 10,  0,  0,  0],
       [ 5,  3,  2,  4,  0,  0,  0],
       [ 9,  5,  7,  2,  4,  8, 11]])>

In [6]:
sequences_pre = pad_sequences(sequences, padding="pre")

# Print the results
print('INPUT:')
[print(sequence.numpy()) for sequence in sequences]
print()

print('OUTPUT:')
print(sequences_pre)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


2024-12-31 17:31:18.519490: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-31 17:31:18.535299: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-31 17:31:18.547659: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
sequences_post = pad_sequences(sequences, padding="post")

# Print the results
print('INPUT:')
[print(sequence.numpy()) for sequence in sequences]
print()

print('OUTPUT:')
print(sequences_post)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 6  3  2  4  0  0  0]
 [ 6  3  2 10  0  0  0]
 [ 5  3  2  4  0  0  0]
 [ 9  5  7  2  4  8 11]]


2024-12-31 17:31:33.260797: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-31 17:31:33.279064: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-12-31 17:31:33.300756: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
vectorize_layer = TextVectorization(ragged=True)

vectorize_layer.adapt(sentences)

ragged_sequences = vectorize_layer(sentences)

ragged_sequences

NameError: name 'TextVectorization' is not defined

In [11]:
sequences_pre = pad_sequences(ragged_sequences.numpy(), padding="pre")
sequences_pre

array([[ 0,  0,  0,  6,  3,  2,  4],
       [ 0,  0,  0,  6,  3,  2, 10],
       [ 0,  0,  0,  5,  3,  2,  4],
       [ 9,  5,  7,  2,  4,  8, 11]], dtype=int32)

In [12]:
sequences_post = pad_sequences(ragged_sequences.numpy(), padding="post")

sequences_post

array([[ 6,  3,  2,  4,  0,  0,  0],
       [ 6,  3,  2, 10,  0,  0,  0],
       [ 5,  3,  2,  4,  0,  0,  0],
       [ 9,  5,  7,  2,  4,  8, 11]], dtype=int32)

In [13]:
# Try with words that are not in the vocabulary
sentences_with_oov = [
    'i really love my dog',
    'my dog loves my manatee'
]

# Generate the sequences
sequences_with_oov = vectorize_layer(sentences_with_oov)

# Print the integer sequences
for sentence, sequence in zip(sentences_with_oov, sequences_with_oov):
  print(f'{sentence} ---> {sequence}')

i really love my dog ---> [6 1 3 2 4]
my dog loves my manatee ---> [2 4 1 2 1]
