<a href="https://colab.research.google.com/github/reitezuz/18NES2-2025/blob/main/week_10/text_processing_layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TextVectorization layer
https://keras.io/api/layers/preprocessing_layers/text/text_vectorization/

In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax"

In [1]:
# Text vectorization with default standardization and split
from keras.layers import TextVectorization

'''
keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation", # default standardization
    split="whitespace",                        # default split
    ngrams=None,                               # None, 2, 3,...
    output_mode="int",                         # "int", "multi_hot", "count" or "tf_idf"
    output_sequence_length=None,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
    **kwargs
)
'''

text_vectorization = TextVectorization(
    output_mode="int",  # "int", "multi_hot", "count" or "tf_idf"
    standardize="lower_and_strip_punctuation", # standardization
    split="whitespace", # tokenization
   )


In [None]:
# Text vectorization with custom standardization and custom split
import re
import string
import tensorflow as tf
from keras.layers import TextVectorization

def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor) # to lowercase
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]", "")  # remove punctuation

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor) # split by whitespace

text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

### Create the vocabulary:

In [2]:
dataset = [
    "The weather today is surprisingly warm.",
    "Tomorrow will be much colder, according to the forecast.",
    "Warm days in winter are unusual, but not impossible.",
]
text_vectorization.adapt(dataset)


### Vocabulary:
- word order is based on their frequency:

In [3]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 np.str_('warm'),
 np.str_('the'),
 np.str_('winter'),
 np.str_('will'),
 np.str_('weather'),
 np.str_('unusual'),
 np.str_('tomorrow'),
 np.str_('today'),
 np.str_('to'),
 np.str_('surprisingly'),
 np.str_('not'),
 np.str_('much'),
 np.str_('is'),
 np.str_('in'),
 np.str_('impossible'),
 np.str_('forecast'),
 np.str_('days'),
 np.str_('colder'),
 np.str_('but'),
 np.str_('be'),
 np.str_('are'),
 np.str_('according')]

### Vectorize text:
- OOV (out-of-vocabulary) words have index 1

In [4]:
test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 3  6  9 14  2 20  1 14  1  7  6  9], shape=(12,), dtype=int64)


## Comparison of tokenization techniques

### Tokenize by words

In [6]:
import keras
filename = keras.utils.get_file(
    origin="https://www.gutenberg.org/files/2701/old/moby10b.txt",
)
mobydick = list(open(filename, "r"))

# word vocabulary:
text_vectorization = keras.layers.TextVectorization(
    output_mode="multi_hot",  # "int", "multi_hot", "count" or "tf_idf"

)
text_vectorization.adapt(mobydick)
vocabulary_mobydick = text_vectorization.get_vocabulary()

print("Vocabulary length:", len(vocabulary_mobydick))
print("Vocabulary start:", vocabulary_mobydick[:20])
print("Vocabulary end:", vocabulary_mobydick[-20:])
print("Processed Sentence length", len(text_vectorization(test_sentence)))



Vocabulary length: 20187
Vocabulary start: ['[UNK]', np.str_('the'), np.str_('of'), np.str_('and'), np.str_('a'), np.str_('to'), np.str_('in'), np.str_('that'), np.str_('his'), np.str_('it'), np.str_('i'), np.str_('but'), np.str_('he'), np.str_('is'), np.str_('as'), np.str_('with'), np.str_('was'), np.str_('for'), np.str_('all'), np.str_('this')]
Vocabulary end: [np.str_('115'), np.str_('114'), np.str_('113'), np.str_('112'), np.str_('111'), np.str_('110'), np.str_('11'), np.str_('109'), np.str_('108'), np.str_('107'), np.str_('106'), np.str_('105'), np.str_('10440'), np.str_('104'), np.str_('103'), np.str_('102'), np.str_('101'), np.str_('100000000'), np.str_('10000'), np.str_('100')]
Processed Sentence length 20187


### Tokenize by characters



In [8]:
# tokenize by characters
text_vectorization = TextVectorization(
    output_mode="multi_hot",  # "int", "multi_hot", "count" or "tf_idf"
    split="character",
)
text_vectorization.adapt(mobydick)
vocabulary = text_vectorization.get_vocabulary()

print("Vocabulary length:", len(vocabulary))
print("Vocabulary start:", vocabulary[:20])
print("Vocabulary end:", vocabulary[-20:])
print("Processed Sentence length", len(text_vectorization(test_sentence)))

Vocabulary length: 39
Vocabulary start: ['[UNK]', np.str_(' '), np.str_('e'), np.str_('t'), np.str_('a'), np.str_('o'), np.str_('n'), np.str_('i'), np.str_('s'), np.str_('h'), np.str_('r'), np.str_('l'), np.str_('d'), np.str_('u'), np.str_('m'), np.str_('\n'), np.str_('c'), np.str_('w'), np.str_('f'), np.str_('g')]
Vocabulary end: [np.str_('g'), np.str_('p'), np.str_('y'), np.str_('b'), np.str_('v'), np.str_('k'), np.str_('q'), np.str_('x'), np.str_('j'), np.str_('z'), np.str_('0'), np.str_('1'), np.str_('2'), np.str_('8'), np.str_('3'), np.str_('7'), np.str_('5'), np.str_('9'), np.str_('4'), np.str_('6')]
Processed Sentence length 39


## Different vectorization modes

### Fixed-length sequences of indices (truncated / with padding)

In [None]:

'''
keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation", # default standardization
    split="whitespace",                        # default split, can be "character"
    ngrams=None,                               # None, 2, 3,...
    output_mode="int",                         # "int", "multi_hot", "count" or "tf_idf"
    output_sequence_length=None,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
    **kwargs
)
'''
from keras import layers


max_length = 5    # Maximum length of each sequence (longer sequences will be truncated)
max_tokens = 6    # Number of (most important) tokens

text_vectorization_1 = layers.TextVectorization(
    max_tokens=max_tokens,
    split="whitespace",
    output_mode="int",                 # Convert text to sequences of integer indices
    output_sequence_length=max_length, # Ensure sequences have the given fixed length
)

text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))

test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

test_sentence = "Nice weather."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)


['', '[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter'), np.str_('will')] 
 6
tf.Tensor([3 1 1 1 2], shape=(5,), dtype=int64)
tf.Tensor([1 1 0 0 0], shape=(5,), dtype=int64)


### Bag-of-Words

In [None]:
# multi_hot
rom keras import layers

max_tokens = 6    # Number of (most important) tokens

text_vectorization_1 = layers.TextVectorization(
    max_tokens=max_tokens,
    split="whitespace",
    output_mode="multi_hot",
)

text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))

test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

test_sentence = "Nice weather."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

['[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter'), np.str_('will'), np.str_('weather')] 
 6
tf.Tensor([1 1 1 0 0 1], shape=(6,), dtype=int64)
tf.Tensor([1 0 0 0 0 1], shape=(6,), dtype=int64)


In [None]:
# count
from keras import layers

max_tokens = 6    # Number of (most important) tokens

text_vectorization_1 = layers.TextVectorization(
    max_tokens=max_tokens,
    split="whitespace",
    output_mode="count",
)

text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))

test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

test_sentence = "Nice weather."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

['[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter'), np.str_('will'), np.str_('weather')] 
 6
tf.Tensor([8 1 1 0 0 2], shape=(6,), dtype=int64)
tf.Tensor([1 0 0 0 0 1], shape=(6,), dtype=int64)


In [None]:
# TF-IDF
from keras import layers

max_tokens = 6    # Number of (most important) tokens

text_vectorization_1 = layers.TextVectorization(
    max_tokens=max_tokens,
    split="whitespace",
    output_mode="tf_idf"
)

text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))

test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

test_sentence = "Nice weather."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

['[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter'), np.str_('will'), np.str_('weather')] 
 6
tf.Tensor([6.6162667 0.6931472 0.6931472 0.        0.        1.8325815], shape=(6,), dtype=float32)
tf.Tensor([0.82703334 0.         0.         0.         0.         0.91629076], shape=(6,), dtype=float32)


### Bag-of-words: Bigrams

In [None]:
# multi-hot + bigrams
from keras import layers

max_tokens = 6    # Number of (most important) tokens

text_vectorization_1 = layers.TextVectorization(
    max_tokens=max_tokens,
    split="whitespace",
    output_mode="multi_hot",
    ngrams = 2

)

text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))

test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

test_sentence = "The weather will be nice."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)

['[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter are'), np.str_('winter'), np.str_('will be')] 
 6
tf.Tensor([1 1 1 0 0 0], shape=(6,), dtype=int64)
tf.Tensor([1 0 1 0 0 1], shape=(6,), dtype=int64)
