<a href="https://colab.research.google.com/github/reitezuz/18NES2-2025/blob/main/week_10/text_processing_layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TextVectorization layer
https://keras.io/api/layers/preprocessing_layers/text/text_vectorization/

In [26]:
import os
os.environ["KERAS_BACKEND"] = "jax"

In [27]:
# Text vectorization with default standardization and split
from keras.layers import TextVectorization

'''
keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation", # default standardization
    split="whitespace",                        # default split
    ngrams=None,                               # None, 2, 3,...
    output_mode="int",                         # "int", "multi_hot", "count" or "tf_idf"
    output_sequence_length=None,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
    **kwargs
)
'''

text_vectorization = TextVectorization(
    output_mode="int",  # "int", "multi_hot", "count" or "tf_idf"
   )


In [28]:
# Text vectorization with custom standardization and custom split
import re
import string
import tensorflow as tf
from keras.layers import TextVectorization

def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor) # to lowercase
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]", "")  # remove punctuation

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor) # split by whitespace

text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

### Create the vocabulary:

In [29]:
dataset = [
    "The weather today is surprisingly warm.",
    "Tomorrow will be much colder, according to the forecast.",
    "Warm days in winter are unusual, but not impossible.",
]
text_vectorization.adapt(dataset)


### Vocabulary:
- word order is based on their frequency:

In [30]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 np.str_('warm'),
 np.str_('the'),
 np.str_('winter'),
 np.str_('will'),
 np.str_('weather'),
 np.str_('unusual'),
 np.str_('tomorrow'),
 np.str_('today'),
 np.str_('to'),
 np.str_('surprisingly'),
 np.str_('not'),
 np.str_('much'),
 np.str_('is'),
 np.str_('in'),
 np.str_('impossible'),
 np.str_('forecast'),
 np.str_('days'),
 np.str_('colder'),
 np.str_('but'),
 np.str_('be'),
 np.str_('are'),
 np.str_('according')]

### Vectorize text:
- OOV (out-of-vocabulary) word have index 1

In [31]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

[ 3  6  9 14  2 20  1 14  1  7  6  9]


In [32]:

'''
keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation", # default standardization
    split="whitespace",                        # default split
    ngrams=None,                               # None, 2, 3,...
    output_mode="int",                         # "int", "multi_hot", "count" or "tf_idf"
    output_sequence_length=None,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
    **kwargs
)
'''

text_vectorization_1 = TextVectorization(
    output_mode="multi_hot",  # "int", "multi_hot", "count" or "tf_idf"
   )
text_vectorization_1.adapt(dataset)
vocabulary = text_vectorization_1.get_vocabulary()
print(vocabulary, "\n", len(vocabulary))
test_sentence = "The weather today is warm, but it is raining. Unusual weather today."
encoded_sentence = text_vectorization_1(test_sentence)
print(encoded_sentence)


['[UNK]', np.str_('warm'), np.str_('the'), np.str_('winter'), np.str_('will'), np.str_('weather'), np.str_('unusual'), np.str_('tomorrow'), np.str_('today'), np.str_('to'), np.str_('surprisingly'), np.str_('not'), np.str_('much'), np.str_('is'), np.str_('in'), np.str_('impossible'), np.str_('forecast'), np.str_('days'), np.str_('colder'), np.str_('but'), np.str_('be'), np.str_('are'), np.str_('according')] 
 23
[1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0]
