<a href="https://colab.research.google.com/github/srv96/AI-ML-TensorFlow/blob/main/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import tensorflow as tf

In [51]:
sentences = [
    "This is a sentence.",
    "TensorFlow is a powerful library.",
    "Python is popular for machine learning.",
    "This code imports TensorFlow."
]

In [52]:
MAX_VOCAB_SIZE = 20_000

In [53]:
vectorization_layers = tf.keras.layers.TextVectorization(
    max_tokens = MAX_VOCAB_SIZE
    # standardize = "lower_and_strip_punctuation",
    # split = "whitespace",    DEFAULT VALUES
    # output_mode = "int",
)

In [54]:
vectorization_layers.adapt(sentences)

In [55]:
sequences = vectorization_layers(sentences)
print(sequences)

tf.Tensor(
[[ 3  2  5  6  0  0]
 [ 4  2  5  8 11  0]
 [ 7  2  9 14 10 12]
 [ 3 15 13  4  0  0]], shape=(4, 6), dtype=int64)


In [56]:
vectorization_layers.get_vocabulary()

['',
 '[UNK]',
 'is',
 'this',
 'tensorflow',
 'a',
 'sentence',
 'python',
 'powerful',
 'popular',
 'machine',
 'library',
 'learning',
 'imports',
 'for',
 'code']

In [57]:
word2idx = { v:k for k,v in enumerate(vectorization_layers.get_vocabulary()) }
print(word2idx)

{'': 0, '[UNK]': 1, 'is': 2, 'this': 3, 'tensorflow': 4, 'a': 5, 'sentence': 6, 'python': 7, 'powerful': 8, 'popular': 9, 'machine': 10, 'library': 11, 'learning': 12, 'imports': 13, 'for': 14, 'code': 15}


In [58]:
vactorization_layer_truncate = tf.keras.layers.TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    output_sequence_length = 3
)
vactorization_layer_truncate.adapt(sentences)
sequences = vactorization_layer_truncate(sentences)
print(sequences)

tf.Tensor(
[[ 3  2  5]
 [ 4  2  5]
 [ 7  2  9]
 [ 3 15 13]], shape=(4, 3), dtype=int64)


In [59]:
vectorization_layer_pad = tf.keras.layers.TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    output_sequence_length = 10
)
vectorization_layer_pad.adapt(sentences)
sequences = vectorization_layer_pad(sentences)
print(sequences)

tf.Tensor(
[[ 3  2  5  6  0  0  0  0  0  0]
 [ 4  2  5  8 11  0  0  0  0  0]
 [ 7  2  9 14 10 12  0  0  0  0]
 [ 3 15 13  4  0  0  0  0  0  0]], shape=(4, 10), dtype=int64)


In [60]:
vectorization_layers_ragged = tf.keras.layers.TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    ragged = True
)
vectorization_layers_ragged.adapt(sentences)
sequences = vectorization_layers_ragged(sentences)
print(sequences)

<tf.RaggedTensor [[3, 2, 5, 6], [4, 2, 5, 8, 11], [7, 2, 9, 14, 10, 12], [3, 15, 13, 4]]>


In [63]:
padded = tf.keras.utils.pad_sequences(sequences.to_list(), maxlen=10)
print(padded)

[[ 0  0  0  0  0  0  3  2  5  6]
 [ 0  0  0  0  0  4  2  5  8 11]
 [ 0  0  0  0  7  2  9 14 10 12]
 [ 0  0  0  0  0  0  3 15 13  4]]
