In [1]:
import tensorflow as tf
import keras 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tf.data input

In [4]:
text_dataset = tf.data.Dataset.from_tensor_slices(["You gotta do it till you're through it"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='int',
  max_tokens=5,                # limit the number of tokens
  output_sequence_length=4     # limit output sequence length
)


In [5]:

vectorize_layer.adapt(text_dataset)
print(vectorize_layer.get_vocabulary())


['', '[UNK]', 'it', 'youre', 'you']


In [6]:

text_vectorized = text_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)

tf.Tensor([4 1 1 2], shape=(4,), dtype=int64)


2024-11-13 09:09:43.793338: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Out-of-vocab words

In [20]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "foo foo bar baz"])
# bif does not appear in text_dataset
new_dataset = tf.data.Dataset.from_tensor_slices(["foo bif bar"])

vectorize_layer = keras.layers.TextVectorization(
  output_mode='int',
  max_tokens=5,
  output_sequence_length=4
)

vectorize_layer.adapt(text_dataset)

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([2 1 3 0], shape=(4,), dtype=int64)


In [21]:
new_dataset = tf.data.Dataset.from_tensor_slices(["bif bif", "bif bar bif", "bif bif bif bif bif bar"])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1 1 0 0], shape=(4,), dtype=int64)
tf.Tensor([1 3 1 0], shape=(4,), dtype=int64)
tf.Tensor([1 1 1 1], shape=(4,), dtype=int64)


# mutli-hot encoding

In [22]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "bar baz", "baz foo", "foo foo bar baz"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='multi_hot',   # aka binary encoding
  max_tokens=5
)

vectorize_layer.adapt(text_dataset)
# Vectorize the text -- note the 0 outputs
text_mh = text_dataset.map(vectorize_layer)
for text in text_mh:
  print(text)


tf.Tensor([0 1 0 1], shape=(4,), dtype=int64)
tf.Tensor([0 0 1 1], shape=(4,), dtype=int64)
tf.Tensor([0 1 1 0], shape=(4,), dtype=int64)
tf.Tensor([0 1 1 1], shape=(4,), dtype=int64)


In [25]:
new_dataset = tf.data.Dataset.from_tensor_slices([
  "bif bif", 
  "bif bar bif", 
  "bif bif bif bif bif bar",
  "foo bar"
])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1 0 0 0], shape=(4,), dtype=int64)
tf.Tensor([1 0 0 1], shape=(4,), dtype=int64)
tf.Tensor([1 0 0 1], shape=(4,), dtype=int64)
tf.Tensor([0 1 0 1], shape=(4,), dtype=int64)


# tf-idf


In [26]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "bar baz", "baz foo", "foo foo bar baz"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='tf-idf',   # term freq. â€“ inverse document freq.
  max_tokens=5
)

vectorize_layer.adapt(text_dataset)
# Vectorize the text -- note the 0 outputs
text_tfidf = text_dataset.map(vectorize_layer)
for text in text_tfidf:
  print(text)


tf.Tensor([0.        0.6931472 0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.        0.6931472 0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.6931472 0.6931472 0.       ], shape=(4,), dtype=float32)
tf.Tensor([0.        1.3862944 0.6931472 0.6931472], shape=(4,), dtype=float32)


In [28]:
new_dataset = tf.data.Dataset.from_tensor_slices([
  "bif bif",
  "bif bar bif",
  "bif bif bif bif bif bar",
  "foo bar",
  ""
])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1.3862944 0.        0.        0.       ], shape=(4,), dtype=float32)
tf.Tensor([1.3862944 0.        0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([3.465736  0.        0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.6931472 0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0. 0. 0. 0.], shape=(4,), dtype=float32)


# bigrams


In [29]:
text_ds = tf.data.Dataset.from_tensor_slices(["the cat sat on the mat"])

vectorize_layer = tf.keras.layers.TextVectorization(
  ngrams=2,
  max_tokens=20000,
  output_mode='multi_hot'
)

vectorize_layer.adapt(text_ds)
print(vectorize_layer.get_vocabulary())


['[UNK]', 'the', 'the mat', 'the cat', 'sat on', 'sat', 'on the', 'on', 'mat', 'cat sat', 'cat']


In [30]:


text_bg = text_ds.map(vectorize_layer)
for text in text_bg:
  print(text)


tf.Tensor([0 1 1 1 1 1 1 1 1 1 1], shape=(11,), dtype=int64)
