In [1]:
import tensorflow as tf
import keras 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tf.data input

In [2]:
text_dataset = tf.data.Dataset.from_tensor_slices(["You gotta do it till you're through it"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='int',
  max_tokens=5,                # limit the number of tokens
  output_sequence_length=4     # limit output sequence length
)


2024-11-13 10:55:34.956495: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-11-13 10:55:34.956522: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-11-13 10:55:34.956534: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-11-13 10:55:34.956552: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-13 10:55:34.956565: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:

vectorize_layer.adapt(text_dataset)
print(vectorize_layer.get_vocabulary())


['', '[UNK]', 'it', 'youre', 'you']


2024-11-13 10:55:35.188788: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:

text_vectorized = text_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)

tf.Tensor([4 1 1 2], shape=(4,), dtype=int64)


2024-11-13 10:55:35.329850: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Out-of-vocab words

In [5]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "foo foo bar baz"])
# bif does not appear in text_dataset
new_dataset = tf.data.Dataset.from_tensor_slices(["foo bif bar"])

vectorize_layer = keras.layers.TextVectorization(
  output_mode='int',
  max_tokens=5,
  output_sequence_length=4
)

vectorize_layer.adapt(text_dataset)

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([2 1 3 0], shape=(4,), dtype=int64)


2024-11-13 10:55:35.432681: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
new_dataset = tf.data.Dataset.from_tensor_slices(["bif bif", "bif bar bif", "bif bif bif bif bif bar"])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1 1 0 0], shape=(4,), dtype=int64)
tf.Tensor([1 3 1 0], shape=(4,), dtype=int64)
tf.Tensor([1 1 1 1], shape=(4,), dtype=int64)


# mutli-hot encoding

In [3]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "bar baz", "baz foo", "foo foo bar baz"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='multi_hot',   # aka binary encoding
  max_tokens=5
)

vectorize_layer.adapt(text_dataset)
# Vectorize the text -- note the 0 outputs
text_mh = text_dataset.map(vectorize_layer)
for text in text_mh:
  print(text)

print(vectorize_layer.get_vocabulary())


tf.Tensor([0 1 0 1], shape=(4,), dtype=int64)
tf.Tensor([0 0 1 1], shape=(4,), dtype=int64)
tf.Tensor([0 1 1 0], shape=(4,), dtype=int64)
tf.Tensor([0 1 1 1], shape=(4,), dtype=int64)
['[UNK]', 'foo', 'baz', 'bar']


2024-11-18 09:01:27.868812: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
new_dataset = tf.data.Dataset.from_tensor_slices([
  "bif bif", 
  "bif bar bif", 
  "bif bif bif bif bif bar",
  "foo bar"
])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1 0 0 0], shape=(4,), dtype=int64)
tf.Tensor([1 0 0 1], shape=(4,), dtype=int64)
tf.Tensor([1 0 0 1], shape=(4,), dtype=int64)
tf.Tensor([0 1 0 1], shape=(4,), dtype=int64)


2024-11-13 10:55:35.644605: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# tf-idf


In [9]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo bar", "bar baz", "baz foo", "foo foo bar baz"])
vectorize_layer = keras.layers.TextVectorization(
  output_mode='tf-idf',   # term freq. – inverse document freq.
  max_tokens=5
)

vectorize_layer.adapt(text_dataset)
# Vectorize the text -- note the 0 outputs
text_tfidf = text_dataset.map(vectorize_layer)
for text in text_tfidf:
  print(text)


tf.Tensor([0.        0.6931472 0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.        0.6931472 0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.6931472 0.6931472 0.       ], shape=(4,), dtype=float32)
tf.Tensor([0.        1.3862944 0.6931472 0.6931472], shape=(4,), dtype=float32)


In [10]:
new_dataset = tf.data.Dataset.from_tensor_slices([
  "bif bif",
  "bif bar bif",
  "bif bif bif bif bif bar",
  "foo bar",
  ""
])

text_vectorized = new_dataset.map(vectorize_layer)
for text in text_vectorized:
  print(text)



tf.Tensor([1.3862944 0.        0.        0.       ], shape=(4,), dtype=float32)
tf.Tensor([1.3862944 0.        0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([3.465736  0.        0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0.        0.6931472 0.        0.6931472], shape=(4,), dtype=float32)
tf.Tensor([0. 0. 0. 0.], shape=(4,), dtype=float32)


# bigrams


In [11]:
text_ds = tf.data.Dataset.from_tensor_slices(["the cat sat on the mat"])

vectorize_layer = tf.keras.layers.TextVectorization(
  ngrams=2,
  max_tokens=20000,
  output_mode='multi_hot'
)

vectorize_layer.adapt(text_ds)
print(vectorize_layer.get_vocabulary())


['[UNK]', 'the', 'the mat', 'the cat', 'sat on', 'sat', 'on the', 'on', 'mat', 'cat sat', 'cat']


In [12]:


text_bg = text_ds.map(vectorize_layer)
for text in text_bg:
  print(text)


tf.Tensor([0 1 1 1 1 1 1 1 1 1 1], shape=(11,), dtype=int64)
