In [6]:
import string

class Vectorizer:
    def standarize(self, text):
        text = text.lower()
        return ''.join(char for char in text if char not in string.punctuation)
    
    def tokenize(self, text):
        text = self.standarize(text)
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = {'': [0, 0], '[UNK]': [1, 0]}
        for text in dataset:
            text = self.standarize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = [len(self.vocabulary), 1]
                else:
                    self.vocabulary[token][1] += 1
        self.inverse_vocabulary = dict((index_count[0], token) for token, index_count in self.vocabulary.items())
    
    def encode(self, text):
        text = self.standarize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, [1, 0])[0] for token in tokens]
    
    def decode(self, index_seq):
        return ' '.join(self.inverse_vocabulary.get(i, '[UNK]') for i in index_seq)

In [10]:
vectorizer = Vectorizer()

dataset = ['I write, erase, rewrite',
           'Erase again, and then',
           'A poppy blooms.']

vectorizer.make_vocabulary(dataset)
vectorizer.vocabulary, vectorizer.inverse_vocabulary

({'': [0, 0],
  '[UNK]': [1, 0],
  'i': [2, 1],
  'write': [3, 1],
  'erase': [4, 2],
  'rewrite': [5, 1],
  'again': [6, 1],
  'and': [7, 1],
  'then': [8, 1],
  'a': [9, 1],
  'poppy': [10, 1],
  'blooms': [11, 1]},
 {0: '',
  1: '[UNK]',
  2: 'i',
  3: 'write',
  4: 'erase',
  5: 'rewrite',
  6: 'again',
  7: 'and',
  8: 'then',
  9: 'a',
  10: 'poppy',
  11: 'blooms'})

In [20]:
test_sentence = 'I write, rewrite and still rewrite again'
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)
decoded_sentence = vectorizer.decode(encoded_sentence)
decoded_sentence

[2, 3, 5, 7, 1, 5, 6]


'i write rewrite and [UNK] rewrite again'

In [15]:
import re
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(lowercase_string, f'[{re.escape(string.punctuation)}]', '')

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(output_mode='int', standardize=custom_standardization_fn, split=custom_split_fn)

2023-09-20 15:34:30.527570: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-20 15:34:30.527883: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [27]:
text_vectorization.adapt(dataset)
vocabulary = text_vectorization.get_vocabulary()
encoded_sentence = text_vectorization(test_sentence)
inverse_vocabulary = dict(enumerate(vocabulary))
decoded_sentence = ' '.join(inverse_vocabulary[int(i)] for i in encoded_sentence)
encoded_sentence.numpy(), decoded_sentence


(array([ 7,  3,  5,  9,  1,  5, 10]),
 'i write rewrite and [UNK] rewrite again')