In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

# 1. Tokenization

In [6]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!'   
]

In [7]:
tokenizer = Tokenizer(num_words = 100)    # 100 is the max of most frequent words
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)


# ! is ignored by default

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


# 2. Sequencing

In [8]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [11]:
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [13]:
test_data = [
    'I really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)  # Notice, the new words (not in word index) used in these sentences are ignored while generating the sequences

[[4, 2, 1, 3], [1, 3, 1]]


In [14]:
tokenizer = Tokenizer(num_words = 100, oov_token="<00V>")       # will generate a word index for <00V>. <Out of Vocabulary>
                                                                # Any word which is not in word index will be replaced by <00V>'s index
tokenizer.fit_on_texts(sentences)

In [16]:
print(tokenizer.word_index)

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
