In [0]:
!pip install tensorflow==2.0.0

In [0]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
sentences = ['I love my dog', 
             'I love my cat', 
             'You love my dog!',
             'Do you think my dog is amazing?']

In [5]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print("Word Index:\n{}".format(word_index))

text_sequence = tokenizer.texts_to_sequences(sentences)
print("\nText Sequence:\n{}".format(text_sequence))

Word Index:
{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}

Text Sequence:
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [6]:
# Note if we want to make inferences, we need to tokenize the test set as well, but if in the test set we have words 
# that are not in the train set, we won't be able to tokenize those.

test_sentences = ['i really love my dog', 
                  'My dog loves my shoes']

test_sequence = tokenizer.texts_to_sequences(test_sentences)
print("Test Sequence: \n{}".format(test_sequence))


Test Sequence: 
[[4, 2, 1, 3], [1, 3, 1]]


In [7]:
# One way to fix running into words not in the training dictionary when testing is by adding a unknown key word 
# when tokenizing which will replace the unknown word using the key word.

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print("Word Index:\n{}".format(word_index))

text_sequence = tokenizer.texts_to_sequences(sentences)
print("\nText Sequence:\n{}".format(text_sequence))

test_sentences = ['i really love my dog', 
                  'My dog loves my shoes']

test_sequence = tokenizer.texts_to_sequences(test_sentences)
print("\nTest Sequence: \n{}".format(test_sequence))

Word Index:
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Text Sequence:
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Test Sequence: 
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [10]:
# We need to have uniform sized sentences for training, similar to how we need to resize all the images 
# to one shape. So for texts, we use padding.

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print("Word Index:\n{}".format(word_index))

text_sequence = tokenizer.texts_to_sequences(sentences)
print("\nText Sequence:\n{}".format(text_sequence))

text_padded = pad_sequences(text_sequence, padding='post')
print("\nText Padded: \n{}".format(text_padded))

Word Index:
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Text Sequence:
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Text Padded: 
[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]
