In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

In [18]:
# invoking the Tokenizer module with 100 unique words
# introducing <OOV> for Out Of Vocabulary
# it can be used when any word is unseen during the testing process
# you can pass anything as 'oov_taken' parameter but make sure that doesn't contradict
# with an actual word.
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

# fit the sentences to the Tokenizer module
tokenizer.fit_on_texts(sentences)

# break down the sentences in unique words dictionary
word_index = tokenizer.word_index

# transforms the sentences into a sequence of nested list
sequences = tokenizer.texts_to_sequences(sentences)

# once the tokenizer has created the sequences, these sequences can be passed to
# pad sequences in order to have them padded like this. The result is pretty straight forward.
# You can now see that the list of sentences has been padded out into a matrix and that
# each row in the matrix has the same length. It achieved this by putting the appropriate
# number of zeros before the sentence.
padded = pad_sequences(sequences)

# Now, here are some parameters for the pad_sequences function:
# padding='post' ==> if you want to padd at the end of the sentence
# maxlen=5 ==> where the maximum length of a sentence is with 5 words
# truncating='post' ==> if the length of a sentence exceeds 'maxlen', then
#                       truncate the extra words from the end of the sentence
# With all those parameters 'pad_sequences' function looks as the following:
padded_with_param = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)

# checking on test data
sequences_test = tokenizer.texts_to_sequences(test_data)

# sequence of numbers as sequence of words in a sentence
print(type(sequences), '\n')
print(sequences, '\n')

# the padded matrix takes the longest row size as the set row size and pads up other rows
# with additional zeros to meet up that length
print("### This is the padded sequence matrix where each row indicates a sentence in the sequence",\
       "(without param): ")
print(padded, '\n')
print("### This is the padded sequence matrix where each row indicates a sentence in the sequence",\
       "(with param): ")
print("**** Well this looks much nicer ****")
print(padded_with_param, '\n')

# it's found that the sequence prints out only those numbers which are during the mapping of words of
# sentences to tokenizer in other words, it considers only the words in the word_index dictionary.
# *** Unrecognized (unseen) words are not included in the sequence ***
print('Test sequence: ')
print(sequences_test, '\n')
print('Padded with test sequence: ')
print(pad_sequences(sequences_test, maxlen=10), '\n')

# the number associated with each word is based on the total number appearance (in descending order)
print(word_index)

<class 'list'> 

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]] 

### This is the padded sequence matrix where each row indicates a sentence in the sequence (without param): 
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]] 

### This is the padded sequence matrix where each row indicates a sentence in the sequence (with param): 
**** Well this looks much nicer ****
[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]] 

Test sequence: 
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]] 

Padded with test sequence: 
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]] 

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
