In [84]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [85]:
sentences = [
    'i love my dog',
    'I, love my cat', 
    'You love my dog!', 
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

In [86]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [87]:
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [88]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [89]:
padded = pad_sequences(sequences)
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  0,  0,  6,  3,  2,  4],
       [ 8,  6,  9,  2,  4, 10, 11]], dtype=int32)

In [90]:
padded1 = pad_sequences(sequences, padding="post")
padded1

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  0,  0,  0],
       [ 8,  6,  9,  2,  4, 10, 11]], dtype=int32)

In [91]:
padded2 = pad_sequences(sequences, padding="post", maxlen=5)
padded2

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  0],
       [ 6,  3,  2,  4,  0],
       [ 9,  2,  4, 10, 11]], dtype=int32)

In [92]:
padded3 = pad_sequences(sequences, padding="post", truncating="post", maxlen=5)
padded3

array([[5, 3, 2, 4, 0],
       [5, 3, 2, 7, 0],
       [6, 3, 2, 4, 0],
       [8, 6, 9, 2, 4]], dtype=int32)

In [93]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee',
    'my mouse loves my notebook so much!',
    'i am trying to crack the tokenizer with unknown words. Let\'s see what I can do!'
]

In [94]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[5, 1, 3, 2, 4],
 [2, 4, 1, 2, 1],
 [2, 1, 1, 2, 1, 1, 1],
 [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 8]]

In [100]:
test_padded = pad_sequences(test_seq, padding="post", truncating="post", maxlen=10)
test_padded

array([[5, 1, 3, 2, 4, 0, 0, 0, 0, 0],
       [2, 4, 1, 2, 1, 0, 0, 0, 0, 0],
       [2, 1, 1, 2, 1, 1, 1, 0, 0, 0],
       [5, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)

In [101]:
reverse_dict = dict([(value, key) for key, value in word_index.items()])
reverse_dict

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'you',
 7: 'cat',
 8: 'do',
 9: 'think',
 10: 'is',
 11: 'amazing'}

In [102]:
print(word_index)
print()
for num, sent in enumerate(test_data):
    print(sent)
    print(' '.join([reverse_dict[word_index] for word_index in test_padded[num] if word_index != 0]))
    print()

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

i really love my dog
i <OOV> love my dog

my dog loves my manatee
my dog <OOV> my <OOV>

my mouse loves my notebook so much!
my <OOV> <OOV> my <OOV> <OOV> <OOV>

i am trying to crack the tokenizer with unknown words. Let's see what I can do!
i <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>

