In [1]:
#importing libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    'My favorite food is ice cream',
    'do you like ice cream too?',
    'My dog likes ice cream!',
    "your favorite flavor of icecream is chocolate",
    "chocolate isn't good for dogs",
    "your dog, your cat, and your parrot prefer broccoli",
    "My mother is really religious",
    "My father is a clean freak",
    "Sister is a supporter"
]
print(sentences)

['My favorite food is ice cream', 'do you like ice cream too?', 'My dog likes ice cream!', 'your favorite flavor of icecream is chocolate', "chocolate isn't good for dogs", 'your dog, your cat, and your parrot prefer broccoli', 'My mother is really religious', 'My father is a clean freak', 'Sister is a supporter']


In [3]:
#creating a tokenizer with OOV
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")

In [4]:
#fitting the tokenizer onto the sequences
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'is': 2, 'my': 3, 'your': 4, 'ice': 5, 'cream': 6, 'favorite': 7, 'dog': 8, 'chocolate': 9, 'a': 10, 'food': 11, 'do': 12, 'you': 13, 'like': 14, 'too': 15, 'likes': 16, 'flavor': 17, 'of': 18, 'icecream': 19, "isn't": 20, 'good': 21, 'for': 22, 'dogs': 23, 'cat': 24, 'and': 25, 'parrot': 26, 'prefer': 27, 'broccoli': 28, 'mother': 29, 'really': 30, 'religious': 31, 'father': 32, 'clean': 33, 'freak': 34, 'sister': 35, 'supporter': 36}


In [5]:
#converting sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print (sequences)

[[3, 7, 11, 2, 5, 6], [12, 13, 14, 5, 6, 15], [3, 8, 16, 5, 6], [4, 7, 17, 18, 19, 2, 9], [9, 20, 21, 22, 23], [4, 8, 4, 24, 25, 4, 26, 27, 28], [3, 29, 2, 30, 31], [3, 32, 2, 10, 33, 34], [35, 2, 10, 36]]


In [6]:
#Later, when you feed the sequences into a neural network to train a model,
# the sequences all need to be uniform in size.
# make them all be the same size, either by padding them with zeros and/or 
#truncating them.
#f.keras.preprocessing.sequence.pad_sequences to add zeros to the sequences to 
#make them all be the same length. By default, the padding goes at the start of 
#the sequences, but you can specify to pad at the end.

#You can optionally specify the maximum length to pad the sequences
#Sequences that are longer than the specified max length will be truncated
#by default from beginning
## Make the sequences all the same length
#For all the options when padding and truncating sequences, see https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

padded = pad_sequences(sequences)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)



Word Index =  {'<OOV>': 1, 'is': 2, 'my': 3, 'your': 4, 'ice': 5, 'cream': 6, 'favorite': 7, 'dog': 8, 'chocolate': 9, 'a': 10, 'food': 11, 'do': 12, 'you': 13, 'like': 14, 'too': 15, 'likes': 16, 'flavor': 17, 'of': 18, 'icecream': 19, "isn't": 20, 'good': 21, 'for': 22, 'dogs': 23, 'cat': 24, 'and': 25, 'parrot': 26, 'prefer': 27, 'broccoli': 28, 'mother': 29, 'really': 30, 'religious': 31, 'father': 32, 'clean': 33, 'freak': 34, 'sister': 35, 'supporter': 36}

Sequences =  [[3, 7, 11, 2, 5, 6], [12, 13, 14, 5, 6, 15], [3, 8, 16, 5, 6], [4, 7, 17, 18, 19, 2, 9], [9, 20, 21, 22, 23], [4, 8, 4, 24, 25, 4, 26, 27, 28], [3, 29, 2, 30, 31], [3, 32, 2, 10, 33, 34], [35, 2, 10, 36]]

Padded Sequences:
[[ 0  0  0  3  7 11  2  5  6]
 [ 0  0  0 12 13 14  5  6 15]
 [ 0  0  0  0  3  8 16  5  6]
 [ 0  0  4  7 17 18 19  2  9]
 [ 0  0  0  0  9 20 21 22 23]
 [ 4  8  4 24 25  4 26 27 28]
 [ 0  0  0  0  3 29  2 30 31]
 [ 0  0  0  3 32  2 10 33 34]
 [ 0  0  0  0  0 35  2 10 36]]


In [7]:
# Specify a max length for the padded sequences
padded = pad_sequences(sequences, maxlen=15)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  3  7 11  2  5  6]
 [ 0  0  0  0  0  0  0  0  0 12 13 14  5  6 15]
 [ 0  0  0  0  0  0  0  0  0  0  3  8 16  5  6]
 [ 0  0  0  0  0  0  0  0  4  7 17 18 19  2  9]
 [ 0  0  0  0  0  0  0  0  0  0  9 20 21 22 23]
 [ 0  0  0  0  0  0  4  8  4 24 25  4 26 27 28]
 [ 0  0  0  0  0  0  0  0  0  0  3 29  2 30 31]
 [ 0  0  0  0  0  0  0  0  0  3 32  2 10 33 34]
 [ 0  0  0  0  0  0  0  0  0  0  0 35  2 10 36]]


In [8]:
# Put the padding at the end of the sequences
padded = pad_sequences(sequences, maxlen=15, padding="post")
print(padded)

[[ 3  7 11  2  5  6  0  0  0  0  0  0  0  0  0]
 [12 13 14  5  6 15  0  0  0  0  0  0  0  0  0]
 [ 3  8 16  5  6  0  0  0  0  0  0  0  0  0  0]
 [ 4  7 17 18 19  2  9  0  0  0  0  0  0  0  0]
 [ 9 20 21 22 23  0  0  0  0  0  0  0  0  0  0]
 [ 4  8  4 24 25  4 26 27 28  0  0  0  0  0  0]
 [ 3 29  2 30 31  0  0  0  0  0  0  0  0  0  0]
 [ 3 32  2 10 33 34  0  0  0  0  0  0  0  0  0]
 [35  2 10 36  0  0  0  0  0  0  0  0  0  0  0]]


In [9]:
# Limit the length of the sequences, you will see some sequences get truncated
padded = pad_sequences(sequences, maxlen=3)
print(padded)

[[ 2  5  6]
 [ 5  6 15]
 [16  5  6]
 [19  2  9]
 [21 22 23]
 [26 27 28]
 [ 2 30 31]
 [10 33 34]
 [ 2 10 36]]


In [10]:
# Try turning sentences that contain words that 
# aren't in the word index into sequences.
# Add your own sentences to the test_data
test_data = [
    "my best friend's favorite ice cream flavor is strawberry",
    "my dog's best friend is a manatee"
]
print (test_data)

# Remind ourselves which number corresponds to the
# out of vocabulary token in the word index
print("<OOV> has the number", word_index['<OOV>'], "in the word index.")

# Convert the test sentences to sequences
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

# Pad the new sequences
padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")

# Notice that "1" appears in the sequence wherever there's a word 
# that's not in the word index
print(padded)

["my best friend's favorite ice cream flavor is strawberry", "my dog's best friend is a manatee"]
<OOV> has the number 1 in the word index.

Test Sequence =  [[3, 1, 1, 7, 5, 6, 17, 2, 1], [3, 1, 1, 1, 2, 10, 1]]

Padded Test Sequence: 
[[ 0  3  1  1  7  5  6 17  2  1]
 [ 0  0  0  3  1  1  1  2 10  1]]
