In [None]:
#!pip install tensorflow==2.5.0

In [13]:

import tensorflow as tf
from tensorflow import keras


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!',
    'Do you actually love my dog too?'
]


In [7]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'actually': 8, 'too': 9}


In [8]:
sequences = tokenizer.texts_to_sequences(sentences)


In [9]:
print(sequences)

[[4, 1, 2, 3], [4, 1, 2, 6], [5, 1, 2, 3], [7, 5, 8, 1, 2, 3, 9]]


Text to sequences called can take any set of sentences, so it can encode them based on the word set that it learned from the one that was passed into fit on texts(i.e.sentences).
If you train a neural network on a corpus of texts, and the text has a word index generated from it, then when you want to do inference with the train model(sentences), you'll have to encode the text that you want to infer on with the same word index, otherwise it would be meaningless. 

In [10]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

In [11]:
test_seq=tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 1, 2, 3], [2, 3, 2]]


In [14]:
padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'love': 1, 'my': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'actually': 8, 'too': 9}

Sequences =  [[4, 1, 2, 3], [4, 1, 2, 6], [5, 1, 2, 3], [7, 5, 8, 1, 2, 3, 9]]

Padded Sequences:
[[0 4 1 2 3]
 [0 4 1 2 6]
 [0 5 1 2 3]
 [8 1 2 3 9]]


In [15]:
padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[0 0 0 0 0 0 4 1 2 3]
 [0 0 0 0 0 0 0 2 3 2]]


In [16]:
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")  #For the unknown words in corpus we define this(out of vocabulary)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [19]:
print(word_index)


{'<OOV>': 1, 'love': 2, 'my': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'actually': 9, 'too': 10}


In [20]:
test_seq=tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 2, 3, 4], [3, 4, 1, 3, 1]]


Adding padding in the end. By default padding is from the beginning

In [24]:
padded = pad_sequences(test_seq, maxlen=10,padding='post')
print("\nPadded Test Sequence: ") 
print(padded)


Padded Test Sequence: 
[[5 1 2 3 4 0 0 0 0 0]
 [3 4 1 3 1 0 0 0 0 0]]
