In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    'i like my house',
    'I, like my car',
    'You like my car!',
    'Do you think my car is expensive?'
]

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'my': 2, 'like': 3, 'car': 4, 'i': 5, 'you': 6, 'house': 7, 'do': 8, 'think': 9, 'is': 10, 'expensive': 11}

Sequences =  [[5, 3, 2, 7], [5, 3, 2, 4], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Padded Sequences:
[[ 0  0  0  5  3  2  7]
 [ 0  0  0  5  3  2  4]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [3]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really like my house',
    'my friend likes my car'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)


Test Sequence =  [[5, 1, 3, 2, 7], [2, 1, 1, 2, 4]]


In [4]:
padded = pad_sequences(test_seq) #without padding
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[5 1 3 2 7]
 [2 1 1 2 4]]


In [5]:
padded = pad_sequences(test_seq,maxlen=7) #with padding with max length (Pre padding)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[0 0 5 1 3 2 7]
 [0 0 2 1 1 2 4]]


In [6]:
padded = pad_sequences(test_seq,maxlen=7,padding='post') #with padding with max length (Post padding)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[5 1 3 2 7 0 0]
 [2 1 1 2 4 0 0]]


In [7]:
padded = pad_sequences(test_seq,maxlen=3,padding='post') #Truncating to length 3 (deafult is pre)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[3 2 7]
 [1 2 4]]


In [8]:
padded = pad_sequences(test_seq,maxlen=3,padding='post',truncating='post') #Truncating to length 3 (post)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[5 1 3]
 [2 1 1]]
