## Tokenization

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
sentences = ['I Love my Dog', 'I love my Giraffe!!']

In [8]:
#Tokenization
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [9]:
word_index

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'giraffe': 5}

## Word Index / Sequences

In [26]:
sentences = ['I Love my Dog', 'I love my Giraffe!!', 'you love my dog', 'do you think my dog is amazing?']

In [27]:
#Tokenization
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [28]:
sequences = tokenizer.texts_to_sequences(sentences)

In [29]:
sequences

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

In [30]:
Test_data = ['I really love my dog','my dog loves my manatee']

In [31]:
test_seq = tokenizer.texts_to_sequences(Test_data)
test_seq
# we can see that loves and manatee is not in the vocabulary

[[4, 2, 1, 3], [1, 3, 1]]

In [32]:
#Tokenization
tokenizer = Tokenizer(num_words = 100,oov_token='<OOV>' ) #the unknown word will be assigned as OOV (Out Of Vocabulary)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [33]:
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'giraffe': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [34]:
test_seq = tokenizer.texts_to_sequences(Test_data)
test_seq
# we can see that loves and manatee is not in the vocabulary

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

## Padding

In [35]:
#When it come to train a neural network, we need a sentence that have same length/size among all
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [36]:
#Tokenization
tokenizer = Tokenizer(num_words = 100,oov_token='<OOV>' ) #the unknown word will be assigned as OOV (Out Of Vocabulary)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

In [38]:
padded = pad_sequences(sequences, padding = 'pre')
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  0,  0,  6,  3,  2,  4],
       [ 8,  6,  9,  2,  4, 10, 11]])