<a href="https://colab.research.google.com/github/sieun-Bae/deep-learning/blob/master/Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Tokenizer

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
train_data = [
              "I love Deep Learning",
              "i, am learning deep"
]

In [0]:
tokenizer = Tokenizer(num_words=100)

In [0]:
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index

In [58]:
print(word_index)

{'i': 1, 'deep': 2, 'learning': 3, 'love': 4, 'am': 5}


In [0]:
train_data = [
              "I love Deep Learning",
              "i, am learning deep",
              "You love deep learning?"
]

In [0]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index

In [61]:
print(word_index)

{'deep': 1, 'learning': 2, 'i': 3, 'love': 4, 'am': 5, 'you': 6}


In [0]:
test_data = [
             "i really love deep learning",
             "my dog loves python"
]

#### sequences

In [0]:
sequences = tokenizer.texts_to_sequences(train_data)

In [64]:
print(sequences)

[[3, 4, 1, 2], [3, 5, 2, 1], [6, 4, 1, 2]]


In [0]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [66]:
print(test_seq)

[[3, 4, 1, 2], []]


==> 
word python doesn't exist, so it lost

#### OOV token

In [70]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index
#print(word_index)
sequences = tokenizer.texts_to_sequences(train_data)
print(sequences)

{'<OOV>': 1, 'deep': 2, 'learning': 3, 'i': 4, 'love': 5, 'am': 6, 'you': 7}
[[4, 5, 2, 3], [4, 6, 3, 2], [7, 5, 2, 3]]


In [0]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [71]:
print(test_seq) #<OOV> : 1

[[4, 1, 5, 2, 3], [1, 1, 1, 1]]


### Padding

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
train_data.append("Do you think deep learning is amazing?")
train_data.append("My dog is still deep learning")

In [45]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_data)
print(sequences)

[[4, 6, 2, 3], [4, 7, 3, 2], [8, 9, 10, 2, 3, 5, 11], [12, 13, 5, 14, 2, 3]]


In [0]:
train_padded = pad_sequences(sequences)

In [49]:
print(train_padded)

[[ 0  0  0  4  6  2  3]
 [ 0  0  0  4  7  3  2]
 [ 8  9 10  2  3  5 11]
 [ 0 12 13  5 14  2  3]]


In [50]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_seq)
print(test_padded)

[[ 4  1  6  2  3]
 [ 0 12 13  1  1]]


In [52]:
train_padded = pad_sequences(sequences, padding='post')
test_padded = pad_sequences(test_seq, padding='post')
print(train_padded)
print(test_padded)

[[ 4  6  2  3  0  0  0]
 [ 4  7  3  2  0  0  0]
 [ 8  9 10  2  3  5 11]
 [12 13  5 14  2  3  0]]
[[ 4  1  6  2  3]
 [12 13  1  1  0]]


In [53]:
train_padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
test_padded = pad_sequences(test_seq, padding='post', truncating='post', maxlen=5)
print(train_padded)
print(test_padded)

[[ 4  6  2  3  0]
 [ 4  7  3  2  0]
 [ 8  9 10  2  3]
 [12 13  5 14  2]]
[[ 4  1  6  2  3]
 [12 13  1  1  0]]
