<a href="https://colab.research.google.com/github/souparnabose99/Text-Classification-RNN-Tensorflow/blob/main/Tensorflow_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
tf.__version__

'2.5.0'

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# Simple list of sentences
sentences = ["I like to play badminton.",
             "I like watching football.",
             "I am a Data Scientist."]

In [4]:
# 3000 will cover 90% of most Google searches. We will use 20,000 as the vocab size
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [5]:
# Tensorflow uses 0 as padding, hence sequence will start from 1
print(sequences)

[[1, 2, 3, 4, 5], [1, 2, 6, 7], [1, 8, 9, 10, 11]]


In [6]:
# Word Index:
tokenizer.word_index

{'a': 9,
 'am': 8,
 'badminton': 5,
 'data': 10,
 'football': 7,
 'i': 1,
 'like': 2,
 'play': 4,
 'scientist': 11,
 'to': 3,
 'watching': 6}

In [7]:
# Default Padding:
data = pad_sequences(sequences)
print(data)

[[ 1  2  3  4  5]
 [ 0  1  2  6  7]
 [ 1  8  9 10 11]]


In [8]:
# Specifying MAX_SEQUENCE_LENGTH = 5
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  2  3  4  5]
 [ 0  1  2  6  7]
 [ 1  8  9 10 11]]


In [9]:
# Post padding:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data)

[[ 1  2  3  4  5]
 [ 1  2  6  7  0]
 [ 1  8  9 10 11]]


In [10]:
# Excess Padding:
data = pad_sequences(sequences, maxlen=6)
print(data)

[[ 0  1  2  3  4  5]
 [ 0  0  1  2  6  7]
 [ 0  1  8  9 10 11]]


In [11]:
# Truncation:
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 2  3  4  5]
 [ 1  2  6  7]
 [ 8  9 10 11]]


In [12]:
data = pad_sequences(sequences, maxlen=4, padding='post')
print(data)

[[ 2  3  4  5]
 [ 1  2  6  7]
 [ 8  9 10 11]]


In [13]:
# Same use for both keywords: padding==truncating:
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  2  3  4]
 [ 1  2  6  7]
 [ 1  8  9 10]]
