

---
**Basics of Tokenization**


In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    'Today is sunny day',
    'today is sunny day',
    'Today is rainy day'
]

tokenizer = Tokenizer(num_words=100, oov_token='<UKN>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(word_index)
print(tokenizer.texts_to_sequences(sentences))

{'<UKN>': 1, 'today': 2, 'is': 3, 'day': 4, 'sunny': 5, 'rainy': 6}
[[2, 3, 5, 4], [2, 3, 5, 4], [2, 3, 6, 4]]


In [3]:
test_data = [
             "today is snowy day",
             "will it be rainy tomorrow?"
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[2, 3, 1, 4], [1, 1, 1, 6, 1]]


In [4]:
sentences = [
'Today is a sunny day',
'Today is a rainy day',
'Is it sunny today?',
'I really enjoyed walking in the snow today'
]
test_seq = tokenizer.texts_to_sequences(sentences)
print(test_seq)



[[2, 3, 1, 5, 4], [2, 3, 1, 6, 4], [3, 1, 5, 2], [1, 1, 1, 1, 1, 1, 1, 2]]


In [5]:
padded = pad_sequences(test_seq, padding='post', maxlen=5, truncating='post')
print(padded)

[[2 3 1 5 4]
 [2 3 1 6 4]
 [3 1 5 2 0]
 [1 1 1 1 1]]




---
**Text Processing on IMDB Review Dataset**


In [6]:
import tensorflow as tf 
import tensorflow_datasets as tfds

In [7]:
imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split='train'))
for item in train_data:
  imdb_sentences.append(str(item['text']))
print(imdb_sentences[:5])

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete876L2F/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete876L2F/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete876L2F/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m
['b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."', "b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, 

In [8]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [9]:
print(tokenizer.word_index)



In [11]:
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = stopwords.words("english")
print(stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [15]:
table = str.maketrans('','', string.punctuation)

imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train"))
for i,item in enumerate(train_data):
  if i % 1000 == 0:
    print("Progress:", i)
  sentence = str(item['text'].decode('UTF-8').lower())

  sentence = sentence.replace(",", " , ")
  sentence = sentence.replace(".", " . ")
  sentence = sentence.replace("-", " - ")
  sentence = sentence.replace("/", " / ")
  
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()

  words = sentence.split()
  filtered_sentence = ""
  for word in words:
    if word not in stopwords:
      filtered_sentence += word + " "
  imdb_sentences.append(filtered_sentence)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)
print(tokenizer.word_index)

Progress: 0
Progress: 1000
Progress: 2000
Progress: 3000
Progress: 4000
Progress: 5000
Progress: 6000
Progress: 7000
Progress: 8000
Progress: 9000
Progress: 10000
Progress: 11000
Progress: 12000
Progress: 13000
Progress: 14000
Progress: 15000
Progress: 16000
Progress: 17000
Progress: 18000
Progress: 19000
Progress: 20000
Progress: 21000
Progress: 22000
Progress: 23000
Progress: 24000


In [16]:
sentences = [
'Today is a sunny day',
'Today is a rainy day',
'Is it sunny today?'
]
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[517, 1007, 357, 5305, 145], [517, 1007, 357, 6599, 145], [1007, 209, 5305, 517]]


In [17]:
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in sequences[0]])
print(decoded_review)

today is a sunny day


In [18]:
(train_data, test_data), info = tfds.load(
  'imdb_reviews/subwords8k',
  split = (tfds.Split.TRAIN, tfds.Split.TEST),
  as_supervised=True,
  with_info=True
)
print(info)



[1mDownloading and preparing dataset imdb_reviews/subwords8k/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete7R1MGC/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete7R1MGC/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete7R1MGC/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0. Subsequent calls will reuse this data.[0m
tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{

In [19]:
encoder = info.features['text'].encoder
print("Vocab Sie:", encoder.vocab_size)

Vocab Sie: 8185


In [20]:
print(encoder.subwords)



In [21]:
sample_string = 'Today is a sunny day'
encoded_string = encoder.encode(sample_string)
print ('Encoded string is {}'.format(encoded_string))

Encoded string is [6427, 4869, 9, 4, 2365, 1361, 606]


In [23]:
print(encoder.subwords[6426])

Tod


In [25]:
encoded_string = encoder.encode(sample_string)
original_string = encoder.decode(encoded_string)
test_string = encoder.decode([6427, 4869, 9, 4, 2365, 1361, 606])
print(test_string)

Today is a sunny day


In [32]:
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = stopwords.words("english")
print(stopwords)

table = str.maketrans('','', string.punctuation)

imdb_sentences = []
labels = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train", as_supervised=True))

for i,item in enumerate(train_data):
  if i % 1000 == 0:
    print("Progress:", i)

  sentence = str(item[0].decode('UTF-8').lower())
  labels.append(item[1])

  sentence = sentence.replace(",", " , ")
  sentence = sentence.replace(".", " . ")
  sentence = sentence.replace("-", " - ")
  sentence = sentence.replace("/", " / ")
  
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()

  words = sentence.split()
  filtered_sentence = ""
  for word in words:
    if word not in stopwords:
      filtered_sentence += word + " "
  imdb_sentences.append(filtered_sentence)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo

In [33]:
training_size = 25000
x_train = imdb_sentences[:training_size]
y_train = labels[:training_size]

x_test = imdb_sentences[training_size:]
y_test = labels[training_size:]

In [34]:
vocab_size = 20000
maxlen = 10
trunc_type = 'post'
pad_type = 'post'
oov_token = '<ukn>'

tokenizer = tf.keras.preprocessing.text.Tokenizer(vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x_train)

In [35]:
def create_vector(data):
  seq = tokenizer.texts_to_sequences(data)
  pad = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen, padding=pad_type, truncating=trunc_type)
  return pad

new_x_train = create_vector(x_train)
new_x_test = create_vector(x_test)

In [36]:
print(new_x_train[:5])

[[  313   283     2 10508  1246  3504   372 10991    20    62]
 [  453   681  2263    30   509   565  2117    83   467    11]
 [ 4309  6040     1  4729  3898   774  1485  1898  1224  2239]
 [  139     3  8728  2610  2556   250    82    51  1292   846]
 [  296   923   261    51  2413     3   546   313  1376    35]]
