In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
tf.random.set_seed(42)

In [3]:
import tensorflow_datasets as tfds

In [4]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)



In [5]:
print(datasets.keys())

dict_keys(['test', 'train', 'unsupervised'])


In [6]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

print(train_size , test_size)

25000 25000


We shall traverse through the batches and show the review(first 200 characters) and label of the first batch data samples:

In [7]:
for x,y in datasets["train"].batch(2).take(1):
    print(x)

tf.Tensor(
[b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
 b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot de

In [8]:
# dataset.batch(# of elements).take(# of loop)

In [9]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [10]:
def preprocess(X_batch, y_batch):
    # truncate the reviews upto 300 words
    X_batch = tf.strings.substr(X_batch, 0, 300)
    # replace <br..> with " "
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    # replace anything other than letters as " "
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    # split the str
    X_batch = tf.strings.split(X_batch)
    # padding so that all reviews are of same size
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [11]:
preprocess(X_batch,y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [12]:
# Creating Vocabulary

In [13]:
from collections import Counter

In [14]:
vocabulary = Counter()

In [15]:
for X_batch, y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [16]:
vocabulary.most_common()[:5]

[(b'<pad>', 63155),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431)]

In [17]:
# Truncating Vocabulary

In [18]:
len(vocabulary)

53893

In [19]:
vocab_size = 10000

In [20]:
truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]

In [21]:
# Creating a Look-Up Table

In [22]:
words = tf.constant(truncated_vocabulary)

In [23]:
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)

In [24]:
vocab_init = tf.lookup.KeyValueTensorInitializer(words,word_ids)

In [25]:
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [26]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [27]:
# Creating final train & test sets

In [28]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [29]:
train_set = datasets["train"].repeat().batch(32).map(preprocess)

In [30]:
train_set = train_set.map(encode_words).prefetch(1)

In [34]:
test_set = datasets["test"].batch(1000).map(preprocess)

In [35]:
test_set = test_set.map(encode_words).prefetch(1)

In [36]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [41]:
# Build Model

In [42]:
embed_size = 128

In [43]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
               mask_zero=True,
               input_shape=[None]),
    keras.layers.GRU(4, return_sequences=True),
    keras.layers.GRU(2),
    keras.layers.Dense(1, activation="sigmoid")
])

In [44]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [47]:
# evaluate

In [48]:
import time

In [49]:
start = time.time()

In [50]:
model.fit(train_set, steps_per_epoch=train_size // 32, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc62fe59438>

In [51]:
end = time.time()

In [52]:
print("Time of execution:", end-start)

Time of execution: 191.6604356765747


In [53]:
model.evaluate(test_set)



[0.5337878465652466, 0.7559599876403809]