## Setup


In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [2]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


In [4]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BUFFER_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [11]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('label: ', label.numpy()[:3])

texts:  [b'This movie\'s origins are a mystery to me, as I only know as much as IMDB did before I rented it. I assume that before "Starship Troopers", "Killshot" was one of the countless unaired pilots that never made it to network, cable, or otherwise. The new title of "Kill Shot" is comically thrown into the opening sequence, the first of many quick clues that this was not ever intended for the cinema. The quick cuts, cheesy "Melrose Place" music, and short 2-second close-up candid shots of the main actors let you know what you\'re in for.<br /><br />And I don\'t mind at all. I rented this movie seeing the repackaging that puts Casper Van Dien and Denise Richards on the cover in front of a volleyball net thinking it would be funny to see them in a movie besides the SciFi travesty of Starship Troopers (an excellent book, in my opinion, not so hot a movie - but that\'s another review). After looking it up on IMDB, my roommate and I surmised that the pilot was dragged up after the appar

## Create the text encoder

In [12]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [13]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [14]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[11, 93,  1, ...,  0,  0,  0],
       [11,  7,  4, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0]])

In [15]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'This movie\'s origins are a mystery to me, as I only know as much as IMDB did before I rented it. I assume that before "Starship Troopers", "Killshot" was one of the countless unaired pilots that never made it to network, cable, or otherwise. The new title of "Kill Shot" is comically thrown into the opening sequence, the first of many quick clues that this was not ever intended for the cinema. The quick cuts, cheesy "Melrose Place" music, and short 2-second close-up candid shots of the main actors let you know what you\'re in for.<br /><br />And I don\'t mind at all. I rented this movie seeing the repackaging that puts Casper Van Dien and Denise Richards on the cover in front of a volleyball net thinking it would be funny to see them in a movie besides the SciFi travesty of Starship Troopers (an excellent book, in my opinion, not so hot a movie - but that\'s another review). After looking it up on IMDB, my roommate and I surmised that the pilot was dragged up after the app

In [16]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [17]:
# predict on a sample text without padding.

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.0098788]


In [18]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.0098788]


In [19]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
