<a href="https://colab.research.google.com/github/schmuecker/transfer-learning/blob/main/natural_language/text_classification_from_scratch/rnn_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

# import os
# os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"


tfds.disable_progress_bar()

In [2]:
tf.__version__

'2.9.2'

In [3]:
devices = tf.config.experimental.list_physical_devices('GPU')
devices

[]

In [4]:
physical_devices = tf.config.list_physical_devices('GPU')

try:
    tf.config.experimental.set_memory_growth(devices[0], True)
    print("Success")
except:
    print("Exception occured")
    pass

Exception occured


**Read more about this dataset here: https://ai.stanford.edu/~amaas/data/sentiment/
As per this article:
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.** 

In [5]:
dataset, info = tfds.load('imdb_reviews', data_dir='./datasets', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ./datasets/imdb_reviews/plain_text/1.0.0...[0m
[1mDataset imdb_reviews downloaded and prepared to ./datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='./datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <Sp

In [7]:
dataset

{Split('train'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('test'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 Split('unsupervised'): <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [8]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [9]:
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [10]:
len(train_dataset)

25000

In [11]:
len(test_dataset)

25000

In [12]:
for sample in train_dataset:
    print(sample[0].numpy())
    print(sample[1].numpy())
    break

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
0


In [13]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [14]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [15]:
for example, label in train_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b'What an ambitious project Kenneth Branagh undertook here and how well it was realized! This is the first filmed version of \'Hamlet\' to use the full text of Shakespeare\'s play, but Branagh didn\'t do it just because "it was there." His intention, I believe, was to make the play accessible and understandable to the general viewer without dumbing it down, so to speak. In return he asks viewers to put in a little work themselves, a fair enough proposition and one that\'s a bargain.<br /><br />The setting is a generic 19th century European one and this does more than work well, it keeps a modern or ancient look from possibly distracting from the work itself. The production design and cinematography and both outstanding, which helps immensely when you\'re watching a four-hour movie. Branagh\'s casting once again is inspired and the acting is likewise. The direction accomplishes the heavy task of making this a movie rather than a deluxe version of a play. Since so much of \'Haml

In [16]:
e = tf.keras.layers.experimental.preprocessing.TextVectorization()
e.adapt([
    "I love samosas and jalebi",
    "I love biking and yoga",
    "I love tensorflow"
])

In [17]:
e.get_vocabulary()

['',
 '[UNK]',
 'love',
 'i',
 'and',
 'yoga',
 'tensorflow',
 'samosas',
 'jalebi',
 'biking']

In [18]:
e(["I love pizza"]).numpy()

array([[3, 2, 1]])

In [19]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [20]:
vocab = np.array(encoder.get_vocabulary())
vocab[:25]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are'], dtype='<U14')

In [21]:
example[:2]

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'What an ambitious project Kenneth Branagh undertook here and how well it was realized! This is the first filmed version of \'Hamlet\' to use the full text of Shakespeare\'s play, but Branagh didn\'t do it just because "it was there." His intention, I believe, was to make the play accessible and understandable to the general viewer without dumbing it down, so to speak. In return he asks viewers to put in a little work themselves, a fair enough proposition and one that\'s a bargain.<br /><br />The setting is a generic 19th century European one and this does more than work well, it keeps a modern or ancient look from possibly distracting from the work itself. The production design and cinematography and both outstanding, which helps immensely when you\'re watching a four-hour movie. Branagh\'s casting once again is inspired and the acting is likewise. The direction accomplishes the heavy task of making this a movie rather than a deluxe

In [22]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 49,  34,   1, ...,   0,   0,   0],
       [ 51,   4, 465, ...,   0,   0,   0],
       [  6,  28,   1, ...,   0,   0,   0]])

In [23]:
for n in range(3):
    print("Original: ", example[n].numpy())
    print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:  b'What an ambitious project Kenneth Branagh undertook here and how well it was realized! This is the first filmed version of \'Hamlet\' to use the full text of Shakespeare\'s play, but Branagh didn\'t do it just because "it was there." His intention, I believe, was to make the play accessible and understandable to the general viewer without dumbing it down, so to speak. In return he asks viewers to put in a little work themselves, a fair enough proposition and one that\'s a bargain.<br /><br />The setting is a generic 19th century European one and this does more than work well, it keeps a modern or ancient look from possibly distracting from the work itself. The production design and cinematography and both outstanding, which helps immensely when you\'re watching a four-hour movie. Branagh\'s casting once again is inspired and the acting is likewise. The direction accomplishes the heavy task of making this a movie rather than a deluxe version of a play. Since so much of \'Ha

In [24]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [25]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
sample_text = ('awesome movie, I loved it so much')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.0040582]


In [26]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [27]:

model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2f69822050>

In [28]:
import sys
print(sys.version)

3.7.15 (default, Oct 12 2022, 19:14:55) 
[GCC 7.5.0]
