In [103]:
import os
import shutil
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") 
      else "NOT AVAILABLE")

Version:  2.6.0-dev20210510
Eager mode:  True
Hub version:  0.8.0
GPU is NOT AVAILABLE


In [2]:
# Downloading the IMDB dataset
# Split the training set into 60% and 40% to end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [85]:
# Preparing train, test and validation datasets
batch_size = 32
seed = 42

train_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

validation_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

test_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', 
    batch_size=batch_size)

AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
validation_data = validation_data.cache().prefetch(buffer_size=AUTOTUNE)
test_data = test_data.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [86]:
# Exploring the data
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
print("Train example batch: \n")
print(train_examples_batch)
print('\n\n\n\nLabels:\n')
print(train_labels_batch)

Train example batch: 

tf.Tensor(
[[b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
  b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes a

  b'Not only was this movie better than all the final season of H:LOTS. But it was better than any movie made for TV I have ever seen!<br /><br />Looking at the "Top 250" I see that only one small screen movie has made it: How the Grinch Stole Christmas. I think it is time to increase that group to 2.<br /><br />I will admit that the original series had several shows that were better than this, but I didn\'t mind. I just LOVED being able to enter the world of the Baltimore Homicide Squad again!']], shape=(10, 32), dtype=string)




Labels:

tf.Tensor(
[[0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 0 1 1]
 [1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 0]
 [0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 1]
 [1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 0 1 0 1 1]
 [0 1 0 1 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 1 1 1 0 0 0 1]
 [0 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 0 0]
 [0 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 

In [87]:
train_examples_batch = train_examples_batch.numpy()[0]
train_labels_batch = train_labels_batch.numpy()[0]

validation_data_batch = next(iter(validation_data.batch(32))) 
train_data_batch = next(iter(train_data.batch(32)))


In [88]:
next(iter(validation_data.batch(32))) 

(<tf.Tensor: shape=(32, 32), dtype=string, numpy=
 array([[b"(spoilers)<br /><br />I shoulda figured. The dvd didn't even show the time or how long it was. I thought Wild Cardz was the worst anime movie I'd ever seen. This one is much worse. Makes no sense. Thrown together plot. All so we can see oversized breasts on a figure that doesn't support them. <br /><br />It had to of been a student film. That musta gotten a C grade<br /><br />2/10<br /><br />Quality: 1/10 Entertainment: 6/10 (until I found out it wasn't going to end.) the ending made no darn sense<br /><br />Replayable: 0/10",
         b"What this movie fails from answering is how wrong this war is (and most US wars recently made only to get some oil).<br /><br />How many innocent civilian casualties there has been, how many lives perished and how blatantly stupid the perpetrators are.<br /><br />So, let me ask you - if American soldier kills women and children apart from enemy, its OK, but if government accidentally kills th

In [89]:
# Building the model
# First embedding layer
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                          dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 50), dtype=float32, numpy=
array([[ 0.964519  ,  0.12235046, -0.35730854,  0.48033378,  0.18034822,
        -0.18373251, -0.03941955, -0.22600542, -0.32746387,  0.3865721 ,
         0.2286384 ,  0.00236568, -0.07970965,  0.06908172, -0.273917  ,
        -0.25015512, -0.17116444,  0.10235601,  0.13415377, -0.6560994 ,
        -0.01646905, -0.18063518,  0.7175045 ,  0.15740581, -0.57366866,
         0.02249735, -1.2577188 ,  0.21991247,  0.5483787 , -0.2534604 ,
        -0.5942718 , -0.05421532,  0.2442439 , -0.02258026, -0.22258581,
         0.60274124, -0.02160914,  0.07075842,  0.09879221, -0.44222328,
        -0.02986133,  0.19645615, -0.61093885, -0.18569276, -0.23887902,
        -0.3891516 , -0.35830313, -0.25282302,  0.3660045 ,  0.06032449],
       [ 0.76805687,  0.37268633,  0.5389303 ,  0.38905075, -0.08719452,
        -0.40928397, -0.09052341, -0.35153955, -0.58467203, -0.08505731,
        -0.25557598,  0.06093939, -0.147848  ,  0.17880681,  0.25342456,
 

In [90]:
# Full model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_8 (KerasLayer)   (None, 50)                48190600  
_________________________________________________________________
dense_4 (Dense)              (None, 16)                816       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 48,191,433
Trainable params: 48,191,433
Non-trainable params: 0
_________________________________________________________________


In [91]:
# compiling model
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [95]:
# Training the model
history = model.fit(train_data,
                   epochs=5,
                   validation_data=validation_data,
                   verbose=1)

Epoch 1/5








Epoch 2/5








Epoch 3/5








Epoch 4/5








Epoch 5/5










In [98]:
# Evaluating the model
loss, accuracy = model.evaluate(test_data)

print("Loss: ", loss)
print("Accuracy: ", accuracy)



Loss:  0.6545483469963074
Accuracy:  0.8464000225067139
