In [1]:
!pip install -q tf-nightly

[K     |████████████████████████████████| 323.0MB 43kB/s 
[K     |████████████████████████████████| 460kB 47.5MB/s 
[K     |████████████████████████████████| 6.8MB 45.4MB/s 
[?25h

In [2]:
# import libraries
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import string
import os
import re
import shutil
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import preprocessing
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
# check version of tensorflow
print(f"tensorflow version {tf.__version__}")

tensorflow version 2.4.0-dev20200719


In [4]:
# download and explore IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz",
                                  url,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir=''
                                  )

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [6]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [8]:
os.listdir(dataset_dir)

['imdbEr.txt', 'README', 'train', 'imdb.vocab', 'test']

In [9]:
train_dir = os.path.join(dataset_dir, "train")
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'urls_neg.txt',
 'unsup',
 'unsupBow.feat',
 'urls_unsup.txt',
 'urls_pos.txt',
 'pos']

In [10]:
# check one file from train pos 
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
  print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [11]:
# remove unsup directory from train directory
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [16]:
# create train ,validation and test dataset 
batch_size = 32
seed = 42

raw_train_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size = batch_size,
    seed = seed,
    validation_split = 0.2,
    subset = "training"
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [17]:
for text_batch, label_batch in raw_train_dataset.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label 0
Review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [18]:
# if you want to know label names 
label_names = raw_train_dataset.class_names
print(label_names)

['neg', 'pos']


In [19]:
raw_valid_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=32,
    validation_split = 0.2,
    subset = "validation",
    seed = seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [20]:
raw_test_dataset = tf.keras.preprocessing.text_dataset_from_directory(
    directory = "aclImdb/test",
    batch_size =32
)

Found 25000 files belonging to 2 classes.


#***Preprocessing***
Next, you will standardize, tokenize, and vectorize the data using the helpful preprocessing.TextVectorization layer.

In [38]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [39]:
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [40]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [43]:
#Let's create a function to see the result of using this layer to preprocess some data.
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [49]:
text_batch , label_batch = next(iter(raw_train_dataset))
text, label = text_batch[0], label_batch[0]
print(f"text :- \n{text}")
print(f"Label :- \n {label}")
vector_result = vectorize_text(text, label)
print(f"Result :-\n {vector_result}")
print(f"text part from Result :-\n {vector_result[0]}")
print(f"label part from Result :-\n {vector_result[1]}")

text :- 
b'If you want Scream or anything like the big-studio horror product that we get forced on us these days don\'t bother. This well-written film kept me up thinking about all it had to say. Importance of myth in our lives to make it make sense, how children interpret the world (and the violence in it), our ransacking of the environment and ignorance of its history and legends.. all here, but not flatly on the surface. You could technically call it a "monster movie" even though the Wendigo does not take physical form until the end, and then it\'s even up to you and your beliefs as to what\'s happening with the legendary spirit/beast. Some standard thriller elements for those looking just for the basics and the film never bores, though in fact the less you see of the creature, the better. Fessenden successfully continues George Romero\'s tradition of using the genre as parable and as a discussion forum while still keeping us creeped out.'
Label :- 
 1
Result :-
 (<tf.Tensor: shape=

In [51]:
#You can lookup the token (string) that each integer corresponds to by calling .get_vocabulary() on the layer.
print(f"540 ----> {vectorize_layer.get_vocabulary()[540]}")
print(f"950 ----> {vectorize_layer.get_vocabulary()[950]}")
# vocabulary size
print(f"soze of vocabulary {len(vectorize_layer.get_vocabulary())}")

540 ----> highly
950 ----> 20
soze of vocabulary 10000


In [52]:
# text vectorization layer to all datasets
train_dataset = raw_train_dataset.map(vectorize_text)
test_dataset = raw_test_dataset.map(vectorize_text)
valid_dataset = raw_valid_dataset.map(vectorize_text)

In [56]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
valid_dataset = valid_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [57]:
embedding_dim = 16

In [58]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [59]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 20
history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
111/625 [====>.........................] - ETA: 2s - loss: 0.0309 - binary_accuracy: 0.9938

In [67]:
loss, accuracy = model.evaluate(test_dataset)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.470775306224823
Accuracy:  0.8516799807548523
