# IMDB Binary Classification

## Load Packages

In [4]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [5]:
print(tf.__version__)

2.3.1


## Load data

In [12]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb.tar.gz", url, untar=True, cache_dir= './dataset', cache_subdir='',)

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [15]:
dataset_dir

'./dataset/aclImdb'

In [16]:
os.listdir(dataset_dir)

['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [17]:
train_dir = os.path.join(dataset_dir,'train')
os.listdir(train_dir)

['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [18]:
# pos and neg directories contain positive and negative reviews
# check one example
sample_file = os.path.join(train_dir, 'pos/12276_7.txt')

with open(sample_file) as f:
    print(f.read())

Can such an ambient production have failed its primary goal, which was to correctly adapt Allende's novel? Obviously yes. Bille August managed to make a superficial, shallow film where basic elements of South American mentality are presented simply as side events, resulting in total incoherency. I can't believe there was a whole production team that could not understand the book! There is of course technical quality in this film and I think the actors did their best with what they had in their hands, but something is missing. And this something was the most important part.


In [19]:
# remove unsup directory from the dataset
remove_dir = os.path.join(train_dir,'unsup')
shutil.rmtree(remove_dir)

In [22]:
# split train into train and vaidation
batch_size = 32
seed = 42
# create train dataset
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory (
    'dataset/aclImdb/train',
    batch_size=batch_size,
    validation_split = 0.2,
    subset='training',
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [23]:
type(raw_train_ds)

tensorflow.python.data.ops.dataset_ops.BatchDataset

In [24]:
# iterate a few samples from 'tf.data'

for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label 0
Review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [25]:
# create validation dataset
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory (
    'dataset/aclImdb/train',
    batch_size=batch_size,
    validation_split = 0.2,
    subset='validation',
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [26]:
# create test dataset
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'dataset/aclImdb/test',
    batch_size = batch_size
)

Found 25000 files belonging to 2 classes.


In [28]:
# Convert to lowercase, remove html tags and punctuation
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation), '')

In [29]:
# Vectorization

max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
                    standardize=custom_standardization,
                    max_tokens=max_features,
                    output_mode='int',
                    output_sequence_length=sequence_length)

In [35]:
# vectorize on the text only
# x= text y = label
train_text = raw_train_ds.map(lambda x,y: x)
vectorize_layer.adapt(train_text) #adapt (fit) on train dataset only --> create an index of strings to integers

In [36]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label


In [37]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'Belmondo is a tough cop. He goes after a big-time drug dealer (played by Henry Silva, normally a great villain - see "Sharky\'s Machine"; but here he is clearly dubbed, and because of that he lacks his usual charisma). He goes to the scuzziest places of Paris and Marseilles, asks for some names, beats up some people, gets the names, goes to more scuzzy places, asks for more names, beats up more people, etc. The whole movie is punch after punch after punch. It seems that the people who made it had no other ambition than to create the French equivalent of "Dirty Harry". Belmondo, who was 50 here, does perform some good stunts at the beginning; apart from those, "Le Marginal" is a violent, episodic, trite, shallow and forgettable cop movie. (*1/2)', shape=(), dtype=string)
Label neg
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   1,    7,    4, 1233, 1021,   27,  261,  101,    4,    1, 1525,
        6992,  248,   32, 1488,    1, 1659,    4,

In [39]:
# check token
print ("6992 ------->", vectorize_layer.get_vocabulary()[6992])
print ("2851 ------->", vectorize_layer.get_vocabulary()[2851])

6992 -------> dealer
2851 -------> punch


In [40]:
# Apply vectorization to train,validation and test
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

**Performnce** 

`.cache()` keeps the model in memory after it is offloaded from the disk to ensure that dataset is not a bottleneck in training

`.prefetch()` overlaps data preprocessing and model execution while training.


In [41]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Create Model

In [42]:
embedding_dim = 16

In [45]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1,embedding_dim), # Generate embeddings
    layers.Dropout(0.2), # regularization
    layers.GlobalAveragePooling1D(),  #returns a fixed-length output vector for each example by averaging over the sequence dimension. 
    layers.Dropout(0.2), # passed to a fully connectecd layer with 16 hidden nodes
    layers.Dense(1)]) # one output

In [46]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          160016    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [47]:
# loss and optimizer
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
            optimizer='adam',
            metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [48]:
# training
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
# Evaluate 
loss, accuracy = model.evaluate(test_ds)

print("Loss:", loss)
print("Accuracy:", accuracy)

Loss: 0.30973944067955017
Accuracy: 0.8737599849700928
