In [1]:
import tensorflow as tf

In [2]:
import numpy as np

In [3]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  35.3M      0  0:00:02  0:00:02 --:--:-- 35.3M


In [4]:
!ls aclImdb
!ls aclImdb/test
!ls aclImdb/train


imdbEr.txt  imdb.vocab	README	test  train
labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt
labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [5]:
batch_size=32

In [6]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="training", seed=1337)

Found 75000 files belonging to 3 classes.
Using 60000 files for training.


In [7]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="validation", seed=1337)

Found 75000 files belonging to 3 classes.
Using 15000 files for validation.


In [8]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [9]:
print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Number of batches in raw_train_ds: 1875
Number of batches in raw_val_ds: 469
Number of batches in raw_test_ds: 782


In [10]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(5):
    print(text_batch.numpy()[i])
    print(label_batch.numpy()[i])

b'SPOILERS: We sit through ten minutes of AWFUL clich\xc3\xa9d dialog at the beginning from two completely unoriginal characters with bad twangs (ripped off from Kalifornia and Natural Born Killers - there isn\'t an original thing about these two) and you\'re going "either they\'re about to kill everyone in the diner or already have" and lo and behold guess what happens.<br /><br />I can\'t stand all the Tarantino wannabes out there and this guy is one of the worst. I got maybe 25-30 minutes into the thing when I just couldn\'t take it and stopped watching. Miner\'s really bad acting was unbearable - I couldn\'t take it. That, and the terrible script. After reading some of these comments I see there was a big twist - well guess what? No one cares. When you create completely uninteresting, unoriginal and unlikeable character like these two clich\xc3\xa9s, no one cares what big "twist" may have happened. I hope this is the end of these types of movies.'
2
b'This movie is horrible- in a \

In [11]:
from tensorflow.keras.layers import TextVectorization

In [12]:
import string

In [13]:
import re

In [14]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
  return tf.strings.regex_replace(stripped_html, f"[{re.escape(string.punctuation)}]", "")

In [15]:
max_features=20000
embedding_dim = 128
sequence_length=500

In [16]:
vectorize_layer = TextVectorization(standardize=custom_standardization, max_tokens=max_features, output_mode="int", output_sequence_length=sequence_length)

In [17]:
text_ds = raw_train_ds.map(lambda x, y:x)

In [18]:
text_ds

<MapDataset shapes: (None,), types: tf.string>

In [19]:
vectorize_layer.adapt(text_ds)

In [20]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [21]:
#vectorize the data

In [22]:
train_ds = raw_train_ds.map(vectorize_text)

In [23]:
val_ds = raw_val_ds.map(vectorize_text)

In [24]:
test_ds = raw_test_ds.map(vectorize_text)

In [25]:
#Async Prefetching for best performance on GPU

In [26]:
train_ds = train_ds.cache().prefetch(buffer_size=10)

In [27]:
val_ds = val_ds.cache().prefetch(buffer_size=10)

In [28]:
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [29]:
from tensorflow.keras import layers

In [30]:
inputs = tf.keras.Input(shape=(None,), dtype="int64")

In [31]:
x = layers.Embedding(max_features, embedding_dim)(inputs)

In [32]:
x = layers.Dropout(0.5)(x)

In [33]:
x = layers.Conv1D(128, 7, padding="valid", activation='relu', strides=3)(x)
x = layers.Conv1D(128, 7, padding='valid', activation='relu', strides=3)(x)

In [34]:
x = layers.GlobalMaxPool1D()(x)

In [35]:
x = layers.Dense(128, activation='relu')(x)

In [36]:
x = layers.Dropout(0.5)(x)

In [37]:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

In [38]:
model = tf.keras.Model(inputs, predictions)

In [39]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
epochs=3
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f25537c08d0>

In [41]:
model.evaluate(test_ds)



[154091346984960.0, 0.5]