In [49]:
!pip install keras_nlp

In [50]:
import keras_nlp
import random
import tensorflow as tf
import os

from tensorflow import keras
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

keras.utils.set_random_seed(42)

In [51]:
BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 512
VOCAB_SIZE = 15000

EMBED_DIM = 128
INTERMEDIATE_DIM = 512

### Prepare Data

In [52]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2022-12-17 08:47:00--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.2’


2022-12-17 08:47:02 (49.1 MB/s) - ‘aclImdb_v1.tar.gz.2’ saved [84125825/84125825]



In [53]:
# aclImdb contains train,test folders containing txt file
!cat aclImdb/train/pos/11826_10.txt

I never attended the midnight showing of a movie before "Dick Tracy" came out.<br /><br />I still have the "t-shirt ticket" I had to wear to get admitted to the showing around here somewhere and, like that shirt, "Dick Tracy" has stuck with me ever since.<br /><br />If you've seen the movie, the sharp visuals, bright primary colors and strong characters have no doubt been etched into your brain. It's a wonder to behold.<br /><br />As director/star/co-writer/producer, Beatty knows what works in a film and shows it here, taking a familiar American icon and re-creating him for a whole new era. Still set in the '30s, "Tracy" has a kind of timeless quality like all good films do. I've lost track of how many times I've watched "Tracy" and I still catch something new every time I do.<br /><br />The others are all top notch, starting with Pacino's Big Boy Caprice (a reminder that he can do comedy with the best of them), even Madonna's Breathless Mahoney is a relevation in that under the right 

In [54]:
print(os.listdir("./aclImdb"))
print(os.listdir("./aclImdb/train"))
print(os.listdir("./aclImdb/test"))

['imdbEr.txt', 'test', 'train', 'imdb.vocab', 'README']
['urls_unsup.txt', 'pos', 'urls_pos.txt', 'labeledBow.feat', 'unsupBow.feat', 'urls_neg.txt', 'unsup', 'neg']
['pos', 'urls_pos.txt', 'labeledBow.feat', 'urls_neg.txt', 'neg']


In [55]:
!rm -rf aclImdb/train/unsup

### Define Dataset

In [None]:
# Define Datasets
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="training",
    seed=42,
)

val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="validation",
    seed=42,
)

test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", 
                                                  batch_size=BATCH_SIZE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
train_ds.class_names

['neg', 'pos']

### Basic Text Normalisation

In [None]:
# normalise all sentences in dataset
train_ds = train_ds.map(lambda x, y: (tf.strings.lower(x), y))
val_ds = val_ds.map(lambda x, y: (tf.strings.lower(x), y))
test_ds = test_ds.map(lambda x, y: (tf.strings.lower(x), y))

In [57]:
# print sample from batch
for text_batch,label_batch in train_ds.take(1):
  print(f'text: {text_batch.numpy()[0]}')
  print(f'label: {label_batch.numpy()[0]}')

text: b"this movie is not as good as all think. the actors are lowlevel and the story is very comic-like. i respect fantasy but lord of the rings is fantasy...conan..is fantasy...this is just normal hk-lowprice-entertainment...why did they include this splatter-tongue, it makes everything worse. the only good thing is the cinematography and the cutter's job."
label: 0


### Tokenisation

[subword tokeniser](https://www.tensorflow.org/text/guide/subwords_tokenizer)

In [59]:
'''

Generate Vocabulary for WordPieceTokenizer

'''

def train_word_piece(ds, vocab_size, reserved_tokens):

    # arguments for 
    bert_vocab_args = dict(
        
        vocab_size=vocab_size,          # The target vocabulary size
        reserved_tokens=reserved_tokens, # Reserved tokens that must be included in the vocabulary
        bert_tokenizer_params={"lower_case": True}, # Arguments for `text.BertTokenizer`
    )

    # Extract text samples (remove the labels).
    word_piece_ds = ds.unbatch().map(lambda x, y: x)
    vocab = bert_vocab.bert_vocab_from_dataset(
        word_piece_ds.batch(1000).prefetch(2), **bert_vocab_args
    )
    return vocab

# Generate Vocabulary
reserved_tokens = ["[PAD]", "[UNK]"]
train_sentences = [element[0] for element in train_ds]

# Vocabulary -> list of unique words
vocab = train_word_piece(train_ds, VOCAB_SIZE, reserved_tokens)

In [64]:
# Slightly lower than target vocab size
print("Tokens: ", vocab[100:110])
print(f"Vocabulary length: {len(vocab)}")

Tokens:  ['in', 'this', 'that', 'was', 'as', 'for', 'movie', 'with', 'but', 'film']
Vocabulary length: 14339


In [70]:
'''

WordPieceTokenizer (requires vocabulary list)

'''


# define tokeniser
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab,
                                                    lowercase=False,
                                                    sequence_length=MAX_SEQUENCE_LENGTH,
)

# Sample Tokenisation, get text data for one element
input_sentence_ex = train_ds.take(1).get_single_element()[0][1]
print(input_sentence_ex)

# Tokenise text
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ", input_sentence_ex)
print("Tokens: ", input_tokens_ex)
print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))

tf.Tensor(b'ah, lucio fulci, rest in peace. this infamous italian is most<br /><br />famous for "zombie," and the absolutely unwatchable "the<br /><br />psychic" and "manhattan baby." well, add this to the unwatchable<br /><br />list.<br /><br />the plot, as it were, concerns a nekkid woman who wears a gold<br /><br />mask and a g-string. she wants the power of a young dubbed<br /><br />stud who has a set of magic arrows and a bow. they are magic<br /><br />because they glow. arrow boy teams up with a guy in a bad wig,<br /><br />and they spend most of the movie rescuing each other from flat<br /><br />action sequences. in the end, the nekkid chick is defeated, but not<br /><br />before taking the mask off and reminding me why i broke up with<br /><br />my high school girlfriend.<br /><br />fulci bathes every shot in an orange glow and fills the screen with<br /><br />smoke. nothing like a smoky orange action sequence to make you<br /><br />crave sunny delight and a cigarette. the spec