In [1]:
extensions_loaded = False
if not extensions_loaded:
    %load_ext autoreload
    %load_ext tensorboard
    extensions_loaded = True

%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
import random
import collections

import so_ml_tools as soml

import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow_datasets as tfds
import tensorflow_hub as hub

import pandas as pd

import opendatasets as od

from keras import Model
from keras import optimizers
from keras import layers
from keras import losses
from keras import utils



# StackOverfloat
## Download the StackOverflow Dataset

In [3]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
data_dir = './data'

soml.util.io.download_file(source=data_url, filepath="./data/stack_overflow_16k.tar.gz")
soml.util.io.extract_tgz(filepath='./data/stack_overflow_16k.tar.gz', folder='./data')

Download of https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz completed.
Extraction of ./data/stack_overflow_16k.tar.gz started.
Extraction of ./data/stack_overflow_16k.tar.gz completed.


## Inspect StackOverflow Dataset

In [4]:
soml.util.io.list_dir_summary(data_dir)

There are 2 directories, and 2 in ./data
There are 4 directories, and 0 in ./data/test
There are 0 directories, and 2000 in ./data/test/python
There are 0 directories, and 2000 in ./data/test/java
There are 0 directories, and 2000 in ./data/test/javascript
There are 0 directories, and 2000 in ./data/test/csharp
There are 4 directories, and 0 in ./data/train
There are 0 directories, and 2000 in ./data/train/python
There are 0 directories, and 2000 in ./data/train/java
There are 0 directories, and 2000 in ./data/train/javascript
There are 0 directories, and 2000 in ./data/train/csharp


In [5]:
train_dir = f"{data_dir}/train"
test_dir = f"{data_dir}/test"

# Show contents of a random file
random_file = random.choice(os.listdir(f"{train_dir}/java"))
with open(f"{data_dir}/train/java/{random_file}") as f:
  print(f.read())

"how do i use a command on multiple objects? i would like to use button1.settext(""test""); multiple times without repeating the .settext(""test""); for every button. something like this button1, button2, button3.settext(""test"");..is this a normal question or am i just lazy...(sorry for my english)"



## Load StackOverflow Dataset

In [6]:
# Create the training and validation datasets, whereas the training set is 80% of the data.

batch_size = 32
seed = 42

raw_train_ds, raw_validation_ds  = utils.text_dataset_from_directory(
    directory=train_dir,
    batch_size=batch_size,
    seed=seed,
    validation_split=0.2,
    subset='both' # return both the train and validation set.
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Using 1600 files for validation.


In [7]:
# Sample the training set

for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(10):
        print(f"Label: {label_batch.numpy()[i]}")
        print(f"Question: {text_batch.numpy()[i]}")
        print("----")

Label: 1
Question: b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can be easily fixed, please forgive me. my program has a tester class with a main. when i send that to my regularpolygon class, it sends it to the wrong constructor. i have two constructors. 1 without perameters..public regularpolygon().    {.       mynumsides = 5;.       mysidelength = 30;.    }//end default constructor...and my second, with perameters. ..public regularpolygon(int numsides, double sidelength).    {.        mynumsides = numsides;.        mysidelength = sidelength;.    }// end constructor...in my tester class i have these two lines:..regularpolygon shape = new regularpolygon(numsides, sidelength);.        shape.menu();...numsides and sidelength were declared and initialized earlier in the testing class...so what i want to happen, is the tester class sends numsides and sidelength to the second constructor and use it in that class. but it only uses the def

In [8]:
# Get the label information
class_names = raw_train_ds.class_names
for i, class_name in enumerate(class_names):
    print(f"Label {i} corresponds {class_name}")

Label 0 corresponds csharp
Label 1 corresponds java
Label 2 corresponds javascript
Label 3 corresponds python


In [9]:
# Load the test data
raw_test_ds = utils.text_dataset_from_directory(
    directory=test_dir,
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


## Prepare the StackOverflow dataset for training

In this step we will:

1. Standardize - Involves removing punctuation, HTML elements, etc...
2. Tokenize - Splitting strings up into individual words (or characters, ngrams)
3. Vectorize - Convert words into numbers

### Build 'binary' vectorization mode to build a bag-of-words model.

In [10]:
VOCAB_SIZE = 10_000 # Maximum number of words in our vocabulary.
binary_vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="binary"
)

### build 'int' mode with a 1D ConvNet.

In [11]:
MAX_SEQUENCE_LENGTH = 250 # explicit maximum sequence length, which will cause the layer to pad or truncate sequences to exactly output_sequence_length

int_vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

In [12]:
# Adapt the text vectorizers to the data (build up the vocabulary)
# IMPORTANT: Only adapt on training data and not the test-data. Otherwise it would leak test data into the training data.
train_text = raw_train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [13]:
# Print the results of using these layers to preprocess data:
def binary_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return binary_vectorize_layer(text), label

def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [14]:
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print(f"Label: {first_label}")
print(f"Question: {first_question}")

Label: 2
Question: b'"what is the difference between these two ways to create an element? var a = document.createelement(\'div\');..a.id = ""mydiv"";...and..var a = document.createelement(\'div\').id = ""mydiv"";...what is the difference between them such that the first one works and the second one doesn\'t?"\n'


In [15]:
# Show the output created when using 'binary' (bag-of-words) model. As you can see we get a total of 10.000 values for the sentence.
print("'binary' vectorized question:", binary_vectorize_text(first_question, first_label)[0])

'binary' vectorized question: tf.Tensor([[1. 1. 0. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)


In [16]:
# Show the output created using 'int' model.
_ = int_vectorize_text(first_question, first_label)[0]
print("'int' vectorized question:", _)

'int' vectorized question: tf.Tensor(
[[ 55   6   2 410 211 229 121 895   4 124  32 245  43   5   1   1   5   1
    1   6   2 410 211 191 318  14   2  98  71 188   8   2 199  71 178   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0


## Re-create the text using the 'int' TextVectorizer

Some things to point out:

- Punctuation and markup (HTML for example) are all gone
- [UNK] are used to donate words that are not part of the vocabulary, for example the word 'regularpolygon' is not part of the vocabulary.
- Some weird words are in the text for example 'systemoutprintlnwelcome' which actually was code: 'System.out.println("Welcome").
- The length of the text has been cut-off at 250 words or when the text was shorter the tensor was filled out to a maximum of 250 words. This was part of the configuration of the 'int' TextVectorizer.

In [17]:
# Re-create the text from the TextVectorizer
" ".join([int_vectorize_layer.get_vocabulary()[val] for val in tf.squeeze(_)])

'what is the difference between these two ways to create an element var a [UNK] [UNK] a [UNK] [UNK] is the difference between them such that the first one works and the second one doesnt                                                                                                                                                                                                                       '

In [18]:
# Apply both TextVectorizers on the training, validation and test data.
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_validation_ds = raw_validation_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_validation_ds = raw_validation_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

## Configure the dataset for performance

These are two important methods you should use when loading data to make sure that I/O does not become blocking.

1) Dataset.cache keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.
2) Dataset.prefetch overlaps data preprocessing and model execution while training. You can learn more about both methods, as well as how to cache data to disk in the Prefetching section of the Better performance with the tf.data API guide.

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
# Confiure all datasets to use caching with autotune buffer size
binary_train_ds = configure_dataset(binary_train_ds)
binary_validation_ds = configure_dataset(binary_validation_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_validation_ds = configure_dataset(int_validation_ds)
int_test_ds = configure_dataset(int_test_ds)

## Train the model

In [21]:
binary_model = tf.keras.Sequential([
    layers.Dense(units=4)
])

# from_logits = True means that the last layer in the model does not have a softmax function (probability between 0 and 1 distributed over all the possible outcomes)
# from_logits = False means that the last layer in the model does have a softmax function (probability between 0 and 1 distributed over all the possible outcomes)
binary_model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                     optimizer=optimizers.Adam(),
                     metrics=["accuracy"])

history = binary_model.fit(binary_train_ds, validation_data=binary_validation_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=64, mask_zero=True),
        layers.Conv1D(filters=64, kernel_size=5, padding='valid', activation='relu', strides=2),
        layers.GlobalMaxPooling1D(),
        layers.Dense(units=num_labels)
    ])
    return model

In [23]:
# `vocab_size` is `VOCAB_SIZE + 1` since `0` is used additionally for padding.
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=len(class_names))
int_model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                  optimizer=optimizers.Adam(),
                  metrics=["accuracy"])
history = int_model.fit(int_train_ds, validation_data=int_validation_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Compare the two models

In [24]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

Linear model on binary vectorized data:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 40004     
                                                                 
Total params: 40004 (156.27 KB)
Trainable params: 40004 (156.27 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [25]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

ConvNet model on int vectorized data:
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          640064    
                                                                 
 conv1d (Conv1D)             (None, None, 64)          20544     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                                 
Total params: 660868 (2.52 MB)
Trainable params: 660868 (2.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


## Evaluate both models

In [26]:
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
int_loss, int_accuracy = int_model.evaluate(int_test_ds)

print(f"Binary model accuracy: {binary_accuracy:2.2%}")
print(f"Int model accuracy: {int_accuracy:2.2%}")

Binary model accuracy: 81.45%
Int model accuracy: 80.84%


## Export the model

There is a performance difference to keep in mind when choosing where to apply tf.keras.layers.TextVectorization.

Using it outside of your model enables you to do asynchronous CPU processing and buffering of your data when training on GPU.

So, if you're training your model on the GPU, you probably want to go with this option to get the best performance while developing your model, then switch to including the TextVectorization layer inside your model when you're ready to prepare for deployment.

In [27]:
export_model = tf.keras.Sequential(
    [binary_vectorize_layer, binary_model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print("Accuracy: {accuracy:2.2%}")

Accuracy: {accuracy:2.2%}


In [28]:
def get_string_labels(predicted_scores_batch):
  predicted_int_labels = tf.math.argmax(predicted_scores_batch, axis=1)
  predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)
  return predicted_labels

## Run inference on new data

In [29]:
inputs = [
    "how do I extract keys from a dict into a list?",  # 'python'
    "debug public static void main(string[] args) {...}",  # 'java'
]

predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)

for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy())

Question:  how do I extract keys from a dict into a list?
Predicted label:  b'python'
Question:  debug public static void main(string[] args) {...}
Predicted label:  b'java'


In [30]:
predicted_scores

array([[0.3774232 , 0.25327504, 0.21751395, 0.79875696],
       [0.7013015 , 0.78789496, 0.09141721, 0.09403575]], dtype=float32)

# Example 2: Predict the author of Iliad translations

In [31]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    soml.util.io.download_file(f"{DIRECTORY_URL}{name}", f'./data2/{name}')

soml.util.io.list_dir_summary('./data2')

Download of https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt completed.
Download of https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt completed.
Download of https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt completed.
There are 0 directories, and 3 in ./data2


## Load the dataset

In [32]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [33]:
labeled_data_sets = []

for i, filename in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(filenames=f"./data2/{filename}")
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

## Next, combine these labeled datasets into a single dataset using Dataset.concatenate, and shuffle it with Dataset.shuffle

In [34]:
BUFFER_SIZE = 50_000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000

In [35]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

## Inspect some examples

In [36]:
for text, label in all_labeled_data.take(10):
  print("Sentence: ", text.numpy())
  print("Label:", label.numpy())

Sentence:  b'He said, and at his word instant arose'
Label: 0
Sentence:  b'Hear ye the words I speak, for they are true:'
Label: 1
Sentence:  b'scales and placed a doom in each of them, one for Achilles and the'
Label: 2
Sentence:  b"Well pleas'd, the monarch Agamemnon saw,"
Label: 1
Sentence:  b'bring him into the city of Priam, the Argives would readily give up the'
Label: 2
Sentence:  b'my mouth," said he, "Trojans and Achaeans, the saying of Alexandrus,'
Label: 2
Sentence:  b'cloister of the outer court and in the inner court at the doors of the'
Label: 2
Sentence:  b'For Menelaus, Atreus son spear-famed,'
Label: 0
Sentence:  b'armour. Thrice did he spring forward with might and main to slay him,'
Label: 2
Sentence:  b'He said, and from his chariot to the plain'
Label: 0


## Prepare the dataset for training (https://www.tensorflow.org/tutorials/load_data/text#prepare_the_dataset_for_training_2)

In [37]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [38]:
def tokenize(text, unused_label):
  lower_case = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lower_case)

In [39]:
tokenized_ds = all_labeled_data.map(tokenize)

## Inspect some samples after the tokenization

In [40]:
for text_batch in tokenized_ds.take(5):
  print("Tokens: ", text_batch.numpy())

Tokens:  [b'he' b'said' b',' b'and' b'at' b'his' b'word' b'instant' b'arose']
Tokens:  [b'hear' b'ye' b'the' b'words' b'i' b'speak' b',' b'for' b'they' b'are'
 b'true' b':']
Tokens:  [b'scales' b'and' b'placed' b'a' b'doom' b'in' b'each' b'of' b'them' b','
 b'one' b'for' b'achilles' b'and' b'the']
Tokens:  [b'well' b'pleas' b"'" b'd' b',' b'the' b'monarch' b'agamemnon' b'saw'
 b',']
Tokens:  [b'bring' b'him' b'into' b'the' b'city' b'of' b'priam' b',' b'the'
 b'argives' b'would' b'readily' b'give' b'up' b'the']


## Build a vocabulary by sorting tokens by frequency and keeping the top VOCAB_SIZE tokens

In [41]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
  for tok in toks:
    vocab_dict[tok] += 1

# Code above results in a dictionary of 14.262 tokens, now sort the dictionaries
# based on the number of occurances column.

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

Vocab size:  10000
First five vocab entries: [b',', b'the', b'and', b"'", b'of']


## Convert the tokens into integers.

Use the vocab set to create a tf.lookup.StaticVocabularyTable. You will map tokens to integers in the range [2, vocab_size + 2]. As with the TextVectorization layer, 0 is reserved to denote padding and 1 is reserved to denote an out-of-vocabulary (OOV) token.

In [42]:
keys = vocab
values = range(2, len(vocab) + 2)  # Reserve `0` for padding, `1` for OOV tokens.

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64)

num_oov_buckets = 1 # num_oov_buckets = number of buvkets to use for out-of-vocabulary keys.
vocab_table = tf.lookup.StaticVocabularyTable(initializer=init, num_oov_buckets=num_oov_buckets)

## Define a function to standardize, tokenize and vectorize the dataset using the tokenizer and lookup table

In [43]:
def preprocess_text(text: str, label: str):
  standardized = tf_text.case_fold_utf8(text)
  tokenized = tokenizer.tokenize(standardized)
  vectorized = vocab_table.lookup(tokenized)
  return vectorized, label

In [44]:
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())

Sentence:  b'He said, and at his word instant arose'
Vectorized sentence:  [  12   83    2    4   34   11  567 1102  658]


## Run the preprocess function on the dataset using Dataset.map

In [45]:
all_encoded_data = all_labeled_data.map(preprocess_text)

## Split the dataset into training and test sets

The Keras TextVectorization layer also batches and pads the vectorized data. Padding is required because the examples inside of a batch need to be the same size and shape, but the examples in these datasets are not all the same size—each line of text has a different number of words.

tf.data.Dataset supports splitting and padded-batching datasets:

In [46]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [47]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

## Inspect train_data and validation_data

validation_data and train_data are not collections of (example, label) pairs, but collections of batches. Each batch is a pair of (many examples, many labels) represented as arrays.

In [48]:
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

Text batch shape:  (64, 17)
Label batch shape:  (64,)
First text example:  tf.Tensor(
[  12   83    2    4   34   11  567 1102  658    0    0    0    0    0
    0    0    0], shape=(17,), dtype=int64)
First label example:  tf.Tensor(0, shape=(), dtype=int64)


## Adjust vocab_size due to padding.

Since you use 0 for padding and 1 for out-of-vocabulary (OOV) tokens, the vocabulary size has increased by two

In [49]:
vocab_size += 2

## Add prefetch and caching to the datasets (function from Example 1)

In [50]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

## Train the model

In [51]:
model = create_model(vocab_size=vocab_size, num_labels=len(FILE_NAMES))

model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer=optimizers.Adam(),
              metrics=["accuracy"])

history = model.fit(train_data, validation_data=validation_data, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [52]:
loss, accuracy = model.evaluate(validation_data)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy:2.2%}")

Loss: 1.0986120700836182
Accuracy: 27.90%


## Export the model

To make the model capable of taking raw strings as input, you will create a Keras TextVectorization layer that performs the same steps as your custom preprocessing function. Since you have already trained a vocabulary, you can use TextVectorization.set_vocabulary (instead of TextVectorization.adapt), which trains a new vocabulary.

In [53]:
preprocess_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    standardize=tf_text.case_fold_utf8,
    split=tokenizer.tokenize,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

preprocess_layer.set_vocabulary(vocab)

  return bool(asarray(a1 == a2).all())
  if self.mask_token is not None and self.mask_token in tokens:


In [54]:
export_model = tf.keras.Sequential(
    [preprocess_layer, model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=optimizers.Adam(),
    metrics=['accuracy'])

In [55]:
# Create a test dataset of raw strings.
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)

loss, accuracy = export_model.evaluate(test_ds)

print(f"Loss: {loss}", loss)
print(f"Accuracy: {accuracy:2.2%}")

Loss: 1.1022253036499023 1.1022253036499023
Accuracy: 26.98%


## Run inference on new data

In [56]:
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]

predicted_scores = export_model.predict(inputs)
predicted_labels = tf.math.argmax(predicted_scores, axis=1)

for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy())

Question:  Join'd to th' Ionians with their flowing robes,
Predicted label:  2
Question:  the allies, and his armour flashed about him so that he seemed to all
Predicted label:  2
Question:  And with loud clangor of his arms he fell.
Predicted label:  2


# Download more datasets using TensorFlow Datasets (TFDS, IMDB)

You can download many more datasets from TensorFlow Datasets.

In this example, we will use the IMDB Large Movie Review dataset to train a model for sentiment classification

In [57]:
# Training set.
train_ds = tfds.load(
    'imdb_reviews',
    split='train[:80%]',
    batch_size=BATCH_SIZE,
    shuffle_files=True,
    as_supervised=True)

2023-09-20 09:12:56.895516: W tensorflow/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/sodeso/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /home/sodeso/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteEET2L7/imdb_reviews-train.t…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /home/sodeso/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteEET2L7/imdb_reviews-test.tf…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /home/sodeso/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteEET2L7/imdb_reviews-unsuper…

[1mDataset imdb_reviews downloaded and prepared to /home/sodeso/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [58]:
# Validation set.
val_ds = tfds.load(
    'imdb_reviews',
    split='train[80%:]',
    batch_size=BATCH_SIZE,
    shuffle_files=True,
    as_supervised=True)

In [59]:
for review_batch, label_batch in val_ds.take(1):
  for i in range(5):
    print("Review: ", review_batch[i].numpy())
    print("Label: ", label_batch[i].numpy())

Review:  b"Instead, go to the zoo, buy some peanuts and feed 'em to the monkeys. Monkeys are funny. People with amnesia who don't say much, just sit there with vacant eyes are not all that funny.<br /><br />Black comedy? There isn't a black person in it, and there isn't one funny thing in it either.<br /><br />Walmart buys these things up somehow and puts them on their dollar rack. It's labeled Unrated. I think they took out the topless scene. They may have taken out other stuff too, who knows? All we know is that whatever they took out, isn't there any more.<br /><br />The acting seemed OK to me. There's a lot of unfathomables tho. It's supposed to be a city? It's supposed to be a big lake? If it's so hot in the church people are fanning themselves, why are they all wearing coats?"
Label:  0
Review:  b'Well, was Morgan Freeman any more unusual as God than George Burns? This film sure was better than that bore, "Oh, God". I was totally engrossed and LMAO all the way through. Carrey was

2023-09-20 09:23:56.497124: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Prepare the dataset for training

In [60]:
vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

# Make a text-only dataset (without labels), then call `TextVectorization.adapt`.
train_text = train_ds.map(lambda text, labels: text)
vectorize_layer.adapt(train_text)

In [61]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [62]:
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

In [63]:
# Configure datasets for performance as before (caching and prefetching)
train_ds = configure_dataset(train_ds)
val_ds = configure_dataset(val_ds)

In [64]:
model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=1)
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 64)          640064    
                                                                 
 conv1d_2 (Conv1D)           (None, None, 64)          20544     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 660673 (2.52 MB)
Trainable params: 660673 (2.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
model.compile(
    loss=losses.BinaryCrossentropy(from_logits=True), # from_logits is True since we don't have an activation function on last layer.
    optimizer=optimizers.Adam(),
    metrics=['accuracy'])

In [66]:
history = model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [67]:
loss, accuracy = model.evaluate(val_ds)

print(f"Loss: {loss}", loss)
print(f"Accuracy: {accuracy:2.2%}")

Loss: 0.3268670439720154 0.3268670439720154
Accuracy: 86.22%


In [68]:
export_model = tf.keras.Sequential(
    [vectorize_layer, model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False), # from_logits = False since we have sigmoid activation as last layer.
    optimizer=optimizers.Adam(),
    metrics=['accuracy'])

In [69]:
# 0 --> negative review
# 1 --> positive review
inputs = [
    "This is a fantastic movie.", # 1
    "This is a bad movie.", # 0
    "This movie was so bad that it was good.", # 1
    "I will never say yes to watching this movie.", # 0
]

predicted_scores = export_model.predict(inputs)
predicted_labels = [int(round(x[0])) for x in predicted_scores]

for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label)

Question:  This is a fantastic movie.
Predicted label:  1
Question:  This is a bad movie.
Predicted label:  0
Question:  This movie was so bad that it was good.
Predicted label:  0
Question:  I will never say yes to watching this movie.
Predicted label:  1
