##### Copyright 2019 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/tensorflow_text/subwords_tokenizer"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/text/blob/master/examples/subwords_tokenizer.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/text/blob/master/examples/subwords_tokenizer.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/text/examples/subwords_tokenizer.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

The `tensorflow_text` package includes a TensorFlow implementation of a subword tokenizer in `text.BertTokenizer`. This tutorial demonstrates how to generate a subword vocabulary from a dataset, and build a `text.BertTokenizer` from the vocabulary.

The main advantage of a subword tokenizer is that it interpolates between word-based and character-based tokenization. Common words get a slot in the vocabulary, but the tokenizer can fall back to word pieces and individual characters for unknown words.

## Setup

In [None]:
!pip install -q tensorflow_datasets
!pip install -q tensorflow_text

In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [None]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

## Download the dataset

Fetch the Portugese/English translation dataset from [tfds](https://tensorflow.org/datasets):

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']  

This dataset produces Portugese/English sentence pairs:

In [None]:
for pt, en in train_examples.take(1):
  print("Portugese: ", pt.numpy().decode('utf-8'))
  print("English:   ", en.numpy().decode('utf-8'))

Note a few things about the example sentences above:
* They're lower case.
* There are spaces around the punctuation.
* It's not clear if or what unicode normalization is being used.

## Text standardization

The standardization function below converts raw text to the same format as the dataset, and normalizes them using [Compatibility decomposition `NFKD`](https://unicode.org/reports/tr15/).

In [None]:
def standardize_text(sentences):
  # Make everything lowercase.
  sentences = tf.strings.lower(sentences)
  # Insert spaces before punctuation
  sentences = tf.strings.regex_replace(
      sentences, '[%s]' % re.escape(string.punctuation), r' \0 ')
  # Normalize unicode.
  sentences = text.normalize_utf8(sentences, 'NFKD')  
  # Collapse multiple spaces into one.
  sentences = tf.strings.regex_replace(
      sentences,'\s+', r' ')
  # Drop any spaces at the start or end.
  sentences = tf.strings.regex_replace(
      sentences,'(^\s|\s$)', r'')
  
  return sentences

In [None]:
example = "Da impressão,".encode('utf-8')

print("Before: ", example)
print("After:  ", standardize_text(example).numpy())

## Word counts

To generate the vocabulary for the `BertTokenizer` you'll need to extract the word-counts from the dataset for each language.

Use a python [collections.Counter](https://docs.python.org/2/library/collections.html#collections.Counter) to do the counting:

In [None]:
def count_words(ds):
  counts = collections.Counter()
  # Batch for efficiency
  for sentences in ds.batch(1024).map(standardize_text).prefetch(1):
    # Split on spaces, the result is a RaggedTensor
    sentences = tf.strings.split(sentences, sep=" ")
    # Convert the RaggedTensor to a list of lists.
    sentences = sentences.to_list()
    # Dor each list of words, 
    for words in sentences:
      # add to the word counts.
      counts.update(words)

  # Decode the words.
  counts = collections.Counter({
      word.decode('utf-8'): count
      for word, count in counts.items()})
  return counts

In [None]:
def select_pt(pt, en):
  return pt

counts = count_words(train_examples.map(select_pt))

Now you have word counts for the Portugese part of the dataset:

In [None]:
print(f"Unique words: {len(counts)}\n")
print(f"Most common Portugese words and counts:")
for word, count  in counts.most_common(20):
  print(f"    {word}: {count}")

## Subwords vocabulary generation

The vocabulary generation code is not included in the `tensorflow_text` pip package. So clone the repository:

In [None]:
!git clone --depth 1 https://github.com/tensorflow/text/

The code of interest is in the `test/tools` directory:

In [None]:
!ls text/tools

So add the tools directory to the python path, so you can import modules from there:

In [None]:
if str(pwd/"text/tools") not in sys.path:
  sys.path.append(str(pwd/"text/tools"))

from wordpiece_vocab import wordpiece_tokenizer_learner_lib as vocab_learner

The `learner` takes a bunch of parameters as input.

The `upper_threshold` and `lower_threshold` parameters set the initial limits of the binary search as it searches for a vocabulary near the requested `vocab_size`.

In [None]:
default_params = dict(    
    upper_thresh=10000000,
    lower_thresh=10,
    num_iterations=4,
    max_input_tokens=5000000,
    max_token_length=50,
    max_unique_chars=1000,
    slack_ratio=0.05,
    include_joiner_token=True, 
    joiner="##")

reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
params = vocab_learner.Params(
    vocab_size=8000, 
    reserved_tokens=reserved_tokens,
    **default_params)

This takes about 2 minutes.

In [None]:
%%time
pt_vocab = vocab_learner.learn(counts.items(), params)

Here are some slices of the resulting vocabulary.

In [None]:
print(pt_vocab[:10])
print(pt_vocab[100:110])
print(pt_vocab[1000:1010])
print(pt_vocab[-10:])

Write a vocabulary file:

In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [None]:
write_vocab_file('pt_vocab.txt', pt_vocab)

Now put that all together into a single function:

In [None]:
def gen_subwords_vocab(ds, filepath):
  counts = count_words(ds)

  params = vocab_learner.Params(
    vocab_size=8000, 
    reserved_tokens=reserved_tokens,
    **default_params)

  vocab = vocab_learner.learn(counts.items(), params)
  write_vocab_file(filepath, vocab)
  return vocab

Use that function to generate a vocabulart from the english data:

In [15]:
%%time
def select_en(pt, en):
  return en

en_examples = train_examples.map(select_en)
en_vocab = gen_subwords_vocab(en_examples, 'en_vocab.txt')

NameError: ignored

In [None]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

Here are the two vocabulary files:

In [16]:
!ls *.txt

ls: cannot access *.txt: No such file or directory


## Build the BertTokenizer

The `text.BertTokenizer` can be initialized by passing the vocabulary file's path as the first argument: 

In [None]:
pt_tokenizer = text.BertTokenizer('pt_vocab.txt')
en_tokenizer = text.BertTokenizer('en_vocab.txt')

Now you can use it to encode some text. Initially this returns a `tf.RaggedTensor` with axes `(batch, word, word-piece)`:

In [None]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
  for ex in en_examples:
    print(ex.numpy())

In [None]:
enc = standardize_text(en_examples)
enc = en_tokenizer.tokenize(enc)

for ex in enc.to_list():
  print(ex)

Merge the `word` and `word-piece` axes into a single `word-piece` axis for the whole sentence, using `RaggedTensor.merge_dims`:

In [None]:
enc = enc.merge_dims(1,2)

for ex in enc.to_list():
  print(ex)

Add the `[START]` and `[END]` tokens (they're at the same indexes for both languages):

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [17]:
enc = add_start_end(enc)

for ex in enc.to_list():
  print(ex)

NameError: ignored

Many models can't handle `tf.ragged` directly but you can convert to a padded dense tensor using `RaggedTensor.to_tensor`:

In [None]:
enc.to_tensor()

Finally, put those steps together into a single module:

In [18]:
class TextEncoder(tf.Module):
  def __init__(self, vocab):
    self.tokenizer = text.BertTokenizer(vocab)
    
  @tf.function(input_signature=[
      tf.TensorSpec(shape=[None], dtype=tf.string)])
  def encode(self, strings):
    enc = standardize_text(strings)
    enc = self.tokenizer.tokenize(enc)
    enc = enc.merge_dims(1,2)
    enc = add_start_end(enc)
    return enc

Build one of these text-encoders for each language:

In [None]:
encoders = tf.Module()
encoders.pt = TextEncoder("pt_vocab.txt")
encoders.en = TextEncoder("en_vocab.txt")

Try the encode function in the batch of english examples:

In [None]:
encoders.en.encode(en_examples)

Since `text.BertTokenizer` is entierly implemented in TensorFlow, it can be exported and restored as part of a `saved_model`:

In [None]:
tf.saved_model.save(encoders, 'encoders')

In [None]:
reloaded_encoders = tf.saved_model.load('encoders')
reloaded_encoders.en.encode(en_examples)

## Text decoding

Sometimes you may want to generate text, and need to convert back from tokens IDs to human-readable text.

For that you need an inverse vocabulary table. There are lots of ways you could implement this. If you stick with a pure TensorFlow implementation you can export the text decoding with your model too.

Since the IDs are dense, you can just pack the tokens into a string tensor, and use `tf.gather` to look them up. Otherwise see `tf.lookup.StaticHashTable`.

In [None]:
en_vocab = pathlib.Path('en_vocab.txt').read_text().splitlines()
en_vocab = tf.Variable(en_vocab)
tokens = tf.gather(en_vocab, enc)
for line in tokens.to_list():
  print(line)


With a little more processing, it can be made human readable again:

In [None]:
def decode_text(vocab, encoded):
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)
  
  # Lookup the tokens.
  result = tf.gather(vocab, encoded)

  # Drop the reserved tokens
  bad_cells = tf.strings.regex_full_match(result, bad_token_re)
  result = tf.ragged.boolean_mask(result, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  # Fix the subword-joiners
  result = tf.strings.regex_replace(result, ' ##', '',)
  return result

In [None]:
for line in decode_text(en_vocab, enc):
  print(line.numpy().decode('utf-8'))

Putting it all together, you can build a 100% self contained encoder and decoder:

In [None]:
class TextConverter(TextEncoder):
  def __init__(self, vocab):
    super().__init__(vocab)
    vocab = pathlib.Path(vocab).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    # Include signatures for both Tensor and RaggedTensor inputs.
    self.decode.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.decode.get_concrete_function(
        tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
  
    self.get_tokens.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.get_tokens.get_concrete_function(
        tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

  @tf.function
  def decode(self, encoded):
    return decode_text(self.vocab, encoded)

  @tf.function
  def get_tokens(self, ids):
    return tf.gather(self.vocab, ids)

Build an encoder/decoder for each language:

In [None]:
converters = tf.Module()
converters.pt = TextConverter('pt_vocab.txt')
converters.en = TextConverter('en_vocab.txt')

Export them as a `saved_model`:

In [19]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.saved_model.save(converters, model_name)

NameError: ignored

Reload the `saved_model` and test the methods:

In [20]:
reloaded_converters = tf.saved_model.load(model_name)
enc = reloaded_converters.en.encode(['Hello TensorFlow!'])
enc.numpy()

array([[4014, 2364,  697, 1199, 2372,    4]])

In [None]:
tokens = reloaded_converters.en.get_tokens(enc)
tokens.numpy()

In [None]:
round_trip = reloaded_converters.en.decode(enc)

print(round_trip.numpy()[0].decode('utf-8'))

Archive it for the [translation tutorials](https://tensorflow.org/tutorials/text/transformer):

In [None]:
!zip -r {model_name}.zip {model_name}

In [None]:
!du -sh *.zip

## Optional: The algorithm

It's worth noting here that there are two versions of the WordPiece algorithm: Bottom-up and top-down. In both cases goal is the same: "Given a training corpus and a number of desired
tokens D, the optimization problem is to select D wordpieces such that the resulting corpus is minimal in the
number of wordpieces when segmented according to the chosen wordpiece model."

The  original [bottom-up WordPiece algorithm](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf), is based on [byte-pair encoding](https://towardsdatascience.com/byte-pair-encoding-the-dark-horse-of-modern-nlp-eb36c7df4f10). Like BPE It starts with the alphabet, and iteratively combines common bigrams to form word-pieces and words. 

TensorFlow text's vocabulary generator follows the top-down implementation from [BERT](https://arxiv.org/pdf/1810.04805.pdf). Starting with words and breaking them down into smaller components until they hit the frequency threshold, or can't be broken down further. The next section describes this in detail.

### Choosing the vocabulary

The top-down WordPiece generation algorithm takes in a set of (word, count) pairs and a threshold `T`, and returns a vocabulary `V`.

The algorithm is iterative. It is run for `k` iterations, where typically `k = 4`, but only the first two are really important. The third and fourth (and beyond) are just identical to the second. Note that each step of the binary search runs the algorithm from scratch for `k` iterations.

The iterations described below:



#### First iteration

1.  Iterate over every word and count pair in the input, denoted as `(w, c)`.
2.  For each word `w`, generate every substring, denoted as `s`. E.g., for the
    word `human`, we generate `{h, hu, hum, huma,
    human, ##u, ##um, ##uma, ##uman, ##m, ##ma, ##man, #a, ##an, ##n}`.
3.  Maintain a substring-to-count hash map, and increment the count of each `s`
    by `c`. E.g., if we have `(human, 113)` and `(humas, 3)` in our input, the
    count of `s = huma` will be `113+3=116`.
4.  Once we've collected the counts of every substring, iterate over the `(s,
    c)` pairs *starting with the longest `s` first*.
5.  Keep any `s` that has a `c > T`. E.g., if `T = 100` and we have `(pers,
    231); (dogs, 259); (##rint; 76)`, then we would keep `pers` and `dogs`.
6.  When an `s` is kept, subtract off its count from all of its prefixes. This
    is the reason for sorting all of the `s` by length in step 4. This is a
    critical part of the algorithm, because otherwise words would be double
    counted. For example, let's say that we've kept `human` and we get to
    `(huma, 116)`. We know that `113` of those `116` came from `human`, and `3`
    came from `humas`. However, now that `human` is in our vocabulary, we know
    we will never segment `human` into `huma ##n`. So once `human` has been
    kept, then `huma` only has an *effective* count of `3`.

This algorithm will generate a set of word pieces `s` (many of which will be
whole words `w`), which we *could* use as our WordPiece vocabulary.

However, there is a problem: This algorithm will severely overgenerate word
pieces. The reason is that we only subtract off counts of prefix tokens.
Therefore, if we keep the word `human`, we will subtract off the count for `h,
hu, hu, huma`, but not for `##u, ##um, ##uma, ##uman` and so on. So we might
generate both `human` and `##uman` as word pieces, even though `##uman` will
never be applied.

So why not subtract off the counts for every *substring*, not just every
*prefix*? Because then we could end up subtracting off the counts multiple
times. Let's say that we're processing `s` of length 5 and we keep both
`(##denia, 129)` and `(##eniab, 137)`, where `65` of those counts came from the
word `undeniable`. If we subtract off from *every* substring, we would subtract
`65` from the substring `##enia` twice, even though we should only subtract
once. However, if we only subtract off from prefixes, it will correctly only be
subtracted once.



#### Second (and third ...) iteration

To solve the overgeneration issue mentioned above, we perform multiple
iterations of the algorithm.

Subsequent iterations are identical to the first, with one important
distinction: In step 2, instead of considering *every* substring, we apply the
WordPiece tokenization algorithm using the vocabulary from the previous
iteration, and only consider substrings which *start* on a split point.

For example, let's say that we're performing step 2 of the algorithm and
encounter the word `undeniable`. In the first iteration, we would consider every
substring, e.g., `{u, un, und, ..., undeniable, ##n, ##nd, ..., ##ndeniable,
...}`.

Now, for the second iteration, we will only consider a subset of these. Let's
say that after the first iteration, the relevant word pieces are:

`un, ##deni, ##able, ##ndeni, ##iable`

The WordPiece algorithm will segment this into `un ##deni ##able` (see the
section [Applying WordPiece](#applying-wordpiece) for more information). In this
case, we will only consider substrings that *start* at a segmentation point. We
will still consider every possible *end* position. So during the second
iteration, the set of `s` for `undeniable` is:

`{u, un, und, unden, undeni, undenia, undeniab, undeniabl,
undeniable, ##d, ##de, ##den, ##deni, ##denia, ##deniab, ##deniabl
, ##deniable, ##a, ##ab, ##abl, ##able}`

The algorithm is otherwise identical. In this example, in the first iteration,
the algorithm produces the suprious tokens `##ndeni` and `##iable`. Now, these
tokens are never considered, so they will not be generated by the second
iteration. We perform several iterations just to make sure the results converge
(although there is no literal convergence guarantee).


### Applying WordPiece

Once a WordPiece vocabulary has been generated, we need to be able to apply it
to new data. The algorithm is a simple greedy longest-match-first application.

For example, consider segmenting the word `undeniable`.

We first lookup `undeniable` in our WordPiece dictionary, and if it's present,
we're done. If not, we decrement the end point by one character, and repeat,
e.g., `undeniabl`.

Eventually, we will either find a subtoken in our vocabulary, or get down to a
single character subtoken. (In general, we assume that every character is in our
vocabulary, although this might not be the case for rare Unicode characters. If
we encounter a rare Unicode character that's not in the vocabulary we simply map
the entire word to `<unk>`).

In this case, we find `un` in our vocabulary. So that's our first word piece.
Then we jump to the end of `un` and repeat the processing, e.g., try to find
`##deniable`, then `##deniabl`, etc. This is repeated until we've segmented the
entire word.

### Intuition

Intuitively, WordPiece tokenization is trying to satisfy two different
objectives:

1.  Tokenize the data into the *least* number of pieces as possible. It is
    important to keep in mind that the WordPiece algorithm does not "want" to
    split words. Otherwise, it would just split every word into its characters,
    e.g., `human -> {h, ##u, ##m, ##a, #n}`. This is one critical thing that
    makes WordPiece different from morphological splitters, which will split
    linguistic morphemes even for common words (e.g., `unwanted -> {un, want,
    ed}`).

2.  When a word does have to be split into pieces, split it into pieces that
    have maximal counts in the training data. For example, the reason why the
    word `undeniable` would be split into `{un, ##deni, ##able}` rather than
    alternatives like `{unde, ##niab, ##le}` is that the counts for `un` and
    `##able` in particular will be very high, since these are common prefixes
    and suffixes. Even though the count for `##le` must be higher than `##able`,
    the low counts of `unde` and `##niab` will make this a less "desirable"
    tokenization to the algorithm.

## Optional: tf.lookup

If you need access to, or more control over the vocabulary it's worth noting that you can build the lookup table yourself and pass that to `BertTokenizer`.

When you pass a string, `BertTokenizer` does the following:

In [None]:
pt_lookup = tf.lookup.StaticVocabularyTable(
    num_oov_buckets=1,
    initializer=tf.lookup.TextFileInitializer(
        filename='pt_vocab.txt',
        key_dtype=tf.string,
        key_index = tf.lookup.TextFileIndex.WHOLE_LINE,
        value_dtype = tf.int64,
        value_index=tf.lookup.TextFileIndex.LINE_NUMBER)) 
pt_tokenizer = text.BertTokenizer(pt_lookup)

Now you have direct access to the lookup table used in the tokenizer.

In [None]:
pt_lookup.lookup(tf.constant(['é', 'um', 'uma', 'para', 'não']))

You don't need to use a vocabulary file, `tf.lookup` has other initializer options. If you have the vocabulary in memory you can use `lookup.KeyValueTensorInitializer`:

In [None]:
pt_lookup = tf.lookup.StaticVocabularyTable(
    num_oov_buckets=1,
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=pt_vocab,
        values=tf.range(len(pt_vocab), dtype=tf.int64))) 
pt_tokenizer = text.BertTokenizer(pt_lookup)

## Optional: Command line script

Note that there is a script that allow you to run the wordpiece_vocab learner from the command line if you have a counts file.

The `text/tools/` directory needs to be in your python path for the script to work.

Here's the builtin help:

In [None]:
%%bash
export PYTHONPATH=$PYTHONPATH:$PWD/text/tools

python -m wordpiece_vocab.wordpiece_tokenizer_learner --help


There is a `wordpiece_tokenizer_learner.py` script that will take a counts file as input and produce a vocabulary file.

In [None]:
pt_counts = count_words(train_examples.map(lambda pt, en: pt))

In [None]:
def write_counts(counts, filepath):
  with open(filepath, 'w') as f:
    for word, count in counts.most_common():
      print(word, count, file=f)

In [None]:
write_counts(pt_counts, 'pt_counts.txt')

It takes about 2 minutes to generate a vocabulary:

In [None]:
%%bash
export PYTHONPATH=$PYTHONPATH:$PWD/text/tools

time python -m wordpiece_vocab.wordpiece_tokenizer_learner \
  --vocab_size 8000 \
  --num_pad_tokens 1 \
  --reserved_tokens "[START],[END]" \
  --input_path pt_counts.txt \
  --output_path pt_vocab.txt

In [None]:
!head pt_vocab.txt

In [None]:
!wc -l pt_vocab.txt