In [84]:
import logging
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pyarrow.parquet as pa

import tensorflow as tf
import tensorflow.keras as k
import ShapeChecker
import einops
import tensorflow_text as text
import tensorflow_datasets as tfds
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [85]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [86]:
table = pa.read_table('./data/to_swedish.parquet')
df = table.to_pandas()
df = df.iloc[:350000,:]
len(df)

350000

In [87]:
table_val = pa.read_table('./data/to_spanish.parquet')
df_val = table_val.to_pandas()
df_val = df_val.iloc[:350000,:]
len(df_val)

350000

In [88]:
en_train_ds = np.array([row['en'] for row in df['translation']])
sv_train_ds = np.array([row['sv'] for row in df['translation']])

In [89]:
en_val_ds = np.array([row['en'] for row in df_val['translation']])
es_val_ds = np.array([row['es'] for row in df_val['translation']])

In [90]:
train_raw = (
    tf.data.Dataset
    .from_tensor_slices((en_train_ds, sv_train_ds))
    )
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((en_val_ds, es_val_ds))
    )

In [91]:
train_en = train_raw.map(lambda en, sv: en)
train_sv = train_raw.map(lambda en, sv: sv)
train_es = val_raw.map(lambda en, es: es)

In [92]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 12000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [93]:
sv_vocab = bert_vocab.bert_vocab_from_dataset(
    train_sv.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [94]:
print(sv_vocab[:10])
print(sv_vocab[100:110])
print(sv_vocab[1000:1010])
print(sv_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ']
['storsta', 'forsokte', 'sla', '2011', 'medlemsstaternas', 'situationen', 'san', 'ihag', 'lakare', 'maj']
['##➥', '##土', '##魯', '##\uf0ac', '##\uf0b3', '##\uf0b7', '##\uf106', '##\uf8e8', '##\uf8e9', '##�']


In [95]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [96]:
write_vocab_file('vocab/sv_vocab.txt', sv_vocab)

In [97]:
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [98]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ']
['inside', 'per', 'dr', 'fire', 'particularly', 'questions', 'employment', 'role', 'weeks', 'sent']
['##视', '##魯', '##\uf020', '##\uf066', '##\uf074', '##\uf0b3', '##\uf0b7', '##\uf106', '##\uf8e9', '##�']


In [99]:
write_vocab_file('vocab/en_vocab.txt', en_vocab)

In [100]:
es_vocab = bert_vocab.bert_vocab_from_dataset(
    train_es.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [101]:
print(es_vocab[:10])
print(es_vocab[100:110])
print(es_vocab[1000:1010])
print(es_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ']
['puesto', 'cuerpo', 'eh', 'mision', '##ina', 'dificil', '##v', 'necesidad', 'propuesta', 'sociales']
['##\ue0e7', '##\uf03c', '##\uf0b7', '##\uf106', '##\uf8e7', '##�', '##🇬', '##🇭', '##🎉', '##😂']


In [102]:
write_vocab_file('vocab/es_vocab.txt', es_vocab)

In [103]:
sv_tokenizer = text.BertTokenizer('vocab/sv_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('vocab/en_vocab.txt', **bert_tokenizer_params)
es_tokenizer = text.BertTokenizer('vocab/es_vocab.txt', **bert_tokenizer_params)

In [104]:
for  en_examples, sv_examples in train_raw.batch(3).take(1):
  for ex in en_examples:
    print(ex.numpy())

b'The Icelandic authorities submitted comments on this Decision by letter dated 24 February 2005 (Event No 311243).'
b'Are you coming or not?'
b"That's not good."


In [105]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

[211, 5488, 523, 558, 1353, 1994, 221, 222, 396, 240, 1576, 4338, 1011, 1591, 691, 11, 1654, 232, 940, 5440, 672, 630, 12, 17]
[230, 214, 540, 242, 227, 34]
[217, 10, 60, 227, 293, 17]


In [106]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'the silly real matter sort 70 on this ##t by relations ##ied ##f issued subject ( coffee ##s stuff adolescents mother action ) .',
       b'me you conditions or what ?', b"that ' s what us ."],
      dtype=object)>

In [107]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'the icelandic authorities submitted comments on this decision by letter dated 24 february 2005 ( event no 311243 ) .',
       b'are you coming or not ?', b"that ' s not good ."], dtype=object)>

In [108]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [109]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] the icelandic authorities submitted comments on this decision by letter dated 24 february 2005 ( event no 311243 ) . [END]',
       b'[START] are you coming or not ? [END]',
       b"[START] that ' s not good . [END]"], dtype=object)>

In [110]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [111]:
en_examples.numpy()

array([b'The Icelandic authorities submitted comments on this Decision by letter dated 24 February 2005 (Event No 311243).',
       b'Are you coming or not?', b"That's not good."], dtype=object)

In [112]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'the', b'icelandic', b'authorities', b'submitted', b'comments', b'on',
  b'this', b'decision', b'by', b'letter', b'dated', b'24', b'february',
  b'2005', b'(', b'event', b'no', b'311243', b')', b'.']                 ,
 [b'are', b'you', b'coming', b'or', b'not', b'?'],
 [b'that', b"'", b's', b'not', b'good', b'.']]>

In [113]:
cleanup_text(reserved_tokens, words).numpy()

array([b'the icelandic authorities submitted comments on this decision by letter dated 24 february 2005 ( event no 311243 ) .',
       b'are you coming or not ?', b"that ' s not good ."], dtype=object)

In [114]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [115]:
tokenizers = tf.Module()
tokenizers.sv = CustomTokenizer(reserved_tokens, 'vocab/sv_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'vocab/en_vocab.txt')
tokenizers.es = CustomTokenizer(reserved_tokens, 'vocab/es_vocab.txt')


In [116]:
model_name = 'tokenizer'
tf.saved_model.save(tokenizers, model_name)

In [117]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

11490

In [118]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2,  763, 1184, 6074,  372,  765, 2686, 1070,    4,    3]],
      dtype=int64)

In [119]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hard', b'land', b'implications', b'before', b'government',
  b'repeat', b'lead', b'!', b'[END]']]>

In [120]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hello tensorflow !
