In [6]:
!pip install tensorflow-text

Collecting tensorflow-text
  Using cached https://files.pythonhosted.org/packages/50/35/b87b84648f84b9021df3407422d3cf9afdcbb9b087d2bf293fe72557b7a5/tensorflow_text-2.1.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting tensorflow<2.2,>=2.1.0
  Using cached https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl
Collecting tensorboard<2.2.0,>=2.1.0
  Using cached https://files.pythonhosted.org/packages/d9/41/bbf49b61370e4f4d245d4c6051dfb6db80cec672605c91b1652ac8cc3d38/tensorboard-2.1.1-py3-none-any.whl
Processing /root/.cache/pip/wheels/5c/2e/7e/a1d4d4fcebe6c381f378ce7743a3ced3699feb89bcfbdadadd/gast-0.2.2-cp36-none-any.whl
[31mERROR: tensorflow-probability 0.10.0rc0 has requirement gast>=0.3.2, but you'll have gast 0.2.2 which is incompatible.[0m
Installing collected packages: tensorboard, gast, tensorflow, tensorflow-text
  Found existing installation: tensorboard 2.2.1
    Uninstalli

In [0]:
import tensorflow as tf
import tensorflow_text as text

In [11]:
docs = tf.constant([u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE')])
utf8_docs = tf.strings.unicode_transcode(docs, input_encoding = 'UTF-16-BE', output_encoding = 'UTF-8')
utf8_docs

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'Everything not saved will be lost.', b'Sad\xe2\x98\xb9'],
      dtype=object)>

In [0]:
# tokenize and split the sting with white spaces

In [14]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
print(tokens.to_list())

Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.
[[b'Everything', b'not', b'saved', b'will', b'be', b'lost.'], [b'Sad\xe2\x98\xb9']]


In [0]:
# tokenize based on the script boundries

In [16]:
tokenizer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
tokens.to_list()

[[b'Everything', b'not', b'saved', b'will', b'be', b'lost', b'.'],
 [b'Sad', b'\xe2\x98\xb9']]

In [0]:
# for languages like chinese there are no white spaces
# its better to split the sting char by char

In [20]:
tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], input_encoding = 'UTF-8')
tokens.to_list()

[[b'\xe4\xbb\x85', b'\xe4\xbb\x8a', b'\xe5\xb9\xb4', b'\xe5\x89\x8d']]

In [21]:
tokenizer = text.UnicodeScriptTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(utf8_docs)
print(tokens.to_list())
print(offset_starts.to_list())
print(offset_limits.to_list())

[[b'Everything', b'not', b'saved', b'will', b'be', b'lost', b'.'], [b'Sad', b'\xe2\x98\xb9']]
[[0, 11, 15, 21, 26, 29, 33], [0, 3]]
[[10, 14, 20, 25, 28, 33, 34], [3, 6]]


In [0]:
# they work same as tf.data


In [24]:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
for i in docs: 
    print(i)
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
print(next(iter(tokenized_docs)))

tf.Tensor([b'Never tell me the odds.'], shape=(1,), dtype=string)
tf.Tensor([b"It's a trap!"], shape=(1,), dtype=string)
<tf.RaggedTensor [[b'Never', b'tell', b'me', b'the', b'odds.']]>


In [25]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Is capitalized?
f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
# Are all letters uppercased?
f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
# Does the token contain punctuation?
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
# Is the token a number?
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

print(f1.to_list())
print(f2.to_list())
print(f3.to_list())
print(f4.to_list())

[[True, False, False, False, False, False], [True]]
[[False, False, False, False, False, False], [False]]
[[False, False, False, False, False, True], [True]]
[[False, False, False, False, False, False], [False]]


In [0]:
# ngrams can be applied to the tokens

In [27]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Ngrams, in this case bi-gram (n = 2)
bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

print(bigrams.to_list())

[[b'Everything not', b'not saved', b'saved will', b'will be', b'be lost.'], []]
