In [0]:
import tensorflow as tf


In [2]:
# unicode strings are utf-8 encoded by default

tf.constant(u"Thanks 😊")

<tf.Tensor: shape=(), dtype=string, numpy=b'Thanks \xf0\x9f\x98\x8a'>

In [4]:
# each string in the unicode is considered as a songle character

tf.constant([u"You're", u"welcome!"]).shape

TensorShape([2])

In [13]:
# utf-8 representation of a unicode string
text_utf8 = tf.constant(u"语言处理")
text_utf8

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [15]:
# utf-16-BE encoded representation of unicode string
text_utf16be = tf.constant(u"语言处理".encode('UTF-16-BE'))
text_utf16be

<tf.Tensor: shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [16]:
# ubicode string represented as unicode code points
text_chars = tf.constant([ord(char) for char in u'语言处理'])
text_chars

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

In [18]:
# utf-8 to unicode code points
tf.strings.unicode_decode(text_utf8, input_encoding = 'UTF-8')

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

In [19]:
# unicode code to utf-8
tf.strings.unicode_encode(text_chars, output_encoding = 'UTF-8')

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [20]:
# utf-8 to utf-16-BE
tf.strings.unicode_transcode(text_utf8, input_encoding = 'UTF-8', output_encoding = 'UTF-16-BE')

<tf.Tensor: shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [0]:
# while decodeing multiple strings in a batch, the no. of strings may be not equal

In [25]:
#encding unicode with utf-8
batch_utf8 = [s.encode('UTF-8') for s in [u'hÃllo',  u'What is the weather tomorrow',  u'Göödnight', u'😊']]

#decoding utf-8 to unicode code
batch_chars_ragged = tf.strings.unicode_decode(batch_utf8, input_encoding = 'UTF-8')

for chars in batch_chars_ragged.to_list():
    print(chars)

[104, 195, 108, 108, 111]
[87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116, 104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119]
[71, 246, 246, 100, 110, 105, 103, 104, 116]
[128522]


In [28]:
# we need to do the paddinng 
batch_padded_chars = batch_chars_ragged.to_tensor(default_value = -1)
batch_padded_chars

<tf.Tensor: shape=(4, 28), dtype=int32, numpy=
array([[   104,    195,    108,    108,    111,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1],
       [    87,    104,     97,    116,     32,    105,    115,     32,
           116,    104,    101,     32,    119,    101,     97,    116,
           104,    101,    114,     32,    116,    111,    109,    111,
           114,    114,    111,    119],
       [    71,    246,    246,    100,    110,    105,    103,    104,
           116,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1],
       [128522,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1

In [30]:
# or we can use to_sparse()
batch_chars_sparse = batch_chars_ragged.to_sparse()
batch_padded_chars

<tf.Tensor: shape=(4, 28), dtype=int32, numpy=
array([[   104,    195,    108,    108,    111,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1],
       [    87,    104,     97,    116,     32,    105,    115,     32,
           116,    104,    101,     32,    119,    101,     97,    116,
           104,    101,    114,     32,    116,    111,    109,    111,
           114,    114,    111,    119],
       [    71,    246,    246,    100,    110,    105,    103,    104,
           116,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1],
       [128522,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
            -1,     -1,     -1,     -1