# Chapter 13: Subword Segmentation
## Subword Tokenizers: WordPiece
Tokenizers using WordPiece (Schuster and Nakajima, 2012): https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6289079

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

In [1]:
import os
import regex as re
from collections import Counter
from math import log

## Corpus Files

We read the files and we store the corpus in a string

In [2]:
PATH = '../datasets/'

In [3]:
CORPUS = 'HOMER'  # 'DICKENS'

In [4]:
if CORPUS == 'DICKENS':
    folder = PATH + 'dickens/'
elif CORPUS == 'HOMER':
    folder = PATH + 'classics/'

In [5]:
def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    :param dir:
    :param suffix:
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        if file.endswith(suffix):
            files.append(file)
    return files

In [6]:
if CORPUS == 'DICKENS':
    files = get_files(folder, 'txt')
elif CORPUS == 'HOMER':
    files = ['iliad.txt', 'odyssey.txt']
files

['iliad.txt', 'odyssey.txt']

In [7]:
files = [folder + file for file in files]
files

['../datasets/classics/iliad.txt', '../datasets/classics/odyssey.txt']

In [8]:
text = ''
for file in files:
    with open(file, encoding='utf8') as f:
        text += ' ' + f.read().strip()

In [9]:
text[:100]

' BOOK I\n\nSing, O goddess, the anger of Achilles son of Peleus, that brought\ncountless ills upon the '

## Pretokenization

We pretokenize the text using the spaces as delimiters.
In BERT, simply `split()`: https://github.com/google-research/bert/blob/master/tokenization.py#L300-L359. Here we use a regex.

In [10]:
pattern = r'\p{P}|[^\s\p{P}]+'

In [11]:
words = [(match.group(), (match.start(), match.end()))
         for match in re.finditer(pattern, text)]

In [12]:
text.split()[:8]

['BOOK', 'I', 'Sing,', 'O', 'goddess,', 'the', 'anger', 'of']

In [13]:
words[:8]

[('BOOK', (1, 5)),
 ('I', (6, 7)),
 ('Sing', (9, 13)),
 (',', (13, 14)),
 ('O', (15, 16)),
 ('goddess', (17, 24)),
 (',', (24, 25)),
 ('the', (26, 29))]

In [14]:
def pretokenize(pattern, text):
    return re.findall(pattern, text)

In [15]:
words = pretokenize(pattern, text)

In [16]:
word_cnts = Counter(words)

In [17]:
word_cnts.most_common(5)

[(',', 19920), ('the', 15258), ('and', 11467), ('of', 8640), ('.', 8108)]

In [18]:
word_cnts['her']

1145

## The Class
Same as with BPE. We create a second dictionary to count the subword tokens. At each iteration, the keys will store the subtokens. We set a ▁ symbol (U+2581) to mark the start of a word.

In [19]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

In [20]:
wp = WordPiece()
wp._wp_init(text)

In [21]:
wp.words_wp

{'▁BOOK': {'freq': 48, 'swords': ['▁', 'B', 'O', 'O', 'K']},
 '▁I': {'freq': 3194, 'swords': ['▁', 'I']},
 '▁Sing': {'freq': 2, 'swords': ['▁', 'S', 'i', 'n', 'g']},
 '▁,': {'freq': 19920, 'swords': ['▁', ',']},
 '▁O': {'freq': 77, 'swords': ['▁', 'O']},
 '▁goddess': {'freq': 112, 'swords': ['▁', 'g', 'o', 'd', 'd', 'e', 's', 's']},
 '▁the': {'freq': 15258, 'swords': ['▁', 't', 'h', 'e']},
 '▁anger': {'freq': 74, 'swords': ['▁', 'a', 'n', 'g', 'e', 'r']},
 '▁of': {'freq': 8640, 'swords': ['▁', 'o', 'f']},
 '▁Achilles': {'freq': 440,
  'swords': ['▁', 'A', 'c', 'h', 'i', 'l', 'l', 'e', 's']},
 '▁son': {'freq': 1246, 'swords': ['▁', 's', 'o', 'n']},
 '▁Peleus': {'freq': 145, 'swords': ['▁', 'P', 'e', 'l', 'e', 'u', 's']},
 '▁that': {'freq': 2558, 'swords': ['▁', 't', 'h', 'a', 't']},
 '▁brought': {'freq': 208, 'swords': ['▁', 'b', 'r', 'o', 'u', 'g', 'h', 't']},
 '▁countless': {'freq': 8,
  'swords': ['▁', 'c', 'o', 'u', 'n', 't', 'l', 'e', 's', 's']},
 '▁ills': {'freq': 5, 'swords': ['▁

In [22]:
wp.words_wp['▁her']

{'freq': 1145, 'swords': ['▁', 'h', 'e', 'r']}

In [23]:
wp.vocab[:10]

['Q', 'P', 'd', 'j', ')', 'U', 'Z', '[', 'q', 'm']

## Quality of a Language Model

We compute the gain
$$
C(xy) \cdot (\log P(xy) - \log P(x) - \log P(y)).
$$
for all bigrams

In [24]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

    def _calc_pair_gains(self):
        sword_cnts = Counter()
        self.pair_gains = Counter()
        for word_dict in self.words_wp.values():
            subwords = tuple(word_dict['swords'])
            freq = word_dict['freq']
            for i in range(len(subwords) - 1):
                sword_cnts[subwords[i]] += freq
                self.pair_gains[subwords[i:i + 2]] += freq
            sword_cnts[subwords[len(subwords) - 1]] += freq
        pair_cnt = sum(self.pair_gains.values())
        sword_cnt = sum(sword_cnts.values())
        for pair in self.pair_gains:
            self.pair_gains[pair] *= (
                log(self.pair_gains[pair]/pair_cnt)
                - log(sword_cnts[pair[0]]/sword_cnt)
                - log(sword_cnts[pair[1]]/sword_cnt))

In [25]:
wp = WordPiece()
wp._wp_init(text)
wp._calc_pair_gains()

In [26]:
sorted(wp.pair_gains, key=wp.pair_gains.get, reverse=True)[:4]

[('t', 'h'), ('h', 'e'), ('a', 'n'), ('▁', 't')]

## Constructing the Subwords
We merge a pair in a sequence of subwords. The structure of the pair is a list as in: `['h', 'e']`. `swords` is also a list.

In [27]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

    def _calc_pair_gains(self):
        sword_cnts = Counter()
        self.pair_gains = Counter()
        for word_dict in self.words_wp.values():
            subwords = tuple(word_dict['swords'])
            freq = word_dict['freq']
            for i in range(len(subwords) - 1):
                sword_cnts[subwords[i]] += freq
                self.pair_gains[subwords[i:i + 2]] += freq
            sword_cnts[subwords[len(subwords) - 1]] += freq
        pair_cnt = sum(self.pair_gains.values())
        sword_cnt = sum(sword_cnts.values())
        for pair in self.pair_gains:
            self.pair_gains[pair] *= (
                log(self.pair_gains[pair]/pair_cnt)
                - log(sword_cnts[pair[0]]/sword_cnt)
                - log(sword_cnts[pair[1]]/sword_cnt))

    def _merge_pair(self, pair, swords):
        pair_str = ''.join(pair)
        i = 0
        temp = []
        while i < len(swords) - 1:
            if pair == swords[i:i + 2]:
                temp += [pair_str]
                i += 2
            else:
                temp += [swords[i]]
                i += 1
        if i == len(swords) - 1:
            temp += [swords[i]]
        swords = temp
        return swords

    def fit(self, text):
        self._wp_init(text)

        self.merge_ops = []
        for _ in range(self.merge_cnt):
            self._calc_pair_gains()
            self.best_pair = max(self.pair_gains,
                                 key=self.pair_gains.get)
            merge_op = list(self.best_pair)
            self.merge_ops.append(merge_op)
            for word_dict in self.words_wp.values():
                word_dict['swords'] = self._merge_pair(merge_op,
                                                       word_dict['swords'])

        self._build_vocab()

    def _build_vocab(self):
        swords = list(map(lambda x: ''.join(x), self.merge_ops))
        self.vocab += swords

In [28]:
wp = WordPiece()
wp.fit(text)

In [29]:
wp.merge_ops[:4]

[['t', 'h'], ['th', 'e'], ['a', 'n'], ['an', 'd']]

In [30]:
wp.vocab

['Q',
 'P',
 'd',
 'j',
 ')',
 'U',
 'Z',
 '[',
 'q',
 'm',
 'p',
 'l',
 '-',
 'W',
 'A',
 'E',
 'M',
 'y',
 'X',
 'w',
 ']',
 'T',
 'L',
 'B',
 'i',
 'b',
 'I',
 'O',
 'v',
 'k',
 'Y',
 ':',
 "'",
 'c',
 's',
 'x',
 'N',
 'R',
 '?',
 'g',
 'S',
 'h',
 '&',
 'K',
 'a',
 'e',
 'o',
 'D',
 '!',
 'r',
 'V',
 '.',
 'n',
 't',
 '(',
 '▁',
 '"',
 'u',
 ',',
 'G',
 'C',
 'z',
 ';',
 'H',
 'J',
 'f',
 'F',
 'th',
 'the',
 'an',
 'and',
 'in',
 '▁the',
 '▁,',
 'ou',
 '▁w',
 '▁h',
 'ing',
 've',
 '▁and',
 '▁b',
 'on',
 'of',
 '▁f',
 '▁of',
 '▁s',
 '▁.',
 '▁hi',
 'to',
 'll',
 '▁to',
 'you',
 '▁he',
 'en',
 're',
 '▁wh',
 'ch',
 '▁a',
 '▁m',
 '▁ha',
 '▁wi',
 '▁with',
 '▁-',
 '▁th',
 '▁you',
 '▁c',
 'gh',
 '▁sh',
 'ow',
 'or',
 'om',
 '▁him',
 '▁for',
 '▁his',
 'us',
 '▁g',
 '▁I',
 'Th',
 '▁A',
 '▁in',
 '▁"',
 'ed',
 'at',
 '▁that',
 'ar',
 '▁d',
 '▁n',
 'ther',
 'le',
 '▁no',
 'ld',
 'er',
 '▁wa',
 'ght',
 '▁p',
 '▁;',
 '▁be',
 'ly',
 'es',
 'is',
 '▁was',
 '▁go',
 '▁will',
 '▁l',
 '▁Th',
 'ould'

## Vocabulary

In [31]:
wp.vocab.sort(key=lambda x: -len(x))

In [32]:
len(wp.vocab)

267

In [33]:
wp.vocab[:10]

['▁Trojans',
 '▁should',
 '▁their',
 '▁which',
 '▁would',
 '▁about',
 '▁shall',
 '▁with',
 '▁that',
 '▁will']

## Subword Tokenization

We write an encode function that breaks the words. We use a regex

In [34]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

    def _calc_pair_gains(self):
        sword_cnts = Counter()
        self.pair_gains = Counter()
        for word_dict in self.words_wp.values():
            subwords = tuple(word_dict['swords'])
            freq = word_dict['freq']
            for i in range(len(subwords) - 1):
                sword_cnts[subwords[i]] += freq
                self.pair_gains[subwords[i:i + 2]] += freq
            sword_cnts[subwords[len(subwords) - 1]] += freq
        pair_cnt = sum(self.pair_gains.values())
        sword_cnt = sum(sword_cnts.values())
        for pair in self.pair_gains:
            self.pair_gains[pair] *= (
                log(self.pair_gains[pair]/pair_cnt)
                - log(sword_cnts[pair[0]]/sword_cnt)
                - log(sword_cnts[pair[1]]/sword_cnt))

    def _merge_pair(self, pair, swords):
        pair_str = ''.join(pair)
        i = 0
        temp = []
        while i < len(swords) - 1:
            if pair == swords[i:i + 2]:
                temp += [pair_str]
                i += 2
            else:
                temp += [swords[i]]
                i += 1
        if i == len(swords) - 1:
            temp += [swords[i]]
        swords = temp
        return swords

    def fit(self, text):
        self._wp_init(text)

        self.merge_ops = []
        for _ in range(self.merge_cnt):
            self._calc_pair_gains()
            self.best_pair = max(self.pair_gains,
                                 key=self.pair_gains.get)
            merge_op = list(self.best_pair)
            self.merge_ops.append(merge_op)
            for word_dict in self.words_wp.values():
                word_dict['swords'] = self._merge_pair(merge_op,
                                                       word_dict['swords'])

        self._build_vocab()

    def _build_vocab(self):
        swords = list(map(lambda x: ''.join(x), self.merge_ops))
        self.vocab += swords
        self._create_regex()

    def _create_regex(self):
        self.vocab.sort(key=lambda x: -len(x))
        # We escape metachars as for '.'
        self.vocab = [re.escape(word) for word in self.vocab]
        self.sword_regex = '|'.join(self.vocab)

In [35]:
wp = WordPiece()
wp.fit(text)

In [36]:
wp.sword_regex

'▁Trojans|▁should|▁their|▁which|▁would|▁about|▁shall|▁with|▁that|▁will|▁have|▁from|▁them|▁were|▁Troj|▁they|▁your|▁this|▁when|▁hand|other|▁upon|▁said|▁ship|▁went|▁Jove|▁into|▁the|▁and|▁you|▁him|▁for|▁his|ther|▁was|ould|ight|▁had|▁but|▁not|▁all|▁com|ound|▁Ach|▁Tro|▁son|ough|▁man|▁who|▁whi|▁her|bout|▁The|▁shi|▁god|▁did|fore|self|▁she|▁now|▁str|the|and|ing|▁of|▁hi|▁to|you|▁he|▁wh|▁ha|▁wi|▁th|▁sh|▁in|▁no|▁wa|ght|▁be|▁go|▁Th|▁on|▁it|▁we|ver|▁bu|▁up|▁as|ill|oun|out|ans|▁fr|ent|▁st|▁sp|ven|own|ong|our|▁my|▁me|ain|Uly|▁He|▁is|tor|aid|▁by|all|▁so|▁li|ear|▁se|ore|ard|red|ind|sel|▁an|ome|ame|ove|▁br|th|an|in|▁,|ou|▁w|▁h|ve|▁b|on|of|▁f|▁s|▁\\.|to|ll|en|re|ch|▁a|▁m|▁\\-|▁c|gh|ow|or|om|us|▁g|▁I|Th|▁A|▁"|ed|at|ar|▁d|▁n|le|ld|er|▁p|▁;|ly|es|is|▁l|id|it|ay|ck|up|▁H|▁t|▁T|▁M|▁P|ir|un|ro|st|se|am|ad|ke|ce|ri|▁\'|ur|▁r|▁S|od|▁W|▁e|▁k|▁J|Q|P|d|j|\\)|U|Z|\\[|q|m|p|l|\\-|W|A|E|M|y|X|w|\\]|T|L|B|i|b|I|O|v|k|Y|:|\'|c|s|x|N|R|\\?|g|S|h|\\&|K|a|e|o|D|!|r|V|\\.|n|t|\\(|▁|"|u|,|G|C|z|;|H|J|f|F'

In [37]:
re.findall(wp.sword_regex, '▁Therefore')

['▁The', 're', 'fore']

In [38]:
re.findall(wp.sword_regex, '▁touché')

['▁to', 'u', 'ch']

In [39]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt
        self.unk_word = '[UNK]'

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

    def _calc_pair_gains(self):
        sword_cnts = Counter()
        self.pair_gains = Counter()
        for word_dict in self.words_wp.values():
            subwords = tuple(word_dict['swords'])
            freq = word_dict['freq']
            for i in range(len(subwords) - 1):
                sword_cnts[subwords[i]] += freq
                self.pair_gains[subwords[i:i + 2]] += freq
            sword_cnts[subwords[len(subwords) - 1]] += freq
        pair_cnt = sum(self.pair_gains.values())
        sword_cnt = sum(sword_cnts.values())
        for pair in self.pair_gains:
            self.pair_gains[pair] *= (
                log(self.pair_gains[pair]/pair_cnt)
                - log(sword_cnts[pair[0]]/sword_cnt)
                - log(sword_cnts[pair[1]]/sword_cnt))

    def _merge_pair(self, pair, swords):
        pair_str = ''.join(pair)
        i = 0
        temp = []
        while i < len(swords) - 1:
            if pair == swords[i:i + 2]:
                temp += [pair_str]
                i += 2
            else:
                temp += [swords[i]]
                i += 1
        if i == len(swords) - 1:
            temp += [swords[i]]
        swords = temp
        return swords

    def fit(self, text):
        self._wp_init(text)

        self.merge_ops = []
        for _ in range(self.merge_cnt):
            self._calc_pair_gains()
            self.best_pair = max(self.pair_gains,
                                 key=self.pair_gains.get)
            merge_op = list(self.best_pair)
            self.merge_ops.append(merge_op)
            for word_dict in self.words_wp.values():
                word_dict['swords'] = self._merge_pair(merge_op,
                                                       word_dict['swords'])

        self._build_vocab()

    def _build_vocab(self):
        swords = list(map(lambda x: ''.join(x), self.merge_ops))
        self.vocab += swords
        self._create_regex()

    def _create_regex(self):
        self.vocab.sort(key=lambda x: -len(x))
        # We escape metachars as for '.'
        self.vocab = [re.escape(word) for word in self.vocab]
        self.sword_regex = '|'.join(self.vocab)

    def encode(self, word):
        subwords = re.findall(self.sword_regex, word)
        if ''.join(subwords) != word:
            # some subwords are not in the vocabulary
            subwords = [self.unk_word]
        return subwords

In [40]:
wp = WordPiece()
wp.fit(text)

In [41]:
wp.encode('▁Therefore')

['▁The', 're', 'fore']

In [42]:
wp.encode('▁touché')

['[UNK]']

We can finally write the complete subword tokenization function. We use a cache to speed up the search

In [43]:
class WordPiece():
    def __init__(self, merge_cnt=200):
        self.pattern = r'\p{P}|[^\s\p{P}]+'
        self.merge_cnt = merge_cnt
        self.unk_word = '[UNK]'

    def pretokenize(self, text):
        words = re.findall(self.pattern, text)
        words = list(map(lambda x: '▁' + x, words))
        return words

    def _wp_init(self, text):
        words = self.pretokenize(text)
        word_cnts = Counter(words)
        self.words_wp = {
            word: {'freq': freq, 'swords': list(word)}
            for word, freq in word_cnts.items()}
        self.vocab = list(
            set([char for word in self.words_wp
                 for char in self.words_wp[word]['swords']]))

    def _calc_pair_gains(self):
        sword_cnts = Counter()
        self.pair_gains = Counter()
        for word_dict in self.words_wp.values():
            subwords = tuple(word_dict['swords'])
            freq = word_dict['freq']
            for i in range(len(subwords) - 1):
                sword_cnts[subwords[i]] += freq
                self.pair_gains[subwords[i:i + 2]] += freq
            sword_cnts[subwords[len(subwords) - 1]] += freq
        pair_cnt = sum(self.pair_gains.values())
        sword_cnt = sum(sword_cnts.values())
        for pair in self.pair_gains:
            self.pair_gains[pair] *= (
                log(self.pair_gains[pair]/pair_cnt)
                - log(sword_cnts[pair[0]]/sword_cnt)
                - log(sword_cnts[pair[1]]/sword_cnt))

    def _merge_pair(self, pair, swords):
        pair_str = ''.join(pair)
        i = 0
        temp = []
        while i < len(swords) - 1:
            if pair == swords[i:i + 2]:
                temp += [pair_str]
                i += 2
            else:
                temp += [swords[i]]
                i += 1
        if i == len(swords) - 1:
            temp += [swords[i]]
        swords = temp
        return swords

    def fit(self, text):
        self._wp_init(text)

        self.merge_ops = []
        for _ in range(self.merge_cnt):
            self._calc_pair_gains()
            self.best_pair = max(self.pair_gains,
                                 key=self.pair_gains.get)
            merge_op = list(self.best_pair)
            self.merge_ops.append(merge_op)
            for word_dict in self.words_wp.values():
                word_dict['swords'] = self._merge_pair(merge_op,
                                                       word_dict['swords'])

        self._build_vocab()

    def _build_vocab(self):
        swords = list(map(lambda x: ''.join(x), self.merge_ops))
        self.vocab += swords
        self._create_regex()

    def _create_regex(self):
        self.vocab.sort(key=lambda x: -len(x))
        # We escape metachars as for '.'
        self.vocab = [re.escape(word) for word in self.vocab]
        self.sword_regex = '|'.join(self.vocab)

    def encode(self, word):
        subwords = re.findall(self.sword_regex, word)
        if ''.join(subwords) != word:
            # some subwords are not in the vocabulary
            subwords = [self.unk_word]
        return subwords

    def tokenize(self, text):
        tokenized_text = []
        cache = {}
        words = self.pretokenize(text)
        for word in words:
            if word not in cache:
                cache[word] = self.encode(word)
            subwords = cache[word]
            tokenized_text += subwords
        return tokenized_text

In [44]:
wp = WordPiece()
wp.fit(text)

In [45]:
wp.tokenize(text)

['▁',
 'B',
 'O',
 'O',
 'K',
 '▁I',
 '▁S',
 'ing',
 '▁,',
 '▁',
 'O',
 '▁god',
 'd',
 'es',
 's',
 '▁,',
 '▁the',
 '▁an',
 'g',
 'er',
 '▁of',
 '▁Ach',
 'ill',
 'es',
 '▁son',
 '▁of',
 '▁P',
 'e',
 'le',
 'us',
 '▁,',
 '▁that',
 '▁br',
 'ough',
 't',
 '▁c',
 'oun',
 't',
 'le',
 's',
 's',
 '▁',
 'ill',
 's',
 '▁upon',
 '▁the',
 '▁Ach',
 'a',
 'e',
 'ans',
 '▁.',
 '▁M',
 'an',
 'y',
 '▁a',
 '▁br',
 'a',
 've',
 '▁so',
 'u',
 'l',
 '▁did',
 '▁it',
 '▁se',
 'n',
 'd',
 '▁h',
 'ur',
 'r',
 'y',
 'ing',
 '▁d',
 'own',
 '▁to',
 '▁H',
 'ad',
 'es',
 '▁,',
 '▁and',
 '▁man',
 'y',
 '▁a',
 '▁her',
 'o',
 '▁did',
 '▁it',
 '▁',
 'y',
 'i',
 'e',
 'ld',
 '▁a',
 '▁p',
 're',
 'y',
 '▁to',
 '▁d',
 'o',
 'g',
 's',
 '▁and',
 '▁',
 'v',
 'u',
 'l',
 't',
 'ur',
 'es',
 '▁,',
 '▁for',
 '▁so',
 '▁were',
 '▁the',
 '▁c',
 'oun',
 'sel',
 's',
 '▁of',
 '▁Jove',
 '▁f',
 'u',
 'l',
 'f',
 'ill',
 'ed',
 '▁from',
 '▁the',
 '▁d',
 'ay',
 '▁on',
 '▁which',
 '▁the',
 '▁son',
 '▁of',
 '▁A',
 't',
 're',
 'us',
 

In [46]:
swords = wp.tokenize('Sit careless in the shade!')
swords

['▁S',
 'it',
 '▁c',
 'ar',
 'e',
 'le',
 's',
 's',
 '▁in',
 '▁the',
 '▁sh',
 'ad',
 'e',
 '▁',
 '!']

## BERT's WordPiece

In [47]:
def bert_wp(swords):
    i = 0
    while i < len(swords) - 1:
        if swords[i] == '▁':
            swords = swords[:i] + \
                [''.join([swords[i], swords[i + 1]])] + swords[i + 2:]
        i += 1
    return [sword[1:] if sword[0] == '▁' else '##' + sword
            for sword in swords]

In [48]:
bert_wp(swords)

['S',
 '##it',
 'c',
 '##ar',
 '##e',
 '##le',
 '##s',
 '##s',
 'in',
 'the',
 'sh',
 '##ad',
 '##e',
 '!']