In [7]:
%cd ..
!pwd

/Users/sieunpark/Documents/GitHub/simple-tokenization
/Users/sieunpark/Documents/GitHub/simple-tokenization


In [19]:
from copy import deepcopy, copy
from typing import Union, Optional
from dataclasses import dataclass, field

@dataclass
class WordPieceVocab:
    token_to_ids: dict = field(default_factory=lambda: {})

    @classmethod
    def from_file(cls, filepath):
        token_to_ids = dict()
        with open(filepath, "r", encoding="utf-8") as reader:
            tokens = reader.readlines()
        for index, token in enumerate(tokens):
            token = token.rstrip("\n")
            token_to_ids[token] = index
        return cls(token_to_ids=token_to_ids) 

    @classmethod
    def from_dict(cls, token_to_ids: dict):
        return cls(token_to_ids=token_to_ids)

    @property
    def vocab_size(self):
        return len(self.token_to_ids)

def add_vocab(vocab, tokens: Union[str, list[str]]):
    if isinstance(tokens, str):
        tokens = [tokens]

    new_token_to_ids = copy(vocab.token_to_ids)
    for token in tokens:
        new_token_to_ids[token] = len(new_token_to_ids)        

    return WordPieceVocab.from_dict(new_token_to_ids)

def add_special_tokens(
    vocab,
    )
    
vocab = WordPieceVocab.from_file("sample-vocab.txt")
print(vocab)

vocab = add_vocab(vocab, tokens=["EOS", "BOS", "SEP"])
print(vocab)

WordPieceVocab(token_to_ids={'a': 0, 'b': 1, 'c': 2, 'aaa': 3, 'abc': 4})
WordPieceVocab(token_to_ids={'a': 0, 'b': 1, 'c': 2, 'aaa': 3, 'abc': 4, 'EOS': 5, 'BOS': 6, 'SEP': 7})


In [None]:
# normailzers

def unicode_normalize(text, format: str)

In [13]:
@dataclass
class WordPieceConfig:
    lowercase: bool = 

def wordpiece_tokenize(text: str, vocab: WordPieceVocab, config: WordPieceConfig)

WordPieceVocab(vocab={'a': 0, 'b': 1, 'c': 2, 'aaa': 3, 'abc': 4, 'j': 10})

In [2]:
# invalid character removal and whitespace cleanup on text

from tokenizers import SentencePieceBPETokenizer



In [6]:
tk = SentencePieceBPETokenizer()

tk.train(
    files=s,
    vocab_size=10,
)
tk

Exception: File name too long (os error 63)

In [3]:
from transformers import AutoTokenizer

tk = AutoTokenizer.from_pretrained("baseten/Meta-Llama-3-tokenizer")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
enc = tk.encode("Hello, my name is Llama.")
enc

[128000, 9906, 11, 856, 836, 374, 445, 81101, 13]

In [None]:
dec = tk.decode(enc)
dec

'<|begin_of_text|>Hello, my name is Llama.'

In [10]:
tk.decode([856])

' my'

In [11]:
text = "Hello, my name is Llama."
tokens = tk.encode(text, add_special_tokens=False)
decoded_tokens = tk.decode(tokens).split()

tokens, decoded_tokens

([9906, 11, 856, 836, 374, 445, 81101, 13],
 ['Hello,', 'my', 'name', 'is', 'Llama.'])

In [20]:
tk.decode(856).replace(" ", "_")

'_my'

In [13]:
decoded_tokens = tk.decode([[t] for t in tokens])

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [14]:
[[t] for t in tokens]

[[9906], [11], [856], [836], [374], [445], [81101], [13]]

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [21]:
tk.encode("ðŸš€")


[128000, 9468, 248, 222]

In [24]:
tk.decode(9468)

'ï¿½'

In [25]:
tokenizer = tk# Token ID
token_id = 9468

# Decode the token ID to a string
decoded_string = tokenizer.decode([token_id])

# Convert the decoded string to bytes
decoded_bytes = decoded_string.encode('utf-8')


In [28]:
print(f"Decoded string: {decoded_string}")
print(f"Decoded bytes: {decoded_bytes}")


Decoded string: ï¿½
Decoded bytes: b'\xef\xbf\xbd'


In [32]:
from transformers import LlamaTokenizer

# Initialize the tokenizer

# Get the class name of the tokenizer
print("Tokenizer class name:", tokenizer.__class__.__name__)

# Get the parent (base) classes of the tokenizer class
parent_classes = [base.__name__ for base in tokenizer.__class__.__bases__]
print("Parent classes:", parent_classes)


Tokenizer class name: PreTrainedTokenizerFast
Parent classes: ['PreTrainedTokenizerBase']


In [36]:
import re
import collections

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_vocab(text):
    words = text.split()
    vocab = collections.defaultdict(int)
    for word in words:
        vocab[' '.join(word)] += 1
    return vocab

def bpe_tokenization(text, num_merges):
    vocab = get_vocab(text)
    print("Initial Vocabulary:")
    for word, freq in vocab.items():
        print(f"{word}: {freq}")
    
    for i in range(num_merges):
        print(f"\n--- Merge Operation {i+1} ---")
        pairs = get_stats(vocab)
        if not pairs:
            print("No more pairs to merge.")
            break
        
        best = max(pairs, key=pairs.get)
        print(f"Most frequent pair: {best}")
        print(f"Pair frequency: {pairs[best]}")
        
        vocab = merge_vocab(best, vocab)
        print("Updated Vocabulary:")
        for word, freq in vocab.items():
            print(f"{word}: {freq}")
    
    return vocab

text = "this is a test. we are testing BPE tokenization. BPE is really cool."
num_merges = 10

print("Final BPE Vocabulary:")
final_vocab = bpe_tokenization(text, num_merges)

Final BPE Vocabulary:
Initial Vocabulary:
t h i s: 1
i s: 2
a: 1
t e s t .: 1
w e: 1
a r e: 1
t e s t i n g: 1
B P E: 2
t o k e n i z a t i o n .: 1
r e a l l y: 1
c o o l .: 1

--- Merge Operation 1 ---
Most frequent pair: ('i', 's')
Pair frequency: 3
Updated Vocabulary:
t h is: 1
is: 2
a: 1
t e s t .: 1
w e: 1
a r e: 1
t e s t i n g: 1
B P E: 2
t o k e n i z a t i o n .: 1
r e a l l y: 1
c o o l .: 1

--- Merge Operation 2 ---
Most frequent pair: ('t', 'e')
Pair frequency: 2
Updated Vocabulary:
t h is: 1
is: 2
a: 1
te s t .: 1
w e: 1
a r e: 1
te s t i n g: 1
B P E: 2
t o k e n i z a t i o n .: 1
r e a l l y: 1
c o o l .: 1

--- Merge Operation 3 ---
Most frequent pair: ('te', 's')
Pair frequency: 2
Updated Vocabulary:
t h is: 1
is: 2
a: 1
tes t .: 1
w e: 1
a r e: 1
tes t i n g: 1
B P E: 2
t o k e n i z a t i o n .: 1
r e a l l y: 1
c o o l .: 1

--- Merge Operation 4 ---
Most frequent pair: ('tes', 't')
Pair frequency: 2
Updated Vocabulary:
t h is: 1
is: 2
a: 1
test .: 1
w e: 1
a r e

In [40]:
import re
import collections
from transformers import GPT2Tokenizer

class BPETokenizer:
    def __init__(self, text, num_merges):
        """
        Initialize BPE Tokenizer
        
        Args:
            text (str): Training text for initial vocabulary
            num_merges (int): Number of merge operations
        """
        self.num_merges = num_merges
        self.vocab = self.get_vocab(text)
        self.merge_rules = []
        self.perform_bpe()

    def get_vocab(self, text):
        """Create initial vocabulary of characters"""
        words = text.split()
        vocab = collections.defaultdict(int)
        for word in words:
            vocab[' '.join(word)] += 1
        return vocab

    def get_stats(self):
        """Calculate pair frequencies"""
        pairs = collections.defaultdict(int)
        for word, freq in self.vocab.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[symbols[i], symbols[i+1]] += freq
        return pairs

    def merge_vocab(self, pair):
        """Merge most frequent pair in vocabulary"""
        new_vocab = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        
        for word in self.vocab:
            w_out = p.sub(''.join(pair), word)
            new_vocab[w_out] = self.vocab[word]
        
        return new_vocab

    def perform_bpe(self):
        """Perform Byte Pair Encoding"""
        for _ in range(self.num_merges):
            pairs = self.get_stats()
            if not pairs:
                break
            
            best = max(pairs, key=pairs.get)
            self.merge_rules.append(best)
            self.vocab = self.merge_vocab(best)

    def tokenize(self, text):
        """Tokenize new text using learned merge rules"""
        words = text.split()
        tokenized_words = []
        
        for word in words:
            # Start with characters
            chars = ' '.join(word)
            
            # Apply merge rules
            for merge in self.merge_rules:
                bigram = ' '.join(merge)
                chars = chars.replace(bigram, ''.join(merge))
            
            tokenized_words.append(chars.split())
        
        return tokenized_words

def verify_with_huggingface(bpe_tokens, text):
    """
    Verify BPE tokenization against HuggingFace GPT-2 Tokenizer
    
    Args:
        bpe_tokens (list): Tokens from custom BPE tokenizer
        text (str): Original text
    """
    # Use GPT-2 tokenizer as reference
    hf_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Tokenize with HuggingFace
    hf_tokens = hf_tokenizer.tokenize(text)
    
    print("Custom BPE Tokens:")
    for word_tokens in bpe_tokens:
        print(word_tokens)
    
    print("\nHuggingFace Tokens:")
    print(hf_tokens)
    
    # Basic comparison
    print("\nComparison:")
    print(f"Custom BPE unique tokens: {len(set(sum(bpe_tokens, [])))}") 
    print(f"HuggingFace unique tokens: {len(set(hf_tokens))}")

# Example usage
training_text = "If a language model is not available in the language you are interested in, or if your corpus is very different from the one your language model was trained on, you will most likely want to retrain the model from scratch using a tokenizer adapted to your data. That will require training a new tokenizer on your dataset. But what exactly does that mean? When we first looked at tokenizers in Chapter 2, we saw that most Transformer models use a subword tokenization algorithm. To identify which subwords are of interest and occur most frequently in the corpus at hand, the tokenizer needs to take a hard look at all the texts in the corpus â€” a process we call training. The exact rules that govern this training depend on the type of tokenizer used, and weâ€™ll go over the three main algorithms later in this chapter."
inference_text = "BPE tokenization is awesome for natural language processing."

# Create BPE tokenizer
bpe_tokenizer = BPETokenizer(training_text, num_merges=20)

# Tokenize inference text
bpe_tokens = bpe_tokenizer.tokenize(inference_text)

# Verify with HuggingFace
verify_with_huggingface(bpe_tokens, inference_text)

Custom BPE Tokens:
['B', 'P', 'E']
['tokeni', 'z', 'at', 'i', 'o', 'n']
['i', 's']
['a', 'w', 'e', 's', 'o', 'm', 'e']
['f', 'or']
['n', 'at', 'u', 'r', 'a', 'l']
['l', 'an', 'g', 'u', 'a', 'g', 'e']
['p', 'r', 'o', 'c', 'e', 's', 's', 'in', 'g', '.']

HuggingFace Tokens:
['B', 'PE', 'Ä token', 'ization', 'Ä is', 'Ä awesome', 'Ä for', 'Ä natural', 'Ä language', 'Ä processing', '.']

Comparison:
Custom BPE unique tokens: 25
HuggingFace unique tokens: 11


In [42]:
training_text

'If a language model is not available in the language you are interested in, or if your corpus is very different from the one your language model was trained on, you will most likely want to retrain the model from scratch using a tokenizer adapted to your data. That will require training a new tokenizer on your dataset. But what exactly does that mean? When we first looked at tokenizers in Chapter 2, we saw that most Transformer models use a subword tokenization algorithm. To identify which subwords are of interest and occur most frequently in the corpus at hand, the tokenizer needs to take a hard look at all the texts in the corpus â€” a process we call training. The exact rules that govern this training depend on the type of tokenizer used, and weâ€™ll go over the three main algorithms later in this chapter.'

In [43]:
bpe_tokenizer = BPETokenizer(training_text, num_merges=20)

In [46]:
bpe_tokenizer.merge_rules

[('i', 'n'),
 ('t', 'h'),
 ('e', 'r'),
 ('a', 't'),
 ('e', 'n'),
 ('a', 'n'),
 ('th', 'e'),
 ('o', 'r'),
 ('t', 'o'),
 ('m', 'o'),
 ('r', 'e'),
 ('y', 'o'),
 ('yo', 'u'),
 ('s', 't'),
 ('e', 'd'),
 ('u', 's'),
 ('a', 'in'),
 ('to', 'k'),
 ('tok', 'en'),
 ('token', 'i')]

In [57]:
import re
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

class BPETokenizer:
    def __init__(self, vocab_size: int = 10000):
        """
        Initialize BPE Tokenizer
        
        Args:
            vocab_size (int): Maximum number of tokens in the vocabulary
        """
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = {}
        self.token_freqs = None
    
    def _get_stats(self, tokens: List[str]) -> Counter:
        """
        Count pair frequencies in the tokens
        
        Args:
            tokens (List[str]): List of tokens
        
        Returns:
            Counter of token pair frequencies
        """
        pairs = Counter()
        for token in tokens:
            symbols = token.split()
            for i in range(len(symbols) - 1):
                pairs[tuple(symbols[i:i+2])] += 1
        return pairs
    
    def _merge_tokens(self, tokens: List[str], pair: Tuple[str, str]) -> List[str]:
        """
        Merge the most frequent pair of tokens
        
        Args:
            tokens (List[str]): List of tokens
            pair (Tuple[str, str]): Pair to merge
        
        Returns:
            List of merged tokens
        """
        new_tokens = []
        i = 0
        while i < len(tokens):
            # Find indices where the pair occurs
            try:
                j = tokens.index(pair[0], i)
                if j + 1 < len(tokens) and tokens[j + 1] == pair[1]:
                    # Merge the pair
                    new_tokens.append(pair[0] + pair[1])
                    i = j + 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            except ValueError:
                new_tokens.append(tokens[i])
                i += 1
        return new_tokens
    
    def train(self, corpus: List[str]):
        """
        Train BPE tokenizer on the given corpus
        
        Args:
            corpus (List[str]): List of training texts
        """
        # Preprocess and initialize tokens
        tokens = [' '.join(list(text)) for text in corpus]
        
        # Initialize token frequencies
        self.token_freqs = Counter(' '.join(tokens).split())
        
        # Merge until reaching vocab size
        while len(self.vocab) < self.vocab_size:
            # Get pair frequencies
            pairs = self._get_stats(tokens)
            
            # If no more pairs can be merged, break
            if not pairs:
                break
            
            # Get the most frequent pair
            best_pair = max(pairs, key=pairs.get)
            
            # Merge tokens
            tokens = [' '.join(self._merge_tokens(token.split(), best_pair)) 
                      for token in tokens]
            
            # Update vocabulary and merges
            merged_token = best_pair[0] + best_pair[1]
            self.vocab[merged_token] = len(self.vocab)
            self.merges[best_pair] = merged_token
        
        # Create reverse vocabulary
        self.id_to_token = {v: k for k, v in self.vocab.items()}
    
    def encode(self, text: str) -> List[int]:
        """
        Encode text into token ids
        
        Args:
            text (str): Input text to encode
        
        Returns:
            List of token ids
        """
        # Convert to list of characters
        tokens = list(text)
        
        # Apply merge rules
        while len(tokens) > 1:
            pairs = self._get_stats([' '.join(tokens)])
            
            # If no mergeable pairs, break
            if not pairs:
                break
            
            # Find the first pair that exists in merges
            mergeable_pair = next((pair for pair in pairs if pair in self.merges), None)
            
            # If no mergeable pair found, break
            if mergeable_pair is None:
                break
            
            # Merge the pair
            tokens = self._merge_tokens(tokens, mergeable_pair)
        
        # Convert to token ids
        return [self.vocab.get(token, self.vocab.get('<unk>', 0)) for token in tokens]
    
    def decode(self, token_ids: List[int]) -> str:
        """
        Decode token ids back to text
        
        Args:
            token_ids (List[int]): List of token ids to decode
        
        Returns:
            Decoded text
        """
        # Convert ids to tokens
        tokens = [self.id_to_token.get(tid, '<unk>') for tid in token_ids]
        
        # Join tokens
        return ''.join(tokens)

# Example usage
def main():
    # Training corpus
    corpus = [
        "Hello world!",
        "Natural language processing is fascinating.",
        "Tokenization breaks text into meaningful pieces."
    ]
    
    # Initialize and train tokenizer
    tokenizer = BPETokenizer(vocab_size=100)
    tokenizer.train(corpus)
    
    # Test encoding and decoding
    test_text = "Hello world of NLP!"
    encoded = tokenizer.encode(test_text)
    decoded = tokenizer.decode(encoded)
    
    print("Original Text:", test_text)
    print("Encoded Tokens:", encoded)
    print("Decoded Text:", decoded)
    
    # Print vocabulary
    print("\nVocabulary:")
    for token, token_id in list(tokenizer.vocab.items())[:10]:
        print(f"{token}: {token_id}")

if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [75]:
text = "If a language model is not available in the language you are interested in, or if your corpus is very different from the one your language model was trained on, you will most likely want to retrain the model from scratch using a tokenizer adapted to your data. That will require training a new tokenizer on your dataset. But what exactly does that mean? When we first looked at tokenizers in Chapter 2, we saw that most Transformer models use a subword tokenization algorithm. To identify which subwords are of interest and occur most frequently in the corpus at hand, the tokenizer needs to take a hard look at all the texts in the corpus â€” a process we call training. The exact rules that govern this training depend on the type of tokenizer used, and weâ€™ll go over the three main algorithms later in this chapter."
vocab_size = 100

In [None]:
# Preprocess the text into words and count frequencies
words = text.split()
word_freq = defaultdict(int)
for word in words:
    chars = list(word) + ['</w>']
    word_freq[tuple(chars)] += 1

In [None]:
from collections import defaultdict


def train_bpe(text, vocab_size):
    # Preprocess the text into words and count frequencies
    words = text.split()
    word_freq = defaultdict(int)
    for word in words:
        chars = list(word) + ['</w>']
        word_freq[tuple(chars)] += 1

    # Initialize vocabulary with individual characters
    vocab = set()
    for word_seq in word_freq:
        for char in word_seq:
            vocab.add(char)
    vocab = sorted(list(vocab))
    merge_rules = []

    while len(vocab) < vocab_size:
        # Count frequency of each adjacent pair
        pair_counts = defaultdict(int)
        for seq, freq in word_freq.items():
            for i in range(len(seq) - 1):
                pair = (seq[i], seq[i+1])
                pair_counts[pair] += freq

        if not pair_counts:
            break  # No more pairs to merge

        # Select the most frequent pair
        most_frequent_pair = max(pair_counts, key=lambda x: (pair_counts[x], x))

        # Merge the most frequent pair
        new_token = ''.join(most_frequent_pair)
        merge_rules.append((most_frequent_pair, new_token))
        vocab.append(new_token)

        # Update word_freq with merged pairs
        new_word_freq = defaultdict(int)
        for seq, freq in word_freq.items():
            new_seq = []
            i = 0
            while i < len(seq):
                if i < len(seq)-1 and (seq[i], seq[i+1]) == most_frequent_pair:
                    new_seq.append(new_token)
                    i += 2
                else:
                    new_seq.append(seq[i])
                    i += 1
            new_word_freq[tuple(new_seq)] += freq
        word_freq = new_word_freq

    return merge_rules, vocab

def tokenize(word, merge_rules):
    tokens = list(word) + ['</w>']
    for pair, merged in merge_rules:
        new_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens)-1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
                new_tokens.append(merged)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        tokens = new_tokens
    return tokens

# Training
training_text = "If a language model is not available in the language you are interested in, or if your corpus is very different from the one your language model was trained on, you will most likely want to retrain the model from scratch using a tokenizer adapted to your data. That will require training a new tokenizer on your dataset. But what exactly does that mean? When we first looked at tokenizers in Chapter 2, we saw that most Transformer models use a subword tokenization algorithm. To identify which subwords are of interest and occur most frequently in the corpus at hand, the tokenizer needs to take a hard look at all the texts in the corpus â€” a process we call training. The exact rules that govern this training depend on the type of tokenizer used, and weâ€™ll go over the three main algorithms later in this chapter."
text = "low lower lowest"
vocab_size = 100
merge_rules, vocab = train_bpe(training_text, vocab_size)
print("Merge Rules:", merge_rules)
print("Vocabulary:", vocab)

# Tokenization
test_word = "BPE training starts by computing the unique set of words used in the corpus (after the normalization and pre-tokenization steps are completed), then building the vocabulary by taking all the symbols used to write those words. As a very simple example, letâ€™s say our corpus uses these five words:"
tokenized = tokenize(test_word, merge_rules)
print(f"Tokenized '{test_word}':\n", "/".join(tokenized))

Merge Rules: [(('e', '</w>'), 'e</w>'), (('t', '</w>'), 't</w>'), (('i', 'n'), 'in'), (('t', 'h'), 'th'), (('s', '</w>'), 's</w>'), (('e', 'r'), 'er'), (('e', 'n'), 'en'), (('th', 'e</w>'), 'the</w>'), (('t', 'o'), 'to'), (('o', 'r'), 'or'), (('d', '</w>'), 'd</w>'), (('a', 'n'), 'an'), (('l', '</w>'), 'l</w>'), (('er', '</w>'), 'er</w>'), (('a', 't</w>'), 'at</w>'), (('m', 'o'), 'mo'), (('in', '</w>'), 'in</w>'), (('y', 'o'), 'yo'), (('yo', 'u'), 'you'), (('to', 'k'), 'tok'), (('tok', 'en'), 'token'), (('token', 'i'), 'tokeni'), (('tokeni', 'z'), 'tokeniz'), (('r', 'a'), 'ra'), (('a', '</w>'), 'a</w>'), (('y', '</w>'), 'y</w>'), (('t', 'ra'), 'tra'), (('s', 't</w>'), 'st</w>'), (('r', '</w>'), 'r</w>'), (('o', 'n'), 'on'), (('l', 'l</w>'), 'll</w>'), (('d', 'e'), 'de'), (('.', '</w>'), '.</w>'), ((',', '</w>'), ',</w>'), (('you', 'r</w>'), 'your</w>'), (('tra', 'in'), 'train'), (('tokeniz', 'er</w>'), 'tokenizer</w>'), (('r', 'e'), 're'), (('mo', 'de'), 'mode'), (('in', 'g'), 'ing'), 

In [71]:
tokenized

['B',
 'P',
 'E',
 ' ',
 't',
 'ra',
 'in',
 'in',
 'g',
 ' ',
 's',
 't',
 'a',
 'r',
 't',
 's',
 ' ',
 'b',
 'y',
 ' ',
 'c',
 'o',
 'm',
 'p',
 'u',
 't',
 'in',
 'g',
 ' ',
 'th',
 'e',
 ' ',
 'u',
 'n',
 'i',
 'q',
 'u',
 'e',
 ' ',
 's',
 'e',
 't',
 ' ',
 'o',
 'f',
 ' ',
 'w',
 'or',
 'd',
 's',
 ' ',
 'u',
 's',
 'e',
 'd',
 ' ',
 'in',
 ' ',
 'th',
 'e',
 ' ',
 'c',
 'or',
 'p',
 'u',
 's',
 ' ',
 '(',
 'a',
 'f',
 't',
 'er',
 ' ',
 'th',
 'e',
 ' ',
 'n',
 'or',
 'm',
 'a',
 'l',
 'i',
 'z',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 'an',
 'd',
 ' ',
 'p',
 'r',
 'e',
 '-',
 'tokeniz',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 's',
 't',
 'e',
 'p',
 's',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'c',
 'o',
 'm',
 'p',
 'l',
 'e',
 't',
 'e',
 'd',
 ')',
 ',',
 ' ',
 'th',
 'en',
 ' ',
 'b',
 'u',
 'i',
 'l',
 'd',
 'in',
 'g',
 ' ',
 'th',
 'e',
 ' ',
 'v',
 'o',
 'c',
 'a',
 'b',
 'u',
 'l',
 'a',
 'r',
 'y',
 ' ',
 'b',
 'y',
 ' ',
 't',
 'a',
 'k',
 'in',
 'g',
 ' ',
 'a',
 'l',
 'l',
 ' '

In [68]:
def tokenize(word, merge_rules):
    tokens = list(word) + ['</w>']
    for pair, merged in merge_rules:
        new_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens)-1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
                new_tokens.append(merged)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        tokens = new_tokens
    return tokens
tokenized = tokenize(test_word, merge_rules)

In [69]:
tokenized

['B',
 'P',
 'E',
 ' ',
 't',
 'ra',
 'in',
 'in',
 'g',
 ' ',
 's',
 't',
 'a',
 'r',
 't',
 's',
 ' ',
 'b',
 'y',
 ' ',
 'c',
 'o',
 'm',
 'p',
 'u',
 't',
 'in',
 'g',
 ' ',
 'th',
 'e',
 ' ',
 'u',
 'n',
 'i',
 'q',
 'u',
 'e',
 ' ',
 's',
 'e',
 't',
 ' ',
 'o',
 'f',
 ' ',
 'w',
 'or',
 'd',
 's',
 ' ',
 'u',
 's',
 'e',
 'd',
 ' ',
 'in',
 ' ',
 'th',
 'e',
 ' ',
 'c',
 'or',
 'p',
 'u',
 's',
 ' ',
 '(',
 'a',
 'f',
 't',
 'er',
 ' ',
 'th',
 'e',
 ' ',
 'n',
 'or',
 'm',
 'a',
 'l',
 'i',
 'z',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 'an',
 'd',
 ' ',
 'p',
 'r',
 'e',
 '-',
 'tokeniz',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 's',
 't',
 'e',
 'p',
 's',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'c',
 'o',
 'm',
 'p',
 'l',
 'e',
 't',
 'e',
 'd',
 ')',
 ',',
 ' ',
 'th',
 'en',
 ' ',
 'b',
 'u',
 'i',
 'l',
 'd',
 'in',
 'g',
 ' ',
 'th',
 'e',
 ' ',
 'v',
 'o',
 'c',
 'a',
 'b',
 'u',
 'l',
 'a',
 'r',
 'y',
 ' ',
 'b',
 'y',
 ' ',
 't',
 'a',
 'k',
 'in',
 'g',
 ' ',
 'a',
 'l',
 'l',
 ' '