In [10]:
from __future__ import annotations

import string
import re
from collections import defaultdict

# Tokenization Steps
In this exercise, we will code our own tokenizer from scratching using base Python. This exercise will help us get to know see some of the tokenization steps better.

## Define Sample Text
Let's first define some sample text we will use to test our tokenization steps.

In [11]:
sample_text = '''Mr. Louis continued to say, "Penguins are important,
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)

Mr. Louis continued to say, "Penguins are important,
but we must not forget the number 1 priority: the READER!"



## Normalization
- convert text into lowercase
- remove accented characters

In [19]:
def normalize_text(text: str) -> str:
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in  acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

In [20]:
# Test out the normalization
normalize_text(sample_text)

'mr. louis continued to say, "penguins are important,\nbut we must not forget the number 1 priority: the reader!"\n'

## Pretokenization
This step will take in the normalized text and pretokenize the text into a list of smaller pieces.

In [21]:
def pretokenize_text(text: str) -> list[str]:
    smaller_pieces = text.split()
    return smaller_pieces

In [22]:
# Test out our pretokenization step (after normalizing the text)
normalized_text = normalize_text(sample_text)
pretokenize_text(normalized_text)

['mr.',
 'louis',
 'continued',
 'to',
 'say,',
 '"penguins',
 'are',
 'important,',
 'but',
 'we',
 'must',
 'not',
 'forget',
 'the',
 'number',
 '1',
 'priority:',
 'the',
 'reader!"']

## Tokenization
This step will take in the list of pretokenized pieces (after the text has been normalized) into the tokens that will be used.

In [23]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    normalized_text: str = normalize_text(text)
    pertokenized_text: list[str] = pretokenize_text(normalized_text)
    tokens = []
    for word in pertokenized_text:
        tokens.extend(
            re.findall(
                f'[\w]+|[{string.punctuation}]', # Split word at punctuations
                word,
            )
        )
    return tokens

In [24]:
# Test out our tokenization (that uses normalizing & pretokenizing functions)
tokenize_text(sample_text)

['mr',
 '.',
 'louis',
 'continued',
 'to',
 'say',
 ',',
 '"',
 'penguins',
 'are',
 'important',
 ',',
 'but',
 'we',
 'must',
 'not',
 'forget',
 'the',
 'number',
 '1',
 'priority',
 ':',
 'the',
 'reader',
 '!',
 '"']

## Postprocessing
This final step will take in the list of tokens from the original text and add any special tokens to the text.

In [26]:
# Useful for some tasks
def postprocess_tokens(tokens: list[str]) -> list[str]:
  # Add beginning and end of sequence tokens to our tokenized text
    bos_token = '[BOS]'
    eos_token = '[EOS]'
    updated_tokens = (
        [bos_token]
        + tokens
        + [eos_token]
    )
    return updated_tokens

In [27]:
# Test full pipeline (normalizing, pretokenizing, tokenizing & postprocessing)
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'must', 'not', 'forget', 'the', 'number', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']


# Encoding & Decoding
## Encoding Text to Token IDs
Create an encoder (`encode()`) that will encode the token strings to integer IDs by defining how to map each token to a unique ID.

In [29]:
# Sample corpus (normally this would be much bigger)
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

In [32]:
# Create an encoder to transform token strings to IDs using the sample
# corpus as the basis of your encoding

unique_tokens = set()
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)

# Create mapping (dictionary) for unique tokens using arbitraty & unique IDs
token2id = defaultdict(lambda : 0) # Allow for unknown tokens to map to 0
token2id |= {
    token: idx
    for idx, token in enumerate(unique_tokens, 1) # Skip 0 (represents unknown)
}

# A mapping for IDs to convert back to token
id2token = defaultdict(lambda: '[UNK]') # Allow for unknown token ('[UNK]')
id2token |= {
    idx: token
    for token, idx in token2id.items()
}


def encode(tokens: list[str]) -> list[int]:
    encoded_tokens = [token2id[token] for token in tokens]
    return encoded_tokens


## Test `encode()`

In [30]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens (to be fed to encode())
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']



In [33]:
# Test encode()
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Encoded Tokens:
[32, 5, 38, 48, 20, 21, 3, 40, 36, 26, 25, 8, 40, 39, 24, 49, 1, 4, 6, 50, 41, 34, 37, 29, 50, 19, 12, 36, 13]



## Decoding Token IDs to Text
Based on the encoder we created (`encode()`), create a decoder(`decode()`) to take a list of token IDs and map them to their associated token.

In [39]:
def decode(ids: list[int]) -> list[str]:
    token_strings = [id2token[idx] for idx in ids]
    return token_strings

## Test `decode()`

In [35]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

# Create token IDs (to be fed to decode())
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']

Encoded Tokens:
[32, 5, 38, 48, 20, 21, 3, 40, 36, 26, 25, 8, 40, 39, 24, 49, 1, 4, 6, 50, 41, 34, 37, 29, 50, 19, 12, 36, 13]



In [40]:
# Test out decode()
decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Decoded Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']

