# Tokenization

## Character tokenization

In [36]:
sentence = "I would like to work than machine lerning engineer at Google!".lower()
print(sentence)

sentence = sentence.replace(" ","")
print(sentence)

chars = [char for char in sentence]
print(chars)

chars = set(chars)
print(chars)

i would like to work than machine lerning engineer at google!
iwouldliketoworkthanmachinelerningengineeratgoogle!
['i', 'w', 'o', 'u', 'l', 'd', 'l', 'i', 'k', 'e', 't', 'o', 'w', 'o', 'r', 'k', 't', 'h', 'a', 'n', 'm', 'a', 'c', 'h', 'i', 'n', 'e', 'l', 'e', 'r', 'n', 'i', 'n', 'g', 'e', 'n', 'g', 'i', 'n', 'e', 'e', 'r', 'a', 't', 'g', 'o', 'o', 'g', 'l', 'e', '!']
{'l', 't', 'c', 'g', 'o', 'w', 'a', 'u', 'i', 'h', '!', 'n', 'r', 'e', 'k', 'd', 'm'}


## Word tokenization

In [None]:
pip install tensorflow

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [None]:
pip install nltk

In [8]:
import nltk
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(sentences[0])

print(word_tokens)

['i', 'love', 'my', 'dog']


In [None]:
# https://huggingface.co/docs/tokenizers/python/latest/

In [None]:
pip install transformers

In [9]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
print(tokenizer)

<tokenizers.Tokenizer object at 0x0000022C59AFE7C0>


In [10]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
print(trainer)

<tokenizers.trainers.BpeTrainer object at 0x0000022C59665110>


In [11]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [12]:
files = [f"data/lecture-03/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokenizer.train(files, trainer)

In [13]:
tokenizer.save("data/lecture-03/tokenizer-wiki.json")

In [14]:
tokenizer = Tokenizer.from_file("data/lecture-03/tokenizer-wiki.json")

In [15]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [16]:
print(output.tokens)
print(output.ids)
print(output.offsets[9])

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
(26, 27)


In [17]:
tokenizer.token_to_id("[SEP]")

2

In [18]:
from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [19]:
print(output.tokens)
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']


In [20]:
print(output.type_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


## Encoding multiple sentences in a batch

In [21]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])

In [22]:
output = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)

In [23]:
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")

In [24]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[1].tokens)

['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]']


In [25]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 1, 0]


## Most used tokenizers (We will learn all of them in advanced NLP)

In [26]:
from tokenizers.models import BPE
from tokenizers.models import Unigram
from tokenizers.models import WordLevel
from tokenizers.models import WordPiece

## Using a pretrained tokenizer

In [27]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("data/lecture-03/vocabs/bert-base-uncased-vocab.txt", lowercase=True)

In [28]:
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', '[SEP]', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']
