# Tokenization

## Character tokenization

In [1]:
sentence = "I would like to work than machine lerning engineer at Google!".lower()
print(sentence)

sentence = sentence.replace(" ","")
print(sentence)

chars = [char for char in sentence]
print(chars)

chars = set(chars)
print(chars)

i would like to work than machine lerning engineer at google!
iwouldliketoworkthanmachinelerningengineeratgoogle!
['i', 'w', 'o', 'u', 'l', 'd', 'l', 'i', 'k', 'e', 't', 'o', 'w', 'o', 'r', 'k', 't', 'h', 'a', 'n', 'm', 'a', 'c', 'h', 'i', 'n', 'e', 'l', 'e', 'r', 'n', 'i', 'n', 'g', 'e', 'n', 'g', 'i', 'n', 'e', 'e', 'r', 'a', 't', 'g', 'o', 'o', 'g', 'l', 'e', '!']
{'w', '!', 'n', 'd', 'u', 'l', 't', 'r', 'k', 'e', 'a', 'm', 'o', 'i', 'g', 'c', 'h'}


## Word tokenization

In [2]:
pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.4.1-cp38-cp38-win_amd64.whl (370.7 MB)
Collecting astunparse~=1.6.3
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tensorflow-estimator<2.5.0,>=2.4.0
  Using cached tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)
Collecting typing-extensions~=3.7.4
  Using cached typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting google-pasta~=0.2
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting wrapt~=1.12.1
  Using cached wrapt-1.12.1-py3-none-any.whl
Collecting flatbuffers~=1.12.0
  Using cached flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting grpcio~=1.32.0
  Using cached grpcio-1.32.0-cp38-cp38-win_amd64.whl (2.6 MB)
Collecting opt-einsum~=3.3.0
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting gast==0.3.3
Note: you may need to restart the kernel to use updated packages.
  Using cached gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting h5py~=2.10.0
  Using 

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [4]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(sentences[0])

print(word_tokens)

['i', 'love', 'my', 'dog']


In [6]:
# https://huggingface.co/docs/tokenizers/python/latest/

In [7]:
pip install transformers

Collecting transformers
  Downloading transformers-4.3.3-py3-none-any.whl (1.9 MB)
Collecting packaging
  Downloading packaging-20.9-py2.py3-none-any.whl (40 kB)
Collecting filelock
  Using cached filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.1-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.43-py3-none-any.whl
Installing collected packages: tokenizers, sacremoses, packaging, filelock, transformers
Successfully installed filelock-3.0.12 packaging-20.9 sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3
Note: you may need to restart the kernel to use updated packages.


In [8]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
print(tokenizer)

<tokenizers.Tokenizer object at 0x0000012F8ED669C0>


In [9]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
print(trainer)

<tokenizers.trainers.BpeTrainer object at 0x0000012FE6905AB0>


In [10]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [13]:
# https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
files = [f"data/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokenizer.train(files, trainer)

In [14]:
tokenizer.save("data/tokenizer-wiki.json")

In [15]:
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")

In [16]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [17]:
print(output.tokens)
print(output.ids)
print(output.offsets[9])

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
(26, 27)


In [18]:
tokenizer.token_to_id("[SEP]")

2

In [19]:
from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [20]:
print(output.tokens)
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']
['[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]']


In [21]:
print(output.type_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


## Encoding multiple sentences in a batch

In [22]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])

In [23]:
output = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)

In [24]:
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")

In [25]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[1].tokens)

['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]']


In [26]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 1, 0]


## Most used tokenizers (We will learn all of them in advanced NLP)

In [27]:
from tokenizers.models import BPE
from tokenizers.models import Unigram
from tokenizers.models import WordLevel
from tokenizers.models import WordPiece

## Using a pretrained tokenizer

In [29]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("data/bert-base-uncased-vocab.txt", lowercase=True)

In [30]:
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

['[CLS]', 'hello', ',', 'y', "'", 'all', '!', '[SEP]', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']
