# Tokenization

## Character tokenization

In [None]:
sentence = "I would like to work than machine lerning engineer at Google!".lower()
print(sentence)

sentence = sentence.replace(" ","")
print(sentence)

chars = [char for char in sentence]
print(chars)

chars = set(chars)
print(chars)

## Word tokenization

In [None]:
!pip install tensorflow

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)

print(tokenizer.word_index)

In [None]:
!pip install nltk

In [None]:
import nltk
from nltk.tokenize import word_tokenize

sentences[0]

In [None]:
from nltk.tokenize import word_tokenize
s = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''
word_tokenize(s)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

word_tokens = word_tokenize(sentences[0])

print(word_tokens)

In [None]:
!pip install transformers

### Speciális tokenek
- [UNK] ismeretlen token jelölése
- [CLS] teljes mondatot reprezentáló token
- [SEP] mondat szeparátor token
- [PAD] padding token a fix input hossz feltöltését biztosító token
- [MASK] Maszkolást biztosító token. pl.: "Hello I'm a [MASK] model."

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
print(tokenizer)

In [None]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
print(trainer)

In [None]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [None]:
# https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
files = [f"../data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokenizer.train(files, trainer)

In [None]:
tokenizer.save("../data/tokenizer-wiki.json")

In [None]:
tokenizer = Tokenizer.from_file("../data/tokenizer-wiki.json")

In [None]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)

In [None]:
print(output.tokens)
print(output.ids)
print(output.offsets[9])

In [None]:
tokenizer.token_to_id("[SEP]")

In [None]:
from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [None]:
print(output.tokens)
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

In [None]:
print(output.type_ids)

## Encoding multiple sentences in a batch

In [None]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])

In [None]:
output = tokenizer.encode_batch(
    [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)

In [None]:
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")

In [None]:
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[0].tokens)
print(output[1].tokens)

In [None]:
print(output[0].attention_mask)
print(output[1].attention_mask)

## Most used tokenizers (We will learn all of them in advanced NLP)

In [None]:
from tokenizers.models import BPE
from tokenizers.models import Unigram
from tokenizers.models import WordLevel
from tokenizers.models import WordPiece

## Using a pretrained tokenizer

In [None]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt", lowercase=True)

In [None]:
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)