# WordPiece and BPE Tokenizers with Normalising and Training

In [None]:
from tokenizers import Tokenizer
#importing the tokenisers
from tokenizers.models import BPE, WordPiece
#importing the trainers for the tokenisers
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
#pre tokenise the string by splitting at whitespaces (split it into words)
tokenizer.pre_tokenizer = Whitespace()
#these tokens are not be edited, and must be included in the final token corpus
trainer = BpeTrainer(special_tokens=["[PAD]", "[OOV]"])
tokenizer.train(["random_text_corpus.txt"], trainer)

#encode this sample string
output = tokenizer.encode("Uncharacteristically, she pre-determined the outcome before initiating the process")
print(output.tokens)




['Un', 'char', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'determined', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'c', 'ess']


## Increasingly complicated storytelling:
['nch', 'ar', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'de', 'ter', 'min', 'ed', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'cess']

['U', 'nch', 'ar', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'det', 'er', 'min', 'ed', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'p', 'ro', 'c', 'ess']

['U', 'nch', 'ar', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'det', 'er', 'min', 'ed', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'c', 'ess']

## Adding some scientifc text:
['Un', 'ch', 'ar', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'determined', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'c', 'ess']

['Un', 'ch', 'ar', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'determined', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'c', 'ess']

## Adding some random conversational tones and dialogues:
['Un', 'char', 'ac', 'ter', 'ist', 'ically', ',', 'she', 'pre', '-', 'determined', 'the', 'out', 'come', 'before', 'in', 'it', 'i', 'ating', 'the', 'pro', 'c', 'ess']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # gpt2 uses bpe
tokens = tokenizer.tokenize("Uncharacteristically, she pre-determined the outcome before initiating the process")
print(tokens)

#the G indicates leading whitespace. it can be removed if needed

['Un', 'character', 'istically', ',', 'Ġshe', 'Ġpre', '-', 'd', 'etermined', 'Ġthe', 'Ġoutcome', 'Ġbefore', 'Ġinitiating', 'Ġthe', 'Ġprocess']


In [None]:
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence

#for all unknown tokens in test sets itll return [UNK]
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

#normalising helps reduce vocabulary. NFD separates accents, to lower case, then removes accents
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
#pre tokenize by splitting words
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

tokenizer.train(["random_text_corpus.txt"], trainer)

output = tokenizer.encode("Uncharacteristically, she pre-determined the outcome before initiating the process")
print(output.tokens)





['unc', '##h', '##ar', '##act', '##er', '##istic', '##ally', ',', 'she', 'pre', '-', 'determined', 'the', 'outcom', '##e', 'before', 'in', '##it', '##ia', '##ting', 'the', 'pr', '##oc', '##ess']
