<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/Normalization_Pre_tokenization_02_11_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase  #Normalization Form Decompose(Unicode Normalization)


In [4]:
normalizer = normalizers.Sequence([Lowercase()])

normalizer.normalize_str("Café culture is prominent in many cities around the world.")

'café culture is prominent in many cities around the world.'

In [5]:
normalizer = normalizers.Sequence([NFD(), StripAccents(), Lowercase()])

normalizer.normalize_str("Café culture is prominent in many cities around the world.")

'cafe culture is prominent in many cities around the world.'

In [6]:
normalizer.normalize_str("The protagonist had déjà vu when he entered the old mansion.")

'the protagonist had deja vu when he entered the old mansion.'

In [7]:
normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

Pre-tokenization

In [8]:
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()

pre_tokenizer.pre_tokenize_str("She can't attend the meeting due to prior commitments.")

[('She', (0, 3)),
 ('can', (4, 7)),
 ("'", (7, 8)),
 ('t', (8, 9)),
 ('attend', (10, 16)),
 ('the', (17, 20)),
 ('meeting', (21, 28)),
 ('due', (29, 32)),
 ('to', (33, 35)),
 ('prior', (36, 41)),
 ('commitments', (42, 53)),
 ('.', (53, 54))]

In [9]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits

pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits = False)])

pre_tokenizer.pre_tokenize_str("I am calling you on 93457654")

[('I', (0, 1)),
 ('am', (2, 4)),
 ('calling', (5, 12)),
 ('you', (13, 16)),
 ('on', (17, 19)),
 ('93457654', (20, 28))]

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #BERT uses wordpiece tokenizer

tokenizer.tokenize("I have a new SAMSUNG GLITE")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



['i', 'have', 'a', 'new', 'samsung', 'g', '##lite']

In [11]:
# 3 tokenizers mainly in huggingface - 1. Byte Pair Encoding tokenizer, Wordpiece, unigram
tokenizer.tokenize("Hello, y'all! How   are you 😁 ?")

['hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?']

In [12]:
tokenizer.backend_tokenizer

<tokenizers.Tokenizer at 0x7d8490995e30>

In [13]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("I have a new SAMSUNG GLITE")

[('I', (0, 1)),
 ('have', (2, 6)),
 ('a', (7, 8)),
 ('new', (9, 12)),
 ('SAMSUNG', (13, 20)),
 ('GLITE', (21, 26))]

In [14]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, y'all! How   are you 😁 ?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('y', (7, 8)),
 ("'", (8, 9)),
 ('all', (9, 12)),
 ('!', (12, 13)),
 ('How', (14, 17)),
 ('are', (20, 23)),
 ('you', (24, 27)),
 ('😁', (28, 29)),
 ('?', (30, 31))]

In [15]:
tokenizer.backend_tokenizer.normalizer.normalize_str("The protagonist had déjà vu when he entered the old mansion.")

'the protagonist had deja vu when he entered the old mansion.'

In [16]:
tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

In [17]:
tokenizer = AutoTokenizer.from_pretrained("gpt2") #Byte Pair encoding , gpt series, roberta, bart, debarta

tokenizer.tokenize("I have a new SAMSUNG GLITE")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

['I', 'Ġhave', 'Ġa', 'Ġnew', 'ĠSAM', 'S', 'UN', 'G', 'ĠGL', 'ITE']

In [18]:
tokenizer.tokenize("Hello, y'all! How   are you 😁 ?")

['Hello',
 ',',
 'Ġy',
 "'",
 'all',
 '!',
 'ĠHow',
 'Ġ',
 'Ġ',
 'Ġare',
 'Ġyou',
 'ĠðŁĺ',
 'ģ',
 'Ġ?']

In [19]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("I have a new SAMSUNG GLITE")

[('I', (0, 1)),
 ('Ġhave', (1, 6)),
 ('Ġa', (6, 8)),
 ('Ġnew', (8, 12)),
 ('ĠSAMSUNG', (12, 20)),
 ('ĠGLITE', (20, 26))]

In [20]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, y'all! How   are you 😁 ?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('Ġy', (6, 8)),
 ("'", (8, 9)),
 ('all', (9, 12)),
 ('!', (12, 13)),
 ('ĠHow', (13, 17)),
 ('ĠĠ', (17, 19)),
 ('Ġare', (19, 23)),
 ('Ġyou', (23, 27)),
 ('ĠðŁĺģ', (27, 29)),
 ('Ġ?', (29, 31))]

In [22]:
tokenizer = AutoTokenizer.from_pretrained("t5-small") #unigram tokenizer uses sentence-piece algorithm

tokenizer.tokenize("I have a new SAMSUNG GLITE")

['▁I', '▁have', '▁', 'a', '▁new', '▁S', 'AMS', 'UNG', '▁', 'GL', 'ITE']

In [23]:
tokenizer.tokenize("Hello, y'all! How   are you 😁 ?")

['▁Hello',
 ',',
 '▁',
 'y',
 "'",
 'all',
 '!',
 '▁How',
 '▁are',
 '▁you',
 '▁',
 '😁',
 '▁',
 '?']

In [24]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("I have a new SAMSUNG GLITE")

[('▁I', (0, 1)),
 ('▁have', (2, 6)),
 ('▁a', (7, 8)),
 ('▁new', (9, 12)),
 ('▁SAMSUNG', (13, 20)),
 ('▁GLITE', (21, 26))]

In [25]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, y'all! How   are you 😁 ?")

[('▁Hello,', (0, 6)),
 ("▁y'all!", (7, 13)),
 ('▁How', (14, 17)),
 ('▁are', (20, 23)),
 ('▁you', (24, 27)),
 ('▁😁', (28, 29)),
 ('▁?', (30, 31))]