[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1FOuOnpnnFRxTChVkei2oE1xHTV5fK7ma?usp=sharing)

# Tokenizers

In [1]:
!pip install transformers[sentencepiece] 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
print(tokenizer.vocab)
print(f'The vocabulary size is {len(tokenizer.vocab)}')

The vocabulary size is 30522


In [5]:
sentence = 'I like NLP'
print(sentence)
tokens = tokenizer.tokenize(sentence)
print(tokens)
ids = tokenizer.encode(sentence)
print(ids)
print(tokenizer.decode(ids))


I like NLP
['i', 'like', 'nl', '##p']
[101, 1045, 2066, 17953, 2361, 102]
[CLS] i like nlp [SEP]


In [6]:
print(f'{tokenizer.cls_token} -> {tokenizer.cls_token_id}')
print(f'{tokenizer.sep_token} -> {tokenizer.sep_token_id}')

[CLS] -> 101
[SEP] -> 102


In [7]:
'😀' in tokenizer.vocab

False

In [8]:
sentence = 'I like NLP😀'
tokenizer.tokenize(sentence)

['i', 'like', '[UNK]']

In [9]:
first_sentence = 'I like NLP.'
second_sentence = 'What about you?'
input = tokenizer(first_sentence, second_sentence, return_tensors='pt')
input

{'input_ids': tensor([[  101,  1045,  2066, 17953,  2361,  1012,   102,  2054,  2055,  2017,
          1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
input['input_ids']

tensor([[  101,  1045,  2066, 17953,  2361,  1012,   102,  2054,  2055,  2017,
          1029,   102]])

In [11]:
input['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])

In [12]:
input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [17]:
first_sentence = 'I like NLP.'
second_sentence = 'What are your thoughts on the subject?'
input = tokenizer([first_sentence, second_sentence], padding=True, return_tensors='pt')
input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])