In [1]:
# Word-based / Each word has a specific ID
# out of vocabulary words result in loss of information!
tokenized_sentence = ("Karpathy is a legit researcher!").split()
print(tokenized_sentence)

['Karpathy', 'is', 'a', 'legit', 'researcher!']


In [13]:
# One way to reduce the amount of unknown tokens: character-based tokenizer.
# Subword-based tokenization:  frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("trying bert felt cute. might delete later")

{'input_ids': [101, 1774, 1129, 3740, 1464, 10509, 119, 1547, 3687, 16618, 1224, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
# Agnostic way to do so
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer("i'm trying the tokenizer right now. can't talk baby!")

{'input_ids': [101, 1045, 1005, 1049, 2667, 1996, 19204, 17629, 2157, 2085, 1012, 2064, 1005, 1056, 2831, 3336, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.save_pretrained("desktop/weights-in-bio")

In [26]:
# How it really works?
# 1st step: split text to tokens
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("i'm trying the tokenizer right now. can't talk baby!")
print(tokens)

['i', "'", 'm', 'trying', 'the', 'token', '##izer', 'right', 'now', '.', 'can', "'", 't', 'talk', 'baby', '!']


In [27]:
inputs_ids = tokenizer.convert_tokens_to_ids(tokens)
print(inputs_ids)

[1045, 1005, 1049, 2667, 1996, 19204, 17629, 2157, 2085, 1012, 2064, 1005, 1056, 2831, 3336, 999]


In [36]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("i'm trying the tokenizer right now. can't talk baby!")
print(tokenizer.decode(inputs["input_ids"]))


[CLS] i'm trying the tokenizer right now. can't talk baby! [SEP]


In [37]:
# Handling multiple sequences