<a href="https://colab.research.google.com/github/sukcsie/NLP-with-Python/blob/main/Introduction_to_BERTTokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a quick tutorial to get started with BERT tokenizer and get your text data ready for your Deep learning models. I have used `transformers` library which is the State-of-the-art Natural Language Processing for *Pytorch* and *TensorFlow 2.0*.

In [33]:
# if the package is not installed, then install it
try:
    import transformers
except:
    !pip install transformers
    import transformers


In [34]:
# BERT tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("I am trying to get sentences ready for BERT!")

['i', 'am', 'trying', 'to', 'get', 'sentences', 'ready', 'for', 'bert', '!']

In [44]:
max_length_test = 50
test_sentence = 'I am trying out different options to learn BERT tokenizers. This is an example. Feel free to use this code as you wish!'
tokenized = tokenizer.tokenize(test_sentence)
print('tokenized', tokenized)


tokenized ['i', 'am', 'trying', 'out', 'different', 'options', 'to', 'learn', 'bert', 'token', '##izer', '##s', '.', 'this', 'is', 'an', 'example', '.', 'feel', 'free', 'to', 'use', 'this', 'code', 'as', 'you', 'wish', '!']


In [45]:
# adding special tokens
# [CLS] is used at the beginning of a sentence
# [SEP] is used at the end of a sentence
test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'
tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)
print('tokenized', tokenized)

tokenized ['[CLS]', 'i', 'am', 'trying', 'out', 'different', 'options', 'to', 'learn', 'bert', 'token', '##izer', '##s', '.', 'this', 'is', 'an', 'example', '.', 'feel', 'free', 'to', 'use', 'this', 'code', 'as', 'you', 'wish', '!', '[SEP]']


In [46]:
# convert tokens to ids in WordPiece
input_ids = tokenizer.convert_tokens_to_ids(tokenized)
print('input IDs', input_ids)

input IDs [101, 1045, 2572, 2667, 2041, 2367, 7047, 2000, 4553, 14324, 19204, 17629, 2015, 1012, 2023, 2003, 2019, 2742, 1012, 2514, 2489, 2000, 2224, 2023, 3642, 2004, 2017, 4299, 999, 102]


In [47]:
# precalculation of pad length, so that we can reuse it later on
padding_length = max_length_test - len(input_ids)
print('padding length', padding_length)

padding length 20


In [48]:
# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
input_ids = input_ids + ([0] * padding_length)
print('padded input ids', input_ids)

padded input ids [101, 1045, 2572, 2667, 2041, 2367, 7047, 2000, 4553, 14324, 19204, 17629, 2015, 1012, 2023, 2003, 2019, 2742, 1012, 2514, 2489, 2000, 2224, 2023, 3642, 2004, 2017, 4299, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [49]:
# attention should focus just on sequence with non padded tokens
attention_mask = [1] * len(input_ids)
print('attention mask', attention_mask)

attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [50]:
# do not focus attention on padded tokens
attention_mask = attention_mask + ([0] * padding_length)
print('attention mask not focusing on padded tokens', attention_mask)

attention mask not focusing on padded tokens [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [51]:
# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
token_type_ids = [0] * max_length_test
bert_input = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": attention_mask
} 
print(bert_input)


{'token_ids': [101, 1045, 2572, 2667, 2041, 2367, 7047, 2000, 4553, 14324, 19204, 17629, 2015, 1012, 2023, 2003, 2019, 2742, 1012, 2514, 2489, 2000, 2224, 2023, 3642, 2004, 2017, 4299, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [54]:
# Now in the practical coding we will use just encode_plus function, which does all of those steps for us

bert_input = tokenizer.encode_plus(
                        test_sentence,                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        max_length = max_length_test, # max length of the text that can go to BERT
                        pad_to_max_length = True, # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )
print('Encoded', bert_input)


Encoded {'input_ids': [101, 1045, 2572, 2667, 2041, 2367, 7047, 2000, 4553, 14324, 19204, 17629, 2015, 1012, 2023, 2003, 2019, 2742, 1012, 2514, 2489, 2000, 2224, 2023, 3642, 2004, 2017, 4299, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


