In [5]:
from transformers import BertTokenizer

# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Inspect tokenizer props

In [6]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_added_tokens_decoder',
 '_added_tokens_encoder',
 '_auto_class',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_call_one',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_

In [7]:
# Check out some tokens
all_tokens = list(tokenizer.get_vocab().keys())
all_tokens[20000:20100]

['chunk',
 'rigorous',
 'blaine',
 '198',
 'peabody',
 'slayer',
 'dismay',
 'brewers',
 'nz',
 '##jer',
 'det',
 '##glia',
 'glover',
 'postwar',
 'int',
 'penetration',
 'sylvester',
 'imitation',
 'vertically',
 'airlift',
 'heiress',
 'knoxville',
 'viva',
 '##uin',
 '390',
 'macon',
 '##rim',
 '##fighter',
 '##gonal',
 'janice',
 '##orescence',
 '##wari',
 'marius',
 'belongings',
 'leicestershire',
 '196',
 'blanco',
 'inverted',
 'preseason',
 'sanity',
 'sobbing',
 '##due',
 '##elt',
 '##dled',
 'collingwood',
 'regeneration',
 'flickering',
 'shortest',
 '##mount',
 '##osi',
 'feminism',
 '##lat',
 'sherlock',
 'cabinets',
 'fumbled',
 'northbound',
 'precedent',
 'snaps',
 '##mme',
 'researching',
 '##akes',
 'guillaume',
 'insights',
 'manipulated',
 'vapor',
 'neighbour',
 'sap',
 'gangster',
 'frey',
 'f1',
 'stalking',
 'scarcely',
 'callie',
 'barnett',
 'tendencies',
 'audi',
 'doomed',
 'assessing',
 'slung',
 'panchayat',
 'ambiguous',
 'bartlett',
 '##etto',
 'distri

In [9]:
print(tokenizer.vocab_size)
tokenizer.get_vocab()['science']

30522


2671

# Tokenizing a word

In [10]:
word = 'science'
#tokenizng using 2 functions....they return same tokenIDs
res1 = tokenizer.convert_tokens_to_ids(word)
res2 = tokenizer.get_vocab()[word]

print(res1)
print(res2)

2671
2671


# Encoding a text

In [12]:
text = 'science is great'

res1 = tokenizer.convert_tokens_to_ids(text)
print(res1)
res2 = tokenizer.get_vocab()[text]

print(res1)
print(res2)

100


KeyError: 'science is great'

In [19]:
# get_vocab works for words and subwords not text
#better way
res3 = tokenizer.encode(text)

for i in res3:
    print(f'Token {i} is "{tokenizer.decode(i)}"')

#[CLS] = classification
#[SEP] = sentence separation

print('')
print(tokenizer.decode(res3, skip_special_tokens=True))
print(tokenizer.decode(res3, skip_special_tokens=False))

Token 101 is "[CLS]"
Token 2671 is "science"
Token 2003 is "is"
Token 2307 is "great"
Token 102 is "[SEP]"

science is great
[CLS] science is great [SEP]


In [21]:
# BERT adds [CLS]...[SEP] with each encode
tokenizer.decode(tokenizer.encode(tokenizer.decode(tokenizer.encode(text))))

'[CLS] [CLS] science is great [SEP] [SEP]'

# Calling the class directly


In [22]:
# we can encode a text without having to call encode() member func
# we can do this by calling the tokenizer class itself
tokenizer(text)

{'input_ids': [101, 2671, 2003, 2307, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [23]:
tokenizer.encode(text)

[101, 2671, 2003, 2307, 102]

In [35]:
## More on tokenizing 

sentence = 'AI is both exciting and terrifying'

print('Original snetnce')
print(f' {sentence}\n')

#segment the text into tokens
tokenized = tokenizer.tokenize(sentence) # this return list of tokens, not tokenIDs
print('Tokenised (segmented) sentence:')
print(f' {tokenized}')


#encode the tokenised sentece
ids_from_tokens = tokenizer.convert_tokens_to_ids(tokenized)
print(f' {ids_from_tokens}\n') # note no special tokens in here

# and finally, encode from original sentence
encodedText = tokenizer.encode(sentence)
print('Encoded from original text:')
print(f' {encodedText}\n\n')

# now for decoding
print('decoded from token-wise encoding:')
print(f' {tokenizer.decode(ids_from_tokens)}\n')

print('decoded from text encoding:')
print(f' {tokenizer.decode(encodedText)}\n')



Original snetnce
 AI is both exciting and terrifying

Tokenised (segmented) sentence:
 ['ai', 'is', 'both', 'exciting', 'and', 'terrifying']
 [9932, 2003, 2119, 10990, 1998, 17082]

Encoded from original text:
 [101, 9932, 2003, 2119, 10990, 1998, 17082, 102]


decoded from token-wise encoding:
 ai is both exciting and terrifying

decoded from text encoding:
 [CLS] ai is both exciting and terrifying [SEP]



In [27]:
s =sentence.split(' ')
s

['AI', 'is', 'both', 'exciting', 'and', 'terrifying']

In [28]:
i1 = tokenizer.convert_tokens_to_ids(tokenized)
i2 = tokenizer.convert_tokens_to_ids(s)

In [29]:
i1

[9932, 2003, 2119, 10990, 1998, 17082]

In [30]:
i2

[100, 2003, 2119, 10990, 1998, 17082]

In [31]:
s

['AI', 'is', 'both', 'exciting', 'and', 'terrifying']

In [32]:
tokenized

['ai', 'is', 'both', 'exciting', 'and', 'terrifying']