In [27]:
from tokenizers import (BertWordPieceTokenizer,
                        SentencePieceBPETokenizer,
                        ByteLevelBPETokenizer,
                        CharBPETokenizer)

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
tokenizer1 = SentencePieceBPETokenizer("bert-base-uncased-vocab.txt")
tokenizer2 = ByteLevelBPETokenizer("bert-base-uncased-vocab.txt")
tokenizer3 = CharBPETokenizer("bert-base-uncased-vocab.txt")
print('tokenizer',tokenizer)
print('tokenizer1',tokenizer1)
print('tokenizer2',tokenizer2)
print('tokenizer3',tokenizer3)
# Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK],
# sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK],
# clean_text=True, handle_chinese_chars=True, strip_accents=True,
# lowercase=True, wordpieces_prefix=##)

# Tokenizers provide exhaustive outputs: tokens, mapping to original string, attention/special token masks.
# They also handle model's max input lengths as well as padding (to directly encode in padded batches)
output = tokenizer.encode("Hello, y'all! How are you?")
output1 = tokenizer1.encode("Hello, y'all! How are you?")
output2 = tokenizer2.encode("Hello, y'all! How are you?")
output3 = tokenizer3.encode("Hello, y'all! How are you?")

print('output',output) 
print('output1',output1)
print('output2',output2)
print('output3',output3)

print(o)


# Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, 							 special_tokens_mask, overflowing])
print(f"ids: {output.ids}") # [101, 7592, 1010, 1061, 1005, 2035, 999, 2129, 2024, 2017, 1029, 102]
print(f"type_ids: {output.type_ids}")   # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print(f"tokens: {output.tokens}")   # ['[CLS]', 'hello', ',', 'y', "'", 'all', '!', 'how', 'are', 											'you', '?', '[SEP]']
print(f"offsets: {output.offsets}") # [(0, 0), (0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), 
                                    #  (14,17), (18, 21), (22, 25), (25, 26), (0, 0)]
print(f"attention_mask: {output.attention_mask}")   # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(f"special_tokens_mask: {output.special_tokens_mask}") # [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
print(f"overflowing: {output.overflowing}") # []

# Provided tokenizers
# CharBPETokenizer: The original BPE
# ByteLevelBPETokenizer: The byte level version of the BPE
# SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
# BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece



tokenizer Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix=##)
tokenizer1 Tokenizer(vocabulary_size=0, model=SentencePieceBPE, unk_token=<unk>, replacement=▁, add_prefix_space=True, dropout=None)
tokenizer2 Tokenizer(vocabulary_size=0, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)
tokenizer3 Tokenizer(vocabulary_size=0, model=BPE, unk_token=<unk>, suffix=</w>, dropout=None, lowercase=False, unicode_normalizer=None, bert_normalizer=True, split_on_whitespace_only=False)
output Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
output1 Encoding(num_tokens=0, attributes=[ids, type_ids, tokens, offsets, atten

In [28]:
DATAFILE = 'pg16457.txt'
MODELDIR = ''

input_text = 'This is a test'

# Training the tokenizers

print("========= CharBPETokenizer ==========")
# CharBPETokenizer
tokenizer = CharBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)





In [29]:
#tokenizer.save(MODELDIR, 'char_bpe')

output = tokenizer.encode(input_text)
print(output.tokens)    # ['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>']

print("========= ByteLevelBPETokenizer ==========")
# ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

#tokenizer.save(MODELDIR, 'byte_bpe')
output = tokenizer.encode(input_text)
print(output.tokens)    # ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est']

print("========= SentencePieceBPETokenizer ==========")
# SentencePieceBPETokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

#tokenizer.save(MODELDIR, 'tok_sp_bpe')
output = tokenizer.encode(input_text)
print(output.tokens)    # ['▁T', 'h', 'is', '▁is', '▁a', '▁t', 'est']

print("========= BertWordPieceTokenizer ==========")
# BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

#tokenizer.save(MODELDIR, 'bert_bpe')
output = tokenizer.encode(input_text)
print(output.tokens)    # ['this', 'is', 'a', 't', '##est']

['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>']
['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est']
['▁T', 'h', 'is', '▁is', '▁a', '▁t', 'est']
['this', 'is', 'a', 't', '##est']


In [30]:
%history

from tokenizers import (BertWordPieceTokenizer,
                        SentencePieceBPETokenizer,
                        ByteLevelBPETokenizer,
                        CharBPETokenizer)

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt", lowercase=True)
print(tokenizer)
# Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK],
# sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK],
# clean_text=True, handle_chinese_chars=True, strip_accents=True,
# lowercase=True, wordpieces_prefix=##)

# Tokenizers provide exhaustive outputs: tokens, mapping to original string, attention/special token masks.
# They also handle model's max input lengths as well as padding (to directly encode in padded batches)
output = tokenizer.encode("Hello, y'all! How are you?")

print(output)   # Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attentio