In [1]:
from datasets import load_dataset

dataset = load_dataset("wanhin/vi_text", split='train')

texts = dataset['text']

print(texts[:2])

  from .autonotebook import tqdm as notebook_tqdm


['Anh ta cần xác định tính chất của các mẫu này', 'Nhiều mưa hơn có nghĩa là nhiều nước mưa đi qua đất hơn']


In [2]:
from tokenizers import BertWordPieceTokenizer

tokenizer_wordpiece = BertWordPieceTokenizer(lowercase=False)
tokenizer_wordpiece.train_from_iterator(texts, vocab_size=10000, min_frequency=2)

tokenizer_wordpiece.pad_token = '[PAD]'
tokenizer_wordpiece.cls_token = '[CLS]'
tokenizer_wordpiece.sep_token = '[SEP]'
tokenizer_wordpiece.unk_token = '[UNK]'
tokenizer_wordpiece.mask_token = '[MASK]'
print(f"PAD Token: {tokenizer_wordpiece.pad_token}")
print(f"CLS Token: {tokenizer_wordpiece.cls_token}")
print(f"SEP Token: {tokenizer_wordpiece.sep_token}")
print(f"UNK Token: {tokenizer_wordpiece.unk_token}")
print(f"MASK Token: {tokenizer_wordpiece.mask_token}")

vocab = tokenizer_wordpiece.get_vocab()
print("Vocabulary size:", len(vocab))

sample_text = texts[0]
encoded_sample_wordpiece = tokenizer_wordpiece.encode(sample_text)
print(f"Tokens (WordPiece): {encoded_sample_wordpiece.tokens}")
print(f"Token IDs (WordPiece): {encoded_sample_wordpiece.ids}")

decoded_sample_wordpiece = tokenizer_wordpiece.decode(encoded_sample_wordpiece.ids)
print(f"Decoded (WordPiece): {decoded_sample_wordpiece}")




PAD Token: [PAD]
CLS Token: [CLS]
SEP Token: [SEP]
UNK Token: [UNK]
MASK Token: [MASK]
Vocabulary size: 10000
Tokens (WordPiece): ['Anh', 'ta', 'cần', 'xác', 'định', 'tính', 'chất', 'của', 'các', 'mẫu', 'này']
Token IDs (WordPiece): [1100, 1120, 1314, 1339, 1149, 1367, 1468, 959, 974, 2332, 996]
Decoded (WordPiece): Anh ta cần xác định tính chất của các mẫu này


In [3]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from tokenizers.normalizers import Lowercase

tokenizer_bpe = Tokenizer(models.BPE())
tokenizer_bpe.normalizer = Lowercase()
tokenizer_bpe.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2)
tokenizer_bpe.train_from_iterator(texts, trainer=trainer)

tokenizer_bpe.pad_token = '[PAD]'
tokenizer_bpe.cls_token = '[CLS]'
tokenizer_bpe.sep_token = '[SEP]'
tokenizer_bpe.unk_token = '[UNK]'
tokenizer_bpe.mask_token = '[MASK]'
print(f"PAD Token: {tokenizer_wordpiece.pad_token}")
print(f"CLS Token: {tokenizer_wordpiece.cls_token}")
print(f"SEP Token: {tokenizer_wordpiece.sep_token}")
print(f"UNK Token: {tokenizer_wordpiece.unk_token}")
print(f"MASK Token: {tokenizer_wordpiece.mask_token}")

vocab = tokenizer_wordpiece.get_vocab()
print("Vocabulary size:", len(vocab))

encoded_sample_bpe = tokenizer_bpe.encode(sample_text)
print(f"Tokens (BPE): {encoded_sample_bpe.tokens}")
print(f"Token IDs (BPE): {encoded_sample_bpe.ids}")

decoded_sample_bpe = tokenizer_bpe.decode(encoded_sample_bpe.ids)
print(f"Decoded (BPE): {decoded_sample_bpe}")





PAD Token: [PAD]
CLS Token: [CLS]
SEP Token: [SEP]
UNK Token: [UNK]
MASK Token: [MASK]
Vocabulary size: 10000
Tokens (BPE): ['anh', 'ta', 'cần', 'xác', 'định', 'tính', 'chất', 'của', 'các', 'mẫu', 'này']
Token IDs (BPE): [450, 537, 782, 766, 619, 823, 919, 428, 448, 1681, 467]
Decoded (BPE): anh ta cần xác định tính chất của các mẫu này


In [5]:
from tokenizers import SentencePieceBPETokenizer

tokenizer_sentencepiece = SentencePieceBPETokenizer()

tokenizer_sentencepiece.train_from_iterator(texts, vocab_size=10000, min_frequency=2)

tokenizer_sentencepiece.pad_token = '[PAD]'
tokenizer_sentencepiece.cls_token = '[CLS]'
tokenizer_sentencepiece.sep_token = '[SEP]'
tokenizer_sentencepiece.unk_token = '[UNK]'
tokenizer_sentencepiece.mask_token = '[MASK]'
print(f"PAD Token: {tokenizer_sentencepiece.pad_token}")
print(f"CLS Token: {tokenizer_sentencepiece.cls_token}")
print(f"SEP Token: {tokenizer_sentencepiece.sep_token}")
print(f"UNK Token: {tokenizer_sentencepiece.unk_token}")
print(f"MASK Token: {tokenizer_sentencepiece.mask_token}")

vocab = tokenizer_wordpiece.get_vocab()
print("Vocabulary size:", len(vocab))

encoded_sample_sentencepiece = tokenizer_sentencepiece.encode(sample_text)
print(f"Tokens (SentencePiece): {encoded_sample_sentencepiece.tokens}")
print(f"Token IDs (SentencePiece): {encoded_sample_sentencepiece.ids}")

decoded_sample_sentencepiece = tokenizer_sentencepiece.decode(encoded_sample_sentencepiece.ids)
print(f"Decoded (SentencePiece): {decoded_sample_sentencepiece}")





PAD Token: [PAD]
CLS Token: [CLS]
SEP Token: [SEP]
UNK Token: [UNK]
MASK Token: [MASK]
Vocabulary size: 10000
Tokens (SentencePiece): ['▁Anh', '▁ta', '▁cần', '▁xác', '▁định', '▁tính', '▁chất', '▁của', '▁các', '▁mẫu', '▁này']
Token IDs (SentencePiece): [708, 728, 930, 956, 757, 986, 1092, 544, 565, 1969, 591]
Decoded (SentencePiece): Anh ta cần xác định tính chất của các mẫu này
