# SentencePiece 演示
本笔记本将介绍一些如何设置和使用 SentencePiece 子词库的示例

In [1]:
#@title 导入我们的库
!pip install tokenizers
import sentencepiece as spm
import re
import os
import pandas as pd



In [10]:
#@title 检查数据集

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

# 获取所有 .txt 文件的路径
files = [str(x) for x in Path(".").glob("**/*.txt")]

for i in range(2):
    print(files[i])

botchan.txt
qq.txt


In [12]:
#@title 提取数据
text = []
#注意，你可以抓取任意数量的文件。
#这里只是使用了两个文件作为例子。
for i in range(2):
    for line in open("/content/{}".format(files[i]), encoding="latin-1").readlines():
        if re.search("<", line) or len(line) < 5:
            continue
        else:
            text.append(line)

In [13]:
#@title 将数据写入文件
with open('blog_test.txt', 'w') as fw:
    for l in text:
        fw.write(l)

In [16]:
#@title 训练 BPE 模型
# 从我们的语料库中训练SentencePiece模型
spm.SentencePieceTrainer.train('--model_type=bpe --input=blog_test.txt --model_prefix=bpe --vocab_size=500')

In [17]:
#@title 训练 Unigram 模型
# train sentencepiece model from our blog corpus
spm.SentencePieceTrainer.train('--model_type=unigram --input=blog_test.txt --model_prefix=uni --vocab_size=500')

In [18]:
#@title 加载新训练的模型
# makes segmenter instance and loads the BPE model file (bpe.model)
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('bpe.model')

True

In [19]:
# makes segmenter instance and loads the BPE model file (bpe.model)
sp_uni = spm.SentencePieceProcessor()
sp_uni.load('uni.model')

True

In [20]:
#@title 查看一些示例令牌
print("BPE: {}".format(sp_bpe.encode_as_pieces('This is a test')))
print("UNI: {}".format(sp_uni.encode_as_pieces('This is a test')))

BPE: ['▁Th', 'is', '▁is', '▁a', '▁t', 'est']
UNI: ['▁This', '▁is', '▁a', '▁', 'te', 'st']


In [21]:
print("BPE: {}".format(sp_bpe.encode_as_pieces(' This is a test')))
print("UNI: {}".format(sp_uni.encode_as_pieces(' This is a test')))

BPE: ['▁Th', 'is', '▁is', '▁a', '▁t', 'est']
UNI: ['▁This', '▁is', '▁a', '▁', 'te', 'st']


In [22]:
BPE: ['▁This', '▁is', '▁a', '▁t', 'est']
UNI: ['▁Thi', 's', '▁is', '▁a', '▁t', 'est']


In [None]:
#@title 获取所有 BPE 的列表
vocabs = [sp_bpe.id_to_piece(id) for id in range(sp_bpe.get_piece_size())]
bpe_tokens = sorted(vocabs, key=lambda x: len(x), reverse=True)
bpe_tokens

In [None]:
vocabs = [sp_uni.id_to_piece(id) for id in range(sp_uni.get_piece_size())]
bpe_tokens = sorted(vocabs, key=lambda x: len(x), reverse=True)
bpe_tokens

In [25]:
#@title 逆转流程
# encode: text => id
print("BPE {}".format(sp_bpe.encode_as_pieces('This is a test')))
print("BPE {}".format(sp_bpe.encode_as_ids('This is a test')))

print("UNI {}".format(sp_uni.encode_as_pieces('This is a test')))
print("UNI {}".format(sp_uni.encode_as_ids('This is a test')))

BPE ['▁Th', 'is', '▁is', '▁a', '▁t', 'est']
BPE [388, 24, 97, 5, 3, 264]
UNI ['▁This', '▁is', '▁a', '▁', 'te', 'st']
UNI [299, 61, 13, 3, 99, 43]


In [29]:
# decode: id => text
print("BPE {}".format(sp_bpe.decode_pieces(['▁Th', 'is', '▁is', '▁a', '▁t', 'est'])))
print("BPE {}".format(sp_bpe.decode_ids([388, 24, 97, 5, 3, 264])))

print("UNI {}".format(sp_uni.decode_pieces(['▁This', '▁is', '▁a', '▁', 'te', 'st'])))
print("UNI {}".format(sp_uni.decode_ids([299, 61, 13, 3, 99, 43])))

BPE This is a test
BPE This is a test
UNI This is a test
UNI This is a test


In [30]:
#@title 获取令牌列表
vocabs = [sp_bpe.id_to_piece(id) for id in range(sp_bpe.get_piece_size())]
bpe_list = sorted(vocabs, key=lambda x: len(x), reverse=True)

In [31]:
vocabs = [sp_uni.id_to_piece(id) for id in range(sp_uni.get_piece_size())]
uni_list = sorted(vocabs, key=lambda x: len(x), reverse=True)

In [32]:
uni_tok_diff = [u for u in uni_list if u not in bpe_list]
print(len(uni_tok_diff))

218


In [33]:
bpe_tok_diff = [b for b in bpe_list if b not in uni_list]
print(len(bpe_tok_diff))

218


In [None]:
diff_pairs = list(zip(uni_tok_diff, bpe_tok_diff))
diff_df = pd.DataFrame(diff_pairs, columns=(["Unigram tokens not in BPE", "BPE tokens not in Unigram"]))
diff_df.head()

In [49]:
#@title 比较tokens
#不同的分词器如何处理丢失的分词？
#我们首先看一下 Unigram 代币，它们在 BPE 中但不在 Unigram 分词器中
diff_df['BPE tokens not in Unigram'].values.tolist()

['orcupine',
 'utenberg',
 '▁princip',
 'cupine',
 '▁start',
 'chool',
 'acher',
 'ought',
 '▁stud',
 '▁prin',
 'other',
 'ould',
 'ight',
 'hing',
 '▁wor',
 '▁whi',
 'llow',
 '▁hou',
 '▁com',
 '....',
 '▁bec',
 '▁int',
 'ject',
 'ents',
 '▁app',
 'nder',
 '▁Pro',
 '▁loo',
 'berg',
 'lown',
 'uten',
 'very',
 'here',
 'ving',
 'ated',
 'self',
 '▁res',
 'ared',
 'omet',
 '▁mat',
 '▁bet',
 'hat',
 '▁th',
 '▁st',
 'ght',
 'ith',
 'ion',
 'her',
 '▁wh',
 'ver',
 '▁su',
 'out',
 'red',
 'ard',
 '▁te',
 '▁se',
 'ine',
 '▁ab',
 '▁fr',
 '▁sa',
 '▁fe',
 '▁li',
 'ect',
 'irt',
 'ked',
 'hen',
 '▁Sh',
 'ill',
 'art',
 'ess',
 'ore',
 '▁ne',
 'own',
 'ame',
 'ain',
 'all',
 'ool',
 'ong',
 '▁ro',
 'cup',
 'ake',
 'ind',
 '▁fa',
 'ood',
 'ble',
 'ack',
 '▁al',
 'ang',
 '▁af',
 'ust',
 'ers',
 '▁le',
 '▁tr',
 'han',
 'one',
 'ist',
 'est',
 '▁pr',
 'ven',
 'ass',
 'ber',
 '▁kn',
 '▁tw',
 'res',
 '▁qu',
 '▁ag',
 'hed',
 '▁wr',
 'ree',
 'ide',
 '▁am',
 'ast',
 'ite',
 'ime',
 'ice',
 '▁It',
 '▁po',
 

In [50]:
test_list = ["orcupine", "utenberg", "cupine", "chool"]
for ut in test_list:
    print("Unigram token {} \nBPE token {}\n".format(sp_uni.encode_as_pieces(ut), sp_bpe.encode_as_pieces(ut)))

Unigram token ['▁or', 'c', 'u', 'p', 'in', 'e'] 
BPE token ['▁or', 'cupine']

Unigram token ['▁', 'ut', 'en', 'b', 'er', 'g'] 
BPE token ['▁', 'utenberg']

Unigram token ['▁c', 'u', 'p', 'in', 'e'] 
BPE token ['▁c', 'u', 'p', 'ine']

Unigram token ['▁', 'ch', 'o', 'o', 'l'] 
BPE token ['▁c', 'h', 'ool']



In [51]:
for n in range(10):
  print(sp_uni.encode_as_pieces('hello world'))

['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']
['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']


In [52]:
# Can obtain different segmentations per request.
# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.
for n in range(10):
  print(sp_uni.sample_encode_as_pieces('remembers', -1, 0.1))

['▁', 'r', 'e', 'm', 'e', 'm', 'b', 'er', 's']
['▁re', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁re', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'er', 's']
['▁re', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'er', 's']


In [53]:
# get 10 best
best_seg = sp_uni.nbest_encode_as_pieces('remembers', 10)
for i in best_seg:
    print(i)

['▁re', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'er', 's']
['▁re', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'm', 'e', 'm', 'b', 'er', 's']
['▁', 're', 'm', 'e', 'm', 'b', 'e', 'r', 's']
['▁', 'r', 'e', 'm', 'e', 'm', 'b', 'e', 'r', 's']


In [54]:
#@title HuggingFace 分词器
from tokenizers import (ByteLevelBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["/content/blog_test.txt"], vocab_size=500, min_frequency=2)

output = tokenizer.encode("This is a test")
print(output.tokens)

['▁Th', 'is', '▁is', '▁a', '▁t', 'est']
