Skip to content

Commit

Permalink
Merge pull request #66 from tiandiweizun/main
Browse files Browse the repository at this point in the history
解决中文加入不生效的问题
  • Loading branch information
stanleylsx committed Oct 26, 2023
2 parents 75cdd7b + edbac7d commit 0466951
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion engines/utils/expand_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import shutil
import sentencepiece as sp
from transformers import AutoTokenizer, AutoModel
from tokenizers import AddedToken


cur_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -67,7 +68,7 @@ def add_new_tokens(logger, tokenizer, save_path):
raw_vocab = [sp_bpe.id_to_piece(id) for id in range(sp_bpe.get_piece_size())]
clean_vocab = list(set(filter(is_chinese, raw_vocab)))

tokenizer.add_tokens(clean_vocab)
tokenizer.add_tokens([AddedToken(token, normalized=False) for token in clean_vocab])
tokenizer.save_pretrained(save_path)
logger.info(f'New tokens added, new tokenizer is saved to {save_path}.')

Expand Down

0 comments on commit 0466951

Please sign in to comment.