## 1. ​Hugging Face Transformers库

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# 初始化BPE模型
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

# 配置训练参数
trainer = trainers.BpeTrainer(
    vocab_size=30000,
    special_tokens=["<unk>", "<pad>", "<s>", "</s>"]
)

# 加载语料并训练
tokenizer.train(files=["corpus.txt"], trainer=trainer)

# 保存
tokenizer.save("custom_bpe.json")
# 使用时加载
Tokenizer.from_file("custom_bpe.json")

## 2. SentencePiece库

In [ ]:
import sentencepiece as spm

# 配置训练参数
spm.SentencePieceTrainer.Train(
    input='corpus.txt',
    model_prefix='custom_sp',
    vocab_size=20000,
    model_type='bpe',
    character_coverage=0.9995,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

# 加载使用模型
sp = spm.SentencePieceProcessor()
sp.load("custom_sp.model")

# 编码示例
print(sp.encode_as_pieces("自然语言处理"))

## 3. 扩展预训练Tokenizer
针对LLM模型添加新token的方法（以LLaMA为例）

In [ ]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. 加载基础 tokenizer 和模型
# 注意：LLaMA-3-2B 可能需要正确路径或访问权限，假设使用公开模型
model_name = "meta-llama/Llama-3-2B"  # 替换为实际可用的模型名称
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
except Exception as e:
    print(f"加载模型或tokenizer失败: {e}")
    exit()

# 2. 添加自定义标记
new_tokens = ["<think>", "</think>", "<answer>", "</answer>"]
num_new_tokens = tokenizer.add_tokens(new_tokens)
print(f"添加了 {num_new_tokens} 个新标记")

# 3. 调整模型嵌入层大小
# 确保嵌入层大小与 tokenizer 的词汇表大小一致
model.resize_token_embeddings(len(tokenizer))
print(f"模型嵌入层已调整为: {model.config.vocab_size}")

# 4. 验证编码效果
test_text = "<think>推理过程</think><answer>最终答案</answer>"
encoded = tokenizer.encode(test_text)
print(f"编码结果: {encoded}")

# 5. （可选）解码验证
decoded = tokenizer.decode(encoded)
print(f"解码结果: {decoded}")

# 6. （可选）保存修改后的 tokenizer 和模型
output_dir = "./custom_model"
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)
print(f"已保存到: {output_dir}")