In [None]:
import os
import platform

import torch
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2Tokenizer
from icecream import ic
ic.configureOutput(includeContext=True, argToStringFunction=str)
ic.lineWrapWidth = 120


REPO_ID = "Qwen/Qwen2.5-0.5B-Instruct"
if platform.system() == "Windows":
    SAVE_DIR = f"D:/models/{REPO_ID}"
else:
    # platform.system() == "Linux":
    SAVE_DIR = f"/data/models/{REPO_ID}"

use_cpu = 1
if use_cpu:
    logger.info("Using CPU for inference")
    device = "cpu"
else:
    logger.info("Using GPU for inference")
    if torch.cuda.is_available():
        logger.info(f"CUDA available, using {torch.cuda.get_device_name(0)}")
        device = "cuda:2"
    else:
        logger.error("CUDA not available, falling back to CPU")
        device = "cpu"
        use_cpu = 1

tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
# len(raw_prompt_ids): 99, len(cn_sentence): 132，中华人民共和国，只有2个tokens，但是12.12301是8个tokens。
cn_sentence = "中华人民共和国在中文文本中，汉字数量与大语言模型（LLM）分词后 token 数量之间的比例，取决于模型所使用的（分词器），尤其是：是中英混合的 tokenizer（如 GPT-4 使用的 BPE 或 tiktoken）还是 专为中文设计的下面是一个总体参考估算："
# len(raw_prompt_ids): 105, len(cn_sentence): 134
cn_sentence1 = "12.12301 在中文文本中，汉字数量与大语言模型（LLM）分词后 token 数量之间的比例，取决于模型所使用的（分词器），尤其是：是中英混合的 tokenizer（如 GPT-4 使用的 BPE 或 tiktoken）还是 专为中文设计的下面是一个总体参考估算："
messages = [
    # {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
    {"role": "user", "content": cn_sentence},
]
raw_ids = tokenizer.encode(cn_sentence, add_special_tokens=False)
# len(raw_ids): 70, len(cn_sentence): 132
ic(len(raw_ids), len(cn_sentence))
raw_ids = tokenizer.encode(cn_sentence1, add_special_tokens=False)
# len(raw_ids): 76, len(cn_sentence1): 134
ic(len(raw_ids), len(cn_sentence1))
raw_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
ic(raw_prompt, type(raw_prompt))

raw_prompt2 = tokenizer.apply_chat_template(
    [messages, messages], tokenize=False, add_generation_prompt=True
)
ic(raw_prompt2, type(raw_prompt2))

# raw prompt_ids is a list of integers
raw_prompt_ids = tokenizer.encode(raw_prompt, add_special_tokens=False)
ic(len(raw_prompt_ids), len(cn_sentence))
ic(raw_prompt_ids, type(raw_prompt_ids))
tokenized_chat = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
)
tokenized_chat = tokenized_chat.to(device)
ic(tokenized_chat, type(tokenized_chat))
ic(tokenizer.decode(tokenized_chat[0]))
ic(tokenizer.batch_decode(tokenized_chat))


[32m2025-08-05 22:43:55.104[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mUsing CPU for inference[0m


ic| 2485995145.py:44 in <module>- len(raw_ids): 70, len(cn_sentence): 132
ic| 2485995145.py:47 in <module>- len(raw_ids): 76, len(cn_sentence1): 134
ic| 2485995145.py:51 in <module>
    raw_prompt: <|im_start|>system
                You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
                <|im_start|>user
                中华人民共和国在中文文本中，汉字数量与大语言模型（LLM）分词后 token 数量之间的比例，取决于模型所使用的（分词器），尤其是：是中英混合的 tokenizer（如 GPT-4 使用的 BPE 或 tiktoken）还是 专为中文设计的下面是一个总体参考估算：<|im_end|>
                <|im_start|>assistant
    type(raw_prompt): <class 'str'>
ic| 2485995145.py:56 in <module>
    raw_prompt2: ['<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n中华人民共和国在中文文本中，汉字数量与大语言模型（LLM）分词后 token 数量之间的比例，取决于模型所使用的（分词器），尤其是：是中英混合的 tokenizer（如 GPT-4 使用的 BPE 或 tiktoken）还是 专为中文设计的下面是一个总体参考估算：<|im_end|>\n<|im_start|>assistant\n', '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful ass

['<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n中华人民共和国在中文文本中，汉字数量与大语言模型（LLM）分词后 token 数量之间的比例，取决于模型所使用的（分词器），尤其是：是中英混合的 tokenizer（如 GPT-4 使用的 BPE 或 tiktoken）还是 专为中文设计的下面是一个总体参考估算：<|im_end|>\n<|im_start|>assistant\n']

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    SAVE_DIR, device_map=device, torch_dtype=torch.bfloat16
)
model_inputs = tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
ic(model_inputs)

if not use_cpu:
    outputs = model.generate(tokenized_chat, max_new_tokens=128)
    ic(outputs, type(outputs))
    ic(tokenizer.decode(outputs[0]))
print("Done")

In [9]:
chat = [
    {"role": "user", "content": "Can you format the answer in JSON?"},
    {"role": "assistant", "content": '{"name": "'},
]

formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt", continue_final_message=True)
print(formatted_chat)
formatted_chat = formatted_chat.to(device)
outputs = model.generate(formatted_chat, max_new_tokens=800)
print(tokenizer.decode(outputs[0]))

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,   6713,    498,   3561,
            279,   4226,    304,   4718,     30, 151645,    198, 151644,  77091,
            198,   4913,    606,    788,    330]])
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you format the answer in JSON?<|im_end|>
<|im_start|>assistant
{"name": "formatting_answer", "input": {"text": "Yes"}, "output": {"answer": "Yes"} }<|im_end|>
