In [12]:
import os
import platform

import torch
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer


REPO_ID = "Qwen/Qwen2.5-0.5B-Instruct"
if platform.system() == "Windows":
    SAVE_DIR = f"D:/models/{REPO_ID}"
else:
    # platform.system() == "Linux":
    SAVE_DIR = f"/data/model/{REPO_ID}"

use_cpu = 0
if use_cpu:
    logger.info("Using CPU for inference")
    device = "cpu"
else:
    logger.info("Using GPU for inference")
    if torch.cuda.is_available():
        logger.info(f"CUDA available, using {torch.cuda.get_device_name(0)}")
        device = "cuda:2"
    else:
        logger.error("CUDA not available, falling back to CPU")
        device = "cpu"
        use_cpu = 1


model = AutoModelForCausalLM.from_pretrained(
    SAVE_DIR, device_map=device, torch_dtype=torch.bfloat16
)

tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
messages = [
    # {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]

raw_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
ic(raw_prompt, type(raw_prompt))

raw_prompt2 = tokenizer.apply_chat_template(
    [messages, messages], tokenize=False, add_generation_prompt=True
)
ic(raw_prompt2, type(raw_prompt2))

# raw prompt_ids is a list of integers
raw_prompt_ids = tokenizer.encode(raw_prompt, add_special_tokens=False)
ic(raw_prompt_ids, type(raw_prompt_ids))
tokenized_chat = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
)
tokenized_chat = tokenized_chat.to(device)
ic(tokenized_chat, type(tokenized_chat))
ic(tokenizer.decode(tokenized_chat[0]))

model_inputs = tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
ic(model_inputs)

if not use_cpu:
    outputs = model.generate(tokenized_chat, max_new_tokens=128)
    ic(outputs, type(outputs))
    ic(tokenizer.decode(outputs[0]))
print("Done")

[32m2025-06-08 20:49:52.650[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mUsing GPU for inference[0m
[32m2025-06-08 20:49:52.652[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCUDA available, using NVIDIA A100-SXM4-80GB[0m
ic| 2652017993.py:48 in <module>
    raw_prompt: <|im_start|>system
                You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
                <|im_start|>user
                How many helicopters can a human eat in one sitting?<|im_end|>
                <|im_start|>assistant
    type(raw_prompt): <class 'str'>
ic| 2652017993.py:53 in <module>
    raw_prompt2: ['<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow many helicopters can a human eat in one sitting?<|im_end|>\n<|im_start|>assistant\n', '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>

Done


In [9]:
chat = [
    {"role": "user", "content": "Can you format the answer in JSON?"},
    {"role": "assistant", "content": '{"name": "'},
]

formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt", continue_final_message=True)
print(formatted_chat)
formatted_chat = formatted_chat.to(device)
outputs = model.generate(formatted_chat, max_new_tokens=800)
print(tokenizer.decode(outputs[0]))

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,   6713,    498,   3561,
            279,   4226,    304,   4718,     30, 151645,    198, 151644,  77091,
            198,   4913,    606,    788,    330]])
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you format the answer in JSON?<|im_end|>
<|im_start|>assistant
{"name": "formatting_answer", "input": {"text": "Yes"}, "output": {"answer": "Yes"} }<|im_end|>
