In [4]:
import os
import platform

import torch
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer


REPO_ID = "Qwen/Qwen2.5-0.5B-Instruct"
if platform.system() == "Windows":
    SAVE_DIR = f"D:/models/{REPO_ID}"
else:
    # platform.system() == "Linux":
    SAVE_DIR = f"/data/model/{REPO_ID}"

device = 'cuda:1'

model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map=device, torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
messages = [
    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]

tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
tokenized_chat = tokenized_chat.to(device)
print(tokenizer.decode(tokenized_chat[0]))
outputs = model.generate(tokenized_chat, max_new_tokens=128) 
print(tokenizer.decode(outputs[0]))

ss = ['', 'help', 'help me'
'']
r = tokenizer(
            ss, 
            padding='longest',
            return_tensors='pt',
            add_special_tokens=False,  # Prevents adding special tokens
        )['input_ids']
print(f'{r = }')

<|im_start|>system
You are a friendly chatbot who always responds in the style of a pirate<|im_end|>
<|im_start|>user
How many helicopters can a human eat in one sitting?<|im_end|>
<|im_start|>assistant

<|im_start|>system
You are a friendly chatbot who always responds in the style of a pirate<|im_end|>
<|im_start|>user
How many helicopters can a human eat in one sitting?<|im_end|>
<|im_start|>assistant
A human cannot eat more than 3 helicopters at once.<|im_end|>
r = tensor([[151643, 151643],
        [  8653, 151643],
        [  8653,    752]])


In [9]:
chat = [
    {"role": "user", "content": "Can you format the answer in JSON?"},
    {"role": "assistant", "content": '{"name": "'},
]

formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt", continue_final_message=True)
print(formatted_chat)
formatted_chat = formatted_chat.to(device)
outputs = model.generate(formatted_chat, max_new_tokens=800)
print(tokenizer.decode(outputs[0]))

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,   6713,    498,   3561,
            279,   4226,    304,   4718,     30, 151645,    198, 151644,  77091,
            198,   4913,    606,    788,    330]])
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you format the answer in JSON?<|im_end|>
<|im_start|>assistant
{"name": "formatting_answer", "input": {"text": "Yes"}, "output": {"answer": "Yes"} }<|im_end|>
