In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.generation.utils import GenerationConfig
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-31 20:34:07,049] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
model_path = "/data/lzj/LLaMA-Efficient-Tuning/llama-2-70b-chat-hf"
lora_path = ""

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [4]:
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True, quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
        ),)

Loading checkpoint shards: 100%|██████████| 15/15 [02:05<00:00,  8.34s/it]


In [5]:
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config.max_new_tokens = 128
generation_config.temperature = 0.7
generation_config.top_k = 50
generation_config.top_p = 0.95

In [None]:
model = PeftModel.from_pretrained(model, "/data/lzj/LLaMA-Efficient-Tuning/modeltest0731_sft")

In [9]:
message = """<s>Human: 
We now have the following financial data:

moneyflow: Individual stock fund flow
trade_daily_data: Daily line chart data
income: Financial Income statement data
balancesheet: balance sheet

What data do you think selecting from above can help us intuitively analyze this user question? When selecting data, please consider the requirements in the user question: 我想要查看平安银行的债务情况。

Your answer should meet the following requirements:
1. Strictly output in the format ["Your Choice A", ...]
2. You only need to reply according to the output format without giving any additional information.
</s><s>Assistant: """
inputs = tokenizer(message, return_tensors="pt",add_special_tokens=False)

In [10]:
inputs['input_ids'] = inputs['input_ids'].cuda()

In [11]:
response = model.generate(**inputs, generation_config=generation_config)

In [12]:
tokenizer.decode(response[0])

'<s> Human: \nWe now have the following financial data:\n\nmoneyflow: Individual stock fund flow\ntrade_daily_data: Daily line chart data\nincome: Financial Income statement data\nbalancesheet: balance sheet\n\nWhat data do you think selecting from above can help us intuitively analyze this user question? When selecting data, please consider the requirements in the user question: 我想要查看平安银行的债务情况。\n\nYour answer should meet the following requirements:\n1. Strictly output in the format ["Your Choice A", ...]\n2. You only need to reply according to the output format without giving any additional information.\n</s><s> Assistant: \n["trade_daily_data", "balancesheet"]</s>'