In [None]:
# !pip install transformers peft accelerate

In [1]:
import os
import os.path as osp
import sys
import json

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from typing import Union

import torch
import transformers

import accelerate
import bitsandbytes

from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/Python-3.9-CUDA/lib/python3.9/site-packages/bitsandbytes-0.40.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda112.so
CUDA SETUP: CUDA runtime path found: /opt/conda/envs/Python-3.9-CUDA/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /opt/conda/envs/Python-3.9-CUDA/lib/python3.9/site-packages/bitsandbytes-0.40.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda112.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("./templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

In [3]:
cache_directory = "/mnts/llm3"

# params
base_model = "huggyllama/llama-65b"
# base_model = "huggyllama/llama-30b"
device = "cuda:0"
lora_weights = "chansung/alpaca-lora-65b"
# lora_weights = "baseten/alpaca-30b"

prompter = Prompter("")

tokenizer = LlamaTokenizer.from_pretrained(base_model)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [4]:
# load_8bit = True
# to run this with load_8bit=True, you need to run the cp commands from the below two issues:
# https://github.com/tloen/alpaca-lora/issues/46
# https://github.com/tloen/alpaca-lora/issues/294

load_in_4bit = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=cache_directory,
)
model = PeftModel.from_pretrained(
    model,
    lora_weights,
    # torch_dtype=torch.float16,
    # cache_dir=cache_directory,
)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [5]:
import sys

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

model.eval()

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

In [6]:
model

OptimizedModule(
  (_orig_mod): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(32000, 8192, padding_idx=0)
          (layers): ModuleList(
            (0-79): 80 x LlamaDecoderLayer(
              (self_attn): LlamaAttention(
                (q_proj): Linear(
                  in_features=8192, out_features=8192, bias=False
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=8192, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=8192, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                )
                (k_proj): Line

In [7]:
with torch.autocast("cuda"): # fix from https://github.com/tloen/alpaca-lora/issues/203
    instruction = "What are the signs of pneumonia on a chest X-ray?"
    prompt = prompter.generate_prompt(instruction, None)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    print(input_ids)

    # eval config
    temperature = 0.1
    top_p = 0.75
    top_k = 40
    num_beams = 4
    max_new_tokens = 128

    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )
    print(generation_config)

    # generate_params = {
    #     "input_ids": input_ids,
    #     "generation_config": generation_config,
    #     "return_dict_in_generate": True,
    #     "output_scores": True,
    #     "max_new_tokens": max_new_tokens,
    # }

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )

    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    print(output)

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009,
         29889,    13,    13,  2277, 29937,  2799,  4080, 29901,    13,  5618,
           526,   278, 18906,   310,   282, 29765,  6405,   373,   263,   521,
           342,  1060, 29899,   764, 29973,    13,    13,  2277, 29937, 13291,
         29901,    13]], device='cuda:0')
GenerationConfig {
  "num_beams": 4,
  "temperature": 0.1,
  "top_k": 40,
  "top_p": 0.75,
  "transformers_version": "4.30.2"
}



RuntimeError: mat1 and mat2 shapes cannot be multiplied (208x8192 and 1x33554432)