In [None]:
# !pip install torch transformers peft accelerate

In [1]:
import os
import os.path as osp
import sys
import json

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from typing import Union

import torch
import transformers

import accelerate
import bitsandbytes

from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/Python-3.10-CUDA/lib/python3.10/site-packages/bitsandbytes-0.39.1-py3.10.egg/bitsandbytes/libbitsandbytes_cuda114.so
CUDA SETUP: CUDA runtime path found: /opt/conda/envs/Python-3.10-CUDA/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 114
CUDA SETUP: Loading binary /opt/conda/envs/Python-3.10-CUDA/lib/python3.10/site-packages/bitsandbytes-0.39.1-py3.10.egg/bitsandbytes/libbitsandbytes_cuda114.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [3]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [9]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  32982 MiB |  33102 MiB | 160531 MiB | 127548 MiB |
|       from large pool |  32525 MiB |  32751 MiB | 159912 MiB | 127387 MiB |
|       from small pool |    457 MiB |    577 MiB |    618 MiB |    160 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  32982 MiB |  33102 MiB | 160531 MiB | 127548 MiB |
|       from large pool |  32525 MiB |  32751 MiB | 159912 MiB | 127387 MiB |
|       from small pool |    457 MiB |    577 MiB |    618 MiB |    160 MiB |
|---------------------------------------------------------------

In [5]:
class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("./templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

In [6]:
device = "cuda:0"
base_model = "huggyllama/llama-65b"
lora_weights = "chansung/alpaca-lora-65b"
cache_directory = "/mnts/llm3"

prompter = Prompter("")

tokenizer = LlamaTokenizer.from_pretrained(base_model)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [7]:
# to run this with load_8bit=True, you need to run the cp commands from the below two issues:
# https://github.com/tloen/alpaca-lora/issues/46
# https://github.com/tloen/alpaca-lora/issues/294

# model = LlamaForCausalLM.from_pretrained(
#         base_model,
#         load_in_8bit=True,
#         torch_dtype=torch.float16,
#         device_map="auto",
#         cache_dir=cache_directory,
#     )
# model = PeftModel.from_pretrained(model, lora_weights)

# Downgrade to accelerate==0.19.0
# https://github.com/artidoro/qlora/issues/151#issuecomment-1629989108

load_in_4bit = True
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=cache_directory,
)
model = PeftModel.from_pretrained(
    model,
    lora_weights,
)

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [8]:
import sys

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

model.half()

model.eval()

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

In [10]:
model.device

device(type='cuda', index=0)

In [None]:
!pip install pynvml

In [None]:
torch.cuda.empty_cache()
torch.cuda.get_device_properties("cuda:0")

In [11]:
with torch.autocast("cuda"): # fix from https://github.com/tloen/alpaca-lora/issues/203
    instruction = "What are the signs of pneumonia on a chest X-ray?"
    prompt = prompter.generate_prompt(instruction, None)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    print(input_ids)

    # eval config
    temperature = 0.1
    top_p = 0.75
    top_k = 40
    num_beams = 1
    max_new_tokens = 128

    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )
    print(generation_config)

    # generate_params = {
    #     "input_ids": input_ids,
    #     "generation_config": generation_config,
    #     "return_dict_in_generate": True,
    #     "output_scores": True,
    #     "max_new_tokens": max_new_tokens,
    # }

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )

    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    print(output)

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009,
         29889,    13,    13,  2277, 29937,  2799,  4080, 29901,    13,  5618,
           526,   278, 18906,   310,   282, 29765,  6405,   373,   263,   521,
           342,  1060, 29899,   764, 29973,    13,    13,  2277, 29937, 13291,
         29901,    13]], device='cuda:0')
GenerationConfig {
  "temperature": 0.1,
  "top_k": 40,
  "top_p": 0.75,
  "transformers_version": "4.32.0.dev0"
}

<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What are the signs of pneumonia on a chest X-ray?

### Response:
Signs of pneumonia on a chest X-ray include consolidation, which is a white or gray area in the lungs, and air bronchograms, which are air-filled bronchi that appear as dark lines on the X-ray. Other signs include pleural effusion, which is fluid in the 