In [1]:
!pip install torch transformers==4.34.0 accelerate==0.23.0 bitsandbytes==0.41.1



In [2]:
import os
import os.path as osp
import sys
import json

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from typing import Union

import torch
import transformers

import accelerate
import bitsandbytes

from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

In [3]:
assert torch.cuda.device_count() == 1

In [4]:
class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = "", verbose: bool = False):
        self._verbose = verbose
        if not template_name:
            # Enforce the default here, so the constructor can be called with '' and will not break.
            template_name = "alpaca"
        file_name = osp.join("./templates", f"{template_name}.json")
        if not osp.exists(file_name):
            raise ValueError(f"Can't read {file_name}")
        with open(file_name) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

In [5]:
load_in_8bit = True
base_model = "starmpcc/Asclepius-13B"
cache_directory = "/mnts/llm3"

prompter = Prompter("")
tokenizer = LlamaTokenizer.from_pretrained(base_model)

model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=load_in_8bit,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=cache_directory,
)

if not load_in_8bit:
    model.half()

model.eval()

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# import sys

# model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
# model.config.bos_token_id = 1
# model.config.eos_token_id = 2

# model.half()

# model.eval()

# if torch.__version__ >= "2" and sys.platform != "win32":
#     model = torch.compile(model)

In [None]:
# !pip install pynvml

In [6]:
torch.cuda.empty_cache()

with torch.autocast("cuda"):
    instruction = "What are the signs of pneumonia on a chest X-ray?"
    prompt = prompter.generate_prompt(instruction, None)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to("cuda")
    # print(input_ids)

    # eval config
    temperature = 0.2
    top_p = 0.75
    top_k = 40
    num_beams = 1
    max_new_tokens = 512

    generation_config = GenerationConfig(
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )
    # print(generation_config)

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )

    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    print(output)

<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What are the signs of pneumonia on a chest X-ray?

### Response:
The signs of pneumonia on a chest X-ray include a consolidated area in the lung, which appears as a white or cloudy patch. The edges of the patch may be sharp or rounded, and it may be surrounded by a clear or hazy border. The size of the patch can range from a few millimeters to several centimeters. In addition, the presence of a pneumothorax or collapsed lung may also be visible on a chest X-ray.</s><s>
