In [1]:
# !git clone https://github.com/tloen/alpaca-lora.git

Cloning into 'alpaca-lora'...
remote: Enumerating objects: 607, done.[K
remote: Total 607 (delta 0), reused 0 (delta 0), pack-reused 607[K
Receiving objects: 100% (607/607), 27.84 MiB | 8.89 MiB/s, done.
Resolving deltas: 100% (357/357), done.


In [None]:
import torch
from peft import PeftModel
import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from transformers.generation.utils import GreedySearchDecoderOnlyOutput
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [3]:
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
 
model = LlamaForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 33/33 [00:06<00:00,  5.14it/s]


In [4]:
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b", torch_dtype=torch.float16)

In [6]:
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2
 
model = model.eval()
# model = torch.compile(model)

In [7]:
PROMPT_TEMPLATE = f"""
Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
[INSTRUCTION]
 
### Response:
"""

In [8]:
def create_prompt(instruction: str) -> str:
    return PROMPT_TEMPLATE.replace("[INSTRUCTION]", instruction)
 
print(create_prompt("What is the meaning of life?"))


Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
What is the meaning of life?
 
### Response:



In [9]:
def generate_response(prompt: str, model: PeftModel) -> GreedySearchDecoderOnlyOutput:
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to(DEVICE)
 
    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    with torch.inference_mode():
        return model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=256,
        )

In [10]:
def format_response(response: GreedySearchDecoderOnlyOutput) -> str:
    decoded_output = tokenizer.decode(response.sequences[0])
    response = decoded_output.split("### Response:")[1].strip()
    return "\n".join(textwrap.wrap(response))

In [11]:
def ask_alpaca(prompt: str, model: PeftModel = model) -> str:
    prompt = create_prompt(prompt)
    response = generate_response(prompt, model)
    print(format_response(response))

In [12]:
ask_alpaca("What is the meaning of life?")

The meaning of life is to find joy and purpose in every moment,
regardless of external circumstances. It is to live with intention and
make the most out of each day.
