In [None]:
from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

### Inference from a Pre-trained or Fine-tuned Model

In [None]:
model_path_or_id = "NousResearch/Llama-2-7b-hf"
lora_path = None

In [None]:
if lora_path:
    # load base LLM model with PEFT Adapter
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        load_in_4bit=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(lora_path)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)

In [None]:
# Prepare the input for for tokenization, attach any prompt that should be needed
PROMPT_TEMPLATE = """### Instruction:
Use the following Input and come up with a structured response.

### Input:
{instruction}

### Response:
"""
instruction = "Tell me all of the moon phases."

# Tokenize the input
input_ids = tokenizer(
    PROMPT_TEMPLATE.format(instruction=instruction), 
    return_tensors="pt", 
    truncation=True).input_ids.cuda()

# Generate new tokens based on the prompt, up to max_new_tokens
# Sample aacording to the parameter
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=100, 
        do_sample=True, 
        top_p=0.9,
        temperature=0.9,
        use_cache=True
    )

print(f"Prompt:\n{instruction}\n")
print(f"Generated Response:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(PROMPT_TEMPLATE.format(instruction=instruction)):]}")