In [None]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

### Using a QLoRA Finetuned Model For Inference

A QLoRA adapter can be loaded along with the base model by using the `AutoPeftModelForCausalLM` class.  By loading the model in this way, it can be used as in any other HuggingFace LLM. 

In [None]:
lora_path = "llama-7-int4-dolly"
model = AutoPeftModelForCausalLM.from_pretrained(
    lora_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(lora_path)


### Text Generation using the Finetuned Model

Often time, a finetuned model expects a specific prompt format in order to behave in the way that it was trained for.  

In this example, we used the instruction finetuning example format with `### Input:` and `### Response:` tags, where our instruction with follow the input tag, and we want the model to respond after the response tag.

In [None]:
def format_prompt(instruction):
    # Convenience function to format our prompt correctly for the bot
    return  f"""### Instruction:
    Use the following Input and come up with a structured response.
    
    ### Input:
    {instruction}

    ### Response:
    """

To feed in an instruction to our model, we will:

1. Format the instruction in the expected prompt format
2. Encode the prompt into tokens using the tokenizer
3. Call `generate` with the tokenized prompt and pass in any generation parameters we want
4. Decode the output using the tokenizer

In [None]:
instruction = "Red, Green, Blue"
prompted_instruction = format_prompt(instruction)
input_ids = tokenizer(
    prompted_instruction,
    return_tensors="pt", 
    truncation=True).input_ids.cuda()

with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)


print(f"Prompt:\n{instruction}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompted_instruction):]}")