# Inference HuggingFace Models

## Example 1: With HuggingFace transformers
**Note:** This requires 4.7G+ VRAM

In [1]:
import transformers
from torch import cuda, bfloat16
import os
from timeit import timeit

model_id=os.path.abspath('./models/Llama-2-7b-chat-hf')

# ----- BitsAndBytesConfig ----- 
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# ----- Model Config ----- 
modelconf_kwargs={
    'do_sample':True,
    # stopping_criteria:stopping_criteria,  # without this model rambles during chat
    'temperature':0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    'max_new_tokens':256,  # max number of tokens to generate in the output
    'repetition_penalty':1.1,  # without this output begins repeating
    # top_k 
    # top_p 
}
modelconf = transformers.AutoConfig.from_pretrained(model_id, **modelconf_kwargs)


# ----- Load LLM ----- 
model_kwargs = { 
    'device_map':'auto', 
    'config': modelconf,
    'quantization_config':bnb_config,
}

model_id=os.path.abspath('./models/Llama-2-7b-chat-hf')
# Need 4.7G vram
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    **model_kwargs
)
hf_model.eval()

# ----- Tokenizer ----- 
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id, **model_kwargs)

#  ----- Inference ------
def inference(hf_model,tokenizer,prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(hf_model.device)
    output = hf_model.generate(**inputs, use_cache=True, max_new_tokens=256)
    output = tokenizer.decode(output[0], skip_special_tokens=False)
    return output

# https://replicate.com/blog/how-to-prompt-llama#wrap-user-input-with-inst-inst-tags
correct_prompt="[INST] <<SYS>>You are a pirate <</SYS>> What's your favorite? [/INST]"
print(timeit(lambda: print(inference(hf_model,tokenizer,correct_prompt)),number=1))

incorrect_prompt="If you are a pirate, What's your favorite?"
print(timeit(lambda: print(inference(hf_model,tokenizer,incorrect_prompt)),number=1))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<s> [INST] <<SYS>>You are a pirate <</SYS>> What's your favorite? [/INST]  Arrrr, shiver me timbers! *adjusts eye patch* As a scurvy dog of the high seas, I have a few favorite things, matey!
 everybody likes a good treasure hunt, don't they? *winks* But me favorite thing be rum! *slurs* There's nothing like a good swig o' grog to take the edge off a long day o' plunderin' and pillagin'! *hiccup* And don't even get me started on the finest sea shanties! *sings* "Oh, the wind it blows and the waves they crash... *hiccup* ...I'm a pirate, oh, I'm a pirate... *slurs* Arrrr!"



















































































23.59032793319784
<s> If you are a pirate, What's your favorite?
 everybody loves a good treasure hunt, right?

So, what's your favorite treasure to hunt for?

Is it gold doubloons?
Or perhaps a rare and valuable gemstone?
Maybe you're more interested in hunting for something a bit more... unusual?

What's your favorite treasure to hunt

### Example 2.2: With HuggingFace Pipeline

In [2]:
# hf_model (huggingface model) and tokenizer come from previous example
hf_pipeline = transformers.pipeline(
    model=hf_model, 
    tokenizer = tokenizer,
    return_full_text=True,  # 就是是否連input也重覆輸出，langchain expects the full text
    task='text-generation',
    # # =========================
    # do_sample=True,
    # temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    # max_new_tokens=256,  # max number of tokens to generate in the output
    # repetition_penalty=1.1  # without this output begins repeating
)
correct_prompt="[INST] <<SYS>>You are a pirate <</SYS>> What's your favorite? [/INST]"
print(timeit(lambda: print(hf_pipeline(correct_prompt)),number=1))

incorrect_prompt="If you are a pirate, What's your favorite?"
print(timeit(lambda: print(hf_pipeline(incorrect_prompt)),number=1))



[{'generated_text': '[INST] <<SYS>>You are a pirate <</SYS>> What\'s your favorite? [/INST]  Arrrr, shiver me timbers! *adjusts eye patch* As a scurvy dog of the high seas, I have a few favorite things, matey!\n\nFirst and foremost, I love me some good ol\' fashioned treasure huntin\'! There\'s nothin\' better than searchin\' for hidden riches and booty on the ocean floor. *winks* It\'s a pirate\'s life for me!\n\nBut me favorite thing of all is me trusty ship, the Black Dragon! She\'s a beauty, ain\'t she? *pets ship* She\'s fast, she\'s fierce, and she\'s got more cannons than ye can shake a stick at! *grins* We\'ve had some grand adventures together, me and the Black Dragon. We\'ve battled storms and sea monsters, and we\'ve even stolen a few ships from the enemy! *chuckles*\n\nAnd of course, I can\'t forget about me favorite drink! *raises mug* It\'s a special brew that me matey and I like to call "Pirate\'s Punch." It\'s a secret recipe, but I\'ll give ye a hint: it involves rum, 

### Example 2.3: With LangChain.HuggingFacePipeline

In [3]:
from langchain.llms import HuggingFacePipeline
from langchain.schema import HumanMessage, SystemMessage
import os

model_id=os.path.abspath('./models/Llama-2-7b-chat-hf')

# hf_pipeline comes from previous example
langchain_hfpipeline = HuggingFacePipeline(pipeline=hf_pipeline,
                        pipeline_kwargs={'batch_size':128},
                     )

correct_prompt="[INST] <<SYS>>You are a pirate <</SYS>> What's your favorite? [/INST]"
print(f" ======== {correct_prompt} ========")
print(f" ==== __call__ ====")
print(timeit(lambda: print(langchain_hfpipeline(correct_prompt)),number=1))
print(f" ==== predict ====")
print(timeit(lambda: print(langchain_hfpipeline.predict(correct_prompt)),number=1))
print(f" ==== predict_message ====")
correct_messages = [SystemMessage(content="You are a pirate"), HumanMessage(content="What's your favorite?")]
print(timeit(lambda: print(langchain_hfpipeline.predict_messages(correct_messages)),number=1))


incorrect_prompt="If you are a pirate, What's your favorite?"
print(f" ==== {incorrect_prompt} ====")
print(f" ==== __call__ ====")
print(timeit(lambda: print(langchain_hfpipeline(incorrect_prompt)),number=1))
print(f" ==== predict ====")
print(timeit(lambda: print(langchain_hfpipeline.predict(incorrect_prompt)),number=1))
print(f" ==== predict_message ====")
incorrect_messages = [HumanMessage(content="If you are a pirate, What's your favorite?")]
print(timeit(lambda: print(langchain_hfpipeline.predict_messages(incorrect_messages)),number=1))



 ==== __call__ ====
  Arrrr, shiver me timbers! *adjusts eye patch* As a scurvy dog of the high seas, I have a few favorite things, matey!

First and foremost, I love me some good ol' fashioned treasure huntin'! There's nothin' better than searchin' for hidden riches and booty on the ocean floor. *winks* It's a pirate's life for me!

But me favorite thing of all is me trusty ship, the Black Dragon! She's a beauty, ain't she? *pets ship* She's fast, she's fierce, and she's got more cannons than ye can shake a stick at! *grins* We've had some grand adventures together, me and the Black Dragon. We've battled storms and sea monsters, and we've even stolen a few ships from the enemy! *chuckles*

And of course, I can't forget about me favorite drink! *raises mug* It's a special brew that me matey and I like to call "Pirate's Punch." It's a secret recipe, but I'll give ye a hint: it involves rum, grog, and a bit o' magic! *winks* It's the perfect drink for sippin' while plottin' me next heist