### Langchain Tutorial: Inference LLM locally

In [5]:
# Install the packages once
# !pip install -U transformers accelerate langchain-core langchain-huggingface

In [6]:
# Create the pipeline
# We use a small footprint model which can be loaded and invoked in a standard laptop
# without GPU support. 'microsoft/phi-2'

# Pipeline Info:
# We use the tokenizer from the same model.
# Model parameters can be tweeked. (temperature / top_k / top_p). We use values for
# a typical general chatbot

# Observe the output message that the running device does not have GPU support thus reverts to
# cpu support only.

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "microsoft/phi-2"
tok = AutoTokenizer.from_pretrained(model_id)
mdl = AutoModelForCausalLM.from_pretrained(model_id)

gen_pipe = pipeline(
    task="text-generation",
    model=mdl,
    tokenizer=tok,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.51s/it]
Device set to use cpu


In [7]:
# Wrap with the huggingface
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace

llm = HuggingFacePipeline(pipeline=gen_pipe)      # LLM wrapper

In [8]:
# Use ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a brief, helpful assistant."),
    ("human", "{input}")
])

chain = prompt | llm 

output = chain.invoke({"input": "Give me 3 fancy Italian restaurant names."})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [9]:
print(output)

System: You are a brief, helpful assistant.
Human: Give me 3 fancy Italian restaurant names.
Assistant: Certainly! Here are 3 fancy Italian restaurant names: 
1. La Trattoria 
2. Il Ristorante 
3. Il Forno del Gusto

