In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [3]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [4]:
model_path = "/Users/q616967/Workspace/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin"

In [8]:
n_gpu_layers = 1  # Change this value based on your model and your GPU VRAM pool.
    # 1 is enough for Metal installation
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    # defaults to 8

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_threads=8,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    temperature=0.0,
    max_tokens=-1,
    repeat_penalty=1,
    streaming=True,
    callback_manager=callback_manager,
    verbose=True,
)

llama.cpp: loading model from /Users/q616967/Workspace/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_head_kv  = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.11 MB
llama_model_load_internal: mem required  = 7349.72 MB (+  400.00 MB per state)
llama_new_context_with_model: kv s

In [9]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

ggml_metal_free: deallocating


In [10]:
question = """
Reading newspaper one of many ways to practice your what? 
Choices: -A: literacy, -B: knowing how to read, -C: money, -D: buying, -E: money bank
"""

llm_chain.run(question) # 1 input variable call



1. The question is asking about one of many ways to practice your...

2. The answer choices are:

A: literacy

B: knowing how to read

C: money

D: buying

E: money bank

3. Based on the context of the question, which of these options makes the most sense?

Given that the question is about reading a newspaper, the most logical answer choice is...

Answer: A: literacy.

Explanation: Reading a newspaper is an activity that involves literacy, which is the ability to read and write. Therefore, reading a newspaper is one of many ways to practice your literacy skills.


llama_print_timings:        load time = 14118.58 ms
llama_print_timings:      sample time =    32.87 ms /   151 runs   (    0.22 ms per token,  4593.30 tokens per second)
llama_print_timings: prompt eval time = 14118.50 ms /    78 tokens (  181.01 ms per token,     5.52 tokens per second)
llama_print_timings:        eval time = 22526.54 ms /   150 runs   (  150.18 ms per token,     6.66 tokens per second)
llama_print_timings:       total time = 37216.28 ms


'\n\n1. The question is asking about one of many ways to practice your...\n\n2. The answer choices are:\n\nA: literacy\n\nB: knowing how to read\n\nC: money\n\nD: buying\n\nE: money bank\n\n3. Based on the context of the question, which of these options makes the most sense?\n\nGiven that the question is about reading a newspaper, the most logical answer choice is...\n\nAnswer: A: literacy.\n\nExplanation: Reading a newspaper is an activity that involves literacy, which is the ability to read and write. Therefore, reading a newspaper is one of many ways to practice your literacy skills.'