In [None]:
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import (
    ChatPromptTemplate, 
    HumanMessagePromptTemplate, 
    MessagesPlaceholder
)
from langchain.schema import SystemMessage
from langchain.memory import ConversationBufferWindowMemory


# template = """Assistant is a large language model developer.
# Assistant able to help engineer to develop large language model application with langchain framework.
#
# {history}
# Human: {input}
# AI:"""
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="""[INST] <<SYS>>You are a helpful AI is a large language model developer. 
                      AI able to help engineer to develop large language model application with langchain framework.<</SYS>>
                      Current conversation: [/INST]"
                      """),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{input}"),
    ]
)

conversation_buffer = ConversationBufferWindowMemory(
    window_size=20, 
    memory_key="chat_history",
    return_messages=True)

# If you following the instruction and use the q4_0 model, you path should be: [UPDATE_AND_PUT_YOUR_PATH_TO_MODEL_HERE]/llama.cpp/models/llama-2-7b-chat/ggml-model-q4_0.gguf
# the folloiwing code use the q5_0 model
llm = LlamaCpp(
    model_path="[PATH_TO_YOUR_MODEL]/llama.cpp/models/llama-2-7b-chat/ggml-model-q5_0.gguf",
    temperature=0.25,
    n_gpu_layers=1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    verbose=False,
    stop=["Human"]
)

llm_chain = LLMChain(
    llm=llm, 
    prompt=prompt, 
    memory=conversation_buffer,
    verbose=True)


In [None]:
import gradio as gr

with gr.Blocks() as ui:
    # call back function to reconfigure the llm model
    def reconfigure_llm_model(temperature, gpu_layers, batch, ctx, is_verbose):
        print(temperature, gpu_layers, batch, ctx, is_verbose)
        llm = LlamaCpp(
            model_path="[PATH_TO_YOUR_MODEL]/llama.cpp/models/llama-2-7b-chat/ggml-model-q5_0.gguf",
            temperature=temperature,
            n_gpu_layers=gpu_layers,
            n_batch=batch,
            n_ctx=ctx,
            f16_kv=True,
            verbose=False,
            stop=["Human"]
        )

        llm_chain = LLMChain(
            llm=llm, 
            prompt=prompt, 
            memory=conversation_buffer,
            verbose=is_verbose)
    
    # call back function to predict the response
    def predict(message, history):
        '''
        Function that will be called when the user hits the submit button.
        the history parameter is not used since it is managed by the ConversationBufferWindowMemory.
        '''
        resp = llm_chain.invoke({"input": message})
        response_text = resp["text"].strip().replace("AI:", "")
        return response_text

    # use according to and group all teh LLM model configuration inputs
    with gr.Accordion("LLM Model Configuration"):
        with gr.Group():
            with gr.Row():
                with gr.Column():
                    n_gpu_layers = gr.Number(value=2, label="GPU Layers", minimum=1, maximum=4)
                with gr.Column():
                    n_batch = gr.Number(label="Batch", minimum=256, maximum=2048, value=256, step=256)
                with gr.Column():
                    n_ctx = gr.Number(label="Context", minimum=512, maximum=4096, value=2048, step=512)
            slider = gr.Slider(0, 1, step=0.05, label="Temperature", value=0.75)
            with gr.Row():
                is_verbose = gr.Checkbox(label="Verbose", value=True, info="verbose output")
        update_btn = gr.Button(value="Update")
    
    chatui = gr.ChatInterface(
        predict,
        retry_btn=None,
        undo_btn=None,
        clear_btn=None,
        submit_btn="Send")
    
    update_btn.click(
        fn=reconfigure_llm_model,
        inputs=[
            slider,
            n_gpu_layers,
            n_batch,
            n_ctx,
            is_verbose
        ],
        outputs=None
    )

ui.launch()

In [None]:
ui.close()