### Instruction:

1. On the menu bar, hit "Runtime", hit "Run all"
2. After it finished running, you will see the chatbot application popped up at the bottom of this notebook.
3. You can click the gradio link shown and then interact with the chatbot

In [1]:
import subprocess
import time

try:
    # Step 1: Install Ollama
    install_command = "curl -fsSL https://ollama.com/install.sh | sh"
    subprocess.run(install_command, shell=True, check=True)
    print("Ollama installed successfully.")

    # Step 2: Start the Ollama server
    serve_command = "ollama serve"
    server_process = subprocess.Popen(serve_command, shell=True)
    print("Ollama server is starting...")

    # Give the server a few seconds to start
    time.sleep(10)

    # Step 3: Pull the Llama model
    pull_command = "ollama pull llama3.2"
    subprocess.run(pull_command, shell=True, check=True)
    print("Llama model pulled successfully.")

except subprocess.CalledProcessError as e:
    print(f"Error occurred: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


Ollama installed successfully.
Ollama server is starting...
Llama model pulled successfully.


In [2]:
!ollama list

NAME               ID              SIZE      MODIFIED               
llama3.2:latest    a80c4f17acd5    2.0 GB    Less than a second ago    


In [3]:
!pip install langchain_community
!pip install langchain
!pip install langchain_huggingface
!pip install langchain_ollama
!pip install chromadb
!pip install gradio
!pip install datasets
!pip install sentence-transformers

Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain_community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain_community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [4]:
# Imports

from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
import textwrap
import gradio as gr


In [5]:
# Set up Text Splitter

def setup_text_splitter(split_separator, split_chunk_size, split_chunk_overlap_size, split_length_function):

    text_splitter = CharacterTextSplitter(
        separator = split_separator,
        chunk_size = split_chunk_size,
        chunk_overlap = split_chunk_overlap_size,
        length_function = split_length_function)

    return text_splitter

In [6]:
# Load the external database for RAG and setting up Embedding

def load_and_process_data(dataset_name, page_content_column, text_splitter):

    loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
    data = loader.load()
    split_data = text_splitter.split_documents(data)

    return split_data

In [7]:
def setup_embedding(embedding_model_choice, embed_device_choice, embed_normalization_option):

    hf_embeddings = HuggingFaceEmbeddings(model_name=embedding_model_choice,
        model_kwargs = {'device': embed_device_choice},
        encode_kwargs = {'normalize_embeddings': embed_normalization_option}
                                         )

    return hf_embeddings

In [8]:
def setup_vectordb_retriever(split_data, hf_embeddings, persist_directory_location, retrieve_k_choice, retrieve_search_type_choice):

    vectordb = Chroma.from_documents(
    documents=split_data,
    embedding=hf_embeddings,
    persist_directory=persist_directory_location
    )

    retriever = vectordb.as_retriever(search_kwargs={"k": retrieve_k_choice}, search_type=retrieve_search_type_choice)

    return retriever


In [9]:
def setup_memory(memory_key_name, memory_input_name, memory_output_name, memory_return_message_option):

    memory = ConversationBufferMemory(
    memory_key = memory_key_name,
    input_key = memory_input_name,
    output_key = memory_output_name,
    return_messages = memory_return_message_option
    )

    return memory

In [10]:
def setup_ollama_model(ollama_model_choice, ollama_temp):

    llm_chosen = OllamaLLM(model = ollama_model_choice, temperature = ollama_temp)

    return llm_chosen

In [11]:
def setup_prompt(base_prompt_template, prompt_input_list):

    base_prompt = PromptTemplate(
            template = base_prompt_template,
            input_variables = prompt_input_list)

    return base_prompt

In [12]:
def build_rag_chain(llm_chosen, retriever, memory, chain_return_source_option, chain_return_generate_quest_option, chain_verbose_option, base_prompt):

    llm_with_rag_chain_and_memory = ConversationalRetrievalChain.from_llm(
        llm = llm_chosen,
        retriever = retriever,
        memory = memory,
        return_source_documents = chain_return_source_option,
        return_generated_question = chain_return_generate_quest_option,
        verbose = chain_verbose_option,
        combine_docs_chain_kwargs = {'prompt': base_prompt}
        )

    return llm_with_rag_chain_and_memory

In [13]:
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


In [14]:
def format_response_with_source_and_memory(llm_response):
    # Initialize an empty list to collect all parts of the output
    output = []

    # Add the answer
    output.append('\n\n==================== Chatbot Response:====================')
    output.append(wrap_text_preserve_newlines(llm_response['answer']))

    # Add sources
    output.append('\n\n====================Other Relevant Information and Sources:====================')
    for source in llm_response["source_documents"]:
        output.append(source.metadata['question'])
        output.append(source.page_content)

    # Add history
    output.append('\n\n====================Chat History:====================')
    for history in llm_response['chat_history']:
        output.append(history.content)

    # Combine all parts into a single string and return
    return '\n'.join(output)

In [15]:
def talk_to_chatbot(input_question):

    llm_response = llm_with_rag_chain_and_memory.invoke(input_question)
    chatbot_answer = format_response_with_source_and_memory(llm_response)

    return chatbot_answer

In [16]:
def clear_chat_history(clear_memory=True):
    if clear_memory:
        return memory.clear()

In [17]:
# Set Variables

dataset_name = "MakTek/Customer_support_faqs_dataset"
page_content_column = "answer"
split_separator = "\n"
split_chunk_size = 1000
split_chunk_overlap_size = 150
split_length_function = len

embedding_model_choice = "hkunlp/instructor-large"
embed_device_choice = "cpu"
embed_normalization_option = True

persist_directory_location = 'docs/chroma/'
retrieve_k_choice = 3
retrieve_search_type_choice = "mmr"

memory_key_name = "chat_history"
memory_input_name = "question"
memory_output_name = "answer"
memory_return_message_option = True

ollama_model_choice = "llama3.2"
ollama_temp = 0.1

base_prompt_template = """System: You are a ABC-Company customer service representative.
\n\nInstruction: Answer the customer's question based on following context and chat history if you know the answer. Otherwise, end the answer with 'I am not sure about the answer, please contact our human service for assistance. Thank You!'.
\n\nContext: {context}
\n\nChat history: {chat_history}
\n\nQuestion: {question}
\n\nOutput Answer: """
prompt_input_list = ["context", "question", "chat_history"]

chain_return_source_option = True
chain_return_generate_quest_option = True
chain_verbose_option = False

In [18]:
text_splitter = setup_text_splitter(split_separator, split_chunk_size, split_chunk_overlap_size, split_length_function)
split_data = load_and_process_data(dataset_name, page_content_column, text_splitter)
hf_embeddings = setup_embedding(embedding_model_choice, embed_device_choice, embed_normalization_option)
retriever = setup_vectordb_retriever(split_data, hf_embeddings, persist_directory_location, retrieve_k_choice, retrieve_search_type_choice)
memory = setup_memory(memory_key_name, memory_input_name, memory_output_name, memory_return_message_option)
llm_chosen = setup_ollama_model(ollama_model_choice, ollama_temp)
base_prompt = setup_prompt(base_prompt_template, prompt_input_list)
llm_with_rag_chain_and_memory = build_rag_chain(llm_chosen, retriever, memory, chain_return_source_option, chain_return_generate_quest_option, chain_verbose_option, base_prompt)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

train_expanded.json:   0%|          | 0.00/46.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

  memory = ConversationBufferMemory(


In [19]:
memory.clear()

In [20]:
#memory.chat_memory.messages

In [21]:
# Make sure Ollama is serving

serve_command = "ollama serve"
server_process = subprocess.Popen(serve_command, shell=True)
print("Ollama server is starting...")

Ollama server is starting...


In [22]:
# Check to make sure llama3.2 is pulled

!ollama list

NAME               ID              SIZE      MODIFIED      
llama3.2:latest    a80c4f17acd5    2.0 GB    2 minutes ago    


# Gradio Application Build

In [26]:
set_gradio_theme = gr.themes.Soft(primary_hue="orange", secondary_hue="gray").set(
    button_primary_background_fill="orange",
    button_primary_background_fill_hover="green",
)

with gr.Blocks(theme=set_gradio_theme) as demo:

    gr.Markdown(
    """
    # Welcome visitor!
    ## I am a demo customer service chatbot. Ask me any questions related to your order and our company.
    ### I am built using Ollama-llama3.2 llm model and Langchain for RAG (Retrieval-Augmented Generation).
    ### For technical details, please see info at the bottom of the page.

    ### Please note that this is just a demo and a work-in-progress.

    Start talking to me by typing below.
    """)

    question = gr.Textbox(label="Ask me a question", placeholder="Where is my order?")
    send_btn = gr.Button("Send Question")
    answer = gr.Textbox(label="Chatbot response", lines=20)

    send_btn.click(fn=talk_to_chatbot, inputs=question, outputs=answer, api_name="customer_service_chatbot")

    gr.Markdown(
    """
    If clear chat history, the next query's chat history will be emptined and refreshed.
    """)
    clear_btn = gr.Button("Clear Chat History")
    clear_btn.click(fn=clear_chat_history)


    gr.Markdown(
    """
    ## Chatbot Technical Details:

    #### Model: Ollama-llama3.2 (3B parameters)
    #### Dataset: Hugging Face Hub "MakTek/Customer_support_faqs_dataset"
    #### Embedding: Hugging Face Hub "hkunlp/instructor-large"
    #### Vector Database: Chroma
    #### Retrieval Search Type: Maximal Marginal Relevance (MMR)
    #### Prompt:
    LLM is told that it is a customer representative from ABC-company and to use chat history and RAG context to answer questions
    If it does not know the answer, it is told to say it does not know and tell user to contact human service
    #### Memory:
    Chat memory is fed into the input so that the chatbot is aware of the context of the conversation.
    However, as the chat history gets long, it becomes confused. It is a limitation of this simple demo.
    #### Temperature: 0.1
    The chatbot is not encouraged to be creative but use factual answers provided in retrieval results.

    #### Good Testing Question Example:
    - Who is this?
        - The answer should show the role assigned in prompt is working.
    - How do I go to Mars?
        - The answer should show that when asked about things it doesn't know or irrelevant, it knows it should refer users to human service.
    - Start the inquiry with: Can I talk to someone? Followed by next query: When can I do that?
        - This question pair should show that the chatbot has memory and it can understand what it means by "that".
    - Other typical customer support questions:
        - Is it possible place an order by phone?
        - What is the refund policy?
        - Where is my order?

    """)

demo.launch(share=True, debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://af610ab674679b152a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [24]:
# demo.close()

In [25]:
#