In [None]:
!pip install -qU langchain langchain-community langchainhub langchain-openai chromadb bs4 sentence_transformers transformers
!pip install -qU bitsandbytes accelerate llama-cpp-python beautifulsoup4 faiss-cpu tavily-python "langserve[all]"
!pip install -qU duckduckgo-search

In [54]:
from google.colab import userdata
import os
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
#from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.schema import (
    HumanMessage,
    SystemMessage,
    AIMessage
)
from langchain_community.chat_models.huggingface import ChatHuggingFace
from transformers import AutoTokenizer

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGSMITH')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')
os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_KEY')

# Define Embeddings vs Model

In [10]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
    )

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
        'include_prompt_in_result' : True,
        "return_full_text": True
    },
)

chat_model = ChatHuggingFace(llm=llm)

                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.


# Prepare Documents

In [4]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://flippers.pk/ads/yartici-com-pakistans-leading-online-art-marketplace/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header", 'site-content')
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
splits[0], splits[2], len(splits)

(Document(page_content="Yartici.com - Pakistan's Leading Online Art Marketplace   | E-Commerce Platforms\n\n \n  G13, Islamabad, Pakistan  -\n  Islamabad\n\n\n\n\n\n\n\n\nBusiness CategoryE-Commerce Platforms\nBusiness Established Year2020\nRevenue Per MonthN/A\nMonthly ProfitN/A\nNumber of employees0 to 5\nProperty valueN/A\nProperty RentN/A\nAsset include:Website, Domain, Email, AWS Hosting, Social Media Accounts \nAsset Value:N/A\nCurrent ConditionRunning\nDemand20,000,000\nAddressG13, Islamabad, Pakistan", metadata={'source': 'https://flippers.pk/ads/yartici-com-pakistans-leading-online-art-marketplace/'}),
 Document(page_content='Purpose of Selling:\r\nSince the 3 co-founders are now based abroad, they are actively seeking entrepreneurs in Pakistan to take charge of the platform.\r\n\r\nThe price listed is negotiable to reflect a fair and accurate value of the platform considering the value of assets and long-term profitability.', metadata={'source': 'https://flippers.pk/ads/yarti

# Create Vector Store

In [5]:
try:
  vectorstore.delete_collection()
except:
  pass

# import chromadb
# client = chromadb.Client()

vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, collection_metadata={"hnsw:space": "cosine"}) # downside: dowesn't do upsert
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x7ee06cdbd270>

In [6]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [7]:
retriever.invoke('price of website') #retriever.get_relevant_documents('abroad')

[Document(page_content='Purpose of Selling:\r\nSince the 3 co-founders are now based abroad, they are actively seeking entrepreneurs in Pakistan to take charge of the platform.\r\n\r\nThe price listed is negotiable to reflect a fair and accurate value of the platform considering the value of assets and long-term profitability.', metadata={'source': 'https://flippers.pk/ads/yartici-com-pakistans-leading-online-art-marketplace/'})]

# Create Prompt

In [13]:
# not compatible with zephyr
# prompt = hub.pull("rlm/rag-prompt")

# hard-code template
# template = """
# <|system|>
# You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>
# <|user|>
# Question: {question}
# Context: {context} </s>
# <|assistant|>
# Answer:
# """

SYSTEM_MESSAGE = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."
USER_MESSAGE = "Question: What is Yartici? \nContext: Yartici is an online art shop. "

# use tokenizer to create template
messages = [
    {
        "role": "system",
        "content": SYSTEM_MESSAGE,
    },
    {
        "role": "user",
        "content": USER_MESSAGE},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici is an online art shop. </s>\n<|assistant|>\n"

In [14]:
# can also use langchain modules

messages = [
    SystemMessage(content=SYSTEM_MESSAGE),
    HumanMessage(content=USER_MESSAGE),
]

# chat model understands the expected prompt template. Interesting because I previously thought it's dumb.
chat_model._to_chat_prompt(messages)

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici is an online art shop. </s>\n<|assistant|>\n"

In [15]:
llm.invoke(prompt)
# llm.predict(prompt)

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici is an online art shop. </s>\n<|assistant|>\nAnswer: Yartici is a digital marketplace for purchasing original artwork and art supplies directly from artists."

In [17]:
chat_model.invoke(prompt) # chat_model.invoke() adds <user></s><assistant>

AIMessage(content="<|user|>\n<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici is an online art shop. </s>\n<|assistant|>\n</s>\n<|assistant|>\nAnswer: Yartici is a digital marketplace where artists can sell their original artwork and art supplies directly to buyers.")

In [18]:
chat_model.invoke(messages) # this method works well for chat_models

AIMessage(content="<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici is an online art shop. </s>\n<|assistant|>\nAnswer: Yartici is a digital marketplace for purchasing original artwork and art supplies directly from artists.")

In [20]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate

messages = [
    {
        "role": "system",
        "content": SYSTEM_MESSAGE,
    },
    {
        "role": "user",
        "content": "Question: {question} \nContext: {context} "},
]

raw_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(raw_prompt)

prompt = PromptTemplate.from_template(raw_prompt)
print(prompt)

prompt.invoke({'context' : "test1", 'question' : 'test question'})

<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>
<|user|>
Question: {question} 
Context: {context} </s>
<|assistant|>

input_variables=['context', 'question'] template="<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: {question} \nContext: {context} </s>\n<|assistant|>\n"


StringPromptValue(text="<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: test question \nContext: test1 </s>\n<|assistant|>\n")

In [42]:
tuple_messages = [
    ("system", SYSTEM_MESSAGE),
    ("human", "Question: {question} \nContext: {context} ")
]

chat_prompt = ChatPromptTemplate.from_messages(tuple_messages)
chat_prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Question: {question} \nContext: {context} '))])

In [47]:
chat_prompt.invoke({'context' : "test1", 'question' : 'test question'}).to_string()
# this shows that chattemplate uses "System" instead of <system>

# because we have only rendered the template without giving it any informationr egarding the model

"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nHuman: Question: test question \nContext: test1 "

# Define Chain

In [29]:
def format_docs(docs):
    # not using stuff_documents to create chain
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print(rag_chain)

rag_chain.invoke("What is Yartici?")

first={
  context: VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7ee06cdbd270>, search_kwargs={'k': 1})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
} middle=[PromptTemplate(input_variables=['context', 'question'], template="<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: {question} \nContext: {context} </s>\n<|assistant|>\n"), HuggingFaceHub(client=<InferenceClient(model='HuggingFaceH4/zephyr-7b-beta', timeout=None)>, repo_id='HuggingFaceH4/zephyr-7b-beta', task='text-generation', model_kwargs={'max_new_tokens': 512, 'top_k': 30, 'temperature': 0.1, 'repetition_penalty': 1.03, 'include_prompt_in_result': True, 'return_full_text': True, 'repo

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici.com - Pakistan's Leading Online Art Marketplace   | E-Commerce Platforms\n\n \n  G13, Islamabad, Pakistan  -\n  Islamabad\n\n\n\n\n\n\n\n\nBusiness CategoryE-Commerce Platforms\nBusiness Established Year2020\nRevenue Per MonthN/A\nMonthly ProfitN/A\nNumber of employees0 to 5\nProperty valueN/A\nProperty RentN/A\nAsset include:Website, Domain, Email, AWS Hosting, Social Media Accounts \nAsset Value:N/A\nCurrent ConditionRunning\nDemand20,000,000\nAddressG13, Islamabad, Pakistan </s>\n<|assistant|>\nYartici is Pakistan's leading online art marketplace, established in 2020. It operates as an e-commerce platform and currently has 0-5 employees. The business generates unknown monthly 

In [49]:
# Make chain with chat

def format_docs(docs):
    # not using stuff_documents to create chain
    return "\n\n".join(doc.page_content for doc in docs)

rag_chat_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | chat_prompt
    | chat_model
    | StrOutputParser()
)

print(rag_chat_chain)

rag_chat_chain.invoke("What is Yartici?")

first={
  context: VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7ee06cdbd270>, search_kwargs={'k': 1})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
} middle=[ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Question: {question} \nContext: {context} '))]), ChatHuggingFace(llm=HuggingFaceHub(client=<InferenceClient(model='HuggingFaceH4/zephyr-7b-beta', timeout=None)>, repo_id='HuggingFaceH4/zephyr-7b-beta', task='text-generation', 

"<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici.com - Pakistan's Leading Online Art Marketplace   | E-Commerce Platforms\n\n \n  G13, Islamabad, Pakistan  -\n  Islamabad\n\n\n\n\n\n\n\n\nBusiness CategoryE-Commerce Platforms\nBusiness Established Year2020\nRevenue Per MonthN/A\nMonthly ProfitN/A\nNumber of employees0 to 5\nProperty valueN/A\nProperty RentN/A\nAsset include:Website, Domain, Email, AWS Hosting, Social Media Accounts \nAsset Value:N/A\nCurrent ConditionRunning\nDemand20,000,000\nAddressG13, Islamabad, Pakistan </s>\n<|assistant|>\nYartici is Pakistan's leading online art marketplace, established in 2020. It operates as an e-commerce platform and currently has 0-5 employees. The business generates unknown monthly 

In [51]:
# Make chain with sources

from langchain_core.runnables import RunnableParallel

doc_chain = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | chat_prompt
    | chat_model
    | StrOutputParser()
)

rag_chain_chat_from_docs = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=doc_chain)

rag_chain_chat_from_docs.invoke("What is Yartici")

{'context': [Document(page_content="Yartici.com - Pakistan's Leading Online Art Marketplace   | E-Commerce Platforms\n\n \n  G13, Islamabad, Pakistan  -\n  Islamabad\n\n\n\n\n\n\n\n\nBusiness CategoryE-Commerce Platforms\nBusiness Established Year2020\nRevenue Per MonthN/A\nMonthly ProfitN/A\nNumber of employees0 to 5\nProperty valueN/A\nProperty RentN/A\nAsset include:Website, Domain, Email, AWS Hosting, Social Media Accounts \nAsset Value:N/A\nCurrent ConditionRunning\nDemand20,000,000\nAddressG13, Islamabad, Pakistan", metadata={'source': 'https://flippers.pk/ads/yartici-com-pakistans-leading-online-art-marketplace/'})],
 'question': 'What is Yartici',
 'answer': "<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici \nContext: Yartici.com - Pakistan

In [52]:
for chunk in rag_chain_chat_from_docs.stream("What is Yartici?"):
    print(chunk)

# doesn't work with chat_model either. Most likely, streaming has to be supported by the underlying model being invoked

{'question': 'What is Yartici?'}
{'context': [Document(page_content="Yartici.com - Pakistan's Leading Online Art Marketplace   | E-Commerce Platforms\n\n \n  G13, Islamabad, Pakistan  -\n  Islamabad\n\n\n\n\n\n\n\n\nBusiness CategoryE-Commerce Platforms\nBusiness Established Year2020\nRevenue Per MonthN/A\nMonthly ProfitN/A\nNumber of employees0 to 5\nProperty valueN/A\nProperty RentN/A\nAsset include:Website, Domain, Email, AWS Hosting, Social Media Accounts \nAsset Value:N/A\nCurrent ConditionRunning\nDemand20,000,000\nAddressG13, Islamabad, Pakistan", metadata={'source': 'https://flippers.pk/ads/yartici-com-pakistans-leading-online-art-marketplace/'})]}
{'answer': "<|system|>\nYou are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.</s>\n<|user|>\nQuestion: What is Yartici? \nContext: Yartici.com - Pakist

# Add Chat History

In [55]:
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"), # placeholder because .chat_history variable will comprise of a series system/human interactions.
        ("human", "{question}"),
    ]
)

contextualize_q_chain = contextualize_q_prompt | chat_model | StrOutputParser()

contextualize_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large?",
    }
)

'<|system|>\nGiven a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.</s>\n<|user|>\nWhat does LLM stand for?</s>\n<|assistant|>\nLarge language model</s>\n<|user|>\nWhat is meant by large?</s>\n<|assistant|>\nIn the context of large language models (LLMs), "large" refers to the size of the neural network that the model is composed of. LLMs with more parameters (i.e., weights and biases) are generally considered larger and have the ability to process and generate more complex and nuanced text. The exact size of a large LLM can vary widely, but models with billions or trillions of parameters are common in the field of natural language processing.'

Despite instructing it not to answer the question, it still answers the question. Two points:
1. The prompt template may not be perfectly compatible for reasons demonstrated earlier in the nteobook. (SOLVED: After using approporiate templating method, it is now compatible. Although this was not the issue as demonstrated below.)
2. The model is not strong enough to understand instructions.

In [19]:
messages = [
    {
        "role": "system",
        "content": """Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.""",
    },
    {
        "role": "user",
        "content": "What does LLM stand for?"
    },
    {
        "role": "assistant",
        "content": "Large language model."
    },
    {
        "role": "user",
        "content": "{question}"
    },
]

raw_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(raw_prompt)

prompt = PromptTemplate.from_template(raw_prompt)
print(prompt)

prompt.invoke({'question' : "What is meant by large?"})

<|system|>
Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.</s>
<|user|>
What does LLM stand for?</s>
<|assistant|>
Large language model.</s>
<|user|>
{question}</s>
<|assistant|>

input_variables=['question'] template='<|system|>\nGiven a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.</s>\n<|user|>\nWhat does LLM stand for?</s>\n<|assistant|>\nLarge language model.</s>\n<|user|>\n{question}</s>\n<|assistant|>\n'


StringPromptValue(text='<|system|>\nGiven a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.</s>\n<|user|>\nWhat does LLM stand for?</s>\n<|assistant|>\nLarge language model.</s>\n<|user|>\nWhat is meant by large?</s>\n<|assistant|>\n')

In [20]:
contextualize_q_chain_custom = prompt | llm | StrOutputParser()
contextualize_q_chain_custom.invoke({'question' : "What is meant by large?"})

'In the context of large language models (LLMs), "large" refers to the size of the neural network that the model is composed of. LLMs with more parameters (i.e., weights and biases) are generally considered larger and have the ability to process and generate more complex and nuanced text. The exact definition of what constitutes a "large" LLM can vary depending on the specific application and task being performed.'

Now, that we know for sure that prompt template is fully customized to make it compatible with the LLM we are using, it still doesn't understand our instruction to NOT generate the answer.

In [21]:
contextualize_q_chain_custom.invoke({'question' : "Is it still a popular field of research for computer scientists"})

'Yes, large language models (LLMs) have gained significant attention and are currently a popular field of research in computer science, particularly in the areas of natural language processing and artificial intelligence. LLMs are being developed to better understand and generate human-like language, with potential applications in various industries such as healthcare, finance, and education.'

Though, we have successfully added the context of the chat history and it understands the ambiguous reference of the word "it".

Note:

Huggingface messages template is a list of dict of role/content (system, user, assistant) while the LangChain is a list SystemMessage/HumanMessages or a list of tuples (role, content) e.g. system, human, ai.

In [22]:
# cleanup
vectorstore.delete_collection()