### Author : Rahul Bhoyar

In this tutorial we will explore the querying to the datasets.

Installing the necessary libraries

In [None]:
!pip install langchain langchain_openai faiss-gpu faiss-cpu nltk



### Step 1 : Loading the vector database

In [None]:
VECTOR_DATABASE_PATH = "vectorstore/db_faiss"

In [None]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

Setting up the OPENAI enviroment.

In [None]:
import os
openai_api_key = "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"  # Enter your OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = openai_api_key
print("OPENAI API key is set successfully :",openai_api_key)

OPENAI API key is set successfully : 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


In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
loaded_db = FAISS.load_local(VECTOR_DATABASE_PATH, embeddings)
print("Vector database loaded successfully.")
loaded_db

Vector database loaded successfully.


<langchain_community.vectorstores.faiss.FAISS at 0x7b1b010a0310>

### Step 2: Creation of the retriever object

In [None]:
retriever = loaded_db.as_retriever()

In [None]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7b1b010a0310>)

### Step 3: Querying

Selcting the model

In [None]:
!pip install -U langchain-openai



In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name = "gpt-4")
print("OpenAI model initiated.")
print("-"*200)
print(llm)

OpenAI model initiated.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
client=<openai.resources.chat.completions.Completions object at 0x7b1b00ea6c80> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7b1b00124670> model_name='gpt-4' openai_api_key=SecretStr('**********') openai_proxy=''


### Querying Approach 1

In this approach, we shall directly query to the model using the retriever.

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


def llm_response_approach_1(query):
    template = """
    Answer the question based only on the following context:
    {context}
    Question:
    {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    #llm = ChatOpenAI()
    llm = ChatOpenAI(model_name = "gpt-4")

    def format_docs(docs):
        return "\n\n".join([d.page_content for d in docs])

    chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    response = chain.invoke(query)
    return response

In [None]:
QUERY = """
Give me list of all datasets about architecture.

I want to know everything about the dataset.
Create a proper structure of all the infromation.

Igonre the below info :
- Sr. No.
- Has Creator Name
- Has Creator URL
- Has Current Version Number
- Has Description
- Has License Name
- Has Owner Name
- Has Owner Ref
- Has Subtitle
- Has Title
- Has Total Bytes
- Has URL
"""

In [None]:
response1 = llm_response_approach_1(QUERY)
response1

'1. Dataset Name: "wwymak/architecture-dataset"\n   - Title: Architecture dataset\n   - Tags: Art, Image\n   - Creator Name: Wendy Mak\n   - Creator URL: wwymak\n   - Current Version Number: 1\n   - Description: Not available\n   - Download Count: 2379\n   - Files: Not available\n   - Is Featured: False\n   - Is Private: False\n   - Kernel Count: 10\n   - Last Updated: 2018-12-22 19:26:00\n   - License Name: CC BY-NC-SA 4.0\n   - Owner Name: Wendy Mak\n   - Ref: wwymak/architecture-dataset\n   - Size: 1GB\n   - Topic Count: 0\n   - Total Bytes: 1536433082\n   - URL: https://www.kaggle.com/datasets/wwymak/architecture-dataset\n   - Usability Rating: 0.625\n   - View Count: 29541\n   - Vote Count: 67\n   - Keyword: architecture\n\n2. Dataset Name: "dumitrux/architectural-styles-dataset"\n   - Title: Architectural styles\n   - Tags: Arts and entertainment, Image\n   - Creator Name: dumitrux\n   - Creator URL: dumitrux\n   - Current Version Number: 3\n   - Description: Not available\n   - 

Let's see the numben of tokens in the output

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the string
tokens = word_tokenize(response1)

# Print the tokens
print("Number of Output tokens:", len(tokens))

Number of Output tokens: 399


### Querying Approach 2


In this approach, we will first retrieve the relevent documents from the vector database and then we shall pass them as a context in the query.

Let's create a function which can gives us the response

In [None]:
from langchain import PromptTemplate
from langchain_openai import ChatOpenAI

def llm_response_approach_2(query):
    relevant_docs = loaded_db.similarity_search(query)
    context_data = ""
    for i, doc in enumerate(relevant_docs):
        context_data = context_data + f"{i + 1}." +  doc.page_content + "\n"
    template = """
      Answer the question based only on the following context:
      {context}
      Question:
      {question}
      """
    prompt = PromptTemplate(input_variables=["context","question"],template=template)
    final_prompt = prompt.format(context = context_data ,question = query)

    #llm = OpenAI(model_name = "gpt-4")
    llm = ChatOpenAI(model_name = "gpt-4")
    response = llm.invoke(final_prompt).content
    return response


In [None]:
QUERY = """
Give me list of all datasets about architecture.

I want to know everything about the dataset.
Create a proper structure of all the infromation.

Igonre the below info :
- Sr. No.
- Has Creator Name
- Has Creator URL
- Has Current Version Number
- Has Description
- Has License Name
- Has Owner Name
- Has Owner Ref
- Has Subtitle
- Has Title
- Has Total Bytes
- Has URL
"""

In [None]:
response2 = llm_response_approach_2(QUERY)
response2

"Dataset 1:\n- Name: Architecture dataset\n- Creator: Wendy Mak\n- Creator URL: wwymak\n- Current Version: 1\n- Description: nan\n- Subtitle: nan\n- Download Count: 2379\n- Files: []\n- License Name: CC BY-NC-SA 4.0\n- Owner Name: Wendy Mak\n- Owner URL: wwymak\n- Last Updated: 2018-12-22 19:26:00\n- Size: 1GB\n- Topic Count: 0\n- Total Bytes: 1536433082\n- URL: https://www.kaggle.com/datasets/wwymak/architecture-dataset\n- Usability Rating: 0.625\n- View Count: 29541\n- Vote Count: 67\n- Keyword: architecture\n\nDataset 2:\n- Name: Architectural styles\n- Creator: dumitrux\n- Creator URL: dumitrux\n- Current Version: 3\n- Description: nan\n- Subtitle: 25 architectural styles\n- Download Count: 2028\n- Files: []\n- License Name: CC0: Public Domain\n- Owner Name: dumitrux\n- Owner URL: dumitrux\n- Last Updated: 2020-08-29 01:19:58\n- Size: 2GB\n- Topic Count: 0\n- Total Bytes: 1677638078\n- URL: https://www.kaggle.com/datasets/dumitrux/architectural-styles-dataset\n- Usability Rating: 0

In [None]:
Dataset 1:
- Name: Architecture dataset
- Creator: Wendy Mak
- Creator URL: wwymak
- Current Version: 1
- Description: nan
- Subtitle: nan
- Download Count: 2379
- Files: []
- License Name: CC BY-NC-SA 4.0
- Owner Name: Wendy Mak
- Owner URL: wwymak
- Last Updated: 2018-12-22 19:26:00
- Size: 1GB
- Topic Count: 0
- Total Bytes: 1536433082
- URL: https://www.kaggle.com/datasets/wwymak/architecture-dataset
- Usability Rating: 0.625
- View Count: 29541
- Vote Count: 67
- Keyword: architecture

Dataset 2:
- Name: Architectural styles
- Creator: dumitrux
- Creator URL: dumitrux
- Current Version: 3
- Description: nan
- Subtitle: 25 architectural styles
- Download Count: 2028
- Files: []
- License Name: CC0: Public Domain
- Owner Name: dumitrux
- Owner URL: dumitrux
- Last Updated: 2020-08-29 01:19:58
- Size: 2GB
- Topic Count: 0
- Total Bytes: 1677638078
- URL: https://www.kaggle.com/datasets/dumitrux/architectural-styles-dataset
- Usability Rating: 0.9375
- View Count: 15598
- Vote Count: 40
- Keyword: architecture

Dataset 3:
- Name: Machine Learning Model's Architecture Diagrams
- Creator: Suraj
- Creator URL: suraj520
- Current Version: 54
- Description: nan
- Subtitle: Machine Learning Model's Architecture Diagrams
- Download Count: 181
- Files: []
- License Name: CC0: Public Domain
- Owner Name: Suraj
- Owner URL: suraj520
- Last Updated: 2023-09-01 15:33:09
- Size: 254KB
- Topic Count: 0
- Total Bytes: 260154
- URL: https://www.kaggle.com/datasets/suraj520/machine-learning-architecture-diagrams
- Usability Rating: 1.0
- View Count: 1770
- Vote Count: 21
- Keyword: architectural design

Let's see the number of tokens as output

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the string
tokens = word_tokenize(response2)

# Print the tokens
print("Number of Output tokens:", len(tokens))

Number of Output tokens: 322


### Conclusion

Here we can see that for both the approaches the output tokens are in the range of 350 to 450.

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
import chainlit as cl

DB_FAISS_PATH = 'vectorstore/db_faiss'

custom_prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

#Retrieval QA Chain
def retrieval_qa_chain(llm, prompt, db):
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=db.as_retriever(search_kwargs={'k': 2}),
                                       return_source_documents=True,
                                       chain_type_kwargs={'prompt': prompt}
                                       )
    return qa_chain

#Loading the model
def load_llm():
    # Load the locally downloaded model here
    llm = CTransformers(
        model = "TheBloke/Llama-2-7B-Chat-GGML",
        model_type="llama",
        max_new_tokens = 512,
        temperature = 0.5
    )
    return llm

#QA Model Function
def qa_bot():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})
    db = FAISS.load_local(DB_FAISS_PATH, embeddings)
    llm = load_llm()
    qa_prompt = set_custom_prompt()
    qa = retrieval_qa_chain(llm, qa_prompt, db)

    return qa

#output function
def final_result(query):
    qa_result = qa_bot()
    response = qa_result({'query': query})
    return response

#chainlit code
@cl.on_chat_start
async def start():
    chain = qa_bot()
    msg = cl.Message(content="Starting the bot...")
    await msg.send()
    msg.content = "Hi, Welcome to Medical Bot. What is your query?"
    await msg.update()

    cl.user_session.set("chain", chain)

@cl.on_message
async def main(message: cl.Message):
    chain = cl.user_session.get("chain")
    cb = cl.AsyncLangchainCallbackHandler(
        stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
    )
    cb.answer_reached = True
    res = await chain.acall(message.content, callbacks=[cb])
    answer = res["result"]
    sources = res["source_documents"]

    if sources:
        answer += f"\nSources:" + str(sources)
    else:
        answer += "\nNo sources found"

    await cl.Message(content=answer).send()