## LangChain: Q&A over Documents

Sources: [Here](https://learn.deeplearning.ai/langchain/lesson/5/question-and-answer),
[here](https://betterprogramming.pub/building-a-multi-document-reader-and-chatbot-with-langchain-and-chatgpt-d1864d47e339) and 
[here](https://python.langchain.com/docs/integrations/vectorstores/faiss)

## Setting Up

In [0]:
!pip install -qU openai
!pip install -qU langchain 
!pip install -qU langchain-openai
!pip install -qU InstructorEmbedding
!pip install -qU transformers
!pip install -qU chromadb
!pip install -qU faiss-cpu
!pip install -qU tiktoken
!pip install -q pydantic==1.10.9  #https://stackoverflow.com/questions/76934579/pydanticusererror-if-you-use-root-validator-with-pre-false-the-default-you
!pip install -q urllib3==1.26.18
!pip install -q requests==2.28.1
!pip install -qU SQLAlchemy
#!pip install -qU docarray
#!pip install -qU python-docx
!pip install -qU pypdf
!pip install -qU docx2txt
#!pip install -qU unstructured[pdf]
!pip install -qU lark

dbutils.library.restartPython()

In [0]:
import os
import sys 
import glob
from pathlib import Path
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import tiktoken
#from funcy import lcat, lmap, linvoke

## Langchain LLM Objects
import openai
from langchain.llms import OpenAI
from langchain_openai import AzureOpenAI
from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI

## Langchain Prompt Templates
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate

## Langchain Chains 
#from langchain.chains import ConversationChain
#from langchain.chains import LLMChain
#from langchain.chains import ConversationChain
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
#from langchain.chains.mapreduce import MapReduceChain
#from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain
#from langchain.chains import SimpleSequentialChain
#from langchain.chains import SequentialChain
#from langchain.chains.router import MultiPromptChain
#from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser

## Langchain Output Parsers 
#from langchain.output_parsers import ResponseSchema
#from langchain.output_parsers import StructuredOutputParser

## Langchain Memory 
#from langchain.memory import ConversationBufferMemory
#from langchain.memory import ConversationBufferWindowMemory
#from langchain.memory import ConversationTokenBufferMemory
#from langchain.memory import ConversationSummaryBufferMemory

## Langchain Text Splitters
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.text_splitter import TokenTextSplitter
#from langchain.text_splitter import MarkdownHeaderTextSplitter

## Langchain Document Object and Loaders
#from docx import Document
from langchain.docstore.document import Document
from langchain.schema import Document as LangchainDocument
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

## Langchain Vector Databases
#from langchain.vectorstores import DocArrayInMemorySearch
#from langchain.vectorstores.base import VectorStore
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

## Langchain  Embedding Models
#from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from InstructorEmbedding import INSTRUCTOR

## Langchain retrievers
from langchain.retrievers.self_query.base import SelfQueryRetriever ## Error
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever

os.environ['TRANSFORMERS_CACHE'] = "/Workspace/ds-academy-research/Models/"

openai.api_type = "azure"
azure_endpoint = "https://rg-rbi-aa-aitest-dsacademy.openai.azure.com/"
#azure_endpoint = "https://chatgpt-summarization.openai.azure.com/"

openai.api_version = "2023-07-01-preview"
openai.api_key = os.environ["OPENAI_API_KEY"]

deployment_name = "model-gpt-35-turbo"
openai_model_name = "gpt-35-turbo"

client = AzureOpenAI(api_key=openai.api_key,
                     api_version=openai.api_version,
                     azure_endpoint=azure_endpoint,
                     )

chat = AzureChatOpenAI(azure_endpoint=azure_endpoint,
                       openai_api_version=openai.api_version,
                       deployment_name=deployment_name,
                       openai_api_key=os.environ["OPENAI_API_KEY"],
                       openai_api_type=openai.api_type,
                       )

### Examining files in the examples folder  
It may be necessary to change the default folder for your documents.

In [0]:
fullpath = "/Workspace/ds-academy-research/Docs/Generic/"
docs = os.listdir(fullpath)
docs = [d for d in docs if d.endswith(".pdf")]
docs.sort()
for doc in docs:
    print(doc)

### LangChain: Creating a Document Object

The simplest Q&A chain implementation we can use is the load_qa_chain.  
It loads a chain that allows you to pass in all of the documents you would like to query against using your LLM. 

![](https://miro.medium.com/v2/resize:fit:640/format:webp/1*rF3UlC7vWiVFGlXFNZ1XHw.png)  

### Querying a single PDF document

Now we will instantiate the PDF Loader, load one small document and create a list of Langchain documents object

Info about the page splitting [here](https://datascience.stackexchange.com/questions/123076/splitting-documents-with-langchain-when-a-sentence-straddles-the-a-page-break)  
You can also define your own document splitter using `pdf_loader.load_and_split()`

In [0]:
docid = 0
print(f'Loading Document: {fullpath+docs[docid]}')
pdf_loader = PyPDFLoader(fullpath+docs[docid])
documents = pdf_loader.load()
print(f"We have {len(documents)} pages in the pdf file")

print(type(documents))
print(type(documents[docid]))

In [0]:
chain = load_qa_chain(llm=chat, verbose=False)
query = 'What is the document about?'
response = chain.run(input_documents=documents, question=query)
print(response) 

##### This method is all good when we only have a short amount of information to send in the [context size of our model](https://platform.openai.com/docs/models/overview).  
However, most LLMs will have a limit on the amount of information that can be sent in a single request. So we will not be able to send all the information in our documents within a single request.  
To overcome this, we need a smart way to send only the information we think will be relevant to our question/prompt.  


### Interacting with documents using Embeddings

We can use embeddings and vector stores to send only relevant information to our prompt.  
The steps we will need to follow are:

+ Split all the documents into small chunks of text
+ Pass each chunk of text into an embedding transformer to turn it into an embedding
+ Store the embeddings and related pieces of text in a vector store, instead of a list of Langchain document objects

![](https://miro.medium.com/v2/resize:fit:828/format:webp/1*FWwgOvUE660a04zoQplS7A.png)  

#### Creating a list of Document Objects for all PDF files in the examples folder

In [0]:
documents = []
for filename in os.listdir(fullpath):
    if filename.endswith('.pdf'):
        print(f"Ingesting document {filename}")
        pdf_path = fullpath + filename
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
print(f"We have {len(documents)} pages from all the pdf files")

#### Splitting each document into small chunks

When we load documents, the splitting is done by pages. We can change that using Splitters, to avoid the limitations on the LLM contexts  

Langchain offer different Text Splitters
+ RecursiveCharacterTextSplitter: Divides the text into fragments based on characters, starting with the first character. If the fragments turn out to be too large, it moves on to the next character. It offers flexibility by allowing you to define the division characters and fragment size.
+ CharacterTextSplitter: Similar to the RecursiveCharacterTextSplitter, but with the ability to define a custom separator for more specific division. By default, it tries to split on characters like “\n\n”, “\n”, “ “, and “”.
+ RecursiveTextSplitter: Unlike the previous ones, the RecursiveTextSplitter divides text into fragments based on words or tokens instead of characters. This provides a more semantic view and is ideal for content analysis rather than structure.
+ TokenTextSplitter: Uses the OpenAI language model to split text into fragments based on tokens, allowing for precise and contextualized segmentation, ideal for advanced natural language processing applications.
+ And some more specific ones


We will split the data into chunks of 1,000 characters, with an overlap of 200 characters between the chunks, which helps to give better results and contain the context of the information between chunks

In [0]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=200,
                                               separators=["\n\n", "\n", "\. ", " ", ""],
                                               length_function=len
                                               )
chunked_documents = text_splitter.split_documents(documents)
print(len(chunked_documents))

#### Choosing a Model to create Embeddings of each chunk:  

We could create embeddings with many different transformers. 
We could have used using **OpenAIEmbeddings**, but then we would have to pay for each token sent to the API.  
In our case, we will create our vectorDB using **InstructEmbeddings** transformer from **[Hugging Face](https://huggingface.co/hkunlp/instructor-xl)** to provide embeddings from our text chunks.  

In [0]:
#openai_embeddings = OpenAIEmbeddings(deployment="model-text-embedding-ada-002", chunk_size = 1)

instruct_embeddings = HuggingFaceInstructEmbeddings(query_instruction="Represent the query for retrieval: ", model_name="hkunlp/instructor-xl") 

### Creating a Vector Database

![Vector Databases](https://miro.medium.com/v2/resize:fit:828/format:webp/1*vIkxM-u3zrkHMZuIRURc0A.png)

There are [many Vector Databases](https://thenewstack.io/top-5-vector-database-solutions-for-your-ai-project/)  products, both paid and open source, that could be used. 

#### Setting Chroma Vector DB  

Deleting previous databases from the folder we have create to store the files and loading all PDF documents into the Vector Database   
##### We have first tried [ChromaDB](https://www.trychroma.com/), but some incompatibilities with the current versions of Python motivated us to try [FAISS](https://faiss.ai/) (from Meta)  

##### Commented Code - ChromaDB
```
files = glob.glob('/Workspace/ds-academy-research/VectorDB_Chroma/*')

for f in files:
    os.remove(f)

chromadb = Chroma.from_documents(chunked_documents,
                                 #embedding=openai_embeddings,
                                 embedding_function=instruct_embeddings,
                                 persist_directory='/Workspace/ds-academy-research/VectorDB_Chroma/'
)
chromadb.persist()
```

#### Setting FAISS Vector DB
Deleting previous databases from the folder we have create to store the files and loading all PDF documents into the Vector Database  

In [0]:
files = glob.glob('/Workspace/ds-academy-research/VectorDB_FAISS/*')

for f in files:
    os.remove(f)

faissdb = FAISS.from_documents(chunked_documents, 
                                embedding=instruct_embeddings,
                               )
#print(f"There are {vectordb.ntotal} documents in the index")
faissdb.save_local('/Workspace/ds-academy-research/VectorDB_FAISS/')


### Retrieval Tasks  

Once we have loaded our content as embeddings into the vector store, we are back to a similar situation as to when we only had one PDF to interact with. As in, we are now ready to pass information into the LLM prompt.  
However, instead of passing in all the documents as a source for our context to the chain, as we did initially, we will pass in our vector store as a source/retriever, and the chain will retrieve only the relevant text based on our question and send that information only inside the LLM prompt.

![](https://miro.medium.com/v2/resize:fit:828/format:webp/1*leoW-Pn0ohWalrUBbzdidA.png)

First we will only use the RetrievalQA chain, which will use our vector store as a source for the context information.

Again, the chain will wrap our prompt with some text, instructing it to only use the information provided for answering the questions.  
So the prompt we end up sending to the LLM something that looks like this:

    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to
    make up an answer.

    {context} // i.e the chunks of text retrieved deemed to be most semantically
              // relevant to our question

    Question: {query} // i.e our actualy query
    Helpful Answer:

#### Loading the recently created Vector Database object

In [0]:
docsearch = FAISS.load_local("/Workspace/ds-academy-research/VectorDB_FAISS/", instruct_embeddings)
print(len(docsearch.index_to_docstore_id))

#### Retrieving chunks of Documents using Similarity Search

In [0]:
query = "Natural Language Processing"
result = docsearch.similarity_search(query)
#print(result[0].page_content)
for r in result:
    print(r)
    print()

#### Search using a score function and a maximum number of documents in return

In [0]:
query = "Natural Language Processing"
result = docsearch.similarity_search_with_score(query, k=2)
for r in result:
    print(r)
    print()

#### Addressing Diversity: Maximum marginal relevance

`Maximum marginal relevance` strives to achieve both relevance to the query *and diversity* among the results.

In [0]:
query = "Natural Language Processing"
result = docsearch.max_marginal_relevance_search(query, k=2)
for r in result:
    print(r)
    print()

#### Addressing Specificity: working with metadata

To address this, many vectorstores support operations on `metadata`.

`metadata` provides context for each embedded chunk.

In [0]:
query = "Natural Language Processing"
result = docsearch.similarity_search(query,
                                     k=3,
                                     filter={"source":"/Workspace/ds-academy-research/Docs/37pagesPDF.pdf"}
                                     )
for r in result:
    print(r)
    print()

#### Addressing Specificity: working with metadata using self-query retriever

But we have an interesting challenge: we often want to infer the metadata from the query itself.

To address this, we can use `SelfQueryRetriever`, which uses an LLM to extract:
 
1. The `query` string to use for vector search
2. A metadata filter to pass in as well

Most vector databases support metadata filters, so this doesn't require any new databases or indexes.

##### Disclaimer: Not implemented for FAISS

In [0]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The answer should come from `/Workspace/ds-academy-research/Docs/13pagesPDF.pdf` or `/Workspace/ds-academy-research/Docs/22pagesPDF.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="the page extracted",
        type="integer",
    ),
]

document_content_description = "Natural Language Processing"
retriever = SelfQueryRetriever.from_llm(chat, 
                                        docsearch,
                                        document_content_description,
                                        metadata_field_info,
                                        verbose=True
                                        )

question = "What did they say about NLP?"
docs = retriever.get_relevant_documents(question)

### Q&A Retrieval from Chain  

![Methods](https://miro.medium.com/v2/resize:fit:720/format:webp/1*0vTWjqREMHkdman0WoLqzQ.png)

It is much more efficient to query the document with the Q&A chain.  
Now we create a Retrieval chain using the Vector Database object:

#### LangChain has four types of Q&A methods.  

![](https://miro.medium.com/v2/resize:fit:4800/format:webp/1*ATLDF3UAPoMy3UOvzS2g5g.png)



#### [Stuffing Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)
The Stuffing chain serves as a solution for scenarios where the context length of the LLM is inadequate to handle extensive documents or a substantial amount of information. In such cases, a large document can be divided into smaller segments, and semantic search techniques can be employed to retrieve relevant documents based on the query. These retrieved documents are then “stuffed” into the LLM context, allowing for the generation of a response.

Benefits:

+ Consolidation of multiple documents: The Stuffing chain allows the aggregation of several relevant documents, overcoming the context length limitation of LLMs for large documents.
Comprehensive information processing: By leveraging multiple documents, the chain can generate more comprehensive and relevant answers.

Disadvantages:

+ Increased complexity: Retrieving relevant document requires good semantic search and vector database.
+ Potential loss of contextual coherency: Since we are retrieving N documents, the LLM might not have all relevant context to generate a cohesive answer.

Use Cases:

+ Document Retrieval Question Answering: Utilizing the Stuffing chain, document chunks retrieved of the larger document can be effectively leveraged to provide accurate answers to your questions. For example, suppose you have a lengthy legal document and need to find specific answers to legal questions. By using the Stuffing chain, you can break down the document into smaller chunks, retrieve relevant chunks based on the question, and utilize the information within those chunks to generate accurate answers.
+ Complex Question Answering: When answering complex questions that require information from diverse sources, the Stuffing chain can provide more comprehensive and accurate responses. For instance, imagine you have a research project that requires answering complex scientific queries. The Stuffing chain allows you to divide relevant scientific papers into smaller chunks, retrieve the necessary information from these chunks, and synthesize it to provide a thorough and precise response to the complex question at hand.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="stuff",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])


#### [Map-Reduce Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)
The Map-Reduce chain enables the iteration over a list of documents, generating individual outputs for each document, which can later be combined to produce a final result. This chain is useful for tasks that involve processing documents in parallel and then aggregating the outputs. 

Benefits:

+ Parallel processing: The Map-Reduce chain allows for parallel execution of the language model on individual documents, improving efficiency and reducing processing time.
+ Scalability: The chain can handle large collections of documents by distributing the processing load across multiple iterations.
+ Enhanced information extraction: By generating individual outputs for each document, the chain can extract specific information that contributes to a more comprehensive final result.  

Disadvantages:

+ Complexity in output aggregation: Combining the outputs of multiple iterations requires careful handling to ensure coherency and meaningful synthesis.
+ Potential redundancy: In some cases, the individual outputs of the Map-Reduce chain may contain redundant information, necessitating further post-processing steps.

Use Cases:

+ Multiple document summarization: The Map-Reduce chain can be used to generate summaries for many documents and then to combine the singular summaries to create a final comprehensive summary for the whole group of documents. For example, imagine you have a collection of research papers on a particular topic. By employing the Map-Reduce chain, you can generate summaries for each research paper, and finally merge the individual summaries to produce a comprehensive summary that captures the key information from the entire collection of papers. This approach enables efficient and accurate summarization of large volumes of documents.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="map_reduce",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### [Refine Chain](https://medium.com/@minh.hoque/what-are-llm-chains-671b84103ba9)  
The Refine chain focuses on iterative refinement of the output by feeding the output of one iteration into the next, aiming to enhance the accuracy and quality of the final result.

Benefits:

+ Continuous improvement: The Refine chain allows for progressive refinement of the output by iteratively updating and enhancing the information.
Enhanced accuracy: By refining the output in each iteration, the chain can improve the accuracy and relevance of the final result.

Disadvantages:

+ Increased computational resources: The iterative nature of the Refine chain may require additional computational resources compared to non-iterative approaches.
+ Longer processing time: Each iteration adds to the overall processing time, which may be a consideration when real-time or near-real-time responses are required.

Use Cases:

+ Long-form text generation: The Refine chain proves exceptionally valuable in the creation of extensive text compositions, such as essays, articles, or stories, where the iterative refinement process greatly enhances coherence and readability. For instance, envision interacting with a substantial research paper and progressively employing the LLM to craft an abstract, refining it with each iteration to achieve an optimal outcome.
+ Answer synthesis: The Refine chain demonstrates its prowess in synthesizing answers derived from multiple sources or generating comprehensive responses. Through iterative refinement, the chain progressively improves the accuracy and comprehensiveness of the final answer. This capability is especially advantageous when each retrieved document contributes crucial context to the answer generation process.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="refine",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### [Map Rerank](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.map_rerank.MapRerankDocumentsChain.html#)  

Combining documents by mapping a chain over them, then reranking results.
This algorithm calls an LLMChain on each input document. The LLMChain is expected to have an OutputParser that parses the result into both an answer (answer_key) and a score (rank_key). The answer with the highest score is then returned.

In [0]:
qa_chain = RetrievalQA.from_chain_type(llm=chat,
                                       retriever=docsearch.as_retriever(),
                                       #retriever=docsearch.as_retriever(search_kwargs={'k': 7}),
                                       return_source_documents=True,
                                       chain_type="map_rerank",
                                       )

query = "What is an extreme outlier?"
result = qa_chain(query)
print(result['result'])

#### Adding Chat History
Now, if we want to take things one step further, we can also make it so that our chatbot will remember any previous questions.

Implementation-wise, all that happens is that on each interaction with the chatbot, all of our previous conversation history, including the questions and answers, needs to be passed into the prompt. That is because the LLM does not have a way to store information about our previous requests, so we must pass in all the information on every call to the LLM.

Fortunately, LangChain also has a set of classes that let us do this out of the box. This is called the ConversationalRetrievalChain, which allows us to pass in an extra parameter called chat_history , which contains a list of our previous conversations with the LLM.

In [0]:
qa_chain = ConversationalRetrievalChain.from_llm(llm=chat,
                                                 retriever=docsearch.as_retriever(),
                                                 return_source_documents=True)

The chain run command accepts the chat_history as a parameter. We must manually build up this list based on our conversation with the LLM.  
The chain does not do this out of the box, so for each question and answer, we will build up a list called chat_history , which we will pass back into the chain run command each time.

In [0]:
chat_history = []
while True:
    # this prints to the terminal, and waits to accept an input from the user
    query = input('Prompt: ')
    # give us a way to exit the script
    if query == "exit" or query == "quit" or query == "q":
        print('Exiting')
        break
    # we pass in the query to the LLM, and print out the response. As well as
    # our query, the context of semantically relevant information from our
    # vector store will be passed in, as well as list of our chat history
    result = qa_chain({'question': query, 'chat_history': chat_history})
    print('Answer: ' + result['answer'])
    # we build up the chat_history list, based on our question and response
    # from the LLM, and the script then returns to the start of the loop
    # and is again ready to accept user input.
    chat_history.append((query, result['answer']))

In [0]:
chat_history

### Interacting With Multiple Document types  
If you remember, the Documents were only PDF files. To increase our base of documents to interact with, we can just add more Documents types to this list.

Now we can simply iterate over all of the files in that folder, and convert the information in them into Documents. From then onwards, the process is the same as before. We just pass our list of documents to the text splitter, which passes the chunked information to the embeddings transformer and vector store.

So, in our case, we want to be able to handle pdfs, Microsoft Word documents, and text files. We will iterate over the docs folder, handle files based on their extensions, use the appropriate loaders for them, and add them to the documents' list, which we then pass on to the text splitter.

##### First let's create Langchain Document objects for all different files in our storage folder

In [0]:
fullpath = "/Workspace/ds-academy-research/Docs/Generic/"
documents = []
for filename in os.listdir(fullpath):
    print(f"Ingesting document {filename}")
    if filename.endswith('.pdf'):
        pdf_path = fullpath + filename
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif filename.endswith('.docx') or filename.endswith('.doc'):
        doc_path = fullpath + filename
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif filename.endswith('.txt'):
        text_path = fullpath + filename
        loader = TextLoader(text_path)
        documents.extend(loader.load())

#### Checking How many objects were created:

In [0]:
print(len(documents))
for d in documents[0:10]:
    print(d.metadata)

#### Splitting the texts (as done before): 

In [0]:
#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"])

new_chunked_documents = text_splitter.split_documents(documents)

print(len(new_chunked_documents))
for d in new_chunked_documents[0:5]:
    print(d.metadata)

##### The vector database does not distinguish which documents were indexed before, so we have to take care when ingesting to avoid duplicates  
##### We could either delete the old VectorDB...

In [0]:
#files = glob.glob('/Workspace/ds-academy-research/VectorDB/FAISS/*')
#for f in files:
#    os.remove(f)

##### Or check for duplicates, and only ingest new documents  

In [0]:
print(len(chunked_documents))
print(len(new_chunked_documents))

print(chunked_documents[0].metadata)
print(new_chunked_documents[0].metadata)

delta = [d for d in new_chunked_documents if d.metadata["source"] not in [h.metadata["source"] for h in chunked_documents]]
print(len(delta))
for d in delta[0:5]:
    print(d.metadata)

##### Now we are going to add only the new documents to the previously created Vector Database index.

In [0]:
print(f"We have {len(faissdb.docstore._dict)} documents in the collection")
faissdb.add_documents(delta, embedding=instruct_embeddings,)

print(f"We have {len(faissdb.docstore._dict)} documents in the collection")
faissdb.save_local('/Workspace/ds-academy-research/VectorDB_FAISS/')

### Chat with our documents from multiple types via LLM 

In [0]:
docsearch = FAISS.load_local("/Workspace/ds-academy-research/VectorDB_FAISS/", instruct_embeddings)
print(len(docsearch.index_to_docstore_id))

In [0]:
pdf_qa = ConversationalRetrievalChain.from_llm(chat,
                                               retriever=docsearch.as_retriever(),
                                               return_source_documents=True,
                                               verbose=False
                                               )

chat_history = []
print(f"---------------------------------------------------------------------------------")
print('Welcome to the DocBot. You are now ready to start interacting with your documents')
print('---------------------------------------------------------------------------------')
while True:
    query = input(f"Prompt: ")
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        break
    if query == '':
        continue
    result = pdf_qa({"question": query, "chat_history": chat_history})
    print(f"Answer: " + result["answer"])
    chat_history.append((query, result["answer"]))

In [0]:
chat_history

### (Bonus) Operations among Vector Databases

##### You can merge many FAISS vector indexes

In [0]:
db1 = FAISS.from_texts(["Oranges are orange or yellow when ripe"], embedding=instruct_embeddings,)
db2 = FAISS.from_texts(["Grapes can be red, purple or green"], embedding=instruct_embeddings,)
db3 = FAISS.from_texts(["Watermelons are green outside, and red inside"], embedding=instruct_embeddings,)
db4 = FAISS.from_texts(["Lemons are green or yellow"], embedding=instruct_embeddings,)
db5 = FAISS.from_texts(["Oranges are orange or yellow when ripe"], embedding=instruct_embeddings,)

In [0]:
print(db1.docstore._dict)
print(db2.docstore._dict)
print(db3.docstore._dict)
print(db4.docstore._dict)
print(db5.docstore._dict)

In [0]:
db1.merge_from(db2)
db1.merge_from(db3)
db1.merge_from(db4)
db1.merge_from(db5)
db1.docstore._dict

In [0]:
results_with_scores = db1.similarity_search_with_score("red and green",)
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Score: {score}")

##### Another useful thing is to add documents with metadata

In [0]:
list_of_documents = [
    LangchainDocument(page_content="Orange is orange", metadata=dict(topic="Fruit")),
    LangchainDocument(page_content="Lemon is green",  metadata=dict(topic="Fruit")),
    LangchainDocument(page_content="Watermelon is green",  metadata=dict(topic="Fruit")),
    LangchainDocument(page_content="Grapes are red or green",  metadata=dict(topic="Fruit")),
    LangchainDocument(page_content="The sun is orange",  metadata=dict(topic="Astronomy")),
    LangchainDocument(page_content="Mars is red",  metadata=dict(topic="Astronomy")),
    LangchainDocument(page_content="The Earth is blue",  metadata=dict(topic="Astronomy")),
    LangchainDocument(page_content="Our planet is Earth",  metadata=dict(topic="Astronomy")),
]
db = FAISS.from_documents(list_of_documents, embedding=instruct_embeddings)

First we make the query without filtering:

In [0]:
results_with_scores = db.similarity_search_with_score("orange")
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

Now we make the same query call but we filter for only topic = "Fruit"

In [0]:
results_with_scores = db.similarity_search_with_score("orange")
for doc, score in results_with_scores:
    if doc.metadata['topic'] == "Fruit":
        print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")