## Chat with documents using Langchain

### Install the required libraries

In [None]:
!pip install openai
!pip install python-dotenv
!pip install langchain
!pip install pypdf
!pip install chromadb
!pip install tiktoken
!pip install lark

### Setup OpenAI

In [None]:
import openai
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']

### Instantiate the LLM

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0) #gpt-3.5-turbo is the default model used

### Load the document

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("docs/progit.pdf")
pages = loader.load()

In [None]:
len(pages)

## Split the document in chunks

This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = r_text_splitter.split_documents(pages)

In [None]:
len(chunks)

In [None]:
for i in range(30, 34):
    print(f"chunk_{i+1}: {chunks[i]}\n")

## Create Embeddings and store in a Vector Database

In [None]:
db_dir = "vectordb/chroma"
!rm -rf ./vectordb/chroma

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
from langchain.vectorstores import Chroma
vector_db = Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory=db_dir)
vector_db.persist()

In [None]:
vector_db._collection.count()

## Query and retrieve data

### Similarity Search

In [None]:
question = "who are the authors of this book?"
docs = vector_db.similarity_search(question, k=5)

for i in range(5):
    print(f"doc[{i+1}]: {docs[i]}\n")

### Maximum Marginal Relevance

In [None]:
question = "who are the authors of this book?"
docs = vector_db.max_marginal_relevance_search(question, k=5)

for i in range(5):
    print(f"doc[{i+1}]: {docs[i]}\n")

### Self Query Retrieval

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
document_content_description = "A book on Git"
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The book the chunk is from, it should be from `docs/progit.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the book",
        type="integer",
    ),
]

#### Basic initialization

In [None]:
sq_retriever = SelfQueryRetriever.from_llm(
    llm, 
    vector_db, 
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
sq_retriever.search_type

In [None]:
sq_retriever = SelfQueryRetriever.from_llm(
    llm, 
    vector_db, 
    document_content_description,
    metadata_field_info,
    search_type="mmr",
    verbose=True
)

In [None]:
sq_retriever.search_type

### Retrieve the relevant documents

In [None]:
question = "What is the main focus of discussion between the pages 100 to 120?"
#question = "What is the 2 main focus of discussion between the pages 100 to 120?" #Limiting the number of documents returned doesn't work

docs = sq_retriever.get_relevant_documents(question)
for doc in docs:
    print(doc.metadata)

### Retrieves n number of relevant documents

We need to set the `enable_limit` parameter to True in order to fetch `k` number of documents. 

In [None]:
question = "What is the 2 main focus of discussion between the pages 100 to 120?" 

docs = sq_retriever.get_relevant_documents(question)
for doc in docs:
    print(doc.metadata)

In [None]:
sq_retriever_1 = SelfQueryRetriever.from_llm(
    llm, 
    vector_db, 
    document_content_description,
    metadata_field_info,
    search_type="mmr",
    enable_limit=True,
    verbose=True
)

In [None]:
question = "What is the 2 main focus of discussion between the pages 100 to 120?"

docs = sq_retriever_1.get_relevant_documents(question)
for doc in docs:
    print(doc.metadata)

## Question and Answer
Pass the chunks retrieved from the vector store to a LLM Model to get a final answer for the user question.

### Using RetrievalQA chain

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=vector_db.as_retriever(),
    return_source_documents=True,
    verbose=True
)

#search_type=mmr
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=vector_db.as_retriever(search_type="mmr"),
    return_source_documents=True,
    verbose=True
)

#retriever = self query retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=sq_retriever,
    return_source_documents=True,
    verbose=True
)

In [None]:
qa_chain.retriever.search_type

In [None]:
question = "who are the main authors of this book?"
response = qa_chain({"query": question})

print(response["result"])

In [None]:
response

## Chat
### Memory

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)

### Using ConversationalRetrievalChain

In [None]:
from langchain.chains import ConversationalRetrievalChain

chat_history = []
conv_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vector_db.as_retriever(search_type="mmr"),
    return_source_documents=True,
    memory=memory
)

In [None]:
question = "who are the main authors of this book?"
response = conv_chain({"question": question})

print(response)

In [None]:
response["answer"]

In [None]:
question = "please give more details about them."
response = conv_chain({"question": question})

In [None]:
print(response["answer"])

In [None]:
memory.buffer

In [None]:
question = "what does NASA do?"
result = conv_chain({"question": question})
print(result["answer"])