# Q&A across documents with LangChain

In [4]:
from langchain.document_loaders import WikipediaLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader

from langchain.vectorstores import Chroma

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_huggingface import HuggingFaceEmbeddings

# from langchain.embeddings import OpenAIEmbeddings # you can use this if you don't mind about costs

## Setting up vector database and embeddings

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#embeddings_model = OpenAIEmbeddings() # you can use this is you don't mind about costs
vector_db = Chroma("tourist_info", embeddings_model)

In [6]:
wikipedia_loader = WikipediaLoader(query="Paestum")
wikipedia_chunks = text_splitter.split_documents(wikipedia_loader.load())
vector_db.add_documents(wikipedia_chunks)

['b18091cd-fad4-4009-8fc4-d165e05483b2',
 'b41b4421-66e6-473b-9676-c83c3bfe0974',
 'd3c44a3f-bdbe-486b-bf7f-ae1480687984',
 'f1112e74-7cb4-4048-9cfb-9d6dbcf30c1d',
 '5e28529a-c7a5-4b03-b06f-722409450831',
 'ebb4e339-bf3c-495d-ac86-3d479615c2b8',
 '26f62faa-70d6-47f3-926c-4fb34b55280e',
 '53a9e90a-fec7-42a1-8e14-114218d17c20',
 'f105a5cf-d6af-47c1-a145-13e644c69f12',
 'fa72d829-a4d8-442c-bd06-3d0b11904082',
 '1cd847f8-0107-4091-876a-c045469f9dc0',
 '11095ab6-fc76-4bb2-8d74-c1b61fc2ec3a',
 'b87ccf7c-c952-481d-af7f-a6fd511bf3c0',
 'a9a43052-2cb3-4b03-9dae-2b52767e45b9',
 'f2fd1574-2bef-42d6-99de-83a147bb998b',
 '1bd1e79c-c065-4e52-a941-5aa8b78b5286',
 '5119fbb9-c6f0-4fbe-a5c1-e491a9a28e02',
 'ee24e8ac-ce24-4518-8652-148b55159cdb',
 '3c2c33a2-468a-4365-b4a6-9365800c6cc9',
 'e4595f8d-efb5-4cc4-a6bc-963237602b7d',
 'ccdfad4c-cce4-4120-85fd-f2242e04c36f',
 'af19c941-cb43-46cc-aeff-885b0b0ed3a2',
 '534e2dcd-ade6-4225-b1df-ec376a65de64',
 '463e7fe8-1d56-4b48-a21a-282f2e3fbb35',
 '05925f90-b617-

In [13]:
#vector_db = Chroma.from_documents(wikipedia_chunks, embeddings_model)

In [7]:
word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
word_chunks = text_splitter.split_documents(word_loader.load())
vector_db.add_documents(word_chunks)

['f3b36cae-e61e-4be6-aac6-0882c3e0721e',
 '6249885f-accf-45ee-9cba-4131ca923747',
 'f10af273-dad4-4f5a-be99-b83eb9bfe798',
 'bbcaf38a-b61b-4cbb-9c82-30571d378669',
 '532e58d9-bf94-4b4a-9474-9c7ff03cb3cf',
 '71b5779c-ada1-4785-bc35-7ada49265c4c',
 '2e8191c0-286c-417b-b73d-28d71334faf1',
 '27ecf7d4-ff51-4702-8c73-2d4461a9dc26']

In [11]:
pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
pdf_chunks = text_splitter.split_documents(pdf_loader.load())
vector_db.add_documents(pdf_chunks)

['0b26ec4f-f60b-42b2-8420-a434d2f00b40',
 'ba59b7c8-95ef-4fb9-8845-5b77706e7a4e',
 '1d9d5c52-1b3e-4511-86ea-1d831b7f5749',
 'db59b46c-d8a8-40a2-a6ab-0ffcbef470d9',
 '82d7c1e6-3967-499a-bd72-f2f1f670db90',
 '0b09c6fd-c670-4f23-b6f7-c0bc430f16b7',
 '920ff455-718e-40e3-b39f-5739f7d53981',
 'ec50cfee-2968-404b-b1aa-5c6938b304c5',
 '43c6b76a-49a4-42fb-abab-40bc4ec012e1',
 '1a812d93-81ab-409f-9ea5-db1440cab6e2',
 'dda72513-b4cc-4e84-b154-0744942f2df8',
 'add2f3d1-a038-4114-8214-815222f1f683',
 'f46cdcec-067a-4d87-8df3-24c3936faf87',
 '1b969fff-46db-47db-aa1f-c7757975b965',
 '407ecc1e-9508-4e24-ab56-5c094bf05cb7',
 '00393996-eaf5-4cb2-9d25-513b48caca92',
 'b43d3ba7-9a82-4ceb-8070-ffd8715e9c98',
 'e576cea1-cefd-43d0-8604-f75cd58abd26',
 'b1ab67c8-303f-41c3-b830-1abfe4468a31',
 '313cb0b9-902b-4291-a2bd-38e823e37bad',
 '59dda10a-9d58-4700-a360-0014be28d619',
 '0c3188a1-6470-437d-a2f5-99d49678672a',
 'b40edcdc-d1d3-41b9-ac20-d2a81a6c75ec',
 '129b3a2a-c8a0-4757-abbc-3333750db953']

In [12]:
txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
txt_chunks = text_splitter.split_documents(txt_loader.load())
vector_db.add_documents(txt_chunks)

['723e3a46-7f47-40d1-a86b-b8ed337c832b']

## Querying the vector store directly

In [13]:
query = "Where was Poseidonia and who renamed it to Paestum" 
results = vector_db.similarity_search(query, 4) # four clostest results
print(results)

[Document(metadata={'source': 'Paestum/Paestum-Britannica.docx'}, page_content='Paestum, Greek\xa0Poseidonia, ancient city in southern\xa0Italy\xa0near the west coast, 22 miles (35 km) southeast of modern\xa0Salerno\xa0and 5 miles (8 km) south of the Sele (ancient Silarus) River. Paestum is noted for its splendidly preserved Greek temples.\n\n\n\n\n\nVisit the ruins of the ancient Greek colony of Paestum and discover its history, culture, and society\n\nSee all videos for this article'), Document(metadata={'source': 'https://en.wikipedia.org/wiki/Paestum', 'summary': 'Paestum ( PEST-əm, US also  PEE-stəm, Latin: [ˈpae̯stũː]) was a major ancient Greek city on the coast of the Tyrrhenian Sea, in Magna Graecia. The ruins of Paestum are famous for their three ancient Greek temples in the Doric order dating from about 550 to 450 BC that are in an excellent state of preservation. The city walls and amphitheatre are largely intact, and the bottom of the walls of many other structures remain, 

In [67]:
len(results)

4

## Asking a question through a RAG chain

In [10]:
from langchain.prompts import PromptTemplate

rag_prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""


rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [11]:
retriever = vector_db.as_retriever()

In [12]:
from langchain.schema.runnable import RunnablePassthrough
question_feeder = RunnablePassthrough()

In [13]:
from langchain.chat_models import ChatOpenAI
openai_api_key = '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'
chatbot = ChatOpenAI(openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

In [14]:
# set up RAG chain

rag_chain = {"context": retriever, "question": question_feeder} | rag_prompt | chatbot

In [17]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
answer = rag_chain.invoke(question)
print(answer)

content='Poseidonia was located in southern Italy, near the west coast. It was renamed to Paestum by the Lucanians. The source of this information is the document titled "Paestum/Paestum-Britannica.docx".'


In [26]:
question = "What else did the Lucanians do in Paestum? Also tell me the source." 
answer = rag_chain.invoke(question)
print(answer)

content="The Lucanians were involved in the overtaking of Poseidonia in Paestum, which is believed to have resulted in a regression in culture, politics, and economy. This information is sourced from the document 'Paestum/PaestumRevisited.pdf'."


In [31]:
# seting up Q&A chain with RetrievalQA

In [39]:
from langchain.chains import RetrievalQA
rag_chain = RetrievalQA.from_chain_type(llm=chatbot, chain_type="stuff", retriever=retriever, return_source_documents=False)

In [40]:
rag_chain

RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x00000135F25640D0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x00000135F257B850>, openai_api_key='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', openai_proxy='')), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x0000

In [41]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
answer = rag_chain.run(question)
print(answer)

Poseidonia was an ancient city located in southern Italy, near the west coast. It was founded by Greek colonists from Sybaris, along the Gulf of Taranto. After coming under the domination of the Lucanians, an indigenous Italic people, the city's name was changed to Paestum. The source for this information is the Encyclopaedia Britannica.


In [47]:
# executing chain for detailed results:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
answer = rag_chain.invoke({"query":question})
print(answer)

{'query': 'Where was Poseidonia and who renamed it to Paestum. Also tell me the source.', 'result': 'Poseidonia was located in southern Italy, near the west coast, approximately 22 miles southeast of modern Salerno and 5 miles south of the Sele River. It was founded by Greek colonists from Sybaris along the Gulf of Taranto.\n\nThe city was later renamed to Paestum by the Lucanians, an indigenous Italic people who dominated the region. The Romans also referred to the city as Paestum.\n\nThe information provided is a combination of historical knowledge about Paestum and Poseidonia, as well as the information found in the Encyclopaedia Britannica article on Paestum. Unfortunately, I cannot provide a specific source for the exact details mentioned.'}


In [49]:
# setting up Q&A chain with load_qa_chain

In [None]:
from langchain.chains.question_answering import load_qa_chain
rag_chain = load_qa_chain(llm=chatbot, chain_type="stuff", retriever=retriever, return_source_documents=False)