# Q&A across documents with LangChain and LangSmith

In [1]:
from langchain.document_loaders import WikipediaLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader

from langchain.vectorstores import Chroma

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_huggingface import HuggingFaceEmbeddings

# from langchain.embeddings import OpenAIEmbeddings # you can use this if you don't mind about costs

## Setting up vector database and embeddings

In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#embeddings_model = OpenAIEmbeddings() # you can use this is you don't mind about costs
vector_db = Chroma("tourist_info", embeddings_model)

  from tqdm.autonotebook import tqdm, trange


In [3]:
wikipedia_loader = WikipediaLoader(query="Paestum")
wikipedia_chunks = text_splitter.split_documents(wikipedia_loader.load())
vector_db.add_documents(wikipedia_chunks)

['6d34de48-8896-4c79-950b-cea3c20be084',
 '6fd0b9b5-e832-4d2c-851b-f1d202975462',
 '03249ad5-9bad-4886-a62b-52eff0839077',
 '623167c5-266d-4e76-a484-4d88182d346f',
 '69d53fca-191e-4d58-806d-572efd3ee4db',
 '0df480e8-0803-4742-ab06-9b4b65215aa4',
 '5b86837c-cae3-41e8-bf89-34f322be05da',
 '2f3322cb-0288-439b-baa8-a7978b7adef7',
 '4eb3ac67-e714-40e7-b853-10c97ee74cab',
 '2767dc09-1573-4b30-a996-6907faa4812b',
 '00ce40be-f5e0-4cb1-b3f1-b2706fef88aa',
 '733a2587-88a1-4e16-8445-2d078f4dc539',
 'bc8b3f5a-8a54-4be3-9ad2-5df22a3b488b',
 'ef7c70f1-5bbc-4c2f-aa28-b4a9041f073f',
 'a26bc834-ca59-4cf0-a12c-a891f0d02fe7',
 '20c5be6c-37cf-463a-a76b-5bdca9cb24bf',
 '26ba74a8-023c-42ed-bf93-ff0b6acafd4b',
 '5afcca7d-844e-456a-847e-6abac9259907',
 '9d84e826-dbac-4480-adbf-6d8098cca822',
 'd2b8ab1d-be40-47e3-bcf7-56af546bb286',
 'a4e8e2c9-8109-4ba3-9dc0-43760adcabd0',
 '86b6c098-47c0-41e8-8325-9ce30423b6d7',
 '6411f74c-7eb9-4225-8962-9b70a3ac7de1',
 '8beeb73a-eb08-4290-88ca-f5157ad0e6a8',
 '91c543c0-cc9c-

In [4]:
#vector_db = Chroma.from_documents(wikipedia_chunks, embeddings_model)

In [7]:
word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
word_chunks = text_splitter.split_documents(word_loader.load())
vector_db.add_documents(word_chunks)

['6b674cdb-0596-4496-8321-d5576308b2ea',
 '0f78d6f9-9e44-4ea8-a8e8-5901410d5387',
 'f5f0e636-8ae1-409c-ad45-b42501b16063',
 '1d589f0e-2ceb-461b-b3d0-729be98510c5',
 '50beddf1-9c35-4613-bf35-b9413dbd4c4d',
 '1d354ca3-dc72-48c0-8167-be7b2d0c6bfb',
 '898c9658-2ca2-462b-a7e9-d874b8f6351a',
 '081f4f41-1309-48c7-8876-e32cdf5292cd']

In [8]:
pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
pdf_chunks = text_splitter.split_documents(pdf_loader.load())
vector_db.add_documents(pdf_chunks)

['b22a612f-0a2f-4fb1-9d96-e48124bb9bf1',
 '7d132a4f-7340-4782-bc58-e2189575c756',
 'd5b2d5ff-03ae-4622-981e-c66ef479870e',
 '1b65173d-e40b-40fc-b0fb-8c892d8b9d60',
 'a02a33f4-1f48-489d-ad0b-0aae9aedbe60',
 '96be5ac6-51f5-46b1-898d-b3c880f6cff1',
 '9859db94-3760-4229-ab75-a0680186bc7a',
 '0af21bac-dddf-42d2-92ef-740fd9543199',
 'ee6ed0a1-5a77-4771-9b5b-19a74eb65d05',
 '9a561092-41bc-4cda-84de-a67f01033295',
 '2bf8a19f-05f3-4fda-9e1e-592aea960340',
 '60afb592-02a3-4fd3-bda2-7f0b2f72f370',
 '23d7aac9-b293-4449-9e75-bda753778ed4',
 '2f8dcfa4-995b-4911-bb53-afcd0d28f75f',
 '6d434e90-485f-4aed-9a82-46d6849972c0',
 'a3f62aef-e0f2-479c-a65b-c4f5e567801e',
 '129d0092-c183-41ca-98d0-49b2ab9d6cfc',
 '78a53a16-c79f-449d-b280-be09b0c66d0e',
 '28b96bce-4528-4ea6-b55e-ede827609c13',
 '274c43af-ace7-40e0-9782-cd5eb9bd2d93',
 '894a0df3-928c-44e6-9c1b-db575e47ef96',
 'e5318aa6-011b-4e6f-ad42-b1a5fe7fbc15',
 '3c414f1c-da5c-48d9-97ba-d81ee6ad27cb',
 '8d8beae3-99a6-4944-a8f8-bc75e3c3e662']

In [9]:
txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
txt_chunks = text_splitter.split_documents(txt_loader.load())
vector_db.add_documents(txt_chunks)

['1675c3bf-9684-4d24-b43e-39a28248f7a6']

## Querying the vector store directly

In [10]:
query = "Where was Poseidonia and who renamed it to Paestum" 
results = vector_db.similarity_search(query, 4) # four clostest results
print(results)

[Document(metadata={'source': 'Paestum/Paestum-Britannica.docx'}, page_content='Paestum, Greek\xa0Poseidonia, ancient city in southern\xa0Italy\xa0near the west coast, 22 miles (35 km) southeast of modern\xa0Salerno\xa0and 5 miles (8 km) south of the Sele (ancient Silarus) River. Paestum is noted for its splendidly preserved Greek temples.\n\n\n\n\n\nVisit the ruins of the ancient Greek colony of Paestum and discover its history, culture, and society\n\nSee all videos for this article'), Document(metadata={'source': 'Paestum/Paestum-Britannica.docx'}, page_content='Paestum, Greek\xa0Poseidonia, ancient city in southern\xa0Italy\xa0near the west coast, 22 miles (35 km) southeast of modern\xa0Salerno\xa0and 5 miles (8 km) south of the Sele (ancient Silarus) River. Paestum is noted for its splendidly preserved Greek temples.\n\n\n\n\n\nVisit the ruins of the ancient Greek colony of Paestum and discover its history, culture, and society\n\nSee all videos for this article'), Document(metada

In [11]:
len(results)

4

## Asking a question through a RAG chain

In [12]:
from openai import OpenAI
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [13]:
from langchain.prompts import PromptTemplate

rag_prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [14]:
retriever = vector_db.as_retriever()

In [15]:
from langchain.schema.runnable import RunnablePassthrough
question_feeder = RunnablePassthrough()

In [16]:
from langchain_openai import ChatOpenAI

chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo")

In [17]:
# set up RAG chain

rag_chain = {"context": retriever, "question": question_feeder} | rag_prompt | chatbot

In [18]:
def execute_chain(chain, question):
    answer = chain.invoke(question)
    return answer

In [19]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
answer = execute_chain(rag_chain, question)
print(answer.content)

Poseidonia was located in southern Italy near the west coast, and it was renamed to Paestum by the Lucanians and later by the Romans. The source of this information is from the Britannica document on Paestum.


In [20]:
print(answer)

content='Poseidonia was located in southern Italy near the west coast, and it was renamed to Paestum by the Lucanians and later by the Romans. The source of this information is from the Britannica document on Paestum.' response_metadata={'token_usage': {'completion_tokens': 48, 'prompt_tokens': 1034, 'total_tokens': 1082}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-65b443ee-b1b3-4156-b99e-91d393d9c0b9-0' usage_metadata={'input_tokens': 1034, 'output_tokens': 48, 'total_tokens': 1082}


In [21]:
question = "What else did the Lucanians do in Paestum? Also tell me the source." 
answer = execute_chain(rag_chain, question)
print(answer.content)

The Lucanians overtook Poseidonia peacefully, which led to a supposed regression in culture, politics, and economy, according to Aristoxenos of Tarentum. This information can be found in the source "PaestumRevisited.pdf."


## Tracing with LangSmith

In [172]:
from langsmith import trace
from langsmith import Client, traceable

In [173]:
LANGSMITH_API_KEY= getpass.getpass('Enter your LANGSMITH_API_KEY')

Enter your LANGSMITH_API_KEY ········


In [175]:
langsmith_client = Client(
    api_key=LANGSMITH_API_KEY,
    api_url="https://api.smith.langchain.com",  
)

In [190]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
with trace("Chat Pipeline", "chain", project_name="Q&A chatbot", inputs={"input": question}, client=langsmith_client) as rt:
    answer = execute_chain(rag_chain, question)
    print(answer)
    rt.end(outputs={"output": answer})

content='Poseidonia, later renamed Paestum, was an ancient Greek city located in southern Italy near the coast of the Tyrrhenian Sea. The city was renamed to Paestum by the Romans after they took over in 273 BC. The information is from the source: https://en.wikipedia.org/wiki/Paestum.' response_metadata={'token_usage': {'completion_tokens': 68, 'prompt_tokens': 1490, 'total_tokens': 1558}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-cba3219f-8a1e-413e-a895-fe63ff8a9e2a-0' usage_metadata={'input_tokens': 1490, 'output_tokens': 68, 'total_tokens': 1558}


## Setting up Q&A chain with RetrievalQA

In [191]:
from langchain.chains import RetrievalQA
rag_chain = RetrievalQA.from_chain_type(llm=chatbot, chain_type="stuff", retriever=retriever, return_source_documents=False)

In [194]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
with trace("RetrievalQA", "chain", project_name="Q&A chatbot", inputs={"input": question}, client=langsmith_client) as rt:
    answer = execute_chain(rag_chain, question)
    print(answer)
    rt.end(outputs={"output": answer})

{'query': 'Where was Poseidonia and who renamed it to Paestum. Also tell me the source.', 'result': 'Poseidonia was an ancient city in southern Italy, near the west coast. It was eventually conquered by the local Lucanians who renamed it to Paistos. The Romans later gave the city its current name, Paestum. The source for this information is the article on Paestum by the Editors of Encyclopaedia Britannica.'}
