# Q&A across documents with LangChain and LangSmith

In [9]:
from langchain.document_loaders import WikipediaLoader, Docx2txtLoader, PyPDFLoader, TextLoader

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

In [10]:
import getpass
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


## Setting up vector database and embeddings

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
embeddings_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vector_db = Chroma("tourist_info", embeddings_model)

In [12]:
wikipedia_loader = WikipediaLoader(query="Paestum")
wikipedia_chunks = text_splitter.split_documents(wikipedia_loader.load())
vector_db.add_documents(wikipedia_chunks)

['c67f4488-3436-40c3-b33d-4202e95ad521',
 'b7a94ed5-443f-4a75-a6cd-eb51dcc7a653',
 '5ee7d575-5051-4018-9f88-b0509b4fab04',
 'aa488be7-6bb7-4105-a481-1ee763f79525',
 '57797de2-7d7d-4153-bd5c-c1e8238be76b',
 'a50a3cc2-0831-45c6-bf37-ed4f80eabc2e',
 '6334e94e-b3ec-427c-94ce-5995c497bc14',
 'b771496d-b31a-4908-94cf-96e514e8ee2b',
 '76d7345a-2448-4c42-9700-7e00d12849ae',
 '90045e56-0935-411e-b644-a2ea8c34bba5',
 '3bdd38da-229b-43ae-a1f0-1d1dfdc40582',
 'a9afb413-bb4a-4e41-b9ec-f08bbaf42c52',
 '14a24d74-e99f-40ea-b454-938ab318384b',
 'bf006188-0ab8-42be-9de4-2e871dd5c478',
 'f105e2f3-f39a-4107-8cc8-a62c9346ead1',
 '28892732-0d38-46d9-832a-12caec5c24f0',
 '797b8799-4184-4e12-a93b-e8779691584b',
 '3c23dfbe-0958-4499-a4a1-d29fdf96a7ef',
 '44bd7597-c424-4e16-b10b-0b13cddc75c0',
 'bc7bb531-e02c-4850-acd6-c0f99306dfca',
 '4ca40b63-23a4-4954-8e77-b0cdfdb906cb',
 'c0a43e11-3a68-4aaa-abbe-c672c0869223',
 '7edc5ceb-af19-4708-a789-ce0a79498aa7',
 'a3f98e2c-0b54-46a0-8153-81c95ccb3a98',
 '7e809382-d687-

In [13]:
word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
word_chunks = text_splitter.split_documents(word_loader.load())
vector_db.add_documents(word_chunks)

['73dc2f2a-3b83-48b6-85a0-bf297680be48',
 'c3a5d7b0-0eae-4df3-84c2-405ccdd25ea2',
 'ad1e0761-9d06-4948-892d-503f0f66db3f',
 'c24e5530-30a5-4b1c-8a36-b667225b6a0c',
 'f898e086-e1f5-4d79-b326-bca5439ab9d3',
 '8f080b59-1567-4261-be5e-80806436ca76',
 '5aa596b6-1e48-4568-af16-3fa284cadbf3',
 'c4d66768-31f3-468f-836b-345663161c66']

In [14]:
pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
pdf_chunks = text_splitter.split_documents(pdf_loader.load())
vector_db.add_documents(pdf_chunks)

['6750da07-de3b-4b30-be98-12e22c02c9f6',
 '99908427-253c-43c8-8d6c-eb4455c2c45a',
 '7579de27-7046-451c-ad0e-feff62408136',
 '89188cc7-d9f6-4097-bed7-afc0a6ba8861',
 '8cf71b0c-7d60-472c-a0b0-b6be70eb1ee7',
 'a4f44a59-724b-40df-9ba6-133c44e02b53',
 '106d0f32-bdf8-4cc1-a285-112aca79619d',
 '8cb9810f-235c-4ac1-be95-5f59df6203ee',
 '03495e4d-4e2c-4086-90b8-0904cd440e41',
 '292d7f58-df86-469a-93d5-d52a89a1d4e5',
 'fc7e11be-d112-43a1-a13d-62e2cab65e6d',
 'd138be27-bad5-40a5-a374-0cf1776cccbb',
 '79189aae-3aae-44f2-a6d2-40199c916536',
 '7f580760-e21d-4382-b04f-df44cee89f1a',
 '7eabface-47a5-43da-a09b-a3a4c830c9bd',
 'dbfb948c-13d9-49e4-8133-73a1e4261d21',
 '390668f0-fe98-4df9-87ba-d5686dcd0c4e',
 'b532ad5c-c2d3-4e5d-a4f8-b3a1b123eebd',
 '3588191c-888a-4637-8af1-a64486320306',
 'f3f8e92e-3ad3-402c-a936-63efa42287c1',
 '766f2381-314e-4be4-b832-ed0efd94508e',
 '9b36adde-4103-47d4-ac87-fd182be250c6',
 '151c4b24-6227-49ff-a606-94ea62597217',
 'da204581-5862-4dfd-9828-065a49021757']

In [15]:
txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
txt_chunks = text_splitter.split_documents(txt_loader.load())
vector_db.add_documents(txt_chunks)

['8105b154-0c41-4e8f-8465-49da779dc026']

### Removing duplication

In [16]:
def split_and_import(loader):
     chunks = text_splitter.split_documents(loader.load())
     vector_db.add_documents(chunks)
     print(f"Ingested chunks created by {loader}")

In [17]:
wikipedia_loader = WikipediaLoader(query="Paestum")
split_and_import(wikipedia_loader)

word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
split_and_import(word_loader)

pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
split_and_import(pdf_loader)

txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
split_and_import(txt_loader)

Ingested chunks created by <langchain_community.document_loaders.wikipedia.WikipediaLoader object at 0x00000283501B5A90>
Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000002834F8BB310>
Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000002834F92B810>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000002833D2027D0>


## Ingesting Multiple Documents from a Folder (two techniques)

### 1) Iterating over all files in a folder

In [19]:
loader_classes = {
    'docx': Docx2txtLoader,
    'pdf': PyPDFLoader,
    'txt': TextLoader
}

In [21]:
import os

def get_loader(filename):
    _, file_extension = os.path.splitext(filename) #A Extract the file extension
    file_extension = file_extension.lstrip('.') #B Remove the leading dot from the extension
    
    loader_class = loader_classes.get(file_extension) #C Get the loader class from the dictionary
    
    if loader_class:
        return loader_class(filename) #D Instantiate and return the correct loader
    else:
        raise ValueError(f"No loader available for file extension '{file_extension}'")

#### Ingesting the files from the folder (Exercise solution)

In [33]:
folder_path = "CilentoTouristInfo" #A Path to the folder containing the documents

for filename in os.listdir(folder_path): #B iterate over the files in the path
    file_path = os.path.join(folder_path, filename) #C Construct the full path to the file
   
    if os.path.isfile(file_path): #D Check if it is a file (not a directory)
        try:
            loader = get_loader(file_path) #E Instantiate the correct loader for the file
            print(f"Loader for {filename}: {loader}")
            split_and_import(loader) #F Split and ingest
        except ValueError as e:
            print(e)

Loader for Acciaroli.pdf: <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000001EAC00E5D50>
Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000001EAC00E5D50>
Loader for Cape Palinuro.txt: <langchain_community.document_loaders.text.TextLoader object at 0x000001EAC00E6490>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000001EAC00E6490>
Loader for Casalvelino.txt: <langchain_community.document_loaders.text.TextLoader object at 0x000001EAC00E4850>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000001EAC00E4850>
Loader for Cilentan coast.docx: <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000001EABF733E90>
Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000001EABF733E90>
Loader for Cilento Coast Map and Travel Guide.docx: <langchain_community.docum

### 2) Ingesting all files with with DirectoryLoader

In [None]:
# ONLY RUN THIS IF YOU HAVE SUCCESFULLY INSTALLED unstructured or langchain-unstructured
# THE INSTALLATION IS OPERATIVE SYSTEM SPECIFIC
# follow LangChain instructions at https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/ or 
# Unstructured instructions at https://docs.unstructured.io/welcome#quickstart-unstructured-open-source-library
folder_path = "CilentoTouristInfo"
pattern = "**/*.{docx,pdf,txt}" #A Pattern to match .docx, .pdf, and .txt files

directory_loader = DirectoryLoader(folder_path, pattern) #B Initialize the DirectoryLoader with the folder path and pattern
split_and_import(directory_loader)

## Querying the vector store directly

In [10]:
query = "Where was Poseidonia and who renamed it to Paestum" 
results = vector_db.similarity_search(query, 4) # four clostest results
print(results)

[Document(metadata={'source': 'Paestum/Paestum-Britannica.docx'}, page_content='Paestum, Greek\xa0Poseidonia, ancient city in southern\xa0Italy\xa0near the west coast, 22 miles (35 km) southeast of modern\xa0Salerno\xa0and 5 miles (8 km) south of the Sele (ancient Silarus) River. Paestum is noted for its splendidly preserved Greek temples.\n\n\n\n\n\nVisit the ruins of the ancient Greek colony of Paestum and discover its history, culture, and society\n\nSee all videos for this article'), Document(metadata={'source': 'Paestum/Paestum-Britannica.docx'}, page_content='Paestum, Greek\xa0Poseidonia, ancient city in southern\xa0Italy\xa0near the west coast, 22 miles (35 km) southeast of modern\xa0Salerno\xa0and 5 miles (8 km) south of the Sele (ancient Silarus) River. Paestum is noted for its splendidly preserved Greek temples.\n\n\n\n\n\nVisit the ruins of the ancient Greek colony of Paestum and discover its history, culture, and society\n\nSee all videos for this article'), Document(metada

In [11]:
len(results)

4

## Asking a question through a RAG chain

In [2]:
from openai import OpenAI
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [18]:
from langchain.prompts import PromptTemplate

rag_prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [19]:
retriever = vector_db.as_retriever()

In [20]:
from langchain.schema.runnable import RunnablePassthrough
question_feeder = RunnablePassthrough()

In [21]:
from langchain_openai import ChatOpenAI

chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-4o-mini")

In [22]:
# set up RAG chain

rag_chain = {"context": retriever, "question": question_feeder} | rag_prompt | chatbot

In [23]:
def execute_chain(chain, question):
    answer = chain.invoke(question)
    return answer

In [24]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
answer = execute_chain(rag_chain, question)
print(answer.content)

Poseidonia was located on the coast of the Tyrrhenian Sea in Magna Graecia, and it was renamed to Paestum by the Romans. The source of this information is the Wikipedia article on Paestum.


In [25]:
print(answer)

content='Poseidonia was located on the coast of the Tyrrhenian Sea in Magna Graecia, and it was renamed to Paestum by the Romans. The source of this information is the Wikipedia article on Paestum.' response_metadata={'token_usage': {'completion_tokens': 47, 'prompt_tokens': 1773, 'total_tokens': 1820}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_48196bc67a', 'finish_reason': 'stop', 'logprobs': None} id='run-81d269ca-7162-4428-8ef3-5c93c5d6d81e-0' usage_metadata={'input_tokens': 1773, 'output_tokens': 47, 'total_tokens': 1820}


In [29]:
question = "And then, what did they do?" 
answer = execute_chain(rag_chain, question)
print(answer.content)

I don't know.


## Chatbot memory of message history

In [30]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables import RunnableLambda

rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant, world-class expert in Roman and Greek history, especially in towns located in southern Italy. Provide interesting insights on local history and recommend places to visit with knowledgeable and engaging answers. Answer all questions to the best of your ability, but only use what has been provided in the context. If you don't know, just say you don't know. Use three sentences maximum and keep the answer as concise as possible."),
        ("placeholder", "{chat_history_messages}"),
        ("assistant", "{retrieved_context}"),
        ("human", "{question}"),
    ]
)

retriever = vector_db.as_retriever()
question_feeder = RunnablePassthrough()
chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-4o-mini")
chat_history_memory = ChatMessageHistory()

def get_messages(x):
    return chat_history_memory.messages

rag_chain = {
    "retrieved_context": retriever, 
    "question": question_feeder,
    "chat_history_messages": RunnableLambda(get_messages)
} | rag_prompt | chatbot

def execute_chain_with_memory(chain, question):
    chat_history_memory.add_user_message(question)
    answer = chain.invoke(question)
    chat_history_memory.add_ai_message(answer)
    print(f'Full chat message history: {chat_history_memory.messages}\n\n')                                      
    return answer

In [31]:
question = "Where was Poseidonia and who renamed it to Paestum? Also tell me the source." 
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)

Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed it to Paestum? Also tell me the source.'), AIMessage(content='Poseidonia was an ancient Greek city located on the coast of the Tyrrhenian Sea in what is now southern Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC, following its conquest by the Lucanians. The source of this information is from Wikipedia.', response_metadata={'token_usage': {'completion_tokens': 64, 'prompt_tokens': 1835, 'total_tokens': 1899}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_48196bc67a', 'finish_reason': 'stop', 'logprobs': None}, id='run-e51e7fb1-8efc-4a42-addc-89cb6fb86ea9-0', usage_metadata={'input_tokens': 1835, 'output_tokens': 64, 'total_tokens': 1899})]


Poseidonia was an ancient Greek city located on the coast of the Tyrrhenian Sea in what is now southern Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC, follo

In [32]:
question = "And then what did they do? Also tell me the source" 
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)

Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed it to Paestum? Also tell me the source.'), AIMessage(content='Poseidonia was an ancient Greek city located on the coast of the Tyrrhenian Sea in what is now southern Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC, following its conquest by the Lucanians. The source of this information is from Wikipedia.', response_metadata={'token_usage': {'completion_tokens': 64, 'prompt_tokens': 1835, 'total_tokens': 1899}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_48196bc67a', 'finish_reason': 'stop', 'logprobs': None}, id='run-e51e7fb1-8efc-4a42-addc-89cb6fb86ea9-0', usage_metadata={'input_tokens': 1835, 'output_tokens': 64, 'total_tokens': 1899}), HumanMessage(content='And then what did they do? Also tell me the source'), AIMessage(content="After the Romans renamed Poseidonia to Paestum, they developed the city further, enhancing its infrastructu

## Tracing with LangSmith

In [172]:
from langsmith import trace
from langsmith import Client, traceable

In [173]:
LANGSMITH_API_KEY= getpass.getpass('Enter your LANGSMITH_API_KEY')

Enter your LANGSMITH_API_KEY ········


In [175]:
langsmith_client = Client(
    api_key=LANGSMITH_API_KEY,
    api_url="https://api.smith.langchain.com",  
)

In [190]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
with trace("Chat Pipeline", "chain", project_name="Q&A chatbot", inputs={"input": question}, client=langsmith_client) as rt:
    answer = execute_chain(rag_chain, question)
    print(answer)
    rt.end(outputs={"output": answer})

content='Poseidonia, later renamed Paestum, was an ancient Greek city located in southern Italy near the coast of the Tyrrhenian Sea. The city was renamed to Paestum by the Romans after they took over in 273 BC. The information is from the source: https://en.wikipedia.org/wiki/Paestum.' response_metadata={'token_usage': {'completion_tokens': 68, 'prompt_tokens': 1490, 'total_tokens': 1558}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-cba3219f-8a1e-413e-a895-fe63ff8a9e2a-0' usage_metadata={'input_tokens': 1490, 'output_tokens': 68, 'total_tokens': 1558}


## Setting up Q&A chain with RetrievalQA

In [191]:
from langchain.chains import RetrievalQA
rag_chain = RetrievalQA.from_chain_type(llm=chatbot, chain_type="stuff", retriever=retriever, return_source_documents=False)

In [194]:
question = "Where was Poseidonia and who renamed it to Paestum. Also tell me the source." 
with trace("RetrievalQA", "chain", project_name="Q&A chatbot", inputs={"input": question}, client=langsmith_client) as rt:
    answer = execute_chain(rag_chain, question)
    print(answer)
    rt.end(outputs={"output": answer})

{'query': 'Where was Poseidonia and who renamed it to Paestum. Also tell me the source.', 'result': 'Poseidonia was an ancient city in southern Italy, near the west coast. It was eventually conquered by the local Lucanians who renamed it to Paistos. The Romans later gave the city its current name, Paestum. The source for this information is the article on Paestum by the Editors of Encyclopaedia Britannica.'}


## Chatbot memory of message history

In [105]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables import RunnableLambda

rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant, world-class expert in Roman and Greek history, especially in towns located in southern Italy. Provide interesting insights on local history and recommend places to visit with knowledgeable and engaging answers. Answer all questions to the best of your ability, but only use what has been provided in the context. If you don't know, just say you don't know. Use three sentences maximum and keep the answer as concise as possible."),
        ("placeholder", "{chat_history_messages}"),
        ("assistant", "{retrieved_context}"),
        ("human", "{question}"),
    ]
)

retriever = vector_db.as_retriever()
question_feeder = RunnablePassthrough()
chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-4o-mini")
chat_history_memory = ChatMessageHistory()

def get_messages(x):
    return chat_history_memory.messages

rag_chain = {
    "retrieved_context": retriever, 
    "question": question_feeder,
    "chat_history_messages": RunnableLambda(get_messages)
} | rag_prompt | chatbot

def execute_chain_with_memory(chain, question):
    chat_history_memory.add_user_message(question)
    answer = chain.invoke(question)
    chat_history_memory.add_ai_message(answer)
    print(f'Full chat message history: {chat_history_memory.messages}\n\n')                                      
    return answer

In [106]:
question = "Where was Poseidonia and who renamed it to Paestum? Also tell me the source." 
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)


Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed it to Paestum? Also tell me the source.'), AIMessage(content='Poseidonia was located on the coast of the Tyrrhenian Sea in what is now the modern town of Paestum, in the province of Salerno, Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC. The source of this information is Wikipedia.', response_metadata={'token_usage': {'completion_tokens': 65, 'prompt_tokens': 1843, 'total_tokens': 1908}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_9b0abffe81', 'finish_reason': 'stop', 'logprobs': None}, id='run-a74d990b-6f92-4128-8ebd-80085da40f28-0', usage_metadata={'input_tokens': 1843, 'output_tokens': 65, 'total_tokens': 1908})]


Poseidonia was located on the coast of the Tyrrhenian Sea in what is now the modern town of Paestum, in the province of Salerno, Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC. T

In [107]:
question = "And then what did they do? Also tell me the source" 
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)

Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed it to Paestum? Also tell me the source.'), AIMessage(content='Poseidonia was located on the coast of the Tyrrhenian Sea in what is now the modern town of Paestum, in the province of Salerno, Italy. It was renamed Paestum by the Romans after they took control of the city in 273 BC. The source of this information is Wikipedia.', response_metadata={'token_usage': {'completion_tokens': 65, 'prompt_tokens': 1843, 'total_tokens': 1908}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_9b0abffe81', 'finish_reason': 'stop', 'logprobs': None}, id='run-a74d990b-6f92-4128-8ebd-80085da40f28-0', usage_metadata={'input_tokens': 1843, 'output_tokens': 65, 'total_tokens': 1908}), HumanMessage(content='And then what did they do? Also tell me the source'), AIMessage(content='After its renaming to Paestum, the Romans developed the city further, constructing significant public buildings, including templ