# Q&A across documents with LangChain and LangSmith

In [3]:
from langchain_community.document_loaders import WikipediaLoader, Docx2txtLoader, PyPDFLoader, TextLoader

from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

In [4]:
import getpass
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


## Setting up vector database and embeddings

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=0)
embeddings_model = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY)
vector_db = Chroma("tourist_info", embeddings_model)

In [8]:
wikipedia_loader = WikipediaLoader(query="Paestum")
wikipedia_chunks = text_splitter.split_documents(
    wikipedia_loader.load())
vector_db.add_documents(wikipedia_chunks)

['710a328f-576b-4e35-a582-dfd5fb00f98a',
 'ba98671a-d371-44fc-8f58-c703f32336b7',
 'fee55d6e-758d-419c-9cdb-436475363713',
 'b282eb46-88ec-4fc1-81f5-298044bde184',
 '59a048b9-0a14-411a-8587-966a9759cf62',
 '89b91caa-2967-4087-9c52-3d60e7309578',
 '2699dab5-3f01-440b-865e-85d19ce6868f',
 'f42b2c90-f9a6-462d-8eb9-778bb87f8243',
 'f0c41c44-2322-4ca0-a4fb-e52c58c6aed2',
 'e2aa50d9-2cbe-43d6-8278-e5a7dfc9ddd6',
 'd0f9af0b-6201-4e93-8272-98047d743fcf',
 '4054d488-a7c1-4c63-b708-4a976c3fb209',
 '5f13ee45-05ef-4113-80cb-b37f4f19c577',
 'd6244bb6-2bea-4ae2-af2c-aa4bd89c1e6e',
 '24c1c851-4ba5-4619-8d99-0ec1816e958b',
 '0af1fe17-3ee1-47a7-8486-bdf127b31773',
 '56c95803-d668-4326-8356-89a45bc6fa93',
 '78c6deb7-5789-4ab7-b40f-ee04f773b3f4',
 'c7824da8-15e3-41a1-9a10-5822c060a0af',
 '0b9c3933-4425-4be1-8c62-a5057abf58e5',
 '3e64be9b-46a1-44dc-886d-0f43a642da23',
 'b0935960-e5eb-4bba-9034-b1782eb3b618',
 '5eb8de03-a883-4fe3-83d5-5bbaa9606e01',
 '8313036a-7d93-40ef-82bc-799e07cd4bc7',
 'c933aefb-4cb6-

In [9]:
word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
word_chunks = text_splitter.split_documents(
    word_loader.load())
vector_db.add_documents(word_chunks)

['427af62d-705f-454b-90f9-a681d9e188f5',
 'feb2788a-a50b-4b14-b3f2-ffe294c1b3c5',
 '0fa9ec40-68df-4609-8e1c-9083ed71254d',
 'c753ce8c-3962-4f6e-adea-db75b9472c24',
 '8408789f-5001-490d-a17a-5448a322e178',
 '4296176a-bfc2-41f8-96ef-c25fdcc4df5a',
 '5dbb297b-8083-4c90-b377-90e9617e873d',
 'efd8772f-3325-4ab6-aba3-47b422918536']

In [10]:
pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
pdf_chunks = text_splitter.split_documents(
    pdf_loader.load())
vector_db.add_documents(pdf_chunks)

['de25bddb-21b6-468f-9b83-1bfce1c77ebe',
 'd5609d3a-fc4d-4f14-834c-cdfea1a75de6',
 'd7b6d12f-7b9d-4a55-be5f-7f284839229b',
 'e891fd18-7ccd-4b29-8991-837dd6b8053e',
 'd1d4a9b1-c9d5-4895-bd6a-974691f29724',
 '764f3546-6aee-4b16-87b5-3c82dbdc7482',
 '217baea0-3ee3-4232-8610-b7650dd147e3',
 '4027d397-63db-4983-9235-7a25b68c490a',
 'e90bc2a5-14bf-4c20-bd8c-57a81108c627',
 '4fe7d6b4-500a-4ee0-b84a-71857400b0c2',
 '5cece374-0c9e-4150-87b3-94dd9cc4d9a7',
 '13e1e73e-6d4a-4a4e-bbee-0bd764d0690c',
 '2ec26d72-dd46-4a0d-a0f1-08a9ea70503f',
 'e78bb726-2ad6-4e5e-9e3d-655bd91d59ad',
 'dbf33eee-1b5e-4467-9abe-2912bfbe20d6',
 '74da3647-0122-415a-8edf-196c0775f8c2',
 '77b96739-835f-43b0-8d52-22fb29dcc44e',
 '088d2542-aa8a-4920-8f25-e639c66a2785',
 'ab345892-857e-43a3-b327-f3cfda0f0758',
 '6791b2d8-f061-4948-8409-b93903e6294c',
 'bbca2e72-481f-4a28-8079-66641b711388',
 '46f8f1e0-de54-4f44-8e17-6e8a9402efc2',
 '9a885f05-5404-4ca9-9862-8a5a76ba79ae',
 'fb45c1fe-e14e-4a01-ba8a-b0b1faea9c38',
 '68586b6f-a9eb-

In [10]:
txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
txt_chunks = text_splitter.split_documents(
    txt_loader.load())
vector_db.add_documents(txt_chunks)

['39a7557d-fe0b-4fae-8a69-e7d567c3c1f1']

### Removing duplication

In [11]:
def split_and_import(loader):
     chunks = text_splitter.split_documents(loader.load())
     vector_db.add_documents(chunks)
     print(f"Ingested chunks created by {loader}")

In [12]:
wikipedia_loader = WikipediaLoader(query="Paestum")
split_and_import(wikipedia_loader)

word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
split_and_import(word_loader)

pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
split_and_import(pdf_loader)

txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
split_and_import(txt_loader)

Ingested chunks created by <langchain_community.document_loaders.wikipedia.WikipediaLoader object at 0x000002D63E3A8210>
Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000002D63EE03C50>
Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000002D641911650>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000002D63DEE0C50>


## Ingesting Multiple Documents from a Folder (two techniques)

### 1) Iterating over all files in a folder

In [13]:
loader_classes = {
    'docx': Docx2txtLoader,
    'pdf': PyPDFLoader,
    'txt': TextLoader
}

In [14]:
import os

def get_loader(filename):
    _, file_extension = os.path.splitext(filename) #A Extract the file extension
    file_extension = file_extension.lstrip('.') #B Remove the leading dot from the extension
    
    loader_class = loader_classes.get(
        file_extension) #C Get the loader class from the dictionary
    
    if loader_class:
        return loader_class(filename) #D Instantiate and return the correct loader
    else:
        raise ValueError(f"No loader available for file extension '{file_extension}'")

#### Ingesting the files from the folder (Exercise solution)

In [15]:
folder_path = "CilentoTouristInfo" #A Path to the folder containing the documents

for filename in os.listdir(folder_path): #B iterate over the files in the path
    file_path = os.path.join(folder_path, filename) #C Construct the full path to the file
   
    if os.path.isfile(file_path): #D Check if it is a file (not a directory)
        try:
            loader = get_loader(file_path) #E Instantiate the correct loader for the file
            print(f"Loader for {filename}: {loader}")
            split_and_import(loader) #F Split and ingest
        except ValueError as e:
            print(e)

Loader for Acciaroli.pdf: <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000001CAD01EECD0>
Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x000001CAD01EECD0>
Loader for Cape Palinuro.txt: <langchain_community.document_loaders.text.TextLoader object at 0x000001CACEA00D10>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000001CACEA00D10>
Loader for Casalvelino.txt: <langchain_community.document_loaders.text.TextLoader object at 0x000001CACF8F7DD0>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x000001CACF8F7DD0>
Loader for Cilentan coast.docx: <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000001CAD01F6390>
Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x000001CAD01F6390>
Loader for Cilento Coast Map and Travel Guide.docx: <langchain_community.docum

### 2) Ingesting all files with with DirectoryLoader

In [15]:
# ONLY RUN THIS IF YOU HAVE SUCCESFULLY INSTALLED unstructured or langchain-unstructured
# THE INSTALLATION IS OPERATIVE SYSTEM SPECIFIC
# follow LangChain instructions at https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/ or 
# Unstructured instructions at https://docs.unstructured.io/welcome#quickstart-unstructured-open-source-library
folder_path = "CilentoTouristInfo"
pattern = "**/*.{docx,pdf,txt}" #A Pattern to match .docx, .pdf, and .txt files

directory_loader = DirectoryLoader(folder_path, pattern) #B Initialize the DirectoryLoader with the folder path and pattern
split_and_import(directory_loader)

NameError: name 'DirectoryLoader' is not defined

## Querying the vector store directly

In [16]:
query = "Where was Poseidonia and who renamed it to Paestum?" 
results = vector_db.similarity_search(query, 4) # four clostest results
print(results)

[Document(id='dc90ac3e-50af-4b6e-9648-794aff7dd10c', metadata={'source': 'https://en.wikipedia.org/wiki/Paestum', 'summary': 'Paestum ( PEST-əm, US also  PEE-stəm, Latin: [ˈpae̯stũː]) was a major ancient Greek city on the coast of the Tyrrhenian Sea, in Magna Graecia. The ruins of Paestum are famous for their three ancient Greek temples in the Doric order dating from about 550 to 450 BCE that are in an excellent state of preservation. The city walls and amphitheatre are largely intact, and the bottom of the walls of many other structures remain, as well as paved roads. The site is open to the public, and there is a modern national museum within it, which also contains the finds from the associated Greek site of Foce del Sele.\nPaestum was established around 600 BCE by settlers from Sybaris, a Greek colony in southern Italy, under the name of Poseidonia (Ancient Greek: Ποσειδωνία). The city thrived as a Greek settlement for about two centuries, witnessing the development of democracy. I

In [17]:
len(results)

4

## Asking a question through a RAG chain

In [14]:
from openai import OpenAI
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [16]:
from langchain_core.prompts import PromptTemplate

rag_prompt_template = """Use the following pieces of context
to answer the question at the end. 
If you don't know the answer, just say that you don't know, 
don't try to make up an answer.
Use three sentences maximum and keep the 
answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [17]:
retriever = vector_db.as_retriever()

In [18]:
from langchain_core.runnables import RunnablePassthrough
question_feeder = RunnablePassthrough()

In [19]:
from langchain_openai import ChatOpenAI

chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, 
                     model_name="gpt-5-nano")

In [24]:
# set up RAG chain

rag_chain = {"context": retriever, 
             "question": question_feeder}|rag_prompt|chatbot

In [25]:
def execute_chain(chain, question):
    answer = chain.invoke(question)
    return answer

In [26]:
question = """Where was Poseidonia and who renamed 
it to Paestum. Also tell me the source."""
answer = execute_chain(rag_chain, question)
print(answer.content)

- Poseidonia was a Greek settlement on the Tyrrhenian coast of southern Italy, at the Gulf of Taranto (Magna Graecia).
- It was renamed Paestum by the Romans after they took control (273 BCE).
- Source: Britannica, Paestum entry (Paestum-Britannica.docx).


In [27]:
print(answer)

content='- Poseidonia was a Greek settlement on the Tyrrhenian coast of southern Italy, at the Gulf of Taranto (Magna Graecia).\n- It was renamed Paestum by the Romans after they took control (273 BCE).\n- Source: Britannica, Paestum entry (Paestum-Britannica.docx).' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 1999, 'prompt_tokens': 1889, 'total_tokens': 3888, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 1920, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-CGCV6EBbH29VRQ0siMkomrR0brmPt', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--78fad7ef-2b09-4b87-b206-f1b5e94216e8-0' usage_metadata={'input_tokens': 1889, 'output_tokens': 1999, 'total_tokens': 3888, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_t

In [35]:
question = """And then, what they do? 
Tell me only if you know. 
Also tell me the source""" 
answer = execute_chain(rag_chain, question)
print(answer.content)

Sirens are female humanlike beings with alluring voices, who appear in the Odyssey in a scene where Odysseus saves his crew's lives. Source: Siren (mythology), Wikipedia.


## Chatbot memory of message history

In [37]:
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables import RunnableLambda

rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant, world-class expert in Roman and Greek history, especially in towns located in southern Italy. Provide interesting insights on local history and recommend places to visit with knowledgeable and engaging answers. Answer all questions to the best of your ability, but only use what has been provided in the context. If you don't know, just say you don't know. Use three sentences maximum and keep the answer as concise as possible."),
        ("placeholder", "{chat_history_messages}"),
        ("assistant", "{retrieved_context}"),
        ("human", "{question}"),
    ]
)

retriever = vector_db.as_retriever()
question_feeder = RunnablePassthrough()
chatbot = ChatOpenAI(openai_api_key=OPENAI_API_KEY, 
                     model_name="gpt-5-nano")
chat_history_memory = ChatMessageHistory()

def get_messages(x):
    return chat_history_memory.messages

rag_chain = {
    "retrieved_context": retriever, 
    "question": question_feeder,
    "chat_history_messages": RunnableLambda(get_messages)
} | rag_prompt | chatbot

def execute_chain_with_memory(chain, question):
    chat_history_memory.add_user_message(question)
    answer = chain.invoke(question)
    chat_history_memory.add_ai_message(answer)
    print(f'Full chat message history: {chat_history_memory.messages}\n\n')                                      
    return answer

In [38]:
question = """Where was Poseidonia and who renamed 
it to Paestum? Also tell me the source."""
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)

Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed \nit to Paestum? Also tell me the source.', additional_kwargs={}, response_metadata={}), AIMessage(content='Poseidonia was a Greek city on the Tyrrhenian coast of southern Italy (Magna Graecia), founded around 600 BCE by settlers from Sybaris. It was renamed Paestum by the Romans after they took over (273 BCE). Source: Paestum article, Wikipedia (https://en.wikipedia.org/wiki/Paestum).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 1811, 'prompt_tokens': 1951, 'total_tokens': 3762, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 1728, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-CGCq8gylIKzPqi8dJGz73VXr8Mkcd', 'service_tier': 'default', 'finish_reason': 'stop', 'l

In [39]:
question = """And then what did they do? 
Also tell me the source""" 
answer = execute_chain_with_memory(rag_chain, question)
print(answer.content)

Full chat message history: [HumanMessage(content='Where was Poseidonia and who renamed \nit to Paestum? Also tell me the source.', additional_kwargs={}, response_metadata={}), AIMessage(content='Poseidonia was a Greek city on the Tyrrhenian coast of southern Italy (Magna Graecia), founded around 600 BCE by settlers from Sybaris. It was renamed Paestum by the Romans after they took over (273 BCE). Source: Paestum article, Wikipedia (https://en.wikipedia.org/wiki/Paestum).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 1811, 'prompt_tokens': 1951, 'total_tokens': 3762, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 1728, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-CGCq8gylIKzPqi8dJGz73VXr8Mkcd', 'service_tier': 'default', 'finish_reason': 'stop', 'l

## Tracing with LangSmith

Stop the notebook and open a new operative system shell (for example Windows command shell).

Configure the relevant environment variables in the OS shell, the rerun the previous Jupyter cells:
```bash
(env_ch07) C:\...\ch07>set LANGSMITH_TRACING=true
(env_ch07) C:\...\ch07>set LANGSMITH_ENDPOINT=https://api.smith.langchain.com
(env_ch07) C:\...\ch07>set LANGSMITH_PROJECT=Q & A chatbot
(env_ch07) C:\...\ch07>set LANGSMITH_API_KEY=<YOUR_LANGSMITH_API_KEY>
```

Then Restart the Jupyter notebook:
```bash
(env_ch07) C:\...\ch07>jupyter notebook 07-QA_across_documents.ipynb
```

Finally re-execute the whole Jupyter notebook cell by cell. All the activity will have not been logged through LangSmith.

## Setting up Q&A chain with RetrievalQA

In [49]:
from langchain.chains import RetrievalQA
from langsmith import Client, trace

langsmith_client = Client(api_key=OPENAI_API_KEY)

rag_chain = RetrievalQA.from_chain_type(llm=chatbot, 
                        chain_type="stuff", 
                        retriever=retriever, 
                        return_source_documents=False)

In [51]:
question = """Where was Poseidonia and 
who renamed it to Paestum. 
Also tell me the source."""
with trace("RetrievalQA", "chain", 
           project_name="Q&A chatbot", 
           inputs={"input": question}, 
           client=langsmith_client) as rt:
    answer = execute_chain(rag_chain, question)
    print(answer)
    rt.end(outputs={"output": answer})

{'query': 'Where was Poseidonia and \nwho renamed it to Paestum. \nAlso tell me the source.', 'result': '- Where Poseidonia was: On the Gulf of Taranto in southern Italy, at the site of the modern city of Paestum in Campania (near Capaccio-Paestum, in the Province of Salerno).\n\n- Who renamed it: The Lucanians renamed the city Paistos after conquering it; the Romans later gave the city its current name, Paestum.\n\n- Source: Britannica entry on Paestum (Paestum) and related UNESCO/WHC references. For example:\n  - Britannica: https://www.britannica.com/place/Paestum-Italy\n  - UNESCO World Heritage Centre: Archaeological Site of Paestum (Paestum, Italy)'}
