# Question transformations

# Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [2]:
uk_granular_collection = Chroma(
    collection_name="uk_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_granular_collection.reset_collection() #A

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [3]:
from langchain_text_splitters import HTMLSectionSplitter
from langchain_community.document_loaders import AsyncHtmlLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
uk_destinations = [
    "Cornwall", "North_Cornwall", "South_Cornwall", "West_Cornwall", 
    "Tintagel", "Bodmin", "Wadebridge", "Penzance", "Newquay",
    "St_Ives", "Port_Isaac", "Looe", "Polperro", "Porthleven",
    "East_Sussex", "Brighton", "Battle", "Hastings_(England)", 
    "Rye_(England)", "Seaford", "Ashdown_Forest"
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [5]:
uk_destination_urls = [f'{wikivoyage_root_url}/{d}' 
                       for d in uk_destinations]

In [6]:
headers_to_split_on = [("h1", "Header 1"),("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(
    headers_to_split_on=headers_to_split_on)

In [7]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content #B
        temp_chunks = html_section_splitter.split_text(
            html_string) #C
        h2_temp_chunks = [chunk for chunk in 
                          temp_chunks if "Header 2" 
                          in chunk.metadata] #D
        all_chunks.extend(h2_temp_chunks) 

    return all_chunks

In [8]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(
        destination_url) #E
    docs =  html_loader.load() #F
    
    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        uk_granular_collection.add_documents(
            documents=granular_chunks)

#A In case it exists
#B Extract the HTML text from the document
#C Each chunk is a H1 or H2 HTML section
#D Only keep content associated with H2 sections        
#E Loader for one destination
#F Documents of one destination

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.53it/s]


{'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.91it/s]


{'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.73it/s]


{'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.43it/s]


{'source': 'https://en.wikivoyage.org/wiki/West_Cornwall', 'title': 'West Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.37it/s]


{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.32it/s]


{'source': 'https://en.wikivoyage.org/wiki/Bodmin', 'title': 'Bodmin – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.19it/s]


{'source': 'https://en.wikivoyage.org/wiki/Wadebridge', 'title': 'Wadebridge – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.18it/s]


{'source': 'https://en.wikivoyage.org/wiki/Penzance', 'title': 'Penzance – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 19.21it/s]


{'source': 'https://en.wikivoyage.org/wiki/Newquay', 'title': 'Newquay – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.29it/s]


{'source': 'https://en.wikivoyage.org/wiki/St_Ives', 'title': 'St Ives – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 22.24it/s]


{'source': 'https://en.wikivoyage.org/wiki/Port_Isaac', 'title': 'Port Isaac – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.22it/s]


{'source': 'https://en.wikivoyage.org/wiki/Looe', 'title': 'Looe – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.50it/s]


{'source': 'https://en.wikivoyage.org/wiki/Polperro', 'title': 'Polperro – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.61it/s]


{'source': 'https://en.wikivoyage.org/wiki/Porthleven', 'title': 'Porthleven – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.11it/s]


{'source': 'https://en.wikivoyage.org/wiki/East_Sussex', 'title': 'East Sussex – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.90it/s]


{'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.60it/s]


{'source': 'https://en.wikivoyage.org/wiki/Battle', 'title': 'Battle – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 19.80it/s]


{'source': 'https://en.wikivoyage.org/wiki/Hastings_(England)', 'title': 'Hastings (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.48it/s]


{'source': 'https://en.wikivoyage.org/wiki/Rye_(England)', 'title': 'Rye (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 22.17it/s]


{'source': 'https://en.wikivoyage.org/wiki/Seaford', 'title': 'Seaford – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.36it/s]


{'source': 'https://en.wikivoyage.org/wiki/Ashdown_Forest', 'title': 'Ashdown Forest – Travel guide at Wikivoyage', 'language': 'en'}


# Rewrite-retrieve-read

## Retrieving content with original user question

In [9]:
user_question = "Tell me some fun things I can enjoy in Cornwall"
initial_results = uk_granular_collection.similarity_search(
    query=user_question,k=4)
for doc in initial_results:
    print(doc)

page_content='Do 
 [ edit ] 
 
 Cornwall, in particular Newquay, is the UK's  surfing  capital, with equipment hire and surf schools present on many of the county's beaches, and events like the UK championships or Boardmasters festival. 
 The  South West Coast Path  runs along the coastline of Britain's south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) 
 The  Camel Trail  is an  18-mile (29   km)  off-road cycle-track that follows the route of a former railway line along the scenic estuary of the river Camel from  Padstow  to Wenford Bridge via  Wadebridge  and 

In [10]:
# COMMENT: the retrieval from the vector store against the original question is bad

## Question rewrite

### Setting up the query rewriter chain

In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [13]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [14]:
rewriter_prompt_template = """
Generate search query for the Chroma DB vector store
from a user question, allowing for a more accurate 
response through semantic search.
Just return the revised Chroma DB query, with quotes around it. 

User question: {user_question}
Revised Chroma DB query:
"""

rewriter_prompt = ChatPromptTemplate.from_template(
    rewriter_prompt_template) 

In [15]:
rewriter_chain = rewriter_prompt | llm | StrOutputParser()

### Retrieving content with the rewritten query

In [17]:
user_question ="Tell me some fun things I can do in Cornwall"

search_query = rewriter_chain.invoke(
    {"user_question": user_question})
print(search_query)

"things to do in Cornwall, including fun activities, beaches, outdoor adventures, family-friendly options, and local attractions"


In [18]:
improved_results = uk_granular_collection.similarity_search(
    query=search_query,k=3)
for doc in improved_results:
    print(doc)

page_content='Contents 
 
 
 
 
 
 
 
 1   Towns and villages 
 
 
 
 
 
 
 2   Other destinations 
 
 
 
 
 
 
 3   Understand 
 
 
 
 
 
 
 4   Get in 
 
 
 
 
 4.1   By train 
 
 
 
 
 
 
 4.2   By car 
 
 
 
 
 
 
 4.3   By plane 
 
 
 
 
 
 
 
 
 5   Get around 
 
 
 
 
 5.1   By bus 
 
 
 
 
 
 
 5.2   By train 
 
 
 
 
 
 
 
 
 6   See 
 
 
 
 
 6.1   National Trust properties 
 
 
 
 
 
 
 
 
 7   Do 
 
 
 
 
 7.1   Festivals 
 
 
 
 
 
 
 
 
 8   Drink 
 
 
 
 
 
 
 9   Stay safe 
 
 
 
 
 
 
 10   Go next 
 
 
 
 
 
 
 
 
 
 
 
 
 
 North Cornwall  is in  Cornwall . It includes much of the Cornish coast along the Celtic Sea and some top surfing areas.' metadata={'Header 2': 'Contents'}
page_content='Do 
 [ edit ] 
 
 Cornwall, in particular Newquay, is the UK's  surfing  capital, with equipment hire and surf schools present on many of the county's beaches, and events like the UK championships or Boardmasters festival. 
 The  South West Coast Path  runs along the coastline of 

### Combining everything in a single RAG chain

In [20]:
from langchain_core.runnables import RunnablePassthrough

In [21]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(
    rag_prompt_template) 

rewrite_retrieve_read_rag_chain = (
    {
        "context": {"user_question": RunnablePassthrough()} 
            | rewriter_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the rewritten query
#B This is the original user question

In [22]:
user_question = "Tell me some fun things I can do in Cornwall"

answer = rewrite_retrieve_read_rag_chain.invoke(user_question)
print(answer)

Here are some fun things to do in Cornwall based on the spots and events mentioned in the context:

- Surfing and beaches in Newquay: rent equipment, take a lesson, and maybe catch events like the UK championships or the Boardmasters festival.

- Walk part or all of the South West Coast Path: enjoy the Cornwall section’s scenery, especially around Penwith and the Lizard.

- Cycle the Camel Trail: ride the 18-mile off-road path from Padstow to Wenford Bridge via Wadebridge and Bodmin.

- Visit family-friendly attractions: Camel Creek Adventure Park near Wadebridge for a full day of fun rides and shows.

- Enjoy local fairs and festivals: 
  - Royal Cornwall Show (early June) – a major agricultural show and tourist draw.
  - Cornish Film Festival (around Newquay) – film lovers’ event.
  - Boardmasters festival (surf/music event) in Cornwall.

- Experience Cornish traditions: 
  - Mummer’s Day in Padstow ( Boxing Day and New Year’s Day).
  - Obby ’Oss on May Day (Padstow) with marching ba

# Multiple query generation with MultiQueryRetriever

In [25]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.prompts import ChatPromptTemplate

from typing import List
from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

## Implementing a custom MultiQueryRetriver

### Setting up the prompt

In [26]:
multi_query_gen_prompt_template = """
You are an AI language model assistant. Your task 
is to generate five different versions of the given 
user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user 
question, your goal is to help the user overcome some of 
the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}
"""

multi_query_gen_prompt = ChatPromptTemplate.from_template(
    multi_query_gen_prompt_template) 

### Setting up the multi-query parser

In [27]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Parse out a question from each output line."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  

questions_parser = LineListOutputParser()

### Setting up the chain to generate multiple queries

In [28]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [29]:
multi_query_gen_chain = multi_query_gen_prompt | llm | questions_parser

### Testing the Multi query gen chain

In [30]:
user_question = "Tell me some fun things I can do in Cornwall"

multiple_queries = multi_query_gen_chain.invoke(user_question)

In [31]:
multiple_queries

['What are some fun activities to do in Cornwall?',
 'What entertaining things should I do in Cornwall, including beaches, sights, and towns?',
 "I'm planning a trip to Cornwall—what enjoyable experiences and activities would you recommend?",
 'What are the top unique or off-the-beaten-path things to do in Cornwall, including outdoor adventures and local culture?',
 'Could you suggest family-friendly or kid-friendly activities and day trips in Cornwall?']

### Setting up the MultiQueryRetriever

In [32]:
basic_retriever = uk_granular_collection.as_retriever()

multi_query_retriever = MultiQueryRetriever(
    retriever=basic_retriever, llm_chain=multi_query_gen_chain, 
    parser_key="lines" #A
)  
#A this is the key for the parsed output

### Using the multi_query retriever

In [33]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [34]:
retrieved_docs

[Document(id='4c723a8f-45b5-480e-b6a6-cf391781054c', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with equipment hire and surf schools present on many of the county\'s beaches, and events like the UK championships or Boardmasters festival. \n The  South West Coast Path  runs along the coastline of Britain\'s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track that follows the route of a former railway line along

## Using directly a standard MultiQueryRetriever 

In [35]:
std_multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=basic_retriever, llm=llm
)

In [36]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [37]:
retrieved_docs

[Document(id='4c723a8f-45b5-480e-b6a6-cf391781054c', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with equipment hire and surf schools present on many of the county\'s beaches, and events like the UK championships or Boardmasters festival. \n The  South West Coast Path  runs along the coastline of Britain\'s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track that follows the route of a former railway line along

# Step-back question

### Setting up the chain to generate the step-back question

In [38]:
llm = ChatOpenAI(model="gpt-5", openai_api_key=OPENAI_API_KEY)

In [41]:
step_back_prompt_template = """
Generate a less specific question (aka Step-back question) 
for the following detailed question, so that a wider context 
can be retrieved.
Detailed question: {detailed_question}
Step-back question:
"""

step_back_prompt = ChatPromptTemplate.from_template(
    step_back_prompt_template) 

In [42]:
step_back_question_gen_chain = step_back_prompt | llm | StrOutputParser()

### Testing the step-back-question generation chain

In [43]:
user_question = "Can you give me some tips for a trip to Brighton?"

step_back_question = step_back_question_gen_chain.invoke(user_question)

In [44]:
step_back_question

'Step-back question: What are general tips for planning a short trip to a UK coastal city?'

### Incorporating step-back question generation chain into the RAG chain

In [45]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

step_back_question_rag_chain = (
    {
        "context": {"detailed_question": RunnablePassthrough()} 
           | step_back_question_gen_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the step-back question
#B This is the original user question

In [46]:
user_question = "Can you give me some tips for a trip to Brighton?"

answer = step_back_question_rag_chain.invoke(user_question)
print(answer)

From the provided info:

- Accommodation is plentiful in Brighton thanks to its seaside resort heritage, so you’ll find lots of hotels.
- If you’re staying outside the centre, it’s best to find a hotel and book in advance, as options are fewer but generally good quality.
- Major hotel chains are increasingly available.
- If Brighton is booked up, nearby towns with year-round accommodation include Eastbourne, Hastings, Lewes, Rye, and Seaford.

Beyond this, I do not know.


# Hypotetical DocumentEmbeddings (HyDE)

### Setting up the chain to generate the hypotetical document associated to the user question

In [47]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [48]:
hyde_prompt_template = """
Write one sentence that could answer the provided question. 
Do not add anything else.
Question: {question}
Sentence:
"""

hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)

In [50]:
hyde_chain = hyde_prompt | llm | StrOutputParser()

### Testing the hyde generation chain

In [51]:
user_question = "What are the best beaches in Cornwall?"

hypotetical_document = hyde_chain.invoke(user_question)

In [52]:
hypotetical_document

'Some of the best beaches in Cornwall include Porthcurno, Fistral Beach in Newquay, Kynance Cove, Perranporth, Watergate Bay, Porthmeor Beach in St Ives, and Holywell Bay.'

### Incorporating hyde chain into the RAG chain

In [53]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
Only use the provided context to answer the question.
If you do not know the answer, just say I do not know. 

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

hyde_rag_chain = (
    {
        "context": {"question": RunnablePassthrough()} 
           | hyde_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the hypotetical document
#B This is the original user question

In [54]:
user_question = "What are the best beaches in Cornwall?"

answer = hyde_rag_chain.invoke(user_question)
print(answer)

The provided context mentions several notable beaches in Cornwall. Examples include:

- North Cornwall / north coast: Bude, Polzeath, Watergate Bay, Perranporth, Porthtowan, Fistral Beach, Newquay, St Agnes, St Ives
- South Cornwall: Gyllyngvase Beach (Falmouth), Praa Sands
- Newquay area beaches (as listed): Crantock Beach, Fistral Beach, Great Western, Harbour, Holywell Bay, Lusty Glaze Beach, Porth Joke (Polly Joke), Porth, Tolcarne Beach, Towan Beach, Whipsiderry, Watergate Bay

These are the beaches named in the provided context.
