# Question transformations

# Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [2]:
uk_granular_collection = Chroma(
    collection_name="uk_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_granular_collection.reset_collection() #A

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [3]:
from langchain_text_splitters import HTMLSectionSplitter
from langchain_community.document_loaders import AsyncHtmlLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
uk_destinations = [
    "Cornwall", "North_Cornwall", "South_Cornwall", "West_Cornwall", 
    "Tintagel", "Bodmin", "Wadebridge", "Penzance", "Newquay",
    "St_Ives", "Port_Isaac", "Looe", "Polperro", "Porthleven",
    "East_Sussex", "Brighton", "Battle", "Hastings_(England)", 
    "Rye_(England)", "Seaford", "Ashdown_Forest"
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [5]:
uk_destination_urls = [f'{wikivoyage_root_url}/{d}' 
                       for d in uk_destinations]

In [6]:
headers_to_split_on = [("h1", "Header 1"),("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(
    headers_to_split_on=headers_to_split_on)

In [7]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content #B
        temp_chunks = html_section_splitter.split_text(
            html_string) #C
        h2_temp_chunks = [chunk for chunk in 
                          temp_chunks if "Header 2" 
                          in chunk.metadata] #D
        all_chunks.extend(h2_temp_chunks) 

    return all_chunks

In [8]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(
        destination_url) #E
    docs =  html_loader.load() #F
    
    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        uk_granular_collection.add_documents(
            documents=granular_chunks)

#A In case it exists
#B Extract the HTML text from the document
#C Each chunk is a H1 or H2 HTML section
#D Only keep content associated with H2 sections        
#E Loader for one destination
#F Documents of one destination

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.18it/s]


{'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.55it/s]


{'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 19.41it/s]


{'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.31it/s]


{'source': 'https://en.wikivoyage.org/wiki/West_Cornwall', 'title': 'West Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.21it/s]


{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.00it/s]


{'source': 'https://en.wikivoyage.org/wiki/Bodmin', 'title': 'Bodmin – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.48it/s]


{'source': 'https://en.wikivoyage.org/wiki/Wadebridge', 'title': 'Wadebridge – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.00it/s]


{'source': 'https://en.wikivoyage.org/wiki/Penzance', 'title': 'Penzance – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.96it/s]


{'source': 'https://en.wikivoyage.org/wiki/Newquay', 'title': 'Newquay – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.04it/s]


{'source': 'https://en.wikivoyage.org/wiki/St_Ives', 'title': 'St Ives – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.10it/s]


{'source': 'https://en.wikivoyage.org/wiki/Port_Isaac', 'title': 'Port Isaac – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 19.49it/s]


{'source': 'https://en.wikivoyage.org/wiki/Looe', 'title': 'Looe – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.43it/s]


{'source': 'https://en.wikivoyage.org/wiki/Polperro', 'title': 'Polperro – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  2.36it/s]


{'source': 'https://en.wikivoyage.org/wiki/Porthleven', 'title': 'Porthleven – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.33it/s]


{'source': 'https://en.wikivoyage.org/wiki/East_Sussex', 'title': 'East Sussex – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.91it/s]


{'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.12it/s]


{'source': 'https://en.wikivoyage.org/wiki/Battle', 'title': 'Battle – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.57it/s]


{'source': 'https://en.wikivoyage.org/wiki/Hastings_(England)', 'title': 'Hastings (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.72it/s]


{'source': 'https://en.wikivoyage.org/wiki/Rye_(England)', 'title': 'Rye (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.80it/s]


{'source': 'https://en.wikivoyage.org/wiki/Seaford', 'title': 'Seaford – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.80it/s]


{'source': 'https://en.wikivoyage.org/wiki/Ashdown_Forest', 'title': 'Ashdown Forest – Travel guide at Wikivoyage', 'language': 'en'}


# Rewrite-retrieve-read

## Retrieving content with original user question

In [9]:
user_question = "Tell me some fun things I can enjoy in Cornwall"
initial_results = uk_granular_collection.similarity_search(
    query=user_question,k=4)
for doc in initial_results:
    print(doc)

page_content='Do 
 [ edit ] 
 
 Cornwall, in particular Newquay, is the UK's  surfing  capital, with equipment hire and surf schools present on many of the county's beaches, and events like the UK championships or Boardmasters festival. 
 The  South West Coast Path  runs along the coastline of Britain's south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) 
 The  Camel Trail  is an  18-mile (29   km)  off-road cycle-track that follows the route of a former railway line along the scenic estuary of the river Camel from  Padstow  to Wenford Bridge via  Wadebridge  and 

In [10]:
# COMMENT: the retrieval from the vector store against the original question is bad

## Question rewrite

### Setting up the query rewriter chain

In [11]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [12]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [13]:
rewriter_prompt_template = """
Generate search query for the Chroma DB vector store
from a user question, allowing for a more accurate 
response through semantic search.
Just return the revised Chroma DB query, with quotes around it. 

User question: {user_question}
Revised Chroma DB query:
"""

rewriter_prompt = ChatPromptTemplate.from_template(
    rewriter_prompt_template) 

In [14]:
rewriter_chain = rewriter_prompt | llm | StrOutputParser()

### Retrieving content with the rewritten query

In [15]:
user_question ="Tell me some fun things I can do in Cornwall"

search_query = rewriter_chain.invoke(
    {"user_question": user_question})
print(search_query)

"collection.query(query_texts=[\"Tell me some fun things I can do in Cornwall\"], n_results=5, include=[\"metadatas\",\"documents\"])"}


In [16]:
improved_results = uk_granular_collection.similarity_search(
    query=search_query,k=3)
for doc in improved_results:
    print(doc)

page_content='Go next 
 [ edit ] 
 
 
 
 
 
 
 
 
 
 
 This  region  travel guide to  West Cornwall   is an  outline   and may need more content. It has a  template , but there is not enough  information  present. If there are  Cities  and  Other destinations  listed, they may not all be at  usable  status or there may not be a  valid regional structure  and a "Get in" section describing all of the typical ways to get here. Please  plunge forward  and  help it grow ! 
 
 
 
 
 
 
 
 
 
 
  
NewPP limit report
Parsed by mw‐jobrunner.codfw.main‐67b664f5f4‐8cnqm
Cached time: 20251101154725
Cache expiry: 2592000
Reduced expiry: false
Complications: [show‐toc, use‐parsoid]
CPU time usage: 0.538 seconds
Real time usage: 0.680 seconds
Preprocessor visited node count: 1810/1000000
Revision size: 11200/2097152 bytes
Post‐expand include size: 25446/2097152 bytes
Template argument size: 688/2097152 bytes
Highest expansion depth: 9/100
Expensive parser function count: 24/500
Unstrip recursion dept

### Combining everything in a single RAG chain

In [17]:
from langchain_core.runnables import RunnablePassthrough

In [18]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(
    rag_prompt_template) 

rewrite_retrieve_read_rag_chain = (
    {
        "context": {"user_question": RunnablePassthrough()} 
            | rewriter_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the rewritten query
#B This is the original user question

In [19]:
user_question = "Tell me some fun things I can do in Cornwall"

answer = rewrite_retrieve_read_rag_chain.invoke(user_question)
print(answer)

Here are some fun things to do in Cornwall based on the ideas in the text:

- Go surfing in Newquay, the UK’s surfing capital. You can rent gear and take lessons, and you might catch events like the UK championships or the Boardmasters festival.
- Walk or hike along the South West Coast Path, especially the scenic sections around Penwith and the Lizard.
- Cycle the Camel Trail (about 18 miles) along the Camel estuary from Padstow to Wenford Bridge via Wadebridge and Bodmin.
- Visit the Eden Project near St Austell to see the biomes and explore plant life and gardens.
- Take a family day out at Camel Creek Adventure Park in Wadebridge.
- Check out major events like the Royal Cornwall Show (early June) and the Cornish Film Festival (each November around Newquay).
- Experience traditional Cornwall celebrations such as Mummer’s Day in Padstow ( Boxing Day/New Year’s) and Obby ’Oss on May Day (Padstow), plus St Piran’s Day celebrations on March 5.
- Explore Cornwall’s towns and beaches, and

# Multiple query generation with MultiQueryRetriever

In [20]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from langchain_core.prompts import ChatPromptTemplate

from typing import List
from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

## Implementing a custom MultiQueryRetriver

### Setting up the prompt

In [21]:
multi_query_gen_prompt_template = """
You are an AI language model assistant. Your task 
is to generate five different versions of the given 
user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user 
question, your goal is to help the user overcome some of 
the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}
"""

multi_query_gen_prompt = ChatPromptTemplate.from_template(
    multi_query_gen_prompt_template) 

### Setting up the multi-query parser

In [22]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Parse out a question from each output line."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  

questions_parser = LineListOutputParser()

### Setting up the chain to generate multiple queries

In [23]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [24]:
multi_query_gen_chain = multi_query_gen_prompt | llm | questions_parser

### Testing the Multi query gen chain

In [25]:
user_question = "Tell me some fun things I can do in Cornwall"

multiple_queries = multi_query_gen_chain.invoke(user_question)

In [26]:
multiple_queries

['What fun activities can I do in Cornwall?',
 'What are some family-friendly things to do in Cornwall?',
 'What outdoor and coastal adventures are available in Cornwall?',
 'What cultural, historical, or foodie experiences should I try in Cornwall?',
 'What budget-friendly or offbeat activities would you recommend in Cornwall?']

### Setting up the MultiQueryRetriever

In [27]:
basic_retriever = uk_granular_collection.as_retriever()

multi_query_retriever = MultiQueryRetriever(
    retriever=basic_retriever, llm_chain=multi_query_gen_chain, 
    parser_key="lines" #A
)  
#A this is the key for the parsed output

### Using the multi_query retriever

In [28]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [29]:
retrieved_docs

[Document(id='0f7ef21f-d0d2-489e-bff4-2b2c98c68007', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with equipment hire and surf schools present on many of the county\'s beaches, and events like the UK championships or Boardmasters festival. \n The  South West Coast Path  runs along the coastline of Britain\'s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track that follows the route of a former railway line along

## Using directly a standard MultiQueryRetriever 

In [30]:
std_multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=basic_retriever, llm=llm
)

In [31]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = std_multi_query_retriever.invoke(user_question)

In [32]:
retrieved_docs

[Document(id='ba29a058-99a3-4553-a8c0-f9fbf2926e5b', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n The  South West Coast Path  runs along the coastline of Britain’s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track following the scenic estuary of the river Camel from  Padstow  to Wenford Bridge via  Wadebridge  and  Bodmin . \n The  Cornish Film Festival  is held annually each November around  Newquay . \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with

# Step-back question

### Setting up the chain to generate the step-back question

In [33]:
llm = ChatOpenAI(model="gpt-5", openai_api_key=OPENAI_API_KEY)

In [34]:
step_back_prompt_template = """
Generate a less specific question (aka Step-back question) 
for the following detailed question, so that a wider context 
can be retrieved.
Detailed question: {detailed_question}
Step-back question:
"""

step_back_prompt = ChatPromptTemplate.from_template(
    step_back_prompt_template) 

In [35]:
step_back_question_gen_chain = step_back_prompt | llm | StrOutputParser()

### Testing the step-back-question generation chain

In [36]:
user_question = "Can you give me some tips for a trip to Brighton?"

step_back_question = step_back_question_gen_chain.invoke(user_question)

In [37]:
step_back_question

'What general travel tips should I consider when planning a trip to a UK seaside city?'

### Incorporating step-back question generation chain into the RAG chain

In [38]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

step_back_question_rag_chain = (
    {
        "context": {"detailed_question": RunnablePassthrough()} 
           | step_back_question_gen_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the step-back question
#B This is the original user question

In [39]:
user_question = "Can you give me some tips for a trip to Brighton?"

answer = step_back_question_rag_chain.invoke(user_question)
print(answer)

Here are some practical tips for Brighton:

- Nights out: The city centre can get rowdy on weekends; West Street is best avoided after midnight. You’ll find a more civilised Friday/Saturday night in other parts of the city.
- Street awareness: You’ll see homeless people; most are harmless and may ask for money—if you decline, they usually move on. Drug users often gather around London Road and the Level; these areas are fine before dark.
- Areas to skip: Outskirts like Whitehawk and Moulsecoomb have a bad reputation, and most visitors don’t need to go there.
- LGBT visitors: Brighton & Hove is generally very welcoming. Same‑sex displays of affection are widely accepted, especially in Hove, The Lanes, North Laine, and Kemp Town/Kemp Town Village. Use normal caution in less central areas.
- Beach safety: Lifeguards patrol from late May to the first weekend in September—check beach signposts for covered areas. In a sea emergency, call 999 and ask for the Coastguard.
- Seagulls: They can b

# Hypotetical DocumentEmbeddings (HyDE)

### Setting up the chain to generate the hypotetical document associated to the user question

In [40]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [41]:
hyde_prompt_template = """
Write one sentence that could answer the provided question. 
Do not add anything else.
Question: {question}
Sentence:
"""

hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)

In [42]:
hyde_chain = hyde_prompt | llm | StrOutputParser()

### Testing the hyde generation chain

In [43]:
user_question = "What are the best beaches in Cornwall?"

hypotetical_document = hyde_chain.invoke(user_question)

In [44]:
hypotetical_document

'Some of the best beaches in Cornwall include Porthcurno, Kynance Cove, Porthminster Beach, Godrevy Beach and St Ives’ Porthmeor Beach.'

### Incorporating hyde chain into the RAG chain

In [45]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
Only use the provided context to answer the question.
If you do not know the answer, just say I do not know. 

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

hyde_rag_chain = (
    {
        "context": {"question": RunnablePassthrough()} 
           | hyde_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the hypotetical document
#B This is the original user question

In [46]:
user_question = "What are the best beaches in Cornwall?"

answer = hyde_rag_chain.invoke(user_question)
print(answer)

The beaches mentioned are: Bude, Polzeath, Watergate Bay, Perranporth, Porthtowan, Fistral Beach, Newquay, St Agnes, St Ives, Gyllyngvase Beach (Falmouth), and Praa Sands.
