# Question transformations

# Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [2]:
uk_granular_collection = Chroma(
    collection_name="uk_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_granular_collection.reset_collection() #A

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [3]:
from langchain_text_splitters import HTMLSectionSplitter
from langchain_community.document_loaders import AsyncHtmlLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
uk_destinations = [
    "Cornwall", "North_Cornwall", "South_Cornwall", "West_Cornwall", 
    "Tintagel", "Bodmin", "Wadebridge", "Penzance", "Newquay",
    "St_Ives", "Port_Isaac", "Looe", "Polperro", "Porthleven",
    "East_Sussex", "Brighton", "Battle", "Hastings_(England)", 
    "Rye_(England)", "Seaford", "Ashdown_Forest"
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [5]:
uk_destination_urls = [f'{wikivoyage_root_url}/{d}' for d in uk_destinations]

In [6]:
headers_to_split_on = [("h1", "Header 1"),("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)

In [7]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content #B
        temp_chunks = html_section_splitter.split_text(html_string) #C
        h2_temp_chunks = [chunk for chunk in temp_chunks if "Header 2" in chunk.metadata] #D
        all_chunks.extend(h2_temp_chunks) 

    return all_chunks

In [8]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #E
    docs =  html_loader.load() #F
    
    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        uk_granular_collection.add_documents(documents=granular_chunks)

#A In case it exists
#B Extract the HTML text from the document
#C Each chunk is a H1 or H2 HTML section
#D Only keep content associated with H2 sections        
#E Loader for one destination
#F Documents of one destination

Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.88s/it]


{'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.02it/s]


{'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.26s/it]


{'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.29s/it]


{'source': 'https://en.wikivoyage.org/wiki/West_Cornwall', 'title': 'West Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.81s/it]


{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.63s/it]


{'source': 'https://en.wikivoyage.org/wiki/Bodmin', 'title': 'Bodmin – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.00it/s]


{'source': 'https://en.wikivoyage.org/wiki/Wadebridge', 'title': 'Wadebridge – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  5.16it/s]


{'source': 'https://en.wikivoyage.org/wiki/Penzance', 'title': 'Penzance – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.27it/s]


{'source': 'https://en.wikivoyage.org/wiki/Newquay', 'title': 'Newquay – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  2.02it/s]


{'source': 'https://en.wikivoyage.org/wiki/St_Ives', 'title': 'St Ives – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.60it/s]


{'source': 'https://en.wikivoyage.org/wiki/Port_Isaac', 'title': 'Port Isaac – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  2.35it/s]


{'source': 'https://en.wikivoyage.org/wiki/Looe', 'title': 'Looe – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  4.72it/s]


{'source': 'https://en.wikivoyage.org/wiki/Polperro', 'title': 'Polperro – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.42it/s]


{'source': 'https://en.wikivoyage.org/wiki/Porthleven', 'title': 'Porthleven – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.66it/s]


{'source': 'https://en.wikivoyage.org/wiki/East_Sussex', 'title': 'East Sussex – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  3.51it/s]


{'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.42it/s]


{'source': 'https://en.wikivoyage.org/wiki/Battle', 'title': 'Battle – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.51it/s]


{'source': 'https://en.wikivoyage.org/wiki/Hastings_(England)', 'title': 'Hastings (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.35it/s]


{'source': 'https://en.wikivoyage.org/wiki/Rye_(England)', 'title': 'Rye (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  2.01it/s]


{'source': 'https://en.wikivoyage.org/wiki/Seaford', 'title': 'Seaford – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.07it/s]


{'source': 'https://en.wikivoyage.org/wiki/Ashdown_Forest', 'title': 'Ashdown Forest – Travel guide at Wikivoyage', 'language': 'en'}


# Rewrite-retrieve-read

## Retrieving content with original user question

In [9]:
user_question = "Tell me some fun things I can enjoy in Cornwall"
initial_results = uk_granular_collection.similarity_search(query=user_question,k=4)
for doc in initial_results:
    print(doc)

page_content='Do 
 [ edit ] 
 
 Cornwall, in particular Newquay, is the UK's  surfing  capital, with equipment hire and surf schools present on many of the county's beaches, and events like the UK championships or Boardmasters festival. 
 The  South West Coast Path  runs along the coastline of Britain's south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) 
 The  Camel Trail  is an  18-mile (29   km)  off-road cycle-track that follows the route of a former railway line along the scenic estuary of the river Camel from  Padstow  to Wenford Bridge via  Wadebridge  and 

In [10]:
# COMMENT: the retrieval from the vector store against the original question is bad

## Question rewrite

### Setting up the query rewriter chain

In [11]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [12]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [13]:
rewriter_prompt_template = """
Generate search query for the Chroma DB vector store from a user question, allowing for a more accurate response through semantic search.
Just return the revised Chroma DB query, with quotes around it. 

User question: {user_question}
Revised Chroma DB query:
"""

rewriter_prompt = ChatPromptTemplate.from_template(rewriter_prompt_template) 

In [14]:
rewriter_chain = rewriter_prompt | llm | StrOutputParser()

### Retrieving content with the rewritten query

In [15]:
user_question ="Tell me some fun things I can do in Cornwall"

search_query = rewriter_chain.invoke({"user_question": user_question})
print(search_query)

"fun activities in Cornwall, things to do in Cornwall, attractions and experiences in Cornwall"


In [18]:
improved_results = uk_granular_collection.similarity_search(query=search_query,k=3)
for doc in improved_results:
    print(doc)

page_content='Do 
 [ edit ] 
 
 Cornwall, in particular Newquay, is the UK's  surfing  capital, with equipment hire and surf schools present on many of the county's beaches, and events like the UK championships or Boardmasters festival. 
 The  South West Coast Path  runs along the coastline of Britain's south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) 
 The  Camel Trail  is an  18-mile (29   km)  off-road cycle-track that follows the route of a former railway line along the scenic estuary of the river Camel from  Padstow  to Wenford Bridge via  Wadebridge  and 

### Combining everything in a single RAG chain

In [19]:
from langchain_core.runnables import RunnablePassthrough

In [20]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

rewrite_retrieve_read_rag_chain = (
    {
        "context": {"user_question": RunnablePassthrough()} | rewriter_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the rewritten query
#B This is the original user question

In [21]:
user_question = "Tell me some fun things I can do in Cornwall"

answer = rewrite_retrieve_read_rag_chain.invoke(user_question)
print(answer)

In Cornwall, you can enjoy a variety of fun activities including:

1. **Surfing in Newquay**: Known as the UK's surfing capital, Newquay offers equipment hire and surf schools at its beaches.

2. **Walking the South West Coast Path**: This scenic trail runs along the coast and takes you to beautiful towns, cliffs, beaches, and fishing villages. It's a great way to experience Cornwall’s natural beauty.

3. **Cycling the Camel Trail**: An 18-mile off-road cycle track that follows the scenic estuary of the River Camel from Padstow to Wenford Bridge.

4. **Visiting the Helford River**: Explore this idyllic river estuary, take a ferry ride, or visit attractions like the Gweek Seal Sanctuary and Trebah Gardens.

5. **Attending Festivals**: Cornwall hosts various festivals throughout the year, including St Piran's Day celebrations, the Cornish Film Festival in November, the Furry Dance in Helston, and Midsummer celebrations in Penzance.

6. **Exploring Camel Creek Adventure Park**: This theme

# Multiple query generation with MultiQueryRetriever

In [22]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.prompts import ChatPromptTemplate

from typing import List
from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

## Implementing a custom MultiQueryRetriver

### Setting up the prompt

In [23]:
multi_query_gen_prompt_template = """
You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}
"""

multi_query_gen_prompt = ChatPromptTemplate.from_template(multi_query_gen_prompt_template) 

### Setting up the multi-query parser

In [24]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Parse out a question from each output line."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  


questions_parser = LineListOutputParser()

### Setting up the chain to generate multiple queries

In [25]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [26]:
multi_query_gen_chain = multi_query_gen_prompt | llm | questions_parser

### Testing the Multi query gen chain

In [27]:
user_question = "Tell me some fun things I can do in Cornwall"

multiple_queries = multi_query_gen_chain.invoke(user_question)

In [28]:
multiple_queries

['What are some entertaining activities to try out in Cornwall?  ',
 'Can you suggest enjoyable experiences or attractions in Cornwall?  ',
 'What fun places or events are happening in Cornwall that I should check out?  ',
 'What are some exciting things to do while visiting Cornwall?  ',
 'Could you list some interesting or entertaining things to do in Cornwall?']

### Setting up the MultiQueryRetriever

In [29]:
basic_retriever = uk_granular_collection.as_retriever()

multi_query_retriever = MultiQueryRetriever(
    retriever=basic_retriever, llm_chain=multi_query_gen_chain, 
    parser_key="lines" #A
)  
#A this is the key for the parsed output

### Using the multi_query retriever

In [30]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [31]:
retrieved_docs

[Document(id='e1a8ca2f-c967-4a0e-ac8c-0d2102c13997', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with equipment hire and surf schools present on many of the county\'s beaches, and events like the UK championships or Boardmasters festival. \n The  South West Coast Path  runs along the coastline of Britain\'s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track that follows the route of a former railway line along

## Using directly a standard MultiQueryRetriever 

In [32]:
std_multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=basic_retriever, llm=llm
)

In [33]:
user_question = "Tell me some fun things I can do in Cornwall"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [34]:
retrieved_docs

[Document(id='e1a8ca2f-c967-4a0e-ac8c-0d2102c13997', metadata={'Header 2': 'Do'}, page_content='Do \n [ edit ] \n \n Cornwall, in particular Newquay, is the UK\'s  surfing  capital, with equipment hire and surf schools present on many of the county\'s beaches, and events like the UK championships or Boardmasters festival. \n The  South West Coast Path  runs along the coastline of Britain\'s south-west peninsula. The Cornish section is supposed to be the most scenic (unless you talk to someone in Devon, in which case the Devon part is most scenic). It is particularly scenic around Penwith and the Lizard. The trail takes walkers to busy towns, remote cliffs, beaches, heaths, farms and fishing villages. Walking along it is a great way to experience the region in all its variety. (Walking the entire path takes several weeks, walking on a choice part of it is easier.) \n The  Camel Trail  is an  18-mile (29 \xa0 km)  off-road cycle-track that follows the route of a former railway line along

# Step-back question

### Setting up the chain to generate the step-back question

In [58]:
llm = ChatOpenAI(model="o4-mini", openai_api_key=OPENAI_API_KEY)

In [59]:
step_back_prompt_template = """
Generate a less specific question (aka Step-back question) for the following detailed question, so that a wider context can be retrieved.
Detailed question: {detailed_question}
Step-back question:
"""

step_back_prompt = ChatPromptTemplate.from_template(step_back_prompt_template) 

In [60]:
step_back_question_gen_chain = step_back_prompt | llm | StrOutputParser()

### Testing the step-back-question generation chain

In [61]:
user_question = "Can you give me some tips for a trip to Brighton?"

step_back_question = step_back_question_gen_chain.invoke(user_question)

In [62]:
step_back_question

'Step-back question: What should I consider when planning a trip to any seaside destination in the UK?'

### Incorporating step-back question generation chain into the RAG chain

In [63]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

step_back_question_rag_chain = (
    {
        "context": {"detailed_question": RunnablePassthrough()} | step_back_question_gen_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the step-back question
#B This is the original user question

In [64]:
user_question = "Can you give me some tips for a trip to Brighton?"

answer = step_back_question_rag_chain.invoke(user_question)
print(answer)

Here are a few practical tips and safety pointers for a visit to Brighton:

• Getting out at night  
  – The city centre (especially West Street) can get very busy and rowdy on Friday/Saturday nights—West Street in particular is best avoided after midnight.  
  – If you’d rather something more low-key, head to venues popular with locals (avoid the biggest tourist clubs on those peak nights).  

• Areas to be aware of  
  – You’ll see a number of homeless people around the centre; they’re generally harmless and only ask for money.  
  – “Party” or drug-user crowds sometimes gather around London Road and the Level—safe during daylight, but use more caution after dark.  
  – Outlying districts like Whitehawk and Moulsecoomb have a rougher reputation and little for tourists to see.  

• LGBTQ+ travellers  
  – Brighton & Hove is famously welcoming, but if you want to hold hands or kiss, stick to Hove, The Lanes, North Laine or Kemp Town—any homophobic abuse there is very unlikely and would

# Hypotetical DocumentEmbeddings (HyDE)

### Setting up the chain to generate the hypotetical document associated to the user question

In [54]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [55]:
hyde_prompt_template = """
Write one sentence that could answer the provided question. Do not add anything else.
Question: {question}
Sentence:
"""

hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)

In [56]:
hyde_chain = hyde_prompt | llm | StrOutputParser()

### Testing the hyde generation chain

In [57]:
user_question = "What are the best beaches in Cornwall?"

hypotetical_document = hyde_chain.invoke(user_question)

In [58]:
hypotetical_document

'Some of the best beaches in Cornwall include Fistral Beach, Porthcurno Beach, and St Ives Bay.'

### Incorporating hyde chain into the RAG chain

In [319]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
Only use the provided context to answer the question.
If you do not know the answer, just say I do not know. 

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

hyde_rag_chain = (
    {
        "context": {"question": RunnablePassthrough()} | hyde_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the hypotetical document
#B This is the original user question

In [320]:
user_question = "What are the best beaches in Cornwall?"

answer = hyde_rag_chain.invoke(user_question)
print(answer)

The best beaches in Cornwall mentioned in the context include Bude, Polzeath, Watergate Bay, Perranporth, Porthtowan, Fistral Beach, Newquay, St Agnes, St Ives, Gyllyngvase beach in Falmouth, and Praa Sands. Additionally, in Newquay, popular beaches are Crantock Beach, Fistral Beach, Great Western, Harbour, Holywell Bay, Lusty Glaze Beach, Porth Joke, Porth, Tolcarne Beach, Towan Beach, Whipsiderry, and Watergate Bay.
