# Question transformations

# Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [17]:
uk_granular_collection = Chroma(
    collection_name="uk_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_granular_collection.reset_collection() #A

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [18]:
from langchain_text_splitters import HTMLSectionSplitter
from langchain_community.document_loaders import AsyncHtmlLoader

In [19]:
uk_destinations = [
    "Cornwall", "North_Cornwall", "South_Cornwall", "West_Cornwall", 
    "Tintagel", "Bodmin", "Wadebridge", "Penzance", "Newquay",
    "St_Ives", "Port_Isaac", "Looe", "Polperro", "Porthleven"
    "East_Sussex", "Brighton", "Battle", "Hastings_(England)", 
    "Rye_(England)", "Seaford", "Ashdown_Forest"
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [20]:
uk_destination_urls = [f'{wikivoyage_root_url}/{d}' for d in uk_destinations]

In [21]:
headers_to_split_on = [("h1", "Header 1"),("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)

In [22]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content #B
        temp_chunks = html_section_splitter.split_text(html_string) #C
        h2_temp_chunks = [chunk for chunk in temp_chunks if "Header 2" in chunk.metadata] #D
        all_chunks.extend(h2_temp_chunks) 

    return all_chunks

In [23]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #E
    docs =  html_loader.load() #F
    
    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        uk_granular_collection.add_documents(documents=granular_chunks)

#A In case it exists
#B Extract the HTML text from the document
#C Each chunk is a H1 or H2 HTML section
#D Only keep content associated with H2 sections        
#E Loader for one destination
#F Documents of one destination

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  7.64it/s]


{'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.90it/s]


{'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.87it/s]


{'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  8.70it/s]


{'source': 'https://en.wikivoyage.org/wiki/West_Cornwall', 'title': 'West Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.84it/s]


{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.66it/s]


{'source': 'https://en.wikivoyage.org/wiki/Bodmin', 'title': 'Bodmin – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.67it/s]


{'source': 'https://en.wikivoyage.org/wiki/Wadebridge', 'title': 'Wadebridge – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  8.91it/s]


{'source': 'https://en.wikivoyage.org/wiki/Penzance', 'title': 'Penzance – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.90it/s]


{'source': 'https://en.wikivoyage.org/wiki/Newquay', 'title': 'Newquay – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.63it/s]


{'source': 'https://en.wikivoyage.org/wiki/St_Ives', 'title': 'St Ives – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.48it/s]


{'source': 'https://en.wikivoyage.org/wiki/Port_Isaac', 'title': 'Port Isaac – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.99it/s]


{'source': 'https://en.wikivoyage.org/wiki/Looe', 'title': 'Looe – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.16it/s]


{'source': 'https://en.wikivoyage.org/wiki/Polperro', 'title': 'Polperro – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.68it/s]


{'source': 'https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex', 'title': 'PorthlevenEast Sussex – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  7.41it/s]


{'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.05it/s]


{'source': 'https://en.wikivoyage.org/wiki/Battle', 'title': 'Battle – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.51it/s]


{'source': 'https://en.wikivoyage.org/wiki/Hastings_(England)', 'title': 'Hastings (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.81it/s]


{'source': 'https://en.wikivoyage.org/wiki/Rye_(England)', 'title': 'Rye (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.71it/s]


{'source': 'https://en.wikivoyage.org/wiki/Seaford', 'title': 'Seaford – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.36it/s]


{'source': 'https://en.wikivoyage.org/wiki/Ashdown_Forest', 'title': 'Ashdown Forest – Travel guide at Wikivoyage', 'language': 'en'}


# Rewrite-retrieve-read

## Retrieving content with original user question

In [9]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"
initial_results = uk_granular_collection.similarity_search(query=user_question,k=4)
for doc in initial_results:
    print(doc)

page_content='Stay safe 
 [ edit ] 
 
 Some paths may be muddy in the winter; in the summer there are some snakes (adders are the only poisonous ones, however rarely attack humans, dogs can be killed by Adder Venom) 
 Often there are deep pools, which can be nice to swim in (from this area's idilic industrial past), but children should  always  be accompanied,  monsters from the deep are rare . 
 Car parks are generally free of crime, however it is always important as with any car park to ensure that valuables are hidden out of site or taken with you. If you do not want to take your dog with you (paths can be muddy), give an area of shade for your animal, leave drinking water for the dog, and keep the windows well open.  Dogs die in hot cars. 
 Some paths may lead abruptly onto (often fast) roads; for your children's and pets' safety keep listening out for cars and if in any doubt keep more adventurous animals on a lead. 
 In the summer months the whole forest is at risk from wildfires

In [159]:
# COMMENT: the retrieval from the vector store against the original question is bad

## Question rewrite

### Setting up the query rewriter chain

In [152]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [153]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [154]:
rewriter_prompt_template = """
Generate search query for the Chroma DB vector store from a user question, allowing for a more accurate response through semantic search.
Just return the revised Chroma DB query, with now quotes around it. 

User question: {user_question}
Revised Chroma DB query:
"""

rewriter_prompt = ChatPromptTemplate.from_template(rewriter_prompt_template) 

In [155]:
rewriter_chain = rewriter_prompt | llm | StrOutputParser()

### Retrieving content with the rewritten query

In [204]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"

search_query = rewriter_chain.invoke({"user_question": user_question})
print(search_query)

"best activities children pets Ashdown Forest safety"


In [205]:
improved_results = uk_granular_collection.similarity_search(query=search_query,k=3)
for doc in improved_results:
    print(doc)

page_content='Stay safe 
 [ edit ] 
 
 Some paths may be muddy in the winter; in the summer there are some snakes (adders are the only poisonous ones, however rarely attack humans, dogs can be killed by Adder Venom) 
 Often there are deep pools, which can be nice to swim in (from this area's idilic industrial past), but children should  always  be accompanied,  monsters from the deep are rare . 
 Car parks are generally free of crime, however it is always important as with any car park to ensure that valuables are hidden out of site or taken with you. If you do not want to take your dog with you (paths can be muddy), give an area of shade for your animal, leave drinking water for the dog, and keep the windows well open.  Dogs die in hot cars. 
 Some paths may lead abruptly onto (often fast) roads; for your children's and pets' safety keep listening out for cars and if in any doubt keep more adventurous animals on a lead. 
 In the summer months the whole forest is at risk from wildfires

In [None]:
# COMMENT: FIND A BETTER QUESTION!!

### Combining everything in a single RAG chain

In [209]:
from langchain_core.runnables import RunnablePassthrough

In [215]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

rewrite_retrieve_read_rag_chain = (
    {
        "context": {"user_question": RunnablePassthrough()} | rewriter_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the rewritten query
#B This is the original user question

In [216]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"

answer = rewrite_retrieve_read_rag_chain.invoke(user_question)
print(answer)

The best activities to do with children and pets in Ashdown Forest while ensuring their safety include:

1. **Walking**: Take a map and explore the beautiful landscapes. Ensure children are accompanied and pets are kept on a lead, especially when paths lead to fast roads.

2. **Playing 'pooh sticks'**: Visit Pooh Sticks Bridge, where children can enjoy this fun game, similar to the stories of Winnie the Pooh.

3. **Supervising swimming**: If swimming in deep pools, make sure children are always accompanied by an adult.

4. **Exploring nature**: Observe the forest's flora and fauna, but keep pets on a lead to avoid encounters with wildlife like snakes.

5. **Providing shade and water for pets**: If you don't want to take your dog on a muddy path, provide a shaded area and drinking water in the car with windows left open to prevent heatstroke.

6. **Cycling**: Use the cycle route through the countryside, ensuring children are safe and within sight.

Always be mindful of the environment a

In [217]:
# COMMENT: Change the example query

# Multiple query generation with MultiQueryRetriever

In [224]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.prompts import ChatPromptTemplate

from typing import List
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

## Implementing a custom MultiQueryRetriver

### Setting up the prompt

In [225]:
multi_query_gen_prompt_template = """
You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}
"""

multi_query_gen_prompt = ChatPromptTemplate.from_template(multi_query_gen_prompt_template) 

### Setting up the multi-query parser

In [230]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Parse out a question from each output line."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  


questions_parser = LineListOutputParser()

### Setting up the chain to generate multiple queries

In [228]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [229]:
multi_query_gen_chain = multi_query_gen_prompt | llm | questions_parser

### Testing the Multi query gen chain

In [236]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"

multiple_queries = multi_query_gen_chain.invoke(user_question)

In [237]:
multiple_queries

['What are some safe and enjoyable activities for kids and pets in Ashdown Forest?  ',
 'Can you suggest family-friendly and pet-friendly things to do in Ashdown Forest that prioritize safety?  ',
 'What are the top outdoor activities in Ashdown Forest for children and pets that ensure their well-being?  ',
 'How can I find safe activities for both kids and pets to enjoy together in Ashdown Forest?  ',
 'What recommendations do you have for safe and fun experiences with children and pets in Ashdown Forest?']

### Setting up the MultiQueryRetriever

In [232]:
basic_retriever = uk_granular_collection.as_retriever()

multi_query_retriever = MultiQueryRetriever(
    retriever=basic_retriever, llm_chain=multi_query_gen_chain, 
    parser_key="lines" #A
)  
#A this is the key for the parsed output

### Using the multi_query retriever

In [238]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"

retrieved_docs = multi_query_retriever.invoke(user_question)

In [239]:
retrieved_docs

[Document(metadata={'Header 2': 'Stay safe'}, page_content="Stay safe \n [ edit ] \n \n Some paths may be muddy in the winter; in the summer there are some snakes (adders are the only poisonous ones, however rarely attack humans, dogs can be killed by Adder Venom) \n Often there are deep pools, which can be nice to swim in (from this area's idilic industrial past), but children should  always  be accompanied,  monsters from the deep are rare . \n Car parks are generally free of crime, however it is always important as with any car park to ensure that valuables are hidden out of site or taken with you. If you do not want to take your dog with you (paths can be muddy), give an area of shade for your animal, leave drinking water for the dog, and keep the windows well open.  Dogs die in hot cars. \n Some paths may lead abruptly onto (often fast) roads; for your children's and pets' safety keep listening out for cars and if in any doubt keep more adventurous animals on a lead. \n In the sum

## Using directly a standard MultiQueryRetriever 

In [243]:
import logging

logging.basicConfig()
logging.getLogger("MultiQueryRetriever").setLevel(logging.INFO)

In [244]:
std_multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=basic_retriever, llm=llm
)

In [245]:
user_question = "What are the best activities to do with children and pets in Ashdown Forest while ensuring their safety?"

retrieved_docs = multi_query_retriever.invoke(user_question)

INFO:langchain.retrievers.multi_query:Generated queries: ['What fun and safe activities can I do with kids and pets in Ashdown Forest?  ', 'What are some family-friendly and pet-safe things to enjoy in Ashdown Forest?  ', 'Can you suggest activities for children and pets in Ashdown Forest that prioritize safety?  ', 'What are the safest ways to engage children and pets in activities at Ashdown Forest?  ', 'Which activities in Ashdown Forest are ideal for children and pets while keeping them safe?']


In [246]:
retrieved_docs

[Document(metadata={'Header 2': 'Stay safe'}, page_content="Stay safe \n [ edit ] \n \n Some paths may be muddy in the winter; in the summer there are some snakes (adders are the only poisonous ones, however rarely attack humans, dogs can be killed by Adder Venom) \n Often there are deep pools, which can be nice to swim in (from this area's idilic industrial past), but children should  always  be accompanied,  monsters from the deep are rare . \n Car parks are generally free of crime, however it is always important as with any car park to ensure that valuables are hidden out of site or taken with you. If you do not want to take your dog with you (paths can be muddy), give an area of shade for your animal, leave drinking water for the dog, and keep the windows well open.  Dogs die in hot cars. \n Some paths may lead abruptly onto (often fast) roads; for your children's and pets' safety keep listening out for cars and if in any doubt keep more adventurous animals on a lead. \n In the sum

# Step-back question

### Setting up the chain to generate the step-back question

In [263]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [264]:
step_back_prompt_template = """
Generate a less specific question (aka Step-back question) for the following detailed question, so that a wider context can be retrieved.
Detailed question: {detailed_question}
Step-back question:
"""

step_back_prompt = ChatPromptTemplate.from_template(step_back_prompt_template) 

In [265]:
step_back_question_gen_chain = step_back_prompt | llm | StrOutputParser()

### Testing the step-back-question generation chain

In [266]:
user_question = "Can you give me some tips for a trip to Brighton?"

step_back_question = step_back_question_gen_chain.invoke(user_question)

In [267]:
step_back_question

'What are some general tips for planning a successful trip to a coastal city?'

### Incorporating step-back question generation chain into the RAG chain

In [260]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

step_back_question_rag_chain = (
    {
        "context": {"detailed_question": RunnablePassthrough()} | step_back_question_gen_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the step-back question
#B This is the original user question

In [261]:
user_question = "Can you give me some tips for a trip to Brighton?"

answer = step_back_question_rag_chain.invoke(user_question)
print(answer)

Here are some tips for a trip to Brighton:

1. **Stay Safe**: While Brighton is generally safe, be cautious in busy areas, especially West Street after midnight due to the nightlife crowd. 

2. **Watch for Traffic**: Be mindful of traffic, especially in busy areas.

3. **Valuables**: Take standard precautions with your valuables to avoid theft.

4. **Homelessness**: Be aware that there may be homeless individuals asking for money, but most are harmless.

5. **LGBT-Friendly Environment**: Brighton is known for its LGBT-friendly atmosphere. While same-sex displays of affection are generally accepted in many areas, exercise caution in certain places.

6. **Beaches**: Lifeguards patrol the beaches from late May to early September. Pay attention to signposts about which areas are covered.

7. **Emergency Contacts**: In case of emergencies related to the sea, call 999 and ask for the Coastguard.

8. **Explore Local Venues**: Enjoy local venues favored by residents for a civilized night out.


# Hypotetical DocumentEmbeddings (HyDE)

### Setting up the chain to generate the hypotetical document associated to the user question

In [310]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [311]:
hyde_prompt_template = """
Write one sentence that could answer the provided question. Do not add anything else.
Question: {question}
Sentence:
"""

hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)

In [312]:
hyde_chain = hyde_prompt | llm | StrOutputParser()

### Testing the hyde generation chain

In [317]:
user_question = "What are the best beaches in Cornwall?"

hypotetical_document = hyde_chain.invoke(user_question)

In [318]:
hypotetical_document

'Some of the best beaches in Cornwall include Porthcurno, Fistral Beach, and St Ives.'

### Incorporating hyde chain into the RAG chain

In [319]:
retriever = uk_granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
Only use the provided context to answer the question.
If you do not the answer, just say I do not know. 

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

hyde_rag_chain = (
    {
        "context": {"question": RunnablePassthrough()} | hyde_chain | retriever,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the hypotetical document
#B This is the original user question

In [320]:
user_question = "What are the best beaches in Cornwall?"

answer = hyde_rag_chain.invoke(user_question)
print(answer)

The best beaches in Cornwall mentioned in the context include Bude, Polzeath, Watergate Bay, Perranporth, Porthtowan, Fistral Beach, Newquay, St Agnes, St Ives, Gyllyngvase beach in Falmouth, and Praa Sands. Additionally, in Newquay, popular beaches are Crantock Beach, Fistral Beach, Great Western, Harbour, Holywell Bay, Lusty Glaze Beach, Porth Joke, Porth, Tolcarne Beach, Towan Beach, Whipsiderry, and Watergate Bay.
