# Query generation

# Self metadata query

## Ingestion (metadata enriched)

### Preparing the Chroma DB collection

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [2]:
uk_with_metadata_collection = Chroma(
    collection_name="uk_with_metadata_collection",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))

uk_with_metadata_collection.reset_collection() #A
#A in case it already exists

### Defining content to be ingested and splitting strategy

In [3]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
html2text_transformer = Html2TextTransformer()

In [5]:
text_splitter = RecursiveCharacterTextSplitter( #A
    chunk_size=1000, chunk_overlap=100
)

In [6]:
def split_docs_into_chunks(docs):
    text_docs = html2text_transformer.transform_documents(
        docs) #B
    chunks = text_splitter.split_documents(
        text_docs)

    return chunks

In [7]:
uk_destinations = [
    ("Cornwall", "Cornwall"), ("North_Cornwall", "Cornwall"), 
    ("South_Cornwall", "Cornwall"), ("West_Cornwall", "Cornwall"),
    ("Tintagel", "Cornwall"), ("Bodmin", "Cornwall"), 
    ("Wadebridge", "Cornwall"),
    ("Penzance", "Cornwall"), ("Newquay", "Cornwall"), 
    ("St_Ives", "Cornwall"),
    ("Port_Isaac", "Cornwall"), ("Looe", "Cornwall"), 
    ("Polperro", "Cornwall"),
    ("Porthleven", "Cornwall"),
    ("East_Sussex", "East_Sussex"), ("Brighton", "East_Sussex"),
    ("Battle", "East_Sussex"), ("Hastings_(England)", "East_Sussex"),
    ("Rye_(England)", "East_Sussex"), ("Seaford", "East_Sussex"), 
    ("Ashdown_Forest", "East_Sussex")
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [8]:
uk_destination_url_with_metadata = [ #C 
    ( f'{wikivoyage_root_url}/{destination}', destination, region)
    for destination, region in uk_destinations]

#A Instantiate a relatively fine-chunk splitting strategy
#B Transform HTML docs into clean text docs
#C Prepare metadata to be imported: Url, UK Destination and UK Region

### Enriching a document with metadata: updating metadata

In [9]:
tintagel_url, tintagel_destination, tintagel_region = uk_destination_url_with_metadata[4]

In [10]:
tintagel_html_loader =AsyncHtmlLoader(tintagel_url)
tintagel_docs = tintagel_html_loader.load()

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.89it/s]


In [11]:
# tintagel_docs # COMMENT: LangChain loaders create docs which contain metadata

In [12]:
for doc in tintagel_docs:
    doc.metadata['destination'] = tintagel_destination
    doc.metadata['region'] = tintagel_region
    print(doc.metadata)

{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en', 'destination': 'Tintagel', 'region': 'Cornwall'}


### Enriching a document with metadata: creating metadata 

In [13]:
tintagel_docs_with_metadata = [
    Document(page_content=d.page_content,
             metadata = {
                 'source': tintagel_url,
                 'destination': tintagel_destination,
                 'region': tintagel_region
             })
    for d in tintagel_docs
]

In [14]:
# tintagel_docs_with_metadata # examine the Document

### Enriching the UK destination documents with metadata: creating metadata 

In [15]:
for (url, destination, region) in uk_destination_url_with_metadata:
    html_loader = AsyncHtmlLoader(url) #A
    docs =  html_loader.load() #B
    
    docs_with_metadata = [
        Document(page_content=d.page_content,
        metadata = {
            'source': url,
            'destination': destination,
            'region': region})
        for d in docs]
             
    chunks = split_docs_into_chunks(docs_with_metadata)

    print(f'Importing: {destination}')
    uk_with_metadata_collection.add_documents(documents=chunks)
#A Loader for one destination
#B Documents (chunks) related to one destination 

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.85it/s]


Importing: Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.17it/s]


Importing: North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.98it/s]


Importing: South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.90it/s]


Importing: West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.52it/s]


Importing: Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.16s/it]


Importing: Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.79it/s]


Importing: Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.57s/it]


Importing: Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.94s/it]


Importing: Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.65s/it]


Importing: St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.24s/it]


Importing: Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  1.10it/s]


Importing: Looe


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.41s/it]


Importing: Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.14s/it]


Importing: Porthleven


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.23s/it]


Importing: East_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:07<00:00,  7.29s/it]


Importing: Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.27s/it]


Importing: Battle


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.47s/it]


Importing: Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:02<00:00,  2.15s/it]


Importing: Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.34s/it]


Importing: Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:01<00:00,  1.99s/it]


Importing: Ashdown_Forest


## Q & A on a collection enriched with metadata

### Searching the collection with a metadata filter explicitly

In [16]:
question =  "Events or festivals"
metadata_retriever = uk_with_metadata_collection.as_retriever(
    search_kwargs={'k':2, 'filter':{'destination': 'Newquay'}})

result_docs = metadata_retriever.invoke(question)

In [17]:
result_docs

[Document(id='c12122b2-8914-41cc-bb48-394383cd4c98', metadata={'source': 'https://en.wikivoyage.org/wiki/Newquay', 'destination': 'Newquay', 'region': 'Cornwall'}, page_content="## Do\n\n[edit]\n\n  * Cornish Film Festival. Held annually for two weeks each November around Newquay. (updated Jan 2024)\n  * 50.415741-5.0914781 Newquay Golf Club, Tower Road, TR7 1LT, ☏ +44 1637 872091, info@newquaygolfclub.co.uk. 9AM-4PM. A semi-private golf club established in 1890. Total yardage Championship: 6141, Men: 5708, and Women: 5364. £31 for non-members. (updated Apr 2019)\n\n### Beaches\n\n[edit]\n\nFistral Beach\n\nNewquay is well known as a surfer's paradise. Therefore it offers plenty of\nbeaches:"),
 Document(id='17dd5f99-277e-4d74-af89-ad5882dffd51', metadata={'region': 'Cornwall', 'destination': 'Newquay', 'source': 'https://en.wikivoyage.org/wiki/Newquay'}, page_content="Jump to content\n\nMain menu\n\nMain menu\n\nmove to sidebar hide\n\nNavigation\n\n  * Main page\n  * Travel destinati

In [18]:
# COMMENT: As you can see, only chunks associated with'destination': 'Newquay' have been selected

### Generating the self metadata query with the SelfQueryRetriever

In [20]:
from langchain_classic.chains.query_constructor.base import AttributeInfo
from langchain_classic.retrievers.self_query.base import SelfQueryRetriever #A
from langchain_openai import ChatOpenAI
#A this requires pip install lark

In [26]:
metadata_field_info = [
    AttributeInfo(
        name="destination",
        description="The specific UK destination to be searched",
        type="string",
    ),
    AttributeInfo(
        name="region",
        description="The name of the UK region to be searched",
        type="string",
    )
]

In [27]:
question = "Tell me about events or festivals in the UK town of Newquay"

In [28]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

self_query_retriever = SelfQueryRetriever.from_llm(
    llm, uk_with_metadata_collection, question, 
    metadata_field_info, verbose=True
)

In [29]:
result_docs = self_query_retriever.invoke(question)

In [30]:
result_docs

[Document(id='c12122b2-8914-41cc-bb48-394383cd4c98', metadata={'destination': 'Newquay', 'region': 'Cornwall', 'source': 'https://en.wikivoyage.org/wiki/Newquay'}, page_content="## Do\n\n[edit]\n\n  * Cornish Film Festival. Held annually for two weeks each November around Newquay. (updated Jan 2024)\n  * 50.415741-5.0914781 Newquay Golf Club, Tower Road, TR7 1LT, ☏ +44 1637 872091, info@newquaygolfclub.co.uk. 9AM-4PM. A semi-private golf club established in 1890. Total yardage Championship: 6141, Men: 5708, and Women: 5364. £31 for non-members. (updated Apr 2019)\n\n### Beaches\n\n[edit]\n\nFistral Beach\n\nNewquay is well known as a surfer's paradise. Therefore it offers plenty of\nbeaches:"),
 Document(id='d09a5f0f-2e78-4360-93e9-4a6b807c40cc', metadata={'region': 'Cornwall', 'source': 'https://en.wikivoyage.org/wiki/Newquay', 'destination': 'Newquay'}, page_content='# Newquay\n\n## Contents\n\n  * 1 Understand\n    * 1.1 Visitor information\n  * 2 Get in\n    * 2.1 By road\n    * 2.

### Generating the self metadata query with a LLM function call

#### Query schema

In [34]:
import datetime
from typing import Literal, Optional, Tuple, List

from pydantic import BaseModel, Field
from langchain_classic.chains.query_constructor.ir import (
    Comparator,
    Comparison,
    Operation,
    Operator,
    StructuredQuery,
)
from langchain_classic.retrievers.self_query.chroma import ChromaTranslator

In [35]:
class DestinationSearch(BaseModel):
    """Search over a vector database of tourist destinations."""

    content_search: str = Field(
        "",
        description="""Similarity search query applied 
        to tourist destinations.""",
    )
    destination: str = Field(
        ...,
        description="The specific UK destination to be searched.",
    )
    region: str = Field(
        ...,
        description="The name of the UK region to be searched.",
    )

    def pretty_print(self) -> None:
        for field in self.__fields__:
            if getattr(self, field) is not None and getattr(
                self, field) != getattr(
                self.__fields__[field], "default", None
            ):
                print(f"{field}: {getattr(self, field)}")

In [36]:
def build_filter(destination_search: DestinationSearch):
    comparisons = []

    destination = destination_search.destination #A
    region = destination_search.region #A
    
    if destination and destination != '': #B
        comparisons.append(
            Comparison(
                comparator=Comparator.EQ,
                attribute="destination",
                value=destination,
            )
        )
    if region and region != '': #C
        comparisons.append(
            Comparison(
                comparator=Comparator.EQ,
                attribute="region",
                value=region,
            )
        )    

    search_filter = Operation(operator=Operator.AND, 
                              arguments=comparisons) #D

    chroma_filter = ChromaTranslator().visit_operation(
        search_filter) #E
        
    return chroma_filter
#A Get destination and region from the structured query
#B If the destination exists, create an 'equality' operation
#C If the region exists, create an 'equality' operation
#D Create a combined search filter
#E Transform the filter into Chroma format

#### Conversion of user question to structured query including metadata filter

In [37]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

system_message = """You are an expert at converting user 
questions into vector database queries. 
You have access to a database of tourist destinations.
Given a question, return a database query optimized 
to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, 
do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message),
        ("human", "{question}"),
    ]
)
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)
structured_llm = llm.with_structured_output(
    DestinationSearch, method="function_calling")
query_generator = prompt | structured_llm

In [38]:
question = "Tell me about events or festivals in the UK town of Newquay"

structured_query =query_generator.invoke(question)

In [39]:
structured_query

DestinationSearch(content_search='events and festivals', destination='Newquay', region='Cornwall')

In [40]:
search_filter = build_filter(structured_query)

In [41]:
search_filter

{'$and': [{'destination': {'$eq': 'Newquay'}},
  {'region': {'$eq': 'Cornwall'}}]}

In [42]:
search_query = structured_query.content_search

In [43]:
search_query

'events and festivals'

In [44]:
metadata_retriever = uk_with_metadata_collection.as_retriever(
    search_kwargs={'k':3, 'filter': search_filter})

In [45]:
answer = metadata_retriever.invoke(search_query)

In [46]:
print(answer)

[Document(id='c12122b2-8914-41cc-bb48-394383cd4c98', metadata={'region': 'Cornwall', 'destination': 'Newquay', 'source': 'https://en.wikivoyage.org/wiki/Newquay'}, page_content="## Do\n\n[edit]\n\n  * Cornish Film Festival. Held annually for two weeks each November around Newquay. (updated Jan 2024)\n  * 50.415741-5.0914781 Newquay Golf Club, Tower Road, TR7 1LT, ☏ +44 1637 872091, info@newquaygolfclub.co.uk. 9AM-4PM. A semi-private golf club established in 1890. Total yardage Championship: 6141, Men: 5708, and Women: 5364. £31 for non-members. (updated Apr 2019)\n\n### Beaches\n\n[edit]\n\nFistral Beach\n\nNewquay is well known as a surfer's paradise. Therefore it offers plenty of\nbeaches:"), Document(id='748fc0e8-fea5-464f-87ed-e75127db0227', metadata={'region': 'Cornwall', 'destination': 'Newquay', 'source': 'https://en.wikivoyage.org/wiki/Newquay'}, page_content="## Eat\n\n[edit]\n\n### Budget\n\n[edit]\n\nThere are lots of cheap eats in the town centre.\n\n  * 50.415513-5.0868851

In [47]:
## COMMENT: this is only the retrieval step; you still need to wrap it in a RAG chain

# Generating a structured SQL query

## Connecting to the UkBooking database

In [49]:
from langchain_community.utilities import SQLDatabase
from langchain_community.tools import QuerySQLDataBaseTool
from langchain_classic.chains import create_sql_query_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import getpass
import os

In [50]:
db = SQLDatabase.from_uri("sqlite:///UkBooking.db")
print(db.get_usable_table_names())

['Accommodation', 'AccommodationType', 'Booking', 'Customer', 'Destination', 'Offer']


In [51]:
db.run("SELECT * FROM Offer;")

"[(1, 1, 'Summer Special', 0.15, '2024-06-01', '2024-08-31'), (2, 2, 'Weekend Getaway', 0.1, '2024-09-01', '2024-12-31'), (3, 3, 'Early Bird Discount', 0.2, '2024-05-01', '2024-06-30'), (4, 4, 'Stay 3 Nights, Get 1 Free', 0.25, '2024-01-01', '2024-03-31'), (5, 5, 'Historic Stay Offer', 0.1, '2024-04-01', '2024-06-30'), (6, 6, 'Autumn Discount', 0.15, '2024-09-01', '2024-11-30'), (7, 7, 'Cottage Retreat Offer', 0.12, '2024-07-01', '2024-09-30'), (8, 8, 'City Break Deal', 0.08, '2024-10-01', '2024-12-31'), (9, 9, 'Luxury Villa Offer', 0.18, '2024-05-01', '2024-08-31'), (10, 10, 'Spa & Wellness Package', 0.2, '2024-04-01', '2024-07-31')]"

## Generate SQL queries from natural language

In [52]:
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


### Generating the SQL query

In [105]:
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4.1")
sql_query_gen_chain = create_sql_query_chain(llm, db)
response = sql_query_gen_chain.invoke(
    {"question": 
     "Give me some offers for Cardiff, including the hotel name"})

In [106]:
response

'Question: Give me some offers for Cardiff, including the hotel name  \nSQLQuery:  \nSELECT "Offer"."OfferDescription", "Accommodation"."Name"\nFROM "Offer"\nJOIN "Accommodation" ON "Offer"."AccommodationId" = "Accommodation"."AccommodationId"\nJOIN "Destination" ON "Accommodation"."DestinationId" = "Destination"."DestinationId"\nWHERE "Destination"."Name" = \'Cardiff\'\nLIMIT 5;'

In [107]:
#db.run(response) # returns error

### Executing the SQL query [NOTE: THIS WILL THROW AN ERROR]

In [108]:
sql_query_exec_chain = QuerySQLDataBaseTool(db=db)
sql_query_gen_chain = create_sql_query_chain(llm, db)
chain = sql_query_gen_chain | sql_query_exec_chain
chain.invoke({"question": "Give me some offers for Cardiff, including the hotel name"})

'Error: (sqlite3.OperationalError) near "Question": syntax error\n[SQL: Question: Give me some offers for Cardiff, including the hotel name  \nSQLQuery:  \nSELECT "Accommodation"."Name", "Offer"."OfferDescription", "Offer"."DiscountRate", "Offer"."StartDate", "Offer"."EndDate"  \nFROM "Offer"  \nJOIN "Accommodation" ON "Offer"."AccommodationId" = "Accommodation"."AccommodationId"  \nJOIN "Destination" ON "Accommodation"."DestinationId" = "Destination"."DestinationId"  \nWHERE "Destination"."Name" = \'Cardiff\'  \nLIMIT 5;]\n(Background on this error at: https://sqlalche.me/e/20/e3q8)'

### Fixing the SQL format

In [122]:
clean_sql_prompt_template = """You are an expert in SQL Lite. 
You are asked to fix badly formed SQL Lite queries, 
which might contain unneded prefixes or suffixes. 
Given the following unclean SQL statement, 
transform it to a clean, 
executable SQL statement for SQL lite.
Always prefix column names with the table name.
Only return an executable SQL statement which terminates 
with a semicolon. Do not return anything else.
Do not include the language name or symbols like ```.

Unclean SQL: {unclean_sql}"""

In [123]:
clean_sql_prompt = ChatPromptTemplate.from_template(
    clean_sql_prompt_template)

In [124]:
clean_sql_chain = clean_sql_prompt | llm

In [125]:
full_sql_gen_chain = sql_query_gen_chain | \
   clean_sql_chain | StrOutputParser()

In [126]:
question = """Give me some offers for Cardiff, 
including the accomodation name"""

In [127]:
response = full_sql_gen_chain.invoke({"question": question})

In [128]:
response

"SELECT Offer.OfferDescription, Offer.DiscountRate, Offer.StartDate, Offer.EndDate, Accommodation.Name\nFROM Offer\nJOIN Accommodation ON Offer.AccommodationId = Accommodation.AccommodationId\nJOIN Destination ON Accommodation.DestinationId = Destination.DestinationId\nWHERE Destination.Name = 'Cardiff'\nLIMIT 5;"

In [129]:
### Comment: now SQL is fixed

### Executing the SQL query

In [130]:
sql_query_exec_chain = QuerySQLDataBaseTool(db=db)

In [131]:
sql_query_gen_and_exec_chain = full_sql_gen_chain \
    | sql_query_exec_chain | StrOutputParser()

In [132]:
response = sql_query_gen_and_exec_chain.invoke(
    {"question":question})

In [133]:
response

"[('Early Bird Discount', 0.2, 'Cardiff Camping')]"

In [69]:
## COMMENT: this is only the retrieval step; you still need to wrap it in a RAG chain

# Query router

In [72]:
from typing import Literal
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableLambda

## Setting up the data retrievers

### Setting up the vector store retriever

In [73]:
tourist_info_retriever_chain = RunnableLambda(
    lambda x: x['question']) \
       | uk_with_metadata_collection.as_retriever(
           search_kwargs={'k':2}) 

### Setting up the relational database retriever (Same as sql_query_gen_and_exec_chain above)

In [74]:
uk_accommodation_retriever_chain =  full_sql_gen_chain \
    | sql_query_exec_chain | StrOutputParser()

## Setting up the query router

In [75]:
class RouteQuery(BaseModel):
    """Route a user question to the most relevant datasource."""

    datasource: Literal["tourist_info_store", 
        "uk_booking_db"] = Field(
        ...,
        description="""Given a user question, 
        route it either to a tourist info vector store 
        or a UK accomodation booking relational database.""",
    )

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-5-nano")
structured_llm_router = llm.with_structured_output(
    RouteQuery) #A
#A Structured router which uses LLM function calls

### Setting up the question router chain

In [76]:
system = """You are an expert at routing a user question 
to a tourist info vector store 
or to an UK accommodation booking relational database.
The vector store contains tourist information about UK destinations.
Use the vectorstore for general tourist information questions 
on UK destinations. 
For questions about accommodation availability or booking, 
use the UK Booking database."""
route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

question_router = route_prompt | structured_llm_router

### Testing the router chain

In [77]:
selected_data_source = question_router.invoke(
    {"question": "Have you got any offers in Brighton?"}
)

In [78]:
print(selected_data_source)

datasource='uk_booking_db'


In [79]:
selected_data_source = question_router.invoke(
    {"question": "Where are the best beaches in Cornwall?"}
)

In [80]:
print(selected_data_source)

datasource='tourist_info_store'


### Setting up the retriever chooser

In [81]:
retriever_chains = {
    'tourist_info_store': tourist_info_retriever_chain,
    'uk_booking_db': uk_accommodation_retriever_chain
}

def retriever_chooser(question):
    selected_data_source = question_router.invoke(
        {"question": question})

    return retriever_chains[selected_data_source.datasource]

In [82]:
chosen = retriever_chooser("""Tell me about events 
or festivals in the UK town of Newquay""") 

In [83]:
print(chosen)

first=RunnableLambda(...) middle=[] last=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002EDE7826A10>, search_kwargs={'k': 2})


## Setting up the full RAG chain

In [84]:
from langchain_core.runnables import RunnablePassthrough

In [85]:
rag_prompt_template = """
Given a question and some context, answer the question.
If you get a structured context, like a tuple, try to 
infer the meaning of the components: 
typically they refer to accommodation offers, 
and the number is a percentage (0.2 means 20%).
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

def execute_rag_chain(question, chosen_retriever):
    full_rag_chain = (
        {
            "context": {"question": RunnablePassthrough()} 
                | chosen_retriever,#A
            "question": RunnablePassthrough(),#B
        }
        | rag_prompt
        | llm
        | StrOutputParser()
    )

    return full_rag_chain.invoke(question)

#A The context is returned by the retriver after feeding to it the rewritten query
#B This is the original user question

## Executing the full RAG chain 

### Question on accommodation offers

In [86]:
question = """Give me some offers for Cardiff, 
including the accommodation name"""

chosen_retriever = retriever_chooser(question)

answer = execute_rag_chain(question, chosen_retriever)

In [87]:
print(answer)

- Cardiff Camping — Early Bird Discount: 20% off, valid 2024-05-01 to 2024-06-30.


### Question on tourist information

In [88]:
question_2 = """Tell me about events or festivals 
in the UK town of Newquay"""

chosen_retriever_2 = retriever_chooser(question_2)

answer2 = execute_rag_chain(question_2, chosen_retriever_2)

In [89]:
print(answer2)

- Cornish Film Festival: Held annually in November around Newquay.
- Surfing events: Newquay is described as the UK’s surfing capital, with events such as the Boardmasters festival and national surfing championships taking place in the area.


# Retrieval post processing

## RAG Fusion

### Multiple query Generation (same as for MultiQueryRetriver)

In [90]:
from langchain_core.prompts import ChatPromptTemplate

from typing import List
from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

In [91]:
multi_query_gen_prompt_template = """
You are an AI language model assistant. Your task is 
to generate five different versions of the given user 
question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the 
user question, your goal is to help
the user overcome some of the limitations of the 
distance-based similarity search. 
Provide these alternative questions separated by newlines.
Original question: {question}
"""

multi_query_gen_prompt = ChatPromptTemplate.from_template(
    multi_query_gen_prompt_template) 

In [92]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Parse out a question from each output line."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  


questions_parser = LineListOutputParser()

In [93]:
llm = ChatOpenAI(model="gpt-5", openai_api_key=OPENAI_API_KEY)

In [94]:
multi_query_gen_chain = multi_query_gen_prompt | llm | questions_parser

### Reciprocal Rank Fusion algorithm

In [95]:
# Based on: https://github.com/Raudaschl/rag-fusion/blob/master/main.py

def reciprocal_rank_fusion(results_groups: #A
                           list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple groups of 
        ranked documents and an optional parameter k used in 
        the Reciprocal Rank Fusion (RRF) formula """

    indexed_results = {} #B
    
    for group_id, results_group in enumerate(
        results_groups): #V
        for local_rank, doc in enumerate(results_group):
            indexed_results[(group_id, local_rank)] = doc
    
    fused_scores = {} #D
    
    for key, doc in indexed_results.items(): #E
        group_id, local_rank = key

        if key not in fused_scores:
            fused_scores[key] = 0 #F
        
        doc_current_score = fused_scores[key]        
        fused_scores[key] += 1 / (local_rank + k) #G

    reranked_results = [ #H
        (indexed_results[key], score)
        for key, score in sorted(fused_scores.items(), 
                                 key=lambda x: x[1], reverse=True)
    ]

    return reranked_results
#A Based on: https://github.com/Raudaschl/rag-fusion/blob/master/main.py                
# B Initialize a dictionary to organize results with an index
# C Index the results by (group_id, local_rank)
# D Initialize a dictionary to hold fused scores for each unique document
# E Iterate through the indexed results
# F Initialize an indexed result with a score of 0 if it has not been processed yet
# G calculate the new document score with the RRF formula
# H rerank the results by RRF score 

In [96]:
retriever = uk_with_metadata_collection.as_retriever(
    search_kwargs={'k':3})
top_three_results = RunnableLambda(
    lambda x: x[0:3]) #A

rag_fusion_retrieval_chain =multi_query_gen_chain \
    | retriever.map() | reciprocal_rank_fusion \
    | top_three_results #B
        
docs = rag_fusion_retrieval_chain.invoke(
    {"question": question}) #C
len(docs)
#A select the top three results
#B Full RAG fusion retrieval chain
#C testing the retrieval_chain_rag_fusion chain

3

### Incorporating Rag Fusion into the RAG Chain

In [97]:
rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template) 

rag_chain = (
    {
        "context": {"question": RunnablePassthrough()} | rag_fusion_retrieval_chain,#A
        "question": RunnablePassthrough(),#B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
#A The context is returned by the retriver after feeding to it the step-back question
#B This is the original user question

In [98]:
user_question = "Can you give me some tips for a trip to Brighton?"

answer = rag_chain.invoke(user_question)
print(answer)

Here are a few tips based on the provided info:

- When to go: The city really comes to life in spring. May brings two major events—Brighton Festival and the Festival Fringe.
- Summer vibe: Brighton flourishes in summer with lazy days and beautiful sunsets along its 5+ mile (8 km) shingle beach facing the English Channel.
- Trip length: A day trip or a long weekend works well year-round.
- Getting there: Trains are a convenient way in (see “Rail travel in Great Britain” for guidance).
- Work options: If you have a working visa, Brighton is good for seasonal and temporary jobs.
- Local info: Check the Brighton & Hove City Council website for updates and event details.
