# Advanced document indexing

# Splitting and ingesting HTML content

## Splitting and ingesting the content of a single URL (on Cornwall)

### Preparing the Chroma DB collections

In [1]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import getpass

OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [2]:
corwnall_granular_collection = Chroma( #A
    collection_name="cornwall_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

corwnall_granular_collection.reset_collection() #B
# A Create a Chorma DB collection
# B Reset the collection in case it already exists 

In [3]:
corwnall_coarse_collection = Chroma( #A 
    collection_name="cornwall_coarse",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

corwnall_coarse_collection.reset_collection() #B
# A Create a Chorma DB collection
# B Reset the collection in case it already exists 

### Loading the HTML content with the AsyncHtmlLoader 

In [4]:
from langchain_community.document_loaders import AsyncHtmlLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
destination_url = "https://en.wikivoyage.org/wiki/Cornwall"

In [6]:
html_loader = AsyncHtmlLoader(destination_url)

In [7]:
docs = html_loader.load()

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  8.92it/s]


In [8]:
len(docs)

1

### Splitting into granular chunks with the HTMLSectionSplitter 

In [9]:
from langchain_text_splitters import HTMLSectionSplitter

In [10]:
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(
    headers_to_split_on=headers_to_split_on)

In [11]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content #A
        temp_chunks = html_section_splitter.split_text(
            html_string) #B
        all_chunks.extend(temp_chunks) 

    return all_chunks

#A Extract the HTML text from the document
#B Each chunk is a H1 or H2 HTML section

In [12]:
granular_chunks = split_docs_into_granular_chunks(docs)

#### Ingesting granular chunks

In [13]:
corwnall_granular_collection.add_documents(documents=granular_chunks)

['78ffd1fa-6845-451f-8d8c-5954ed23c47f',
 '218c3426-d2ef-424c-8cc1-e723e09fd300',
 'dba57221-bf97-418e-9f8f-50e7289031d4',
 '607a81cd-8abe-40e3-8ba4-27503cbdc098',
 'd7bc097f-7c29-4b3a-b210-825afb574dac',
 'bba38404-6697-4312-a2ad-84fce0235162',
 'bd9f23ab-c34f-4d17-9d8d-4341212495b9',
 '68a0bbfb-1d42-4f17-b3ab-793e1c87fa4b',
 '852f09f5-4f13-41a6-8cd9-c934fd6bdf5a',
 'ccf3c984-2d25-4340-a983-400d63432ad9',
 '7a4aefc6-0992-430c-ae4f-d3030db3e7e4',
 'dd7e2ae3-a254-4915-bd98-29b072730719',
 '501a4c01-bdaf-4ed7-9b47-35cde19a1710',
 '5312c3f8-dcba-4801-b43d-b141bcd7a09e',
 'b9fd5fde-5e61-45b5-9359-177e8b894b4b',
 '65b790e8-f77b-45bc-9277-e18c3af65711',
 'd340953e-335d-4c1a-8d29-3d3bc0fc4313',
 'a3322085-31c4-474f-8db1-b0c016d58b16',
 '23195549-b5ac-433e-8bf7-df52f9a8ae11']

#### Searching granular chunks

In [14]:
results = corwnall_granular_collection.similarity_search(
    query="Events or festivals in Cornwall",k=3)
for doc in results:
    print(doc)

page_content='Cornwall' metadata={'Header 1': 'Cornwall'}
page_content='Festivals 
 [ edit ] 
 
 These festivals tend to not be public holidays and not all are celebrated fully across the county. 
   
 AberFest 
 .   A Celtic cultural festival celebrating “All things” Cornish and Breton that takes place biennially (every two years) in Cornwall at Easter. The AberFest Festival alternates with the Breizh – Kernow Festival that is held in Brandivy and Bignan (in Breizh/Bretagne – France) on the alternate years.       ( updated Jun 2023 ) 
 Golowan , sometimes also  Goluan  or  Gol-Jowan  is the Cornish word for the Midsummer celebrations, most popular in the Penwith area and in particular  Penzance  and  Newlyn . The celebrations are conducted from the 23rd of June (St John's Eve) to the 28th of June (St Peter's Eve) each year, St Peter's Eve being the more popular in Cornish fishing communities. The celebrations are centred around the lighting of bonfires and fireworks and the performanc

### Splitting into coarse chunks with the RecursiveCharacterTextSplitter 

In [15]:
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [16]:
html2text_transformer = Html2TextTransformer()

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000, chunk_overlap=300
)

In [18]:
def split_docs_into_coarse_chunks(docs):
    text_docs = html2text_transformer.transform_documents(
        docs) #A 
    coarse_chunks = text_splitter.split_documents(
        text_docs)

    return coarse_chunks
#A transform HTML docs into clean text docs

In [19]:
coarse_chunks = split_docs_into_coarse_chunks(docs)

#### Ingesting coarse chunks

In [20]:
corwnall_coarse_collection.add_documents(documents=coarse_chunks)

['ecead088-14e3-4846-9a4e-371f31513cc3',
 '8416f862-34b5-4f76-bdfa-6a5c38810674',
 'a4339481-ac18-4ca4-90f7-862e01e166e8',
 'bde0f20f-b12c-4cbe-8202-65c7ee240cbe',
 'b25da7f6-fc24-486d-ae70-6bf81eeb53cf',
 '5fed9278-50cd-4330-8ef8-326c95c7c9ac',
 'e96ac49e-a409-41e7-b0b6-7cd0b4e8f3da',
 '542d54a9-1785-4157-9846-2885362d11ab',
 '4df93b89-09c8-46d2-88fd-e7542ec18849',
 'd77f79c9-d6a5-4914-af4e-a625bc39beb0',
 '3b6f9f0b-7188-4da3-ac36-2f525f64844d',
 '1476b8e9-05ab-4f29-9e5e-b553d5ce8c7e',
 '66d57a3e-3d52-43bd-aefd-b678ccbc2dde',
 '764a0c39-9320-4835-b80b-751965207447',
 '06591698-ee1e-4a45-ae0b-34839b96821a']

#### Searching coarse chunks

In [21]:
results = corwnall_coarse_collection.similarity_search(
    query="Events or festivals in Cornwall",k=3)
for doc in results:
    print(doc)

page_content='### Spirits

[edit]

    _See also:Liquor_

Gin and rum are also produced in Cornwall. A popular brand of Cornish rum is
Dead Man's Fingers which has multiple flavours and is bottled in St. Ives.

## Festivals

[edit]

These festivals tend to not be public holidays and not all are celebrated
fully across the county.

AberFest. A Celtic cultural festival celebrating “All things” Cornish and
Breton that takes place biennially (every two years) in Cornwall at Easter.
The AberFest Festival alternates with the Breizh – Kernow Festival that is
held in Brandivy and Bignan (in Breizh/Bretagne – France) on the alternate
years. (updated Jun 2023)

**Golowan** , sometimes also _Goluan_ or _Gol-Jowan_ is the Cornish word for
the Midsummer celebrations, most popular in the Penwith area and in particular
Penzance and Newlyn. The celebrations are conducted from the 23rd of June (St
John's Eve) to the 28th of June (St Peter's Eve) each year, St Peter's Eve
being the more popular in Corni

## Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [22]:
uk_granular_collection = Chroma( #A
    collection_name="uk_granular",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_granular_collection.reset_collection() #B

In [23]:
uk_coarse_collection = Chroma( #A
    collection_name="uk_coarse",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

uk_coarse_collection.reset_collection() #B

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [24]:
# Reduce this list if you want to save on processing fees
uk_destinations = [
    "Cornwall", "North_Cornwall", "South_Cornwall", "West_Cornwall", 
    "Tintagel", "Bodmin", "Wadebridge", "Penzance", "Newquay",
    "St_Ives", "Port_Isaac", "Looe", "Polperro", "Porthleven"
    "East_Sussex", "Brighton", "Battle", "Hastings_(England)", 
    "Rye_(England)", "Seaford", "Ashdown_Forest"
] 

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [25]:
uk_destination_urls = [f'{wikivoyage_root_url}/{d}' for d in uk_destinations]

In [26]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #C
    docs =  html_loader.load() #D
    
    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        uk_granular_collection.add_documents(documents=granular_chunks)

        coarse_chunks = split_docs_into_coarse_chunks(docs)
        uk_coarse_collection.add_documents(documents=coarse_chunks)
#A Create a Chroma DB collection
#B Reset the collection in case it already exists 
#C Loader for one destination
#D Documents of one destination 

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.36it/s]


{'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.12it/s]


{'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.74it/s]


{'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.98it/s]


{'source': 'https://en.wikivoyage.org/wiki/West_Cornwall', 'title': 'West Cornwall – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.45it/s]


{'source': 'https://en.wikivoyage.org/wiki/Tintagel', 'title': 'Tintagel – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.72it/s]


{'source': 'https://en.wikivoyage.org/wiki/Bodmin', 'title': 'Bodmin – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.36it/s]


{'source': 'https://en.wikivoyage.org/wiki/Wadebridge', 'title': 'Wadebridge – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.49it/s]


{'source': 'https://en.wikivoyage.org/wiki/Penzance', 'title': 'Penzance – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.70it/s]


{'source': 'https://en.wikivoyage.org/wiki/Newquay', 'title': 'Newquay – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.14it/s]


{'source': 'https://en.wikivoyage.org/wiki/St_Ives', 'title': 'St Ives – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.44it/s]


{'source': 'https://en.wikivoyage.org/wiki/Port_Isaac', 'title': 'Port Isaac – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.95it/s]


{'source': 'https://en.wikivoyage.org/wiki/Looe', 'title': 'Looe – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.80it/s]


{'source': 'https://en.wikivoyage.org/wiki/Polperro', 'title': 'Polperro – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 22.55it/s]


{'source': 'https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex', 'title': 'PorthlevenEast Sussex – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.32it/s]


{'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.09it/s]


{'source': 'https://en.wikivoyage.org/wiki/Battle', 'title': 'Battle – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.70it/s]


{'source': 'https://en.wikivoyage.org/wiki/Hastings_(England)', 'title': 'Hastings (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.45it/s]


{'source': 'https://en.wikivoyage.org/wiki/Rye_(England)', 'title': 'Rye (England) – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.33it/s]


{'source': 'https://en.wikivoyage.org/wiki/Seaford', 'title': 'Seaford – Travel guide at Wikivoyage', 'language': 'en'}


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.88it/s]


{'source': 'https://en.wikivoyage.org/wiki/Ashdown_Forest', 'title': 'Ashdown Forest – Travel guide at Wikivoyage', 'language': 'en'}


#### Searching 

In [27]:
granular_results = uk_granular_collection.similarity_search(
    query="Events or festivals in East Sussex",k=4)
for doc in granular_results:
    print(doc)

page_content='Brighton' metadata={'Header 1': 'Brighton'}
page_content='South Cornwall' metadata={'Header 1': 'South Cornwall'}
page_content='Seaford' metadata={'Header 1': 'Seaford'}
page_content='Penzance' metadata={'Header 1': 'Penzance'}


In [28]:
coarse_results = uk_coarse_collection.similarity_search(
    query="Events or festivals in East Sussex",k=4)
for doc in coarse_results:
    print(doc)

page_content='### Events

[edit]

A market during the Brighton Festival. The Theatre Royal is the red building.
A colourful parade down Queens Road during Pride in 2016.

  * **Brighton Racecourse** has flat-racing April-Oct. It's on Freshfield Rd a mile east of town centre.
  * **Plumpton Racecourse** is National Hunt (jumps races) Nov-March, but it's 10 mi (16 km) north in Lewes.
  * Brighton Festival Fringe: early May – early June, ☏ +44 1273 764900, info@brightonfringe.org. The Fringe runs at the same time as the main festival, and features over 600 events, including comedy, theatre, music, and "open houses" (local artists exhibiting in their own homes) and tours (haunted pubs, Regency Brighton, churches, cemeteries, sewers, etc.)_(date needs fixing)_
  * Brighton Festival: May, ☏ +44 1273 709709 (tickets), tickets@brightonfestival.org. The Brighton Festival, in May each year, is the second biggest arts festival in Great Britain (coming closely behind Edinburgh). Music of all sorts

In [29]:
granular_results = uk_granular_collection.similarity_search(
    query="Beaches in Conrwall",k=4)
for doc in granular_results:
    print(doc)

page_content='Cornwall' metadata={'Header 1': 'Cornwall'}
page_content='North Cornwall' metadata={'Header 1': 'North Cornwall'}
page_content='West Cornwall' metadata={'Header 1': 'West Cornwall'}
page_content='South Cornwall' metadata={'Header 1': 'South Cornwall'}


In [30]:
coarse_results = uk_coarse_collection.similarity_search(
    query="Beaches in Cornwall",k=4)
for doc in coarse_results:
    print(doc)

page_content='**South Cornwall** is in Cornwall. It includes much of the stunning Cornish
coast along the English Channel of the Atlantic Ocean.

## Towns and villages

[edit]

Map of South Cornwall

  * 50.26-5.0511 Truro — Cornwall's main centre hosts the Royal Cornwall Museum
  * 50.3311-4.20212 Cawsand — overlooks Plymouth Sound; Cawsand is within Mount Edgcumbe Country Park
  * 50.15-5.073 Falmouth — famous for its beaches, it is home to the world's third largest natural harbour
  * 50.334-4.6334 Fowey — the Fowey Regatta in mid-August attracts many yachts and sailing boats
  * 50.354-4.4545 Looe — a summer resort place with a monkey sanctuary, and an active fishing village
  * 50.408-4.2126 Saltash — "Gateway to Cornwall", a small town on the Cornwall side of the Tamar crossings
  * 50.338-4.7957 St Austell — largest town in the county and home to the Eden Project, the world's largest greenhouse
  * 50.3314-4.75788 Charlestown — seaside town used as filming location for the TV sh

# Embedding strategy

## Embedding child chunks with ParentDocumentRetriever

In [31]:
from langchain_classic.retrievers import ParentDocumentRetriever
from langchain_classic.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Setting up the Parent Document retriever

In [32]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=3000) #A
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500) #B

child_chunks_collection = Chroma( #C
    collection_name="uk_child_chunks",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

child_chunks_collection.reset_collection() #D

doc_store = InMemoryStore() #E

parent_doc_retriever = ParentDocumentRetriever( #F
    vectorstore=child_chunks_collection,
    docstore=doc_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
#A Splitter to generate parent coarse chunks from original documents (parsed from web pages)
#B Splitter to generate child granular chunks from parent coarse chunks
#C Vector store collection to host child granular chunks
#D Make sure the collection is empty
#E Document store to host parent coarse chunks
#F Retriever to link parent coarse chunks to child granular chunks

### Ingesting the content into doc and vector store

In [33]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #A
    html_docs =  html_loader.load() #B
    text_docs = html2text_transformer.transform_documents(html_docs) #C

    print(f'Ingesting {destination_url}')
    parent_doc_retriever.add_documents(text_docs, ids=None) #D

#A Loader for destination web page
#B HTML documents of one destination 
#C Transform HTML docs into clean text deocs
#D Ingest coarse chunks into document store and granular chunks into vector store

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.05it/s]


Ingesting https://en.wikivoyage.org/wiki/Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.39it/s]


Ingesting https://en.wikivoyage.org/wiki/North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.45it/s]


Ingesting https://en.wikivoyage.org/wiki/South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.29it/s]


Ingesting https://en.wikivoyage.org/wiki/West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.33it/s]


Ingesting https://en.wikivoyage.org/wiki/Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.78it/s]


Ingesting https://en.wikivoyage.org/wiki/Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.73it/s]


Ingesting https://en.wikivoyage.org/wiki/Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.82it/s]


Ingesting https://en.wikivoyage.org/wiki/Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.94it/s]


Ingesting https://en.wikivoyage.org/wiki/Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.43it/s]


Ingesting https://en.wikivoyage.org/wiki/St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.06it/s]


Ingesting https://en.wikivoyage.org/wiki/Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.08it/s]


Ingesting https://en.wikivoyage.org/wiki/Looe


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.35it/s]


Ingesting https://en.wikivoyage.org/wiki/Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.84it/s]


Ingesting https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.59it/s]


Ingesting https://en.wikivoyage.org/wiki/Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.78it/s]


Ingesting https://en.wikivoyage.org/wiki/Battle


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.34it/s]


Ingesting https://en.wikivoyage.org/wiki/Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.43it/s]


Ingesting https://en.wikivoyage.org/wiki/Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.61it/s]


Ingesting https://en.wikivoyage.org/wiki/Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.08it/s]


Ingesting https://en.wikivoyage.org/wiki/Ashdown_Forest


In [34]:
list(doc_store.yield_keys())
#A Show the keys of the added coarse chunks

['4000bc4c-11ad-41eb-9751-7bcd6f72654d',
 '1b4727c7-ba4c-4fc0-a395-1bb52edf23fd',
 'b23594c5-66ae-4f24-b307-383635b09858',
 '5ebe455b-cb09-41fa-9892-732a36dec28a',
 '508c5f9c-0537-4fd3-9068-327f91e6213f',
 '98f789bc-901a-4565-b07e-f56d78993423',
 'a0e75fb7-bda2-4b2a-967f-fdc73257dcc3',
 '248523cf-a3bf-47b6-80ba-234747eaf19c',
 '32d8b2d8-e59d-4470-8c86-5e60cf145520',
 '73ff6050-7428-4d47-aa50-32f1afd7f1a0',
 'f4903ff5-d516-4e27-b827-42126ad48c77',
 'cfbc6fb8-eb67-49f1-8d4a-5c6fa6021757',
 '29b94a31-6e3f-4fa1-a692-7f6e9c53f031',
 'a79e6645-1589-4f03-a13a-fa5b66cd51ff',
 '95bc1eec-0037-40ff-8faa-f69f53ea0a5b',
 '00e38278-88bf-419c-97cb-fd1e5f81297a',
 '8274d332-7265-4d32-aaf2-971f4bcea977',
 'abab2c45-e487-488a-aa23-00e6c385c64b',
 'a5c173c9-52cc-4f6e-a64a-90f50ebd2c36',
 'd989afef-0acd-471b-9b82-46f0dc390f8f',
 '7d2eb85a-6d79-4d55-b94b-795d433ad09c',
 '13d462c2-446c-4bdb-9bb0-adde90a3acce',
 'cfe67351-55df-4b97-8e14-f77f096868d8',
 'ecf78ce8-8a98-4ec0-a4da-1b5e862477b9',
 '91e78cac-c8c7-

### Performing a search on granular information 

In [35]:
retrieved_docs = parent_doc_retriever.invoke("Cornwall Ranger")

In [36]:
len(retrieved_docs)

4

In [37]:
retrieved_docs[0]

Document(metadata={'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'language': 'en'}, page_content="## Get around\n\n[edit]\n\n### By bus\n\n[edit]\n\nThanks to Transport for Cornwall, all bus tickets are interchangeable across\nthe different companies. The **Cornwall All Day ticket** allows unlimited\ntravel for a calendar day. As of 2023, fares are £5 for adults and £4 for\nunder-19s. Payment is by cash or contactless. The two main bus companies are:\n\n  * **Go Cornwall Bus** covers all parts of Cornwall and connects with Plymouth (in Devon).\n  * **Kernow** (part of First Bus) covers western and central Cornwall.\n\nBuses only serve designated stops when in towns; otherwise, you can flag them\ndown anywhere that's safe for them to stop.\n\n### By train\n\n[edit]\n\n**CrossCountry Trains** and **Great Western Railway** operate regular train\nservices between the main centres of population, the latter company also\nse

### Comparing with direct semantic search on child chunks

In [38]:
child_docs_only =  child_chunks_collection.similarity_search("Cornwall Ranger")

In [39]:
len(child_docs_only)

4

In [40]:
child_docs_only[0]

Document(id='e6356a11-7ca1-44cf-ba42-eeb9553ab838', metadata={'title': 'South Cornwall – Travel guide at Wikivoyage', 'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'language': 'en', 'doc_id': 'cfe67351-55df-4b97-8e14-f77f096868d8'}, page_content='The **Cornwall Ranger** ticket allows unlimited train travel in Cornwall and\nPlymouth for a calendar day. As of 2023, this costs £14 for adults and £7 for\nunder-16s.\n\n## See\n\n[edit]\n\nThe **Eden Project** , near St Austell, a fabulous collection of flora from\nall over the planet housed in two space age transparent domes, and a massive\nzip line.')

In [41]:
# IMPORTANT: as you can see a granular search would have identified the chunk, but it would have lost the usefulcontext about travelling in Cornwall

## Embedding child chunks with MultiVectorRetriever

In [42]:
from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
from langchain_classic.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid

### Setting up the Multi vector retriever

In [43]:
parent_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000) #A
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500) #B

child_chunks_collection = Chroma( #C
    collection_name="uk_child_chunks",
    embedding_function=OpenAIEmbeddings(
        openai_api_key=OPENAI_API_KEY),
)

child_chunks_collection.reset_collection() #D

doc_byte_store = InMemoryByteStore() #E
doc_key = "doc_id"

multi_vector_retriever = MultiVectorRetriever( #F
    vectorstore=child_chunks_collection,
    byte_store=doc_byte_store
)
#A Splitter to generate parent coarse chunks from original documents (parsed from web pages)
#B Splitter to generate child granular chunks from parent coarse chunks
#C Vector store collection to host child granular chunks
#D Make sure the collection is empty
#E Document store to host parent coarse chunks
#F Retriever to link parent coarse chunks to child granular chunks

### Ingesting the content into doc and vector store

In [44]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #A
    html_docs =  html_loader.load() #B
    text_docs = html2text_transformer.transform_documents(
        html_docs) #C

    coarse_chunks = parent_splitter.split_documents(
        text_docs) #D

    coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
    all_granular_chunks = []
    for i, coarse_chunk in enumerate(
        coarse_chunks): #E
        
        coarse_chunk_id = coarse_chunks_ids[i]
            
        granular_chunks = child_splitter.split_documents(
            [coarse_chunk]) #F

        for granular_chunk in granular_chunks:
            granular_chunk.metadata[doc_key] = coarse_chunk_id #G

        all_granular_chunks.extend(granular_chunks)

    print(f'Ingesting {destination_url}')
    multi_vector_retriever.vectorstore.add_documents(
        all_granular_chunks) #H
    multi_vector_retriever.docstore.mset(
        list(zip(coarse_chunks_ids, coarse_chunks))) #I

#A Loader for one destination
#B Documents of one destination 
#C transform HTML docs into clean text docs
#D Split the destination content into parent coarse chunks
#E Iterate over the parent coarse chunks
#F Create child granular chunks form each parent coarse chunk
#G Link each child granular chunk to its parent coarse chunk
#H Ingest the child granular chunks into the vector store
#I Ingest the parent coarse chunks into the document store

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.74it/s]


Ingesting https://en.wikivoyage.org/wiki/Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.46it/s]


Ingesting https://en.wikivoyage.org/wiki/North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.68it/s]


Ingesting https://en.wikivoyage.org/wiki/South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.26it/s]


Ingesting https://en.wikivoyage.org/wiki/West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.70it/s]


Ingesting https://en.wikivoyage.org/wiki/Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.14it/s]


Ingesting https://en.wikivoyage.org/wiki/Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.41it/s]


Ingesting https://en.wikivoyage.org/wiki/Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.26it/s]


Ingesting https://en.wikivoyage.org/wiki/Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.92it/s]


Ingesting https://en.wikivoyage.org/wiki/Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.63it/s]


Ingesting https://en.wikivoyage.org/wiki/St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.58it/s]


Ingesting https://en.wikivoyage.org/wiki/Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.72it/s]


Ingesting https://en.wikivoyage.org/wiki/Looe


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.42it/s]


Ingesting https://en.wikivoyage.org/wiki/Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 21.06it/s]


Ingesting https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.92it/s]


Ingesting https://en.wikivoyage.org/wiki/Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.83it/s]


Ingesting https://en.wikivoyage.org/wiki/Battle


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.33it/s]


Ingesting https://en.wikivoyage.org/wiki/Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.17it/s]


Ingesting https://en.wikivoyage.org/wiki/Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.20it/s]


Ingesting https://en.wikivoyage.org/wiki/Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.45it/s]


Ingesting https://en.wikivoyage.org/wiki/Ashdown_Forest


### Performing a search on granular information

In [58]:
retrieved_docs = multi_vector_retriever.invoke(
    "Cornwall Ranger")

In [59]:
len(retrieved_docs)

4

In [60]:
retrieved_docs[0]

Document(metadata={'source': 'https://en.wikivoyage.org/wiki/Cornwall', 'title': 'Cornwall – Travel guide at Wikivoyage', 'language': 'en'}, page_content="Cornwall boasts many attractions for the traveller, many lying outside of\ncities and towns amidst the Cornish landscape:\n\n  * Within the 208 m² of the Bodmin Moor, is **King Arthur's Hall** , a megalithic monument and **Brown Willy** , the highest point in Cornwall at 417 m (1,368 ft). **Dozmary Pool** is a small beautiful lake where, according to legend, King Arthur was entrusted with the sword Excalibur. There is also a reputed **Beast of the Moor** , a large wild-cat that haunts and stalks at night, but is similar in fantasy to the Loch Ness Monster, in that no one can prove it exists, though sightings, theories and track-marks abound.\n  * The **Eden Project** , near St Austell, a fabulous collection of flora from all over the planet housed in two 'space age' transparent domes.\n  * The **Lost Gardens of Heligan** \\- near Mev

In [61]:
##IMPORTANT: same as Parent Document retriever, but more control and flexibility on how to link child to parent chunks

### Comparing with direct semantic search on child chunks

In [62]:
child_docs_only =  child_chunks_collection.similarity_search(
    "Cornwall Ranger")

In [63]:
len(child_docs_only)

4

In [64]:
child_docs_only[0]

Document(id='a18b9a25-f88f-433b-8819-2d80d8a39fcd', metadata={'doc_id': 'f69a9d0a-6153-4c74-9f19-83942aeb3876', 'source': 'https://en.wikivoyage.org/wiki/South_Cornwall', 'language': 'en', 'title': 'South Cornwall – Travel guide at Wikivoyage'}, page_content='The **Cornwall Ranger** ticket allows unlimited train travel in Cornwall and\nPlymouth for a calendar day. As of 2023, this costs £14 for adults and £7 for\nunder-16s.\n\n## See\n\n[edit]\n\nThe **Eden Project** , near St Austell, a fabulous collection of flora from\nall over the planet housed in two space age transparent domes, and a massive\nzip line.')

In [65]:
## IMPORTANT: Same as before

## Embedding summaries with MultiVectorRetriever

In [66]:
from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
from langchain_classic.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import uuid

### Setting up the Multi vector retriever (similar to when embedding child chunks)

In [67]:
parent_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000) #A

summaries_collection = Chroma( #B
    collection_name="uk_summaries",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

summaries_collection.reset_collection() #C

doc_byte_store = InMemoryByteStore() #D
doc_key = "doc_id"

multi_vector_retriever = MultiVectorRetriever( #E
    vectorstore=summaries_collection,
    byte_store=doc_byte_store
)
#A Splitter to generate parent coarse chunks from original documents (parsed from web pages)
#B Vector store collection to host child granular chunks
#C Make sure the collection is empty
#D Document store to host parent coarse chunks
#E Retriever to link parent coarse chunks to child granular chunks

### Setting up the summarization chain

In [68]:
llm = ChatOpenAI(model="gpt-5-nano", openai_api_key=OPENAI_API_KEY)

In [69]:
summarization_chain = (
    {"document": lambda x: x.page_content} #A
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{document}") #B
    | llm
    | StrOutputParser())

#A Grab the text content from the document
#B Instantiate a prompt asking to generate summary of the provided text
#C Send the LLM the instantiated prompt 
#D Extract the summary text from the response

### Ingesting the coarse chunks and related summaries into doc and vector store

In [70]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #A
    html_docs =  html_loader.load() #B
    text_docs = html2text_transformer.transform_documents(
        html_docs) #C

    coarse_chunks = parent_splitter.split_documents(
        text_docs) #D

    coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
    all_summaries = []
    for i, coarse_chunk in enumerate(
        coarse_chunks): #E
        
        coarse_chunk_id = coarse_chunks_ids[i]
            
        summary_text =  summarization_chain.invoke(
            coarse_chunk) #F
        summary_doc = Document(page_content=summary_text, 
                               metadata={doc_key: coarse_chunk_id})

        all_summaries.append(summary_doc) #G

    print(f'Ingesting {destination_url}')
    multi_vector_retriever.vectorstore.add_documents(
        all_summaries) #H
    multi_vector_retriever.docstore.mset(
        list(zip(coarse_chunks_ids, coarse_chunks))) #I

#A Loader for one destination
#B Documents of one destination 
#C transform HTML docs into clean text docs
#D Split the destination content into coarse chunks
#E Iterate over the coarse chunks
#F Generate a summary for the coarse chunk thorugh the summarization chain
#G Link each summary to its related coarse chunk
#H Ingest the summaries into the vector store
#I Ingest the coarse chunks into the document store

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.12it/s]


Ingesting https://en.wikivoyage.org/wiki/Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.13it/s]


Ingesting https://en.wikivoyage.org/wiki/North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.90it/s]


Ingesting https://en.wikivoyage.org/wiki/South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.06it/s]


Ingesting https://en.wikivoyage.org/wiki/West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 21.76it/s]


Ingesting https://en.wikivoyage.org/wiki/Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.46it/s]


Ingesting https://en.wikivoyage.org/wiki/Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 20.87it/s]


Ingesting https://en.wikivoyage.org/wiki/Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 16.10it/s]


Ingesting https://en.wikivoyage.org/wiki/Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.44it/s]


Ingesting https://en.wikivoyage.org/wiki/Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.51it/s]


Ingesting https://en.wikivoyage.org/wiki/St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.12it/s]


Ingesting https://en.wikivoyage.org/wiki/Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.18it/s]


Ingesting https://en.wikivoyage.org/wiki/Looe


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.36it/s]


Ingesting https://en.wikivoyage.org/wiki/Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 18.76it/s]


Ingesting https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.58it/s]


Ingesting https://en.wikivoyage.org/wiki/Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.92it/s]


Ingesting https://en.wikivoyage.org/wiki/Battle


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.61it/s]


Ingesting https://en.wikivoyage.org/wiki/Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.64it/s]


Ingesting https://en.wikivoyage.org/wiki/Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.62it/s]


Ingesting https://en.wikivoyage.org/wiki/Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.98it/s]


Ingesting https://en.wikivoyage.org/wiki/Ashdown_Forest


In [71]:
# COMMENT: the code above is similar to when ingesting child chunks, but it is slower because of the summarization step
# which invokes the LLM.
# The processing can be speeded up by parallelizing the outer for loop on the destination urls.

### Performing a search on granular information

In [72]:
retrieved_docs = multi_vector_retriever.invoke("Cornwall travel")

In [73]:
len(retrieved_docs)

4

In [74]:
retrieved_docs

[Document(metadata={'source': 'https://en.wikivoyage.org/wiki/North_Cornwall', 'title': 'North Cornwall – Travel guide at Wikivoyage', 'language': 'en'}, page_content="### By car\n\n[edit]\n\nCornwall can be accessed by road via the A30 which runs from the end of the M5\nat Exeter, all the way through the heart of Devon and Cornwall down to Land's\nEnd. It is a grade-separated expressway as far as Carland Cross near Truro\n(the expressway is expected to be open as far as Camborne (between Redruth and\nHayle) by March 2024). You can also get to Cornwall via the A38, crossing the\nRiver Tamar at Plymouth via the Tamar Bridge, which levies a toll on eastbound\nvehicles. On summer Saturdays and during bank holiday weekends roads to\nCornwall are usually busy.\n\n### By plane\n\n[edit]\n\n50.440833-4.9952781 Cornwall Airport (**NQY** IATA) in Newquay is the main\nairport for the county, with year-round flights only from Aberdeen, Alicante,\nDublin, London Gatwick, and Manchester. During the

### Comparing with direct semantic search on summaries

In [75]:
summary_docs_only =  summaries_collection.similarity_search(
    "Cornwall Travel")

In [76]:
len(summary_docs_only)

4

In [77]:
summary_docs_only

[Document(id='9553aa2d-456e-4a38-8f95-28f072f13992', metadata={'doc_id': '14554480-abf6-4b35-ae7d-8580a27ecc9a'}, page_content="Cornwall offers a diverse array of attractions spanning natural beauty, legends, gardens, historic sites, arts, and heritage, including both independent sites and National Trust properties.\n\n- Natural and legendary sights: King Arthur's Hall and Brown Willy on Bodmin Moor; Dozmary Pool and tales of the Beast of the Moor.\n- Gardens and nature: The Eden Project’s two glass domes; the Lost Gardens of Heligan near Mevagissey.\n- Castles, archaeology, and coastal culture: Tintagel Castle (Arthurian legends and early medieval finds); Minack Theatre (clifftop outdoor theatre and museum); St Michael's Mount.\n- Arts and museums: Tate St Ives (modern art); National Maritime Museum, Falmouth (small-boat collection and other exhibits).\n- Mining and industrial heritage: Historic tin/copper mine sites such as Geevor Tin Mine, Poldark Mine, King Edward Mine, Crown Hill 

In [78]:
# COMMENT: a direct search on summaries retrieves denser information, but it is missing out on useful details. 
# However, you might consider using the summaries directly if after testing they prove adequate.

## Embedding hypothetical questions with MultiVectorRetriever

In [79]:
from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
from langchain_classic.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import uuid
from typing import List
from pydantic import BaseModel, Field

### Setting up the Multi vector retriever (same as when embedding summaries)

In [80]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=3000) #A

hypothetical_questions_collection = Chroma( #B
    collection_name="uk_hypothetical_questions",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

hypothetical_questions_collection.reset_collection() #C

doc_byte_store = InMemoryByteStore() #D
doc_key = "doc_id"

multi_vector_retriever = MultiVectorRetriever( #E
    vectorstore=hypothetical_questions_collection,
    byte_store=doc_byte_store
)
#A Splitter to generate parent coarse chunks from original documents (parsed from web pages)
#B Vector store collection to host child granular chunks
#C Make sure the collection is empty
#D Document store to host parent coarse chunks
#E Retriever to link parent coarse chunks to child granular chunks

### Setting up the chain to generate hypothetical questions

In [81]:
class HypotheticalQuestions(BaseModel):
    """A list of hypotetical questions for given text."""

    questions: List[str] = Field(..., description="List of hypothetical questions for given text")

In [82]:
llm_with_structured_output = ChatOpenAI(
    model="gpt-5-nano", 
    openai_api_key=OPENAI_API_KEY).with_structured_output(
        HypotheticalQuestions
)

In [83]:
hypothetical_questions_chain = (
    {"document_text": lambda x: x.page_content} #A
    | ChatPromptTemplate.from_template( #B
        "Generate a list of exactly 4 hypothetical questions that the below text could be used to answer:\n\n{document_text}"
    )
    | llm_with_structured_output #C
    | (lambda x: x.questions) #D
)

#A Grab the text content from the document
#B Instantiate a prompt asking to generate 4 hypothetical questions on the provided text
#C Invoke the LLM configured to return an object containing the questions as a typed list of strings
#D Grab the list of questions from the response

### Ingesting the coarse chunks and related hypothetical questions into doc and vector store

In [84]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #A
    html_docs =  html_loader.load() #B
    text_docs = html2text_transformer.transform_documents(
        html_docs) #C

    coarse_chunks = parent_splitter.split_documents(
        text_docs) #D

    coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
    all_hypothetical_questions = []
    for i, coarse_chunk in enumerate(
        coarse_chunks): #E
        
        coarse_chunk_id = coarse_chunks_ids[i]
            
        hypothetical_questions = hypothetical_questions_chain.invoke(
            coarse_chunk) #F
        hypothetical_questions_docs = [Document(
            page_content=question, metadata={doc_key: coarse_chunk_id})
                    for question 
                    in hypothetical_questions] #G

        all_hypothetical_questions.extend(hypothetical_questions_docs)

    print(f'Ingesting {destination_url}')
    multi_vector_retriever.vectorstore.add_documents(
        all_hypothetical_questions) #H
    multi_vector_retriever.docstore.mset(
        list(zip(coarse_chunks_ids, coarse_chunks))) #I

#A Loader for one destination
#B Documents of one destination 
#C transform HTML docs into clean text docs
#D Split the destination content into coarse chunks
#E Iterate over the coarse chunks
#F Generate a list of hypothetical questions for the coarse chunk thorugh the question generation chain
#G Link each hypothetical question to its related coarse chunk
#H Ingest the hypothetical questions into the vector store
#I Ingest the coarse chunks into the document store

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.29it/s]


Ingesting https://en.wikivoyage.org/wiki/Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.01it/s]


Ingesting https://en.wikivoyage.org/wiki/North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  2.23it/s]


Ingesting https://en.wikivoyage.org/wiki/South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.53it/s]


Ingesting https://en.wikivoyage.org/wiki/West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.01it/s]


Ingesting https://en.wikivoyage.org/wiki/Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.19it/s]


Ingesting https://en.wikivoyage.org/wiki/Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 17.73it/s]


Ingesting https://en.wikivoyage.org/wiki/Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.17it/s]


Ingesting https://en.wikivoyage.org/wiki/Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.49it/s]


Ingesting https://en.wikivoyage.org/wiki/Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.17it/s]


Ingesting https://en.wikivoyage.org/wiki/St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.00it/s]


Ingesting https://en.wikivoyage.org/wiki/Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.76it/s]


Ingesting https://en.wikivoyage.org/wiki/Looe


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.56it/s]


Ingesting https://en.wikivoyage.org/wiki/Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 21.86it/s]


Ingesting https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.26it/s]


Ingesting https://en.wikivoyage.org/wiki/Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.80it/s]


Ingesting https://en.wikivoyage.org/wiki/Battle


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.05it/s]


Ingesting https://en.wikivoyage.org/wiki/Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.98it/s]


Ingesting https://en.wikivoyage.org/wiki/Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.02it/s]


Ingesting https://en.wikivoyage.org/wiki/Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.97it/s]


Ingesting https://en.wikivoyage.org/wiki/Ashdown_Forest


### Performing a search on granular information

In [85]:
retrieved_docs = multi_vector_retriever.invoke(
    "How can you go to Brighton from London?")

In [86]:
len(retrieved_docs)

4

In [87]:
retrieved_docs

[Document(metadata={'source': 'https://en.wikivoyage.org/wiki/Brighton', 'title': 'Brighton – Travel guide at Wikivoyage', 'language': 'en'}, page_content='Brighton  \n---  \nClimate chart (explanation)  \n| J| F| M| A| M| J| J| A| S| O| N| D  \n---|---|---|---|---|---|---|---|---|---|---|---  \n88 8 3 |  60 8 3 |  51 9 4 |  58 12 6 |  56 16 9 |  50 18 12 |  54 20 14 |  62 21 14 |  67 18 12 |  105 15 9 |  103 11 6 |  97 9 4  \nAverage max. and min. temperatures in °C  \nPrecipitation+Snow totals in mm  \nSource: Wikipedia. Visit the Met Office for a five day forecast.  \n| Imperial conversion  \n---  \nJ| F| M| A| M| J| J| A| S| O| N| D  \n3.5 46 37 |  2.4 46 37 |  2 48 39 |  2.3 54 43 |  2.2 61 48 |  2 64 54 |  2.1 68 57 |  2.4 70 57 |  2.6 64 54 |  4.1 59 48 |  4.1 52 43 |  3.8 48 39  \nAverage max. and min. temperatures in °F  \nPrecipitation+Snow totals in inches  \n  \nThe city is close to London, and is increasingly popular with media and music\ntypes who don\'t want to live in t

### Inspecting possible questions matching our question through semantic search

In [88]:
hypothetical_question_docs_only = hypothetical_questions_collection.similarity_search(
    "How can you go to Brighton from London?")

In [89]:
len(hypothetical_question_docs_only)

4

In [90]:
hypothetical_question_docs_only

[Document(id='399bb54e-88e8-4eb1-b572-d363e8f23006', metadata={'doc_id': 'a1f51d6c-2d27-4a6c-b1a0-4f9918dc1545'}, page_content='How can you travel to Brighton by train from London, and what are the two main railway stations in the city?'),
 Document(id='65d2f229-e7e5-4516-ac09-ea93fd3feb1d', metadata={'doc_id': '53e1031c-3d92-4e7e-9398-1770b215dad1'}, page_content='What transportation options are described for getting to Brighton and for getting around the city?'),
 Document(id='fb27db61-b09a-4280-9473-cfe82759e708', metadata={'doc_id': 'f4b555cf-afe0-43e2-b10a-f59fdab3e2e2'}, page_content='What is the fastest way to travel from Gatwick to Brighton, and how long does it take by train according to the text?'),
 Document(id='320ca9aa-f328-4c06-9d18-00ef76577884', metadata={'doc_id': '79b9f77f-21ab-4ab4-bbf4-4a04d5576cbc'}, page_content='If I want to travel around Brighton all day on buses with one fare, what ticket would I buy and how much would it cost?')]

# Granular chunk expansion with MultiVectorRetriever

In [91]:
from langchain_classic.retrievers.multi_vector import MultiVectorRetriever
from langchain_classic.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid

### Setting up the Multi vector retriever

In [92]:
granular_chunk_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500) #A

granular_chunks_collection = Chroma( #B
    collection_name="uk_granular_chunks",
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
)

granular_chunks_collection.reset_collection() #C

expanded_chunk_store = InMemoryByteStore() #D
doc_key = "doc_id"

multi_vector_retriever = MultiVectorRetriever( #E
    vectorstore=granular_chunks_collection,
    byte_store=expanded_chunk_store
)
#A Splitter to generate granular chunks from original documents (parsed from web pages)
#B Vector store collection to host child granular chunks
#C Make sure the collection is empty
#D Document store to host expanded chunks
#E Retriever to link parent coarse chunks to child granular chunks

### Ingesting granular and expanded chunks into doc and vector store

In [93]:
for destination_url in uk_destination_urls:
    html_loader = AsyncHtmlLoader(destination_url) #A
    html_docs =  html_loader.load() #B
    text_docs = html2text_transformer.transform_documents(
        html_docs) #C

    granular_chunks = granular_chunk_splitter.split_documents(
        text_docs) #D

    expanded_chunk_store_items = []
    for i, granular_chunk in enumerate(
        granular_chunks): #E

        this_chunk_num = i #F
        previous_chunk_num = i-1 #F
        next_chunk_num = i+1 #F
        
        if i==0: #F
            previous_chunk_num = None
        elif i==(len(granular_chunks)-1): #F
            next_chunk_num = None

        expanded_chunk_text = "" #G
        if previous_chunk_num: #G
            expanded_chunk_text += granular_chunks[
                previous_chunk_num].page_content
            expanded_chunk_text += "\n"

        expanded_chunk_text += granular_chunks[
            this_chunk_num].page_content #G
        expanded_chunk_text += "\n"

        if next_chunk_num: #G
            expanded_chunk_text += granular_chunks[
                next_chunk_num].page_content
            expanded_chunk_text += "\n"

        expanded_chunk_id = str(uuid.uuid4()) #H
        expanded_chunk_doc = Document(
            page_content=expanded_chunk_text) #I

        expanded_chunk_store_item = (expanded_chunk_id, 
                                     expanded_chunk_doc)
        expanded_chunk_store_items.append(
            expanded_chunk_store_item)

        granular_chunk.metadata[
            doc_key] = expanded_chunk_id #J
            
    print(f'Ingesting {destination_url}')
    multi_vector_retriever.vectorstore.add_documents(
        granular_chunks) #K
    multi_vector_retriever.docstore.mset(
        expanded_chunk_store_items) #L

#A Loader for one destination
#B Documents of one destination 
#C transform HTML docs into clean text docs
#D Split the destination content into granular chunks
#E Iterate over the granular chunks
#F determine the index of the current chunk and its previous and next chunks
#G Assemble the text of the expanded chunk by including the previous and next chunk
#H Generate the ID of the expanded chunk
#I Create the expanded chunk document
#J Link each granular chunk to its related expanded chunk
#K Ingest the granular chunks into the vector store
#L Ingest the expanded chunks into the document store

Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.39it/s]


Ingesting https://en.wikivoyage.org/wiki/Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.60it/s]


Ingesting https://en.wikivoyage.org/wiki/North_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.08it/s]


Ingesting https://en.wikivoyage.org/wiki/South_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.61it/s]


Ingesting https://en.wikivoyage.org/wiki/West_Cornwall


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 14.14it/s]


Ingesting https://en.wikivoyage.org/wiki/Tintagel


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.76it/s]


Ingesting https://en.wikivoyage.org/wiki/Bodmin


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 10.19it/s]


Ingesting https://en.wikivoyage.org/wiki/Wadebridge


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.62it/s]


Ingesting https://en.wikivoyage.org/wiki/Penzance


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 15.33it/s]


Ingesting https://en.wikivoyage.org/wiki/Newquay


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.03it/s]


Ingesting https://en.wikivoyage.org/wiki/St_Ives


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.97it/s]


Ingesting https://en.wikivoyage.org/wiki/Port_Isaac


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.84it/s]


Ingesting https://en.wikivoyage.org/wiki/Looe


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.71it/s]


Ingesting https://en.wikivoyage.org/wiki/Polperro


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.29it/s]


Ingesting https://en.wikivoyage.org/wiki/PorthlevenEast_Sussex


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.78it/s]


Ingesting https://en.wikivoyage.org/wiki/Brighton


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 12.18it/s]


Ingesting https://en.wikivoyage.org/wiki/Battle


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00,  9.89it/s]


Ingesting https://en.wikivoyage.org/wiki/Hastings_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 11.03it/s]


Ingesting https://en.wikivoyage.org/wiki/Rye_(England)


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.77it/s]


Ingesting https://en.wikivoyage.org/wiki/Seaford


Fetching pages: 100%|####################################################################| 1/1 [00:00<00:00, 13.99it/s]


Ingesting https://en.wikivoyage.org/wiki/Ashdown_Forest


### Performing a search on granular information

In [94]:
retrieved_docs = multi_vector_retriever.invoke("Cornwall Ranger")

In [95]:
len(retrieved_docs)

4

In [96]:
retrieved_docs[0]

Document(metadata={}, page_content="Buses only serve designated stops when in towns; otherwise, you can flag them\ndown anywhere that's safe for them to stop.\n\n### By train\n\n[edit]\n\n**CrossCountry Trains** and **Great Western Railway** operate regular train\nservices between the main centres of population, the latter company also\nserving a number of other towns on branch lines. For train times and fares\nvisit National Rail Enquiries.\nThe **Cornwall Ranger** ticket allows unlimited train travel in Cornwall and\nPlymouth for a calendar day. As of 2023, this costs £14 for adults and £7 for\nunder-16s.\n\n## See\n\n[edit]\n\nThe **Eden Project** , near St Austell, a fabulous collection of flora from\nall over the planet housed in two space age transparent domes, and a massive\nzip line.\n## See\n\n[edit]\n\nThe **Eden Project** , near St Austell, a fabulous collection of flora from\nall over the planet housed in two space age transparent domes, and a massive\nzip line.\n\nThe **Lo

### Comparing with direct semantic search on granular chunks

In [97]:
child_docs_only =  child_chunks_collection.similarity_search("Cornwall Ranger")

In [98]:
len(child_docs_only)

4

In [99]:
child_docs_only[0]

Document(id='a18b9a25-f88f-433b-8819-2d80d8a39fcd', metadata={'language': 'en', 'doc_id': 'f69a9d0a-6153-4c74-9f19-83942aeb3876', 'title': 'South Cornwall – Travel guide at Wikivoyage', 'source': 'https://en.wikivoyage.org/wiki/South_Cornwall'}, page_content='The **Cornwall Ranger** ticket allows unlimited train travel in Cornwall and\nPlymouth for a calendar day. As of 2023, this costs £14 for adults and £7 for\nunder-16s.\n\n## See\n\n[edit]\n\nThe **Eden Project** , near St Austell, a fabulous collection of flora from\nall over the planet housed in two space age transparent domes, and a massive\nzip line.')

In [100]:
# COMMENT: the expanded chunk has more useful context