In [113]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from typing import List

embedding = GeminiEmbedding()

reader = SimpleDirectoryReader(input_dir="/home/dai/35/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p","li", "b", "i", "u", "section", "text"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
# remove nodes with no content
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files:   0%|          | 0/594 [00:00<?, ?file/s]

Loading files: 100%|██████████| 594/594 [00:00<00:00, 4641.27file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [114]:
len(nodes)

3386

In [115]:
count = 0
for node in nodes:
    if len(node.text) > 10000:
        count += 1
        print(node.metadata)
print(count)

{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Aayurveda_Samhitaa/01_Charaka_Samhitaa/DF_Introduction_Charaka_Samhita.html', 'file_name': 'DF_Introduction_Charaka_Samhita.html', 'file_type': 'text/html', 'file_size': 13642, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Basics_of_Chikitsaa.htm', 'file_name': 'Basics_of_Chikitsaa.htm', 'file_type': 'text/html', 'file_size': 24441, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Kushtha.htm', 'file_name': 'Kushtha.htm', 'file_type': 'text/html', 'file_size': 44579, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Latest_Trends_in_

In [116]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=0,
)
sentence_nodes = splitter.get_nodes_from_documents(nodes, show_progress=True)

Parsing nodes:   0%|          | 0/3386 [00:00<?, ?it/s]

In [117]:
len(sentence_nodes)

4419

In [99]:
for i in range(20,30):
    print(sentence_nodes[i].text)

Charaka's quadruple. Fully realizing the importance and contribution of the physician, the attendant (nurse), the patient and the drug if endowed with proper qualities, leads to timely alleviation of disorders. Physician: Excellence in theoretical knowledge, experience, dexterity and cleanliness  these are the qualities of a physician. Attendant (nurse):  Knowledge of attendance, dexterity, loyalty and cleanliness -  these are the four qualities of an attendant. Patient:  Memory, obedience. Fearlessness and providing all information about time disorder - these are the qualities of a patient Drug:  Abundance, effectiveness, various pharmaceutical forms and normal composition - these are the four qualities of drugs. Employment of all these excellent four - physician, attendant, patient and drug in case of disorders with the object of (re-establishing) their equilibrium, is said to be therapeutics. It also explains the logic and philosophy on which this system of medicine is based. The co

In [101]:
count = 0
for node in sentence_nodes:
    if len(node.text) > 10000:
        count += 1
        print(node.metadata)
print(count)

0


In [76]:
# import nltk
# tokenizer = nltk.tokenize.PunktSentenceTokenizer()

# # get the spans and then return the sentences
# # using the start index of each span
# # instead of using end, use the start of the next span if available
# def split(text: str) -> List[str]:
#     spans = list(tokenizer.span_tokenize(text))
#     sentences = []
#     for i, span in enumerate(spans):
#         start = span[0]
#         if i < len(spans) - 1:
#             end = spans[i + 1][0]
#         else:
#             end = len(text)
#         sentences.append(text[start:end])

#     return sentences

In [118]:
def split(text: str) -> List[str]:
    sentences = []
    sentences.append(text)
    return sentences

In [119]:
from llama_index.core.node_parser import SentenceWindowNodeParser

window_parser = SentenceWindowNodeParser(
    sentence_splitter=split,
    # how many sentences on either side to capture
    window_size=3,
    # the metadata key that holds the window of surrounding sentences
    window_metadata_key="window",
    # the metadata key that holds the original sentence
    original_text_metadata_key="original_sentence",
)
window_nodes = window_parser.get_nodes_from_documents(sentence_nodes, show_progress=True)

Parsing nodes:   0%|          | 0/4419 [00:00<?, ?it/s]

In [120]:
len(window_nodes)

4419

In [111]:
for node in window_nodes:
    print(node.metadata['window'])
    print()

The project is funded by Ministry of Communications & Information Technology. Following Institutions and Ayurveda Experts have contributed to AyuSoft in their capacities as advisors, authors, owners of photographs and clips, testers etc.

Institutes,"Ayurved  Research Centre K.E.M. Hospital, Mumbai." Institutes,"B.S.D.T.  College of Ayurved, Wagholi Pune." Institutes,"College of  Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune." Institutes,"Centre for  Rheumatic Diseases Pune (CRD), Pune." Institutes,Dept. of  Ayurved Sassoon General Hospital Pune. Institutes,"Ayurved  Dept. Dinanath Mangeshkar Hospital,Pune." Institutes,"Foundation  for Revitalisation of Local Health Traditions (FRLHT), Bangalore." Institutes,"Sanjeevan  Hospital, Pune." Institutes,"Sumatibhai  Shaha Ayurved College, Hadapsar, Pune." Institutes,"Swami Prakashanand Ayurveda Research Centre  (SPARC), Mumbai." Institutes,"Seth  Tarachand Ramnath Hospital, Pune." Institutes,"T.N.Medical  College and Nair Hosp

In [104]:
len(window_nodes)

4419

In [105]:
count = 0
for node in window_nodes:
    if len(node.metadata['window']) > 10000:
        count += 1
        print(node.metadata)
print(count)

0


In [None]:
for node in window_nodes:
    print(node.metadata["window"])

In [48]:
count = 0
for node in window_nodes:
    if len(node.metadata['window']) > 10000:
        count += 1
print(count)

4


## Futher implementation guide
```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])

nodes = pipeline.run(documents=documents)
```

```python
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global
from llama_index.core import Settings

Settings.text_splitter = text_splitter

# per-index
index = VectorStoreIndex.from_documents(
    documents, transformations=[text_splitter]
)
```

To use the window of sentences instead of just one sentence to generate embeddings while building your index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.

Was this response helpful?

Yes
No
Verified Sources:

1. Metadata Replacement Demo - Node Po...
2. Indexing - Indexing
3. Indexing - Indexing
4. Embeddings - Models
how to use window from SentenceWindowRetrival and pass it to embedding model instead of just one sentence to build the index

To use the window of sentences from SentenceWindowNodeParser and pass it to the embedding model instead of just one sentence to build the index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

First, you need to extract the nodes using the SentenceWindowNodeParser:

# Assuming you have already imported the necessary libraries and loaded your documents

# Create the SentenceWindowNodeParser
node_parser = SentenceWindowNodeParser.from_defaults(
    # how many sentences on either side to capture
    window_size=3,
    # the metadata key that holds the window of surrounding sentences
    window_metadata_key="window",
    # the metadata key that holds the original sentence
    original_text_metadata_key="original_sentence",
)

# Extract the nodes
nodes = node_parser.get_nodes_from_documents(documents)
Then, you can build the index with the MetadataReplacementPostProcessor:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.



In [None]:
#TODO: Apply sentencesplitter on html nodes with 0 overlap then apply windownodeparser \
#with custom sentence splitter which should in theory return multiple sentences instead of individual sentences