In [326]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from typing import List

embedding = GeminiEmbedding()

reader = SimpleDirectoryReader(input_dir="/home/dai/35/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p","li", "b", "i", "u", "section", "text"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
# remove nodes with no content
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files:  69%|██████▉   | 412/594 [00:00<00:00, 4114.30file/s]

Loading files: 100%|██████████| 594/594 [00:00<00:00, 4451.59file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [327]:
len(nodes)

3386

In [301]:
count = 0
for node in nodes:
    if len(node.text) > 10000:
        count += 1
        print(node.metadata)
print(count)

{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Aayurveda_Samhitaa/01_Charaka_Samhitaa/DF_Introduction_Charaka_Samhita.html', 'file_name': 'DF_Introduction_Charaka_Samhita.html', 'file_type': 'text/html', 'file_size': 13642, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Basics_of_Chikitsaa.htm', 'file_name': 'Basics_of_Chikitsaa.htm', 'file_type': 'text/html', 'file_size': 24441, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Kushtha.htm', 'file_name': 'Kushtha.htm', 'file_type': 'text/html', 'file_size': 44579, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '/home/dai/35/rag/data/clean_html/Articles/Chikitsaa/Latest_Trends_in_

In [302]:
# from llama_index.core.node_parser import SentenceSplitter

# splitter = SentenceSplitter(
#     chunk_size=2048,
#     chunk_overlap=400,
# )
# sentence_nodes = splitter.get_nodes_from_documents(nodes, show_progress=True)

In [303]:
# count = 0
# for node in sentence_nodes:
#     if len(node.text) > 10000:
#         count += 1
#         print(node.metadata)
# print(count)

In [304]:
from llama_index.core.node_parser import SentenceSplitter

_splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=0,
)
_sentence_nodes = _splitter.get_nodes_from_documents(nodes, show_progress=True)

Parsing nodes:   0%|          | 0/3386 [00:00<?, ?it/s]

In [305]:
len(_sentence_nodes)

3712

In [306]:
for i in range(len(_sentence_nodes)-2):
    _sentence_nodes[i].text = _sentence_nodes[i+1].text+" "+_sentence_nodes[i+2].text
_sentence_nodes = _sentence_nodes[:-2]

In [307]:
count = 0
for node in _sentence_nodes:
    if len(node.get_content()) > 10000:
        count += 1
        print(node.metadata)
print(count)

0


In [321]:
for i in range(100,120):
    print(_sentence_nodes[i].text)

Pathya Aahaara: Diet should be administered having following properties. Gun`a: Guru, Snigdha Sheeta, Manda, Saandra, S`heeta, Sthira, Pichchhla Rasa: Madhura Rasa Karma: Kaphakara, Medura, Pusht`eekara Dravya: Paayasa, Vasaa, Majjaa, Though indigestion is caused by this dietary regime, in this stage, diet should be given continuously to avoid depletion of Dhaatu and to avoid complication and death. Diet like fish, which are having Slakshn`a sthira gun`a should be used. Other diet items Dugdha and Ghrita added with Madhuchchhisht`a should be used for thirst. Pathya Vihaara: Divaasvaapa should be advised as Pathyaa Vihaara. Precautions: The patient should not be kept without food for long. The Virechana should be administered cautiously according to the condition of the patient All measures should be taken to protect the Agni and to maintain in its normal functions. Search Anveshaka: Search Kalpa, Dravya, Ahaara, Chikitsaa Karma, Vihaara Effect on Agni - Atyagni Naas`ha, Agni Naas`ha, I

In [308]:
from llama_index.core import VectorStoreIndex
embedding_model = GeminiEmbedding()
index = VectorStoreIndex(nodes = _sentence_nodes, embed_model = embedding_model, show_progress=True)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1662 [00:00<?, ?it/s]

In [318]:
index.storage_context.persist(persist_dir="indexes")

In [311]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
Settings.llm = Gemini(model_name='models/gemini-pro',
                      temperature=0.7)

query_engine = index.as_query_engine(llm=Settings.llm)

In [322]:
response = query_engine.query("What are the reasons of sweating?")
print(response)

The provided context does not contain any information about the reasons of sweating. Therefore, I am unable to answer this query.


In [325]:
for node in response.source_nodes:
    print(node.id_)
    print("-"*100)

5c1e8869-929f-4187-bac2-68e6d42ac523
----------------------------------------------------------------------------------------------------
6201b78b-b911-4d39-96c6-fd7d09560fc4
----------------------------------------------------------------------------------------------------


In [309]:
# import nltk
# tokenizer = nltk.tokenize.PunktSentenceTokenizer()

# # get the spans and then return the sentences
# # using the start index of each span
# # instead of using end, use the start of the next span if available
# def split(text: str) -> List[str]:
#     spans = list(tokenizer.span_tokenize(text))
#     sentences = []
#     for i, span in enumerate(spans):
#         start = span[0]
#         if i < len(spans) - 1:
#             end = spans[i + 1][0]
#         else:
#             end = len(text)
#         sentences.append(text[start:end])

#     return sentences

## Futher implementation guide
```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])

nodes = pipeline.run(documents=documents)
```

```python
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global
from llama_index.core import Settings

Settings.text_splitter = text_splitter

# per-index
index = VectorStoreIndex.from_documents(
    documents, transformations=[text_splitter]
)
```

To use the window of sentences instead of just one sentence to generate embeddings while building your index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.

Was this response helpful?

Yes
No
Verified Sources:

1. Metadata Replacement Demo - Node Po...
2. Indexing - Indexing
3. Indexing - Indexing
4. Embeddings - Models
how to use window from SentenceWindowRetrival and pass it to embedding model instead of just one sentence to build the index

To use the window of sentences from SentenceWindowNodeParser and pass it to the embedding model instead of just one sentence to build the index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

First, you need to extract the nodes using the SentenceWindowNodeParser:

# Assuming you have already imported the necessary libraries and loaded your documents

# Create the SentenceWindowNodeParser
node_parser = SentenceWindowNodeParser.from_defaults(
    # how many sentences on either side to capture
    window_size=3,
    # the metadata key that holds the window of surrounding sentences
    window_metadata_key="window",
    # the metadata key that holds the original sentence
    original_text_metadata_key="original_sentence",
)

# Extract the nodes
nodes = node_parser.get_nodes_from_documents(documents)
Then, you can build the index with the MetadataReplacementPostProcessor:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.



In [310]:
#TODO: Apply sentencesplitter on html nodes with 0 overlap then apply windownodeparser \
#with custom sentence splitter which should in theory return multiple sentences instead of individual sentences