In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from typing import List

embedding = GeminiEmbedding()

reader = SimpleDirectoryReader(input_dir="../data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p","li", "b", "i", "u", "section", "text"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
# remove nodes with no content
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files: 100%|██████████| 594/594 [00:00<00:00, 7233.62file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [2]:
len(nodes)

3386

In [3]:
count = 0
for node in nodes:
    if len(node.text) > 10000:
        count += 1
        print(node.metadata)
print(count)

{'tag': 'p', 'file_path': '../data/clean_html/Aayurveda_Samhitaa/01_Charaka_Samhitaa/DF_Introduction_Charaka_Samhita.html', 'file_name': 'DF_Introduction_Charaka_Samhita.html', 'file_type': 'text/html', 'file_size': 13642, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '../data/clean_html/Articles/Chikitsaa/Basics_of_Chikitsaa.htm', 'file_name': 'Basics_of_Chikitsaa.htm', 'file_type': 'text/html', 'file_size': 24441, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '../data/clean_html/Articles/Chikitsaa/Kushtha.htm', 'file_name': 'Kushtha.htm', 'file_type': 'text/html', 'file_size': 44579, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
{'tag': 'p', 'file_path': '../data/clean_html/Articles/Chikitsaa/Latest_Trends_in_Basti.htm', 'file_name': 'Latest_Trends_in_Basti.htm', '

In [4]:
# from llama_index.core.node_parser import SentenceSplitter

# splitter = SentenceSplitter(
#     chunk_size=2048,
#     chunk_overlap=400,
# )
# sentence_nodes = splitter.get_nodes_from_documents(nodes, show_progress=True)

In [5]:
# count = 0
# for node in sentence_nodes:
#     if len(node.text) > 10000:
#         count += 1
#         print(node.metadata)
# print(count)

In [6]:
from llama_index.core.node_parser import SentenceSplitter

_splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=0,
)
_sentence_nodes = _splitter.get_nodes_from_documents(nodes, show_progress=True)

Parsing nodes:   0%|          | 0/3386 [00:00<?, ?it/s]

In [7]:
len(_sentence_nodes)

3712

In [8]:
from nltk.tokenize import sent_tokenize

def get_last_context(node, tokenizer):
    '''
    Retrieving last sentence from node for context
    '''
    sents = tokenizer(node.text)
    return sents[-1]


def get_first_context(node, tokenizer):
    '''
    Retrieving first sentence from node for context
    '''    
    sents = tokenizer(node.text)
    return sents[0]


def add_context_to_nodes(_sentence_nodes):
    for i in range(len(_sentence_nodes)):
        if i == 0:
            _sentence_nodes[i].metadata['context'] = get_first_context(_sentence_nodes[i+1], sent_tokenize)
        
        elif i == len(_sentence_nodes)-1:
            _sentence_nodes[i].metadata['context'] = get_last_context(_sentence_nodes[i-1], sent_tokenize)
        
        else:
            _sentence_nodes[i].metadata['context'] = get_last_context(_sentence_nodes[i-1], sent_tokenize) + ' ' + get_first_context(_sentence_nodes[i+1], sent_tokenize)
    

add_context_to_nodes(_sentence_nodes)

In [9]:
_sentence_nodes[-1].text

'Placing a Varti (tablet / pessary) inside the vagina is termed as Yoni Varti. Yoni S`huddheekara ( It cleans the female genital organs )'

In [10]:
_sentence_nodes[0].metadata

{'tag': 'p',
 'file_path': '../data/clean_html/AayurVidnyaana_Introduction/Acknowledgements.html',
 'file_name': 'Acknowledgements.html',
 'file_type': 'text/html',
 'file_size': 9752,
 'creation_date': '2024-02-14',
 'last_modified_date': '2024-02-14',
 'last_accessed_date': '2024-02-14',
 'context': 'Institutes,"Ayurved  Research Centre K.E.M.'}

In [11]:
_sentence_nodes[0].text

'The project is funded by Ministry of Communications & Information Technology. Following Institutions and Ayurveda Experts have contributed to AyuSoft in their capacities as advisors, authors, owners of photographs and clips, testers etc.'

In [12]:
# for node in _sentence_nodes:
#     print('Node content: ', node.get_content())
#     print('Context: ', node)
_sentence_nodes[0].get_content()

'The project is funded by Ministry of Communications & Information Technology. Following Institutions and Ayurveda Experts have contributed to AyuSoft in their capacities as advisors, authors, owners of photographs and clips, testers etc.'

In [13]:
_sentence_nodes[1].get_content()

'Institutes,"Ayurved  Research Centre K.E.M. Hospital, Mumbai." Institutes,"B.S.D.T.  College of Ayurved, Wagholi Pune." Institutes,"College of  Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune." Institutes,"Centre for  Rheumatic Diseases Pune (CRD), Pune." Institutes,Dept. of  Ayurved Sassoon General Hospital Pune. Institutes,"Ayurved  Dept. Dinanath Mangeshkar Hospital,Pune." Institutes,"Foundation  for Revitalisation of Local Health Traditions (FRLHT), Bangalore." Institutes,"Sanjeevan  Hospital, Pune." Institutes,"Sumatibhai  Shaha Ayurved College, Hadapsar, Pune." Institutes,"Swami Prakashanand Ayurveda Research Centre  (SPARC), Mumbai." Institutes,"Seth  Tarachand Ramnath Hospital, Pune." Institutes,"T.N.Medical  College and Nair Hospital, Mumbai." Institutes,"Tilak  Ayurveda Mahavidyalaya, Pune." Physicians/Researchers,Dr. Baghel M.S.,"Director, International Centre for Ayurvedic studies, Jamnagar." Physicians/Researchers,Dr. Badave Jyoti,"Ayurved Physician, Pune." P

In [14]:
_sentence_nodes[1].metadata['context']

'Following Institutions and Ayurveda Experts have contributed to AyuSoft in their capacities as advisors, authors, owners of photographs and clips, testers etc. Physicians/Researchers,Dr.'

In [15]:
count = 0
for node in _sentence_nodes:
    if len(node.metadata['context']) > 10000:
        count += 1
        print(node.metadata)
print(count)

0


In [16]:
count = 0
for node in _sentence_nodes:
    if len(node.get_content()) > 10000:
        count += 1
        print(node.metadata)
print(count)

0


In [17]:
len(_sentence_nodes[0].metadata['context'])

43

In [18]:
from llama_index.core import VectorStoreIndex
embedding_model = GeminiEmbedding()
index = VectorStoreIndex(nodes = _sentence_nodes, embed_model = embedding_model, show_progress=True)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

In [None]:
# import nltk
# tokenizer = nltk.tokenize.PunktSentenceTokenizer()

# # get the spans and then return the sentences
# # using the start index of each span
# # instead of using end, use the start of the next span if available
# def split(text: str) -> List[str]:
#     spans = list(tokenizer.span_tokenize(text))
#     sentences = []
#     for i, span in enumerate(spans):
#         start = span[0]
#         if i < len(spans) - 1:
#             end = spans[i + 1][0]
#         else:
#             end = len(text)
#         sentences.append(text[start:end])

#     return sentences

## Futher implementation guide
```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])

nodes = pipeline.run(documents=documents)
```

```python
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global
from llama_index.core import Settings

Settings.text_splitter = text_splitter

# per-index
index = VectorStoreIndex.from_documents(
    documents, transformations=[text_splitter]
)
```

To use the window of sentences instead of just one sentence to generate embeddings while building your index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.

Was this response helpful?

Yes
No
Verified Sources:

1. Metadata Replacement Demo - Node Po...
2. Indexing - Indexing
3. Indexing - Indexing
4. Embeddings - Models
how to use window from SentenceWindowRetrival and pass it to embedding model instead of just one sentence to build the index

To use the window of sentences from SentenceWindowNodeParser and pass it to the embedding model instead of just one sentence to build the index, you can use the MetadataReplacementPostProcessor during the index building process. This postprocessor replaces the sentence in each node with its surrounding context (the "window" of sentences) before passing the node to the Language Learning Model (LLM).

Here is an example of how to do this:

First, you need to extract the nodes using the SentenceWindowNodeParser:

# Assuming you have already imported the necessary libraries and loaded your documents

# Create the SentenceWindowNodeParser
node_parser = SentenceWindowNodeParser.from_defaults(
    # how many sentences on either side to capture
    window_size=3,
    # the metadata key that holds the window of surrounding sentences
    window_metadata_key="window",
    # the metadata key that holds the original sentence
    original_text_metadata_key="original_sentence",
)

# Extract the nodes
nodes = node_parser.get_nodes_from_documents(documents)
Then, you can build the index with the MetadataReplacementPostProcessor:

from llama_index import VectorStoreIndex
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# Create the postprocessor
postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

# Build the index with the postprocessor
sentence_index = VectorStoreIndex(nodes, service_context=ctx, node_postprocessors=[postprocessor])
In this example, the MetadataReplacementPostProcessor is added to the VectorStoreIndex during its creation. This means that when the index is built, each node's sentence will be replaced with its surrounding context before the embeddings are generated. This allows the LLM to consider the context of the sentence when generating embeddings.



In [None]:
#TODO: Apply sentencesplitter on html nodes with 0 overlap then apply windownodeparser \
#with custom sentence splitter which should in theory return multiple sentences instead of individual sentences