In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, get_response_synthesizer
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

Settings.embed_model = GeminiEmbedding(model_name='models/embedding-001')
Settings.llm = Gemini(model_name='models/gemini-pro', temperature=0.7)

In [2]:
reader = SimpleDirectoryReader(input_dir="/home/dai/33/project/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p","li", "b", "i", "u", "section", "text"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files: 100%|██████████| 594/594 [00:00<00:00, 5187.80file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [3]:
splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=0,
)
nodes = splitter.get_nodes_from_documents(nodes)
len(nodes)

3713

In [4]:
greater_size_docs = []
for i in range(len(nodes)):
    size = len(nodes[i].text)
    if size > 10000:
        greater_size_docs.append(nodes[i].metadata['file_name'])

len(greater_size_docs)

0

In [5]:
# Create Index and store the index
index = VectorStoreIndex(nodes=nodes, embed_model=Settings.embed_model, show_progress=True)
index.storage_context.persist(persist_dir='indexes')

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1665 [00:00<?, ?it/s]

In [6]:
# # rebuild storage context
# storage_context = StorageContext.from_defaults(persist_dir="/home/dai/33/project/rag/indexes")

# # load index
# index = load_index_from_storage(storage_context)

In [7]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.3)]
)

In [6]:
query_engine = index.as_query_engine(llm=Settings.llm)

In [7]:
response = query_engine.query("Lifepan of individual is divided in how many stages?")
print(response)

The provided context does not contain any information about the lifespan of an individual or the stages it is divided into, so I cannot answer this question from the provided context.


In [8]:
for node in response.source_nodes:
    print(node.score)
    print(node.text)
    print(node.metadata)
    print("-"*100)

0.7069733386678532
According to Charaka:
{'tag': 'u', 'file_path': '/home/dai/33/project/rag/data/clean_html/Articles/S`halya_S`haalaakya/Gandamaalaa_Nidaana.htm', 'file_name': 'Gandamaalaa_Nidaana.htm', 'file_type': 'text/html', 'file_size': 2505, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
----------------------------------------------------------------------------------------------------
0.7060845148798267
S.Ni.1/52-58
{'tag': 'p', 'file_path': '/home/dai/33/project/rag/data/clean_html/Articles/Nidaana/Vaatavyaadhi.htm', 'file_name': 'Vaatavyaadhi.htm', 'file_type': 'text/html', 'file_size': 40453, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-14'}
----------------------------------------------------------------------------------------------------
