In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini

reader = SimpleDirectoryReader(input_dir="/home/dai/33/project/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)

Loading files: 100%|██████████| 594/594 [00:00<00:00, 6166.51file/s]


In [3]:
embed_model = GeminiEmbedding()
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f372b3fa5d0>, id_func=<function default_id_func at 0x7f372fa33100>, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.split at 0x7f372aac6840>, embed_model=GeminiEmbedding(model_name='models/embedding-001', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f372b3fa990>, title=None, task_type='retrieval_document'), buffer_size=1, breakpoint_percentile_threshold=95)

In [20]:
from llama_index.core.node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048]
)

nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
nodes = [node for node in nodes if len(node.get_content()) > 0]
len(nodes)

Parsing documents into nodes:   0%|          | 0/594 [00:00<?, ?it/s]

924

In [21]:
greater_size_docs = []
for i in range(len(nodes)):
    size = len(nodes[i].text)
    if size > 10000:
        greater_size_docs.append(nodes[i].metadata['file_name'])

len(greater_size_docs)

0

In [17]:
print(nodes[0].text)
print(len(nodes[0].text))

<html>
<head>
<title>Acknowledgement</title>
</head>
<body>
<div>
</div>
<div>
<p> </p>
<p>The project is funded by Ministry of Communications &amp; Information
Technology.</p>
<p>Following Institutions and Ayurveda Experts have contributed to AyuSoft
in their </p>
<p>capacities as advisors, authors, owners of photographs and clips,
testers etc.</p>
<text>Institutes,"Ayurved  Research Centre K.E.M. Hospital, Mumbai."
Institutes,"B.S.D.T.  College of Ayurved, Wagholi Pune."
Institutes,"College of  Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune."
Institutes,"Centre for  Rheumatic Diseases Pune (CRD), Pune."
Institutes,Dept. of  Ayurved Sassoon General Hospital Pune.
Institutes,"Ayurved  Dept. Dinanath Mangeshkar Hospital,Pune."
Institutes,"Foundation  for Revitalisation of Local Health Traditions (FRLHT), Bangalore."
Institutes,"Sanjeevan  Hospital, Pune."
Institutes,"Sumatibhai  Shaha Ayurved College, Hadapsar, Pune."
Institutes,"Swami Prakashanand Ayurveda Research Centre

In [18]:
print(nodes[1].text)
print(len(nodes[1].text))

Scientist AyuSoft.
Physicians/Researchers,Dr. Nesari Manoj,Sr. Scientist AyuSoft.
Physicians/Researchers,Dr. Nesari Tanuja,"Assistant Professor, Tilak Ayurved Mahavidyalaya, Pune."
Physicians/Researchers,Dr. Nevpurkar Santosh,"Ayurveda Physician, Aurangabad."
Physicians/Researchers,DR Nimbalkar Ranjeet,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Nishtheshvar K.,Asst. Professor - Vijayawada.
Physicians/Researchers,Dr. Paranjape Durga,Ex.Professor – Tilak Ayurved Mahavidyalaya.
Physicians/Researchers,Smt. Page Ushahprabha,"Nature Photography Expert, Pune."
Physicians/Researchers,Mr. Page Navendu,"Field Botanist, Deharadun."
Physicians/Researchers,Dr. Patankar Ashutosh,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Patil Jayashree,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Pendse Narendra,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Prabhune Yogesh,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Prasad V.V.,"Director Rashtiya Ayurveda Vidyap

In [19]:
print(nodes[2].text)
print(len(nodes[2].text))

<html xmlns:st1="urn:schemas-microsoft-com:office:smarttags">
<head>
<title>AayurVidnyaana</title>
<o:smarttagtype name="City" namespaceuri="urn:schemas-microsoft-com:office:smarttags"></o:smarttagtype><o:smarttagtype name="place" namespaceuri="urn:schemas-microsoft-com:office:smarttags"></o:smarttagtype>
</head>
<body>
<div>
<h4>AayurVidnyaana Kosha </h4>
<p>AayurVidnyaana Kosha is encyclopedia of Ayurveda.</p>
<p>The AayurVidnyaana
Kosha window is divided into two frames. The left frame has three tabs,
Contents, Search and New Window. Right frame is for viewing the document.</p>
<p>About AayurVidnyaana Kosha</p>
<p>AyuSoft encyclopedia elaborates scientific basis,
theory and strength of Ayurveda in a dynamic form. </p>
<p>Key topic of different subjects highlighted with
their clinical applications. </p>
<p>This includes information regarding Ayurvedic
concepts, current status of Ayurveda and related research articles. </p>
<p>The Encyclopedia of AyuSoft includes articles from
eminent

In [14]:
print(nodes[3].text)
print(len(nodes[3].text))

Barve Neelmani,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Bavdekar S.D.,"Head Ayurved Dept, Jnanaprabhodhini, Pune."
Physicians/Researchers,Dr. Bhalerao Supriya,"Research Associate, Nair Hospital, Department of Clinical  Pharmacology, Mumbai."
Physicians/Researchers,Dr. Chatopadhyay  Abheechal,"Lecturer, Ayurveda College, Kolkata."
Physicians/Researchers,Dr Chavan Santosh,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Chitre Rajashree,"Lecturer - Ayurved College Wagholi, Pune."
Physicians/Researchers,Dr. Chopra Arvind,"Director CRD, Pune."
Physicians/Researchers,Dr. Chunekar K.C.,"Ex Head Dept. of Dravyaguna Vidnyan,  Banaras Hindu University, Varanasi."
Physicians/Researchers,Dr. Darshanashankar,Director FRLHT Bangalore.
Physicians/Researchers,Dr. Deopujari Jayant,"Physician and Researcher, Nagpur."
Physicians/Researchers,Dr. Deshpande Manasi,"Department of Dravyagunavidyana, B.V.D.U. College of Ayurved, Pune."
Physicians/Researchers,Mrs.Deshpande Suvarna,"Sanskrit

In [15]:
print(nodes[4].text)
print(len(nodes[4].text))

Erande Mukund,"Principal and H.O.D. Department of Sharir Rachana, Sumatibhai Shaha  Ayurved College, Hadapsar, Pune."
Physicians/Researchers,Dr. Gadgil Dilip,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gadgil Swati,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Galgali Shreerang,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gangadharan G. G.,"Joint Director, FRLHT Bangalore."
Physicians/Researchers,Dr. Gangurde Jayant,"Ayurveda Physician, Nashik."
Physicians/Researchers,Dr. Ghatnekar Vishvas,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gaud  Banwarilal,"Director, National Institute of Ayurveda, Jaipur."
Physicians/Researchers,Dr. Ghodke Amit,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gogte R. B.,"Ex - Professor and H.O.D. Shalya, Tilak Ayurved Mahavidyalaya and  Hospital, Pune."
Physicians/Researchers,Dr. Gokakar Gurunath,"Ayurveda Physician, Kolhapur."
Physicians/Researchers,Dr. Gokhale Maya,"Lecturer - Ayurved College, Akurdi (Pune), 