In [62]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini

reader = SimpleDirectoryReader(input_dir="/home/dai/33/project/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "b", "i", "u", "section", "text", "title"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files:   0%|          | 0/594 [00:00<?, ?file/s]

Loading files: 100%|██████████| 594/594 [00:00<00:00, 6487.64file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [63]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=0,
)
nodes = splitter.get_nodes_from_documents(documents)
len(nodes)

1480

In [70]:
nodes[0].text

'<html>\n<head>\n<title>Acknowledgement</title>\n</head>\n<body>\n<div>\n</div>\n<div>\n<p> </p>\n<p>The project is funded by Ministry of Communications &amp; Information\nTechnology.</p>\n<p>Following Institutions and Ayurveda Experts have contributed to AyuSoft\nin their </p>\n<p>capacities as advisors, authors, owners of photographs and clips,\ntesters etc.</p>\n<text>Institutes,"Ayurved  Research Centre K.E.M. Hospital, Mumbai."\nInstitutes,"B.S.D.T.  College of Ayurved, Wagholi Pune."\nInstitutes,"College of  Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune."\nInstitutes,"Centre for  Rheumatic Diseases Pune (CRD), Pune."\nInstitutes,Dept. of  Ayurved Sassoon General Hospital Pune.\nInstitutes,"Ayurved  Dept. Dinanath Mangeshkar Hospital,Pune."\nInstitutes,"Foundation  for Revitalisation of Local Health Traditions (FRLHT), Bangalore."\nInstitutes,"Sanjeevan  Hospital, Pune."\nInstitutes,"Sumatibhai  Shaha Ayurved College, Hadapsar, Pune."\nInstitutes,"Swami Prakashanand

In [71]:
len(nodes[0].text.split())

303

In [67]:
print(nodes[0].text)
print(len(nodes[0].text))
print(len(nodes[0].text.split(" ")))

<html>
<head>
<title>Acknowledgement</title>
</head>
<body>
<div>
</div>
<div>
<p> </p>
<p>The project is funded by Ministry of Communications &amp; Information
Technology.</p>
<p>Following Institutions and Ayurveda Experts have contributed to AyuSoft
in their </p>
<p>capacities as advisors, authors, owners of photographs and clips,
testers etc.</p>
<text>Institutes,"Ayurved  Research Centre K.E.M. Hospital, Mumbai."
Institutes,"B.S.D.T.  College of Ayurved, Wagholi Pune."
Institutes,"College of  Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune."
Institutes,"Centre for  Rheumatic Diseases Pune (CRD), Pune."
Institutes,Dept. of  Ayurved Sassoon General Hospital Pune.
Institutes,"Ayurved  Dept. Dinanath Mangeshkar Hospital,Pune."
Institutes,"Foundation  for Revitalisation of Local Health Traditions (FRLHT), Bangalore."
Institutes,"Sanjeevan  Hospital, Pune."
Institutes,"Sumatibhai  Shaha Ayurved College, Hadapsar, Pune."
Institutes,"Swami Prakashanand Ayurveda Research Centre

In [65]:
print(nodes[1].text)

Physicians/Researchers,Dr. Gadgil Swati,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Galgali Shreerang,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gangadharan G. G.,"Joint Director, FRLHT Bangalore."
Physicians/Researchers,Dr. Gangurde Jayant,"Ayurveda Physician, Nashik."
Physicians/Researchers,Dr. Ghatnekar Vishvas,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gaud  Banwarilal,"Director, National Institute of Ayurveda, Jaipur."
Physicians/Researchers,Dr. Ghodke Amit,"Ayurveda Physician, Pune."
Physicians/Researchers,Dr. Gogte R. B.,"Ex - Professor and H.O.D. Shalya, Tilak Ayurved Mahavidyalaya and  Hospital, Pune."
Physicians/Researchers,Dr. Gokakar Gurunath,"Ayurveda Physician, Kolhapur."
Physicians/Researchers,Dr. Gokhale Maya,"Lecturer - Ayurved College, Akurdi (Pune), Ayurved Consultant, Pune."
Physicians/Researchers,Dr. Gokhale Vivek,"Lecturer - Bharati Vidyapeth College of Ayurved, Pune."
Physicians/Researchers,Dr. Hirwani Raj,"Director URDIP, Pune.

In [37]:
greater_size_docs = []
for i in range(len(nodes)):
    size = len(nodes[i].text)
    if size > 10000:
        greater_size_docs.append(nodes[i].metadata['file_name'])

len(greater_size_docs)

0

In [32]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import Settings

Settings.embed_model = GeminiEmbedding(model_name='models/embedding-001')

index = VectorStoreIndex(nodes=nodes, embed_model=Settings.embed_model, show_progress=True)

Generating embeddings:   0%|          | 0/1860 [00:00<?, ?it/s]