In [21]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini

reader = SimpleDirectoryReader(input_dir="/home/dai/35/rag/data/clean_html",
                                  recursive=True)

documents = reader.load_data(show_progress=True)
node_parser = HTMLNodeParser(tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "b", "i", "u", "section", "text", "title"])
nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)
nodes = [node for node in nodes if len(node.get_content()) > 0]
for node in nodes:
    #remove all the \n and \t
    node.text = node.text.replace("\n", " ").replace("\t", " ")

Loading files:   0%|          | 0/594 [00:00<?, ?file/s]

Loading files: 100%|██████████| 594/594 [00:00<00:00, 4154.66file/s]


Parsing nodes:   0%|          | 0/594 [00:00<?, ?it/s]

In [30]:
for i in range(30):
    print(nodes[i].text)

Acknowledgement
The project is funded by Ministry of Communications & Information Technology. Following Institutions and Ayurveda Experts have contributed to AyuSoft in their capacities as advisors, authors, owners of photographs and clips, testers etc.
Institutes   Ayurved   Research Centre K.E.M. Hospital, Mumbai. B.S.D.T.   College of Ayurved, Wagholi Pune. College of   Ayurved Bharati Vidyapeeth Deemed University (B.V.D.U), Pune. Centre for   Rheumatic Diseases Pune (CRD), Pune. Dept. of   Ayurved Sassoon General Hospital Pune. Ayurved   Dept. Dinanath Mangeshkar Hospital,Pune. Foundation   for Revitalisation of Local Health Traditions (FRLHT), Bangalore. Sanjeevan   Hospital, Pune. Sumatibhai   Shaha Ayurved College, Hadapsar, Pune. Swami Prakashanand Ayurveda Research Centre   (SPARC), Mumbai. Seth   Tarachand Ramnath Hospital, Pune. T.N.Medical   College and Nair Hospital, Mumbai. Tilak   Ayurveda Mahavidyalaya, Pune.
Institutes Ayurved   Research Centre K.E.M. Hospital, Mumbai.

In [36]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=40)
nodes = splitter.get_nodes_from_documents(nodes, show_progress=True)
nodes = [node for node in nodes if len(node.get_content()) > 0]

Parsing nodes:   0%|          | 0/45914 [00:00<?, ?it/s]

In [None]:
#TODO: implement ingestion pipeline and combine two node parser together

In [37]:
nodes[2].get_content()

'capacities as advisors, authors, owners of photographs and clips,\ntesters etc.'

In [None]:
from llama_index.embeddings.gemini import GeminiEmbedding
embedding = GeminiEmbedding()
for i, node in enumerate(nodes):
    print(str(i)+"\r", end="", flush=True)
    try:
        embedding.get_text_embedding(node.get_content())
    except Exception as e:
        print(len(node.get_content()))
        print(node.metadata)

## Futher implementation guide
```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

documents = SimpleDirectoryReader("./data").load_data()

pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])

nodes = pipeline.run(documents=documents)
```

```python
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

# global
from llama_index.core import Settings

Settings.text_splitter = text_splitter

# per-index
index = VectorStoreIndex.from_documents(
    documents, transformations=[text_splitter]
)
```

In [None]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents, )