# 1. Importing Libraries

In [1]:
import chromadb

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, get_response_synthesizer
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import QueryFusionRetriever

from ebooklib import epub
import uuid
import os
from pathlib import Path
from dotenv import load_dotenv
import nest_asyncio
from enum import Enum

nest_asyncio.apply()

# 2. Loading Data

In [2]:
def extract_epub_metadata(book_path: str) -> dict:
    book_path = Path(book_path)
    if not book_path.exists():
        raise FileNotFoundError(f"EPUB file not found at path: {book_path}")
    book = epub.read_epub(str(book_path))

    return {
        "id": f"epub-{uuid.uuid4().hex}",
        "title": book.get_metadata("DC", "title")[0][0].rstrip(".epub") if book.get_metadata("DC", "title") else "N/A",
        "author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "",
        "language": book.get_metadata("DC", "language")[0][0] if book.get_metadata("DC", "language") else "",
        "description": book.get_metadata("DC", "description")[0][0] if book.get_metadata("DC", "description") else "",
        "type": "epub",
        "embeddings": "openaiembeddings"
    }

In [3]:
documents = SimpleDirectoryReader(input_dir="./data", file_metadata=extract_epub_metadata).load_data()

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [4]:
print(f"Total Documents: {len(documents)}")
print(documents[0].metadata)

Total Documents: 1
{'id': 'epub-a8427909087b4859b8a4c747a0c0fda9', 'title': 'Islamic Laws', 'author': 'Sayyid Ali Hussaini Sistani', 'language': 'en', 'description': '', 'type': 'epub', 'embeddings': 'openaiembeddings'}


# 3. Using different splitters

## 3.1 `SentenceSplitter`

In [5]:
from llama_index.core.node_parser import SentenceSplitter

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
    ],
)
sentence_chunks = pipeline.run(documents=documents)
print(f"Total Sentence Chunks: {len(sentence_chunks)}")

for i, chunk in enumerate(sentence_chunks):
    print(f"Chunk {i}:\n{chunk.text}\n-------------------------------------------------")

Total Sentence Chunks: 365
Chunk 0:
# Islamic Laws

**Sub Title:**

According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid

**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, and the laws r

In [6]:
from llama_index.core.node_parser.file import MarkdownNodeParser

transformations = [MarkdownNodeParser()]

pipeline = IngestionPipeline(
    transformations=transformations
)
sentence_chunks = pipeline.run(documents=documents)
print(f"Total Sentence Chunks: {len(sentence_chunks)}\n")

for i, chunk in enumerate(sentence_chunks):
    print(f"CHUNK {i}:\n{chunk.text}\n-------------------------------------------------\n-------------------------------------------------\n\n")

Total Sentence Chunks: 302

CHUNK 0:
Islamic Laws

**Sub Title:**

According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]
-------------------------------------------------
-------------------------------------------------


CHUNK 1:
Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.
-------------------------------------------------
-------------------------------------------------


CHUNK 2:
Taqlid: Following a Mujtahid

**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the w

In [7]:
from llama_index.core.node_parser.relational.markdown_element import MarkdownElementNodeParser

transformations = [MarkdownElementNodeParser()]

pipeline = IngestionPipeline(
    transformations=transformations
)
sentence_chunks = pipeline.run(documents=documents)
print(f"Total Sentence Chunks: {len(sentence_chunks)}\n")

for i, chunk in enumerate(sentence_chunks):
    print(f"CHUNK {i}:\n{chunk.text}\n-------------------------------------------------\n-------------------------------------------------\n\n")

0it [00:00, ?it/s]


Total Sentence Chunks: 339

CHUNK 0:
Islamic Laws

**Sub Title:**

According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

 Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

 Taqlid: Following a Mujtahid

**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, and the laws rela

In [8]:
from llama_index.core.node_parser import Markdown

transformations = [MarkdownNodeParser()]

pipeline = IngestionPipeline(
    transformations=transformations
)
sentence_chunks = pipeline.run(documents=documents)
print(f"Total Sentence Chunks: {len(sentence_chunks)}\n")

for i, chunk in enumerate(sentence_chunks):
    print(f"CHUNK {i}:\n{chunk.text}\n-------------------------------------------------\n-------------------------------------------------\n\n")

ImportError: cannot import name 'Markdown' from 'llama_index.core.node_parser' (e:\Capelin\llama-index-test\.venv\Lib\site-packages\llama_index\core\node_parser\__init__.py)

In [25]:
!pip install langchain-text-splitters




[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
msg = ""
for document in documents:
    msg += document.text
    print(msg)

# Islamic Laws

**Sub Title:**

According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid

**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, and the laws related to a Muslim will hold good fo

In [38]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    # ("#", "Heading"),
    # ("##", "Subheading"),
    ("**", "Issue")
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
chunks = splitter.split_text(msg)

In [26]:
print(chunks)

[Document(metadata={}, page_content="# Islamic Laws  \n**Sub Title:**  \nAccording to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani  \nLaws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other\ntopics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.  \nGet PDF [3] Get EPUB [4] Get MOBI [5]  \n# Important Note  \nThe * sign after a number denotes that there is a total or partial variation\nfrom the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These\nlaws are also available online at Al-Islam.org.  \n# Taqlid: Following a Mujtahid  \n**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of\nfaith with his own insight and understanding, and he cannot follow anyone in\nthis respect i.e. he cannot accept the word of another who knows, simply\nbecause he has said it. However, one who has faith in the true tenets of\nIslam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is\nnot very

In [34]:
import re

def split_markdown_by_bold(markdown_text):
    # Use regex to find bold text and split into groups, keeping associated content together
    pattern = r'\*\*(.*?):\*\*([\s\S]*?)(?=\*\*|$)'  # Match bold text with details
    matches = re.findall(pattern, markdown_text)
    
    chunks = []
    for match in matches:
        bold_text = f"**{match[0]}:**".strip()  # Keep the bold text with formatting
        details = match[1].strip()  # Trim details
        chunks.append(f"{bold_text}\n{details}")  # Combine bold text and details

    return chunks


In [35]:
chunks = split_markdown_by_bold(msg)

In [39]:
print(type(chunks[0]))

<class 'langchain_core.documents.base.Document'>


In [36]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:\n{chunk}\n-------------------------------------------------")

Chunk 0:
**Sub Title:**
According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid
-------------------------------------------------
Chunk 1:
**Issue 1:**
* It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, 

In [32]:
for i, chunk in enumerate(chunks):
    if chunk.metadata.get("Heading"):
        print(f"CHUNK {i}:\nHeading: {chunk.metadata['Heading']}\n{chunk.page_content}\n-------------------------------------------------\n-------------------------------------------------\n\n")
    elif chunk.metadata.get("Subheading"):
        print(f"CHUNK {i}:\nSubheading: {chunk.metadata['Subheading']}\n{chunk.page_content}\n-------------------------------------------------\n-------------------------------------------------\n\n")
    elif chunk.metadata.get("Issue"):
        print(f"CHUNK {i}:\nIssue: {chunk.metadata['Issue']}\n{chunk.page_content}\n-------------------------------------------------\n-------------------------------------------------\n\n")

AttributeError: 'str' object has no attribute 'metadata'