# 1. Importing Libraries

In [1]:
import chromadb

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, get_response_synthesizer
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import QueryFusionRetriever

from ebooklib import epub
import uuid
import os
import re
from pathlib import Path
from dotenv import load_dotenv
import nest_asyncio
from enum import Enum

nest_asyncio.apply()

# 2. Loading Data

In [2]:
def extract_epub_metadata(book_path: str) -> dict:
    book_path = Path(book_path)
    if not book_path.exists():
        raise FileNotFoundError(f"EPUB file not found at path: {book_path}")
    book = epub.read_epub(str(book_path))

    return {
        "id": f"epub-{uuid.uuid4().hex}",
        "title": book.get_metadata("DC", "title")[0][0].rstrip(".epub") if book.get_metadata("DC", "title") else "N/A",
        "author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "",
        "language": book.get_metadata("DC", "language")[0][0] if book.get_metadata("DC", "language") else "",
        "description": book.get_metadata("DC", "description")[0][0] if book.get_metadata("DC", "description") else "",
        "type": "epub",
        "embeddings": "openaiembeddings"
    }

In [29]:
documents = SimpleDirectoryReader(input_dir="./data", file_metadata=extract_epub_metadata).load_data()

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [30]:
print(f"Total Documents: {len(documents)}")
print(documents[0].metadata)

Total Documents: 1
{'id': 'epub-28bb0ed1628b4b9185967542333e78cc', 'title': 'Islamic Laws', 'author': 'Sayyid Ali Hussaini Sistani', 'language': 'en', 'description': '', 'type': 'epub', 'embeddings': 'openaiembeddings'}


In [34]:
print(documents[0].text)

# Islamic Laws

**Sub Title:**

According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid

**Issue 1:** * It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, and the laws related to a Muslim will hold good fo

In [8]:
def split_markdown_by_issues(markdown_text):
    # Use regex to find bold text and split into groups, keeping associated content together
    pattern = r'\*\*(.*?):\*\*([\s\S]*?)(?=\*\*|$)'  # Match bold text with details
    matches = re.findall(pattern, markdown_text)
    
    chunks = []
    for match in matches:
        bold_text = f"**{match[0]}:**".strip()  # Keep the bold text with formatting
        details = match[1].strip()  # Trim details
        chunks.append(f"{bold_text}\n{details}")  # Combine bold text and details

    return chunks

In [49]:
import re
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import TransformComponent
from llama_index.core.schema import TextNode


class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        chunks = []
        pattern = r'\*\*(.*?):\*\*([\s\S]*?)(?=\*\*|$)'  # Match bold text with details

        for node in nodes:
            text = node.text
            metadata = node.metadata
            matches = re.findall(pattern, text)
            for match in matches:
                bold_text = f"**{match[0]}:**".strip()  # Keep the bold text with formatting
                details = match[1].strip()  # Trim details
                chunks.append(TextNode(text=f"{bold_text}\n{details}", metadata={"issue": bold_text}))  # Combine bold text and details

        return chunks

In [51]:
print(type(documents[0]))

<class 'llama_index.core.schema.Document'>


In [50]:
transformations = [TextCleaner()]

pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=documents)

for node in nodes:
    print(node.text)
    
    print("-"*50)

**Sub Title:**
According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid
--------------------------------------------------
**Issue 1:**
* It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo'min, even if he is
not very profound, and the laws rela

In [13]:
chunks_by_issues = split_markdown_by_issues(documents[0].text)

In [27]:
def split_markdown_by_headings(markdown_text):
    """
    Splits a Markdown string into chunks based on top-level headings (#).
    
    Args:
        markdown_text (str): The Markdown content to split.
    
    Returns:
        List[dict]: A list of dictionaries with 'heading' and 'content' keys.
    """
    # Regex to match only top-level headings (# Heading)
    heading_pattern = re.compile(r"^#\s+(.+)$", re.MULTILINE)
    
    # Find all top-level headings with their start positions
    matches = list(heading_pattern.finditer(markdown_text))
    
    # Store chunks
    chunks = []
    
    # Iterate over matches to extract content
    for i, match in enumerate(matches):
        heading = match.group(0)  # Full heading text (e.g., "# Heading")
        start = match.end()       # End of the heading line
        next_start = matches[i + 1].start() if i + 1 < len(matches) else len(markdown_text)
        content = markdown_text[start:next_start].strip()
        chunks.append(f"Section: {heading}\n{content}")
    
    return chunks


In [30]:
chunks_by_sections = split_markdown_by_headings(documents[0].text)

In [37]:
chunks = chunks_by_issues + chunks_by_sections

for i, chunk in enumerate(chunks):
    print(f"********\nCHUNK {i + 1}\n********\n\n{chunk}\n\n")
    print("-" * 50)

********
CHUNK 1
********

**Sub Title:**
According to the Fatawa of Ayatullah al Uzma Sayyid Ali al-Husaini Seestani

Laws on cleanliness, prayers, fasting, hajj, transactions, marriage, and other
topics. According to the Risalah of Ayatullah Ali al-Husayni Al-Seestani.

Get PDF [3] Get EPUB [4] Get MOBI [5]

# Important Note

The * sign after a number denotes that there is a total or partial variation
from the fatwa of Marhum Ayatullah Al Uzama Syed Abul Qasim Al Khu’i. These
laws are also available online at Al-Islam.org.

# Taqlid: Following a Mujtahid


--------------------------------------------------
********
CHUNK 2
********

**Issue 1:**
* It is necessary for a Muslim to believe in the fundamentals of
faith with his own insight and understanding, and he cannot follow anyone in
this respect i.e. he cannot accept the word of another who knows, simply
because he has said it. However, one who has faith in the true tenets of
Islam, and manifests it by his deeds, is a Muslim and Mo