# Importing Libraries

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings
from llama_index.core.constants import DEFAULT_TEMPERATURE
from ebooklib import epub
import uuid
import os
from pathlib import Path
from dotenv import load_dotenv
import nest_asyncio

nest_asyncio.apply()

In [14]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

Settings.llm = OpenAI(api_key=OPENAI_API_KEY, model_name="gpt-4o-mini", temperature=DEFAULT_TEMPERATURE)
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002", api_key=OPENAI_API_KEY)

# Loading Data

In [2]:
def extract_epub_metadata(book_path: str) -> dict:
    book_path = Path(book_path)
    if not book_path.exists():
        raise FileNotFoundError(f"EPUB file not found at path: {book_path}")
    book = epub.read_epub(str(book_path))

    return {
        "id": f"epub-{uuid.uuid4().hex}",
        "title": book.get_metadata("DC", "title")[0][0].rstrip(".epub") if book.get_metadata("DC", "title") else "N/A",
        "author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "",
        "language": book.get_metadata("DC", "language")[0][0] if book.get_metadata("DC", "language") else "",
        "description": book.get_metadata("DC", "description")[0][0] if book.get_metadata("DC", "description") else "",
        "type": "epub",
        "embeddings": "openaiembeddings"
    }

In [3]:
documents = SimpleDirectoryReader(input_dir="../data", file_metadata=extract_epub_metadata).load_data()

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [4]:
print(f"Total Documents: {len(documents)}")
print(documents[0].metadata)

Total Documents: 1
{'id': 'epub-46d8649981b9488bb0fef4a897d3b2ff', 'title': 'Islamic Stories ', 'author': 'Kids Collection - XKP', 'language': 'en', 'description': "Collection of Short Stories for kids. Including A CALL TO PRAYERS ON A COLD WINTRY MORNING, BAHLOOL PROVES THE THREE FACTS, AHLUL BAIT AT MUBAHALA, GOD SEES YOU EVERYWHERE, TIT FOR TAT, CO-OPERATION THE KEY TO SUCCESS, SHE HAD TRUE FAITH, AL-QUR'AN - THE GREAT AND HOLY BOOK, MUST GOD BE JUST?, FROM THE SHADOW OF A TREE TO THE SHADOW OF ISLAM, ON THE PATH OF RELIGION, THE ANT PRAYS FOR RAIN, CARE FOR ANIMALS, PROPHET SULAIMAN (SOLOMON) THE KING PROPHET, HERE COMES AL-AMIN - THE TRUSTWORTHY, HERCULES AND ISLAM, JA'FAR AL-TAYYAR - THE REFUGEE WHO BROUGHT ISLAM TO AFRICA, PROPHET IBRAHIM AND THE IDOL WORSHIP, THE PROPHET MUHAMMAD (s) ENTERS MADINA, NAZR: A VOW WITH ALLAH TO BE FULFILLED, FOR THE SAKE OF ALLAH ONLY, IMAMAT IS GUIDANCE WE NEED, THE LEARNED MAN AND THE HUNGRY DOG, THE FIVE LOAVES\n-\nISLAMICMOBILITY.COM", 'type': 

# Splitting along Headings, Subheadings, and so on

In [5]:
msg = ""
for document in documents:
    msg += document.text

In [7]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Heading"),
    ("##", "Subheading"),
    ("###", "Subsubheading")
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
chunks = splitter.split_text(msg)
len(chunks)

26

In [None]:
for i, chunk in enumerate(chunks):
    # Print chunk number and metadata
    print(f"📄 Chunk #{i+1}")
    print(f"🔍 Metadata: {chunk.metadata}")
    
    # Print preview of content (first 1000 characters)
    content_preview = chunk.page_content[:1000] + "..." if len(chunk.page_content) > 1000 else chunk.page_content
    print(f"\n📝 Content Preview:\n{content_preview}")
    
    # Divider for better visual separation
    print("\n" + "=" * 70 + "\n")

In [None]:
# Get only those chunks that have only "Heading" and "Subheading" metadata as they are chapter chunks
chapter_chunks = [chunk for chunk in chunks if "Heading" in chunk.metadata and "Subheading" in chunk.metadata and "Subsubheading" not in chunk.metadata]
len(chapter_chunks)

25

In [17]:
from llama_index.core import Document, DocumentSummaryIndex

def summarize_documents(docs):
    # Create Document objects from the text list
    documents = [Document(text=t.page_content, metadata=t.metadata) for t in docs]

    # Initialize the Document Summary Index
    index = DocumentSummaryIndex.from_documents(documents)

    # Retrieve summaries for each document
    summaries = []
    for doc in documents:
        summary = index.get_document_summary(doc.get_doc_id())
        summaries.append(summary)

    return summaries

In [18]:
summaries = summarize_documents(chapter_chunks)

current doc id: 3b5e5b9f-c0f2-45f9-911b-40dfff6b08f9
current doc id: b3fde31d-2e62-4d7a-9dbc-35a0bc804a4c
current doc id: 5ee63c51-2aec-4e3d-8838-9094f48b53df
current doc id: b9af1f3a-50c3-4c56-a93c-1cece9a2ce01
current doc id: 370e07c1-2bef-4ce5-9918-7c14bd946f77
current doc id: a4d379f4-cc99-4213-913d-21fd16061286
current doc id: f957505a-f173-4386-99f9-df29aeb7d428
current doc id: 84452988-fa89-4f51-9103-a811dd2631a2
current doc id: 9db55c9f-c943-4b9d-9040-5e4a2be003c6
current doc id: 539de77e-8ba7-408a-af59-337d85efc0c6
current doc id: dea66e22-9825-4a20-9ded-5fa0060dddc8
current doc id: 7b80cd96-90b2-414c-bb85-e1b32d8d5935
current doc id: efe822a3-d27a-4c57-8054-661b45826ff7
current doc id: a52e3692-a3f6-40e9-a7bb-674b45755134
current doc id: 82072b2f-0fe1-45a3-8090-507017def53a
current doc id: 81451c5e-711c-48e9-b962-149f6c1bfd1c
current doc id: 93b93581-585e-4ca5-8dfd-1321ea6272fe
current doc id: 76ef2296-565b-444c-aa0b-db71a8d08176
current doc id: 57088b5d-65ea-486e-82f8-23d9fa

  summary = index.get_document_summary(doc.get_doc_id())


In [20]:
for i, summary in enumerate(summaries, 1):
    print(f"Summary of Document {i}: {summary}\n-----------------------------\n")

Summary of Document 1: The provided text is about a collection of Islamic stories for kids, specifically focusing on the character Behlool. These stories may cover various themes such as trust, faith, comedy, and interactions with animals. The text can potentially answer questions related to Islamic teachings for children, the significance of certain characters like Behlool, the importance of faith and trust in Islam, and how moral lessons are conveyed through storytelling in Islamic literature for young readers.
-----------------------------

Summary of Document 2: The provided text is about a story involving the renowned Persian thinker and philosopher 'Ibn Sina' (Avicenna) and his student 'Bahman Yar'. The story highlights a moment when 'Bahman Yar' suggests to 'Ibn Sina' that he should declare himself a prophet due to his exceptional knowledge and status. 'Ibn Sina' responds by pointing out the importance of obedience and humility in the face of the example set by Prophet Muhammad,

We now have documents as well as their summaries. We can now index them to our vector db.