In [2]:
import os
import json
import logging
import re
import shutil
from typing import List, Dict, Any, Optional, Tuple

# --- LangChain & Pydantic Imports ---
from langchain_core.documents import Document
from pydantic import BaseModel, Field # Direct Pydantic import
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader
from langchain_ollama import OllamaEmbeddings # Assuming Ollama for embeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Setup Logging ---
# Ensure logging is configured (might be done once per notebook session)
if not logging.getLogger().hasHandlers(): # Avoid adding multiple handlers
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) # Ensure logger level is set

# --- Configuration ---
BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
# BOOK_PATH = "/path/to/your/book.pdf" # Example for PDF

# Output paths for this focused step
OUTPUT_HIERARCHICAL_TOC_JSON = "./book_hierarchical_toc_parsed.json"
CHROMA_PERSIST_DIR_HIERARCHICAL = "./chroma_db_book_hierarchical_chunks_v1"
CHROMA_COLLECTION_NAME_HIERARCHICAL = "book_hierarchical_chunks_v1"

# Embedding and Chunking
EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100

# --- Helper: Clean metadata values for ChromaDB ---
def clean_metadata_for_chroma(value: Any) -> Any:
    if isinstance(value, list): return ", ".join(map(str, value)) # Convert list to string
    elif isinstance(value, dict): return json.dumps(value)      # Convert dict to JSON string
    elif isinstance(value, (str, int, float, bool)) or value is None: return value
    else: return str(value)

# --- Core Function: Load Book, Extract Hierarchical ToC, Create Chunks with Hierarchy ---
def process_book_for_hierarchical_toc_and_chunks(book_path: str) -> Tuple[List[Document], List[Dict[str, Any]]]:
    logger.info(f"Processing book: '{book_path}' for hierarchical ToC and chunk metadata.")
    _, file_extension = os.path.splitext(book_path.lower())

    all_book_documents_with_hierarchy: List[Document] = []
    hierarchical_toc: List[Dict[str, Any]] = []

    if file_extension == ".epub":
        epub_loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
        try:
            raw_elements = epub_loader.load()
            if not raw_elements: raise ValueError("No elements loaded from EPUB.")
            logger.info(f"Loaded {len(raw_elements)} elements from EPUB.")
        except Exception as e:
            logger.error(f"Error loading EPUB: {e}", exc_info=True); raise

        current_paths = {level: None for level in range(1, 5)}
        current_numbers = {level: 0 for level in range(1, 5)}

        chapter_regex = r"(?i)^(chapter\s+(\d+|[IVXLCDM]+)\b|part\s+[A-Z0-9]+|appendix\s+[A-Z])"
        ignore_title_prefixes = ("Note:", "Tip:", "Figure:", "Table:", "Listing:", "Caution:")

        current_chapter_toc_entry = None
        current_section_toc_entry = None
        # current_subsection_toc_entry = None # Not explicitly used for adding deeper, but for context

        for i, element_doc in enumerate(raw_elements):
            element_text = element_doc.page_content.strip() if element_doc.page_content else ""
            element_category = element_doc.metadata.get("category", "").lower()
            
            if not element_text: continue

            is_potential_heading = element_category == "title" and len(element_text) < 200
            is_ignorable = any(element_text.lower().startswith(p.lower()) for p in ignore_title_prefixes)
            level_detected = 0

            if is_potential_heading and not is_ignorable:
                if re.match(chapter_regex, element_text):
                    level_detected = 1
                elif current_paths[1]: # Inside a chapter
                    if current_paths[2] and not current_paths[3]: # Inside a section, potential subsection
                        level_detected = 3
                    elif not current_paths[2]: # Not in a section yet, so this is a section
                        level_detected = 2
                    elif current_paths[3] and not current_paths[4]: # Inside a subsection, potential sub-subsection
                        level_detected = 4
                    # Add more conditions if you want to detect L3 or L4 more reliably.
                    # This basic heuristic might misclassify titles if structure isn't simple H1->H2->H3.

            if level_detected > 0:
                for L_reset in range(level_detected + 1, 5):
                    current_paths[L_reset] = None
                    current_numbers[L_reset] = 0
                
                current_numbers[level_detected] += 1
                current_paths[level_detected] = element_text
                
                # Create new_toc_item with appropriate children key based on level
                new_toc_item = {
                    "title": element_text,
                    "level": level_detected,
                    "number_hierarchical": ".".join(map(str, [current_numbers[l_num] for l_num in range(1, level_detected + 1) if current_paths[l_num]]))
                }
                if level_detected < 4: # Levels 1, 2, 3 can have children ("sections" or "subsections")
                    # Level 1 (Chapter) has "sections"
                    # Level 2 (Section) has "subsections" (which we'll also call "sections" for simplicity in this dict key)
                    # Level 3 (Subsection) has "subsections"
                    new_toc_item["sections"] = [] # Use a generic "sections" key for children of the current item

                if level_detected == 1:
                    hierarchical_toc.append(new_toc_item)
                    current_chapter_toc_entry = new_toc_item
                    current_section_toc_entry = None 
                elif level_detected == 2 and current_chapter_toc_entry:
                    current_chapter_toc_entry["sections"].append(new_toc_item)
                    current_section_toc_entry = new_toc_item
                elif level_detected == 3 and current_section_toc_entry: # If it's a subsection, add to current section
                    current_section_toc_entry["sections"].append(new_toc_item)
                    # current_subsection_toc_entry = new_toc_item # If we were tracking L3 pointers
                elif level_detected == 4 and current_section_toc_entry: # Assuming L4 falls under L3, which is under L2.
                                                                     # This logic needs L3 pointer for L4.
                                                                     # For simplicity, let's say L4 also goes into L2's "sections" if no L3 ptr
                    # If current_subsection_toc_entry was set when L3 was detected:
                    # if current_subsection_toc_entry:
                    #    current_subsection_toc_entry["sections"].append(new_toc_item)
                    # else: # Fallback: add to current L2 section if no L3 pointer active
                    if current_section_toc_entry: # Check again
                         current_section_toc_entry["sections"].append(new_toc_item)
                
                logger.debug(f"TOC L{level_detected}: {element_text}")

            doc_metadata = {
                "source": os.path.basename(book_path),
                "original_category": element_category,
                "raw_text_snippet": element_text[:100]
            }
            for L_meta in range(1, 5):
                if current_paths[L_meta]:
                    doc_metadata[f"level_{L_meta}_title"] = current_paths[L_meta]
                    doc_metadata[f"level_{L_meta}_number_hierarchical"] = ".".join(map(str, [current_numbers[l_num_meta] for l_num_meta in range(1, L_meta + 1) if current_paths[l_num_meta]]))
            
            cleaned_doc_metadata = {k: clean_metadata_for_chroma(v) for k, v in doc_metadata.items() if v is not None}
            all_book_documents_with_hierarchy.append(Document(page_content=element_text, metadata=cleaned_doc_metadata))

    # --- For PDF: Basic page-level processing ---
    elif file_extension == ".pdf":
        # ... (your existing PDF code) ...
        logger.warning("PDF processing: Using pages as primary elements. Hierarchical ToC will be page-based.")
        pdf_loader = PyPDFLoader(book_path)
        try:
            pages = pdf_loader.load()
            if not pages: raise ValueError("No pages loaded from PDF.")
            logger.info(f"Loaded {len(pages)} pages from PDF.")
        except Exception as e:
            logger.error(f"Error loading PDF: {e}", exc_info=True); raise

        pdf_main_toc_entry = {"title": "PDF Content (Page by Page)", "level": 1, "number_hierarchical": "PDF", "sections": []}
        hierarchical_toc.append(pdf_main_toc_entry)

        for i, page_doc in enumerate(pages):
            page_num = page_doc.metadata.get('page', i + 1)
            page_title = f"Page {page_num}"
            
            pdf_main_toc_entry["sections"].append({"title": page_title, "level": 2, "number_hierarchical": f"PDF.{page_num}", "sections": []}) # Add "sections" key here too
            
            doc_metadata = {
                "source": os.path.basename(book_path),
                "page_number": page_num,
                "level_1_title": "PDF Content", 
                "level_1_number_hierarchical": "PDF",
                "level_2_title": page_title,    
                "level_2_number_hierarchical": f"PDF.{page_num}"
            }
            cleaned_doc_metadata = {k: clean_metadata_for_chroma(v) for k, v in doc_metadata.items() if v is not None}
            all_book_documents_with_hierarchy.append(Document(page_content=page_doc.page_content, metadata=cleaned_doc_metadata))

    else:
        raise ValueError(f"Unsupported book file format: {file_extension}")

    if not all_book_documents_with_hierarchy:
        logger.error("No documents processed from the book.")
        return [], []

    logger.info(f"Total documents processed with hierarchical metadata: {len(all_book_documents_with_hierarchy)}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len
    )
    final_chunks = text_splitter.split_documents(all_book_documents_with_hierarchy)
    logger.info(f"Split into {len(final_chunks)} final chunks for vector store.")
    
    return final_chunks, hierarchical_toc

   

# --- Main execution block for Jupyter ---

# 1. Delete old DB if it exists (for clean runs during development)
if os.path.exists(CHROMA_PERSIST_DIR_HIERARCHICAL):
    logger.info(f"Deleting existing ChromaDB directory: {CHROMA_PERSIST_DIR_HIERARCHICAL}")
    shutil.rmtree(CHROMA_PERSIST_DIR_HIERARCHICAL)

# 2. Process the book: Get chunks with hierarchical metadata and the hierarchical ToC
logger.info(f"Using book: {BOOK_PATH}")
final_chunks_for_db, book_hierarchical_toc = process_book_for_hierarchical_toc_and_chunks(BOOK_PATH)

# 3. Save the Hierarchical Table of Contents
if book_hierarchical_toc:
    try:
        with open(OUTPUT_HIERARCHICAL_TOC_JSON, 'w', encoding='utf-8') as f:
            json.dump(book_hierarchical_toc, f, indent=2, ensure_ascii=False)
        logger.info(f"Hierarchical Table of Contents saved to: {OUTPUT_HIERARCHICAL_TOC_JSON}")
        # You can print a snippet for quick review in Jupyter
        # print("\n--- Sample of Hierarchical ToC ---")
        # print(json.dumps(book_hierarchical_toc[:2], indent=2)) # Print first 2 chapters
        # print("...")
    except Exception as e:
        logger.error(f"Error saving hierarchical ToC: {e}", exc_info=True)
else:
    logger.warning("No hierarchical ToC was generated to save.")

# 4. Create and Persist the Vector Database with hierarchical chunks
if final_chunks_for_db:
    logger.info("Initializing Ollama embedding model...")
    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        logger.info(f"Creating/Rebuilding vector database at: {CHROMA_PERSIST_DIR_HIERARCHICAL}")
        vector_db = Chroma.from_documents(
            documents=final_chunks_for_db, # These chunks have the hierarchical metadata
            embedding=embedding_model,
            persist_directory=CHROMA_PERSIST_DIR_HIERARCHICAL,
            collection_name=CHROMA_COLLECTION_NAME_HIERARCHICAL
        )
        logger.info(f"Vector database created/updated with {len(final_chunks_for_db)} chunks successfully.")
        logger.info(f"ChromaDB collection count: {vector_db._collection.count()}")

        # Test a query (optional)
        # if vector_db._collection.count() > 0:
        #     test_query = "digital forensics history" # Example query
        #     logger.info(f"Testing DB with query: '{test_query}'")
        #     results = vector_db.similarity_search(test_query, k=2)
        #     if results:
        #         logger.info(f"Found {len(results)} results for test query.")
        #         for doc in results:
        #             logger.info(f"  - Metadata: {doc.metadata}")
        #             logger.info(f"    Content snippet: {doc.page_content[:100]}...")
        #     else:
        #         logger.info("Test query returned no results.")

    except Exception as e:
        logger.error(f"Error initializing embedding model or creating vector DB: {e}", exc_info=True)
else:
    logger.warning("No chunks were generated, so no vector database will be created.")

logger.info("Focused script for ToC extraction and DB creation finished.")

2025-06-15 21:45:55,596 - INFO - Using book: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
2025-06-15 21:45:55,597 - INFO - Processing book: '/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub' for hierarchical ToC and chunk metadata.
  data file translations/en.yaml not found
  data file translations/en.yaml not found


2025-06-15 21:46:03,650 - INFO - Loaded 11815 elements from EPUB.
2025-06-15 21:46:03,765 - INFO - Total documents processed with hierarchical metadata: 11483
2025-06-15 21:46:04,151 - INFO - Split into 11774 final chunks for vector store.
2025-06-15 21:46:04,164 - INFO - Hierarchical Table of Contents saved to: ./book_hierarchical_toc_pars

# Extract TOC from epub

In [8]:
import re
import json
from ebooklib import epub, ITEM_NAVIGATION
from bs4 import BeautifulSoup

# No need to ignore the XML warning, as we'll be parsing XML correctly.

# Define keywords to ignore for a clean ToC.
IGNORE_KEYWORDS = [
    'note', 'tip', 'caution', 'list of illustrations', 'list of tables',
    'copyright statement', 'hands-on project'
]

def parse_navpoint(navpoint, level=0):
    """
    Recursively parses an NCX <navPoint> element and its children.
    This is for the EPUB 2 format.
    """
    # The title is in the <navLabel>/<text> tags
    title = navpoint.navLabel.text.strip()
    
    # --- Filtering Logic ---
    if not title or re.fullmatch(r'\d+\.?', title) or \
       any(title.lower().startswith(keyword) for keyword in IGNORE_KEYWORDS):
        return None

    node = {
        "level": level,
        "title": title,
        "children": []
    }

    # Recursively process child <navPoint> elements
    for child_navpoint in navpoint.find_all('navPoint', recursive=False):
        child_node = parse_navpoint(child_navpoint, level + 1)
        if child_node:
            node["children"].append(child_node)
    
    return node

def parse_li(li_element, level=0):
    """
    Recursively parses an XHTML <li> element and its children.
    This is for the EPUB 3 format.
    """
    a_tag = li_element.find('a')

    if a_tag:
        title = a_tag.get_text(strip=True)

        if not title or re.fullmatch(r'\d+\.?', title) or \
           any(title.lower().startswith(keyword) for keyword in IGNORE_KEYWORDS):
            return None

        node = {
            "level": level,
            "title": title,
            "children": []
        }

        nested_ol = li_element.find('ol')
        if nested_ol:
            for sub_li in nested_ol.find_all('li', recursive=False):
                child_node = parse_li(sub_li, level + 1)
                if child_node:
                    node["children"].append(child_node)
        
        return node
    return None

def extract_toc_as_json(epub_path, output_json_path):
    """
    Extracts a clean, hierarchical ToC from an EPUB, supporting both
    EPUB 2 (.ncx) and EPUB 3 (.xhtml) formats, and saves it as a JSON file.
    """
    toc_data = []
    try:
        book = epub.read_epub(epub_path)
        print(f"Processing ToC for: {epub_path}")

        # Ebooklib gives us all navigation documents. We'll check each one.
        for nav_item in book.get_items_of_type(ITEM_NAVIGATION):
            soup = BeautifulSoup(nav_item.get_content(), 'xml')
            
            # --- CHECK 1: Is it an EPUB 2 (.ncx) file? ---
            if nav_item.get_name().endswith('.ncx'):
                print("INFO: Found EPUB 2 (NCX) Table of Contents.")
                # The main container in an NCX file is the <navMap>
                navmap = soup.find('navMap')
                if navmap:
                    # We process the top-level <navPoint> tags
                    for navpoint in navmap.find_all('navPoint', recursive=False):
                        node = parse_navpoint(navpoint)
                        if node:
                            toc_data.append(node)
            
            # --- CHECK 2: Is it an EPUB 3 (.xhtml) file? ---
            else:
                print("INFO: Found EPUB 3 (XHTML) Table of Contents.")
                # The main container is <nav epub:type="toc">
                toc_nav = soup.select_one('nav[epub|type="toc"]')
                if toc_nav:
                    top_ol = toc_nav.find('ol')
                    if top_ol:
                        for li in top_ol.find_all('li', recursive=False):
                            node = parse_li(li)
                            if node:
                                toc_data.append(node)

            # If we successfully extracted a ToC, we can stop.
            if toc_data:
                print("INFO: Table of Contents extracted successfully.")
                break
        
        if not toc_data:
            print("\nWARNING: Finished processing, but no valid ToC data was extracted.")
        
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(toc_data, f, indent=2, ensure_ascii=False)
        
        print(f"\n✅ Successfully wrote hierarchical ToC to: {output_json_path}")

    except FileNotFoundError:
        print(f"ERROR: The file was not found at {epub_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Usage Example ---
epub_file = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
json_output_file = "/home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/epub_table_of_contents.json"

extract_toc_as_json(epub_file, json_output_file)

Processing ToC for: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
INFO: Found EPUB 2 (NCX) Table of Contents.
INFO: Table of Contents extracted successfully.

✅ Successfully wrote hierarchical ToC to: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/epub_table_of_contents.json


# Extract TOC from PDF

In [12]:
import fitz  # The PyMuPDF library
import json

def build_hierarchy(toc_list):
    """
    Converts a flat list from PyMuPDF's get_toc() into a nested hierarchy.
    """
    root = []
    # This stack keeps track of the parent node at each level.
    # The key is the level, the value is the node dictionary.
    parent_stack = {0: {"children": root}}

    for level, title, page in toc_list:
        node = {
            "level": level,
            "title": title.strip(),
            "page": page,
            "children": []
        }

        # The parent of the current node is the last node seen at level-1
        parent_node = parent_stack[level - 1]
        parent_node["children"].append(node)

        # The current node becomes the new parent for any subsequent, deeper nodes
        parent_stack[level] = node

    return root

def extract_pdf_bookmarks_to_json(pdf_path, output_json_path):
    """
    Extracts the bookmarks (outline) from a PDF file, builds a hierarchy,
    and saves it as a JSON file.
    """
    print(f"Processing PDF: {pdf_path}")
    try:
        doc = fitz.open(pdf_path)
        
        # get_toc() returns a flat list of bookmarks like: [[lvl, title, page], ...]
        # For example: [[1, "Chapter 1", 10], [2, "Section 1.1", 12]]
        toc = doc.get_toc()
        
        if not toc:
            print("WARNING: This PDF has no embedded bookmarks (Table of Contents). Cannot extract structure.")
            # Create an empty JSON file
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump([], f)
            return

        print(f"INFO: Found {len(toc)} bookmark entries.")

        # Convert the flat list into a nested, hierarchical structure
        hierarchical_toc = build_hierarchy(toc)

        # Write the hierarchical data to the JSON file
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(hierarchical_toc, f, indent=2, ensure_ascii=False)
            
        print(f"\n✅ Successfully wrote PDF bookmarks to: {output_json_path}")

    except FileNotFoundError:
        print(f"ERROR: The file was not found at {pdf_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Usage Example ---
# Replace this with the path to your PDF file
pdf_file_path = "/home/sebas_dev_linux/projects/course_generator/data/books/(Chapman & Hall_CRC Cryptography and Network Security Series) Jonathan Katz, Yehuda Lindell - Introduction to Modern Cryptography-CRC Press (2020).pdf" 
json_output_file = "/home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/pdf_table_of_contents.json"

extract_pdf_bookmarks_to_json(pdf_file_path, json_output_file)

Processing PDF: /home/sebas_dev_linux/projects/course_generator/data/books/(Chapman & Hall_CRC Cryptography and Network Security Series) Jonathan Katz, Yehuda Lindell - Introduction to Modern Cryptography-CRC Press (2020).pdf
INFO: Found 290 bookmark entries.

✅ Successfully wrote PDF bookmarks to: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/pdf_table_of_contents.json


In [1]:
import os
import json
import logging
import re
import shutil
from typing import List, Dict, Any, Optional, Tuple

# --- LangChain & Pydantic Imports ---
from langchain_core.documents import Document
from pydantic import BaseModel, Field # Pydantic not directly used in this script but often in surrounding code
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader # For loading full text
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Setup Logging ---
if not logging.getLogger().hasHandlers():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# --- Configuration ---
# Determine book type and corresponding ToC path
IS_EPUB = True # Set to False if processing a PDF

if IS_EPUB:
    BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
    # Assume you've run your ebooklib script and saved its output here:
    PRE_EXTRACTED_TOC_JSON_PATH = "./epub_table_of_contents.json"
else: # Example for PDF
    BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/(Chapman & Hall_CRC Cryptography and Network Security Series) Jonathan Katz, Yehuda Lindell - Introduction to Modern Cryptography-CRC Press (2020).pdf"
    # Assume you've run your PyMuPDF script and saved its output here:
    PRE_EXTRACTED_TOC_JSON_PATH = "./pdf_table_of_contents.json"

# Define output path for the re-saved ToC
OUTPUT_HIERARCHICAL_TOC_JSON = "./processed_book_table_of_contents.json"

CHROMA_PERSIST_DIR_HIERARCHICAL = "./chroma_db_book_toc_guided_chunks_v1"
CHROMA_COLLECTION_NAME_HIERARCHICAL = "book_toc_guided_chunks_v1"

EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
CHUNK_SIZE = 800  # Size of final chunks for ChromaDB
CHUNK_OVERLAP = 100

# --- Helper: Clean metadata values for ChromaDB ---
def clean_metadata_for_chroma(value: Any) -> Any:
    if isinstance(value, list): return ", ".join(map(str, value))
    elif isinstance(value, dict): return json.dumps(value)
    elif isinstance(value, (str, int, float, bool)) or value is None: return value
    else: return str(value)

# --- Core Function ---
def process_book_with_extracted_toc(
    book_path: str,
    extracted_toc_path: str
) -> Tuple[List[Document], List[Dict[str, Any]]]:
    
    logger.info(f"Processing book '{book_path}' using pre-extracted ToC from '{extracted_toc_path}'.")

    # 1. Load the pre-extracted hierarchical ToC
    hierarchical_toc = []
    try:
        with open(extracted_toc_path, 'r', encoding='utf-8') as f:
            hierarchical_toc = json.load(f)
        if not hierarchical_toc:
            logger.error(f"Pre-extracted ToC at '{extracted_toc_path}' is empty.")
            return [], [] # Return empty list for ToC as well
        logger.info(f"Successfully loaded pre-extracted hierarchical ToC with {len(hierarchical_toc)} top-level entries.")
    except Exception as e:
        logger.error(f"Error loading pre-extracted ToC JSON from '{extracted_toc_path}': {e}", exc_info=True)
        return [], [] # Return empty list for ToC as well

    # 2. Load full book content
    all_text_elements: List[Document] = []
    _, file_extension = os.path.splitext(book_path.lower())

    if file_extension == ".epub":
        loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
        try:
            all_text_elements = loader.load()
            logger.info(f"Loaded {len(all_text_elements)} text elements from EPUB.")
        except Exception as e:
            logger.error(f"Error loading EPUB content for text: {e}", exc_info=True); return [], hierarchical_toc
    elif file_extension == ".pdf":
        loader = PyPDFLoader(book_path)
        try:
            all_text_elements = loader.load()
            logger.info(f"Loaded {len(all_text_elements)} pages from PDF.")
        except Exception as e:
            logger.error(f"Error loading PDF content for text: {e}", exc_info=True); return [], hierarchical_toc
    else:
        logger.error(f"Unsupported book file format: {file_extension}")
        return [], hierarchical_toc

    if not all_text_elements:
        logger.error("No text elements loaded from the book.")
        return [], hierarchical_toc

    # Concatenating full book text is not used in the current metadata assignment logic,
    # but kept here in case future refinements need it. Can be memory intensive.
    # full_book_text = "\n\n".join([doc.page_content for doc in all_text_elements if doc.page_content])
    # logger.info(f"Full book text loaded (length: {len(full_book_text)} characters).")

    # 3. Prepare for assigning hierarchical metadata
    
    # Create a flat list of all ToC entries with their full path for easier lookup
    flat_toc_with_paths: List[Dict[str, Any]] = []
    def flatten_toc_recursive(nodes, parent_path_titles, parent_path_numbers):
        for i, node in enumerate(nodes):
            title = node.get("title","").strip()
            level = node.get("level", len(parent_path_titles) + 1) # Derive level if not present
            page_start_val = node.get("page") # Can be None or a number

            current_path_titles = parent_path_titles + [title]
            # current_path_numbers = parent_path_numbers + [str(node.get("number", i + 1))] # Example numbering

            flat_toc_with_paths.append({
                "path_titles": current_path_titles,
                # "path_numbers": current_path_numbers,
                "level": level,
                "title": title,
                "page_start": page_start_val
            })
            
            children = node.get("children") or node.get("sections") # Handle both common keys
            if children:
                flatten_toc_recursive(children, current_path_titles, []) # parent_path_numbers for more complex numbering

    flatten_toc_recursive(hierarchical_toc, [], [])
    
    # Sort flat_toc by page number if page numbers are available and useful (mainly for PDFs)
    # Ensure page_start is treated as an integer for sorting, with None handled.
    if flat_toc_with_paths:
        # Sorts items with None page_start (or page_start treated as -1) to the beginning.
        flat_toc_with_paths.sort(key=lambda x: x.get("page_start") if x.get("page_start") is not None else -1)
        logger.info(f"Flattened and sorted ToC with {len(flat_toc_with_paths)} entries.")


    # Assign hierarchical metadata to each document from all_text_elements
    final_documents_with_metadata: List[Document] = []
    
    if file_extension == ".pdf":
        logger.info("Processing PDF: Assigning metadata based on ToC page ranges.")
        for page_doc in all_text_elements:
            # PyPDFLoader's "page" metadata is 0-indexed. ToC page numbers are often 1-indexed.
            page_num_0_indexed = page_doc.metadata.get("page", 0) 
            page_num_1_indexed = page_num_0_indexed + 1
            
            active_toc_entry = None
            # Find the ToC entry that this page belongs to
            for toc_item in flat_toc_with_paths:
                current_entry_page_start = toc_item.get("page_start")
                if current_entry_page_start is None: # Cannot use this ToC entry for page matching
                    continue

                # Find the start page of the *next* ToC item to define the end of the current section
                next_entry_page_start_for_range = float('inf')
                current_item_index = flat_toc_with_paths.index(toc_item)
                for next_toc_item_idx in range(current_item_index + 1, len(flat_toc_with_paths)):
                    potential_next_page_start = flat_toc_with_paths[next_toc_item_idx].get("page_start")
                    if potential_next_page_start is not None:
                        next_entry_page_start_for_range = potential_next_page_start
                        break 
                
                if current_entry_page_start <= page_num_1_indexed < next_entry_page_start_for_range:
                    active_toc_entry = toc_item
                    break # Found the most specific ToC entry for this page
            
            metadata = {"source": os.path.basename(book_path), "page_number": page_num_1_indexed}
            if active_toc_entry:
                for i, title in enumerate(active_toc_entry["path_titles"]):
                    metadata[f"level_{i+1}_title"] = title
                # You could add other metadata from active_toc_entry if needed
            else: # Page not covered by any specific ToC entry, or all ToC entries lacked page numbers
                metadata["level_1_title"] = "Uncategorized PDF Content"
            
            cleaned_metadata = {k: clean_metadata_for_chroma(v) for k,v in metadata.items() if v is not None}
            final_documents_with_metadata.append(Document(page_content=page_doc.page_content, metadata=cleaned_metadata))

    elif file_extension == ".epub":
        logger.warning("For EPUB, using element-based heuristic for assigning hierarchical metadata to chunks. The external ToC is primarily for output and less for precise text segmentation in this version.")
        current_paths = {level: None for level in range(1, 6)} # Support up to 5 levels
        current_numbers = {level: 0 for level in range(1, 6)}    
        # More generic regex for titles/chapters/parts
        title_regexes = [
            (1, re.compile(r"^(?:PART|BOOK)\s+[IVXLCDM\d]+", re.IGNORECASE)),
            (1, re.compile(r"^CHAPTER\s+\d+", re.IGNORECASE)),
            (2, re.compile(r"^\d+\.\d+\s+[^0-9.]", re.IGNORECASE)), # e.g. 1.1 Section Title
            (3, re.compile(r"^\d+\.\d+\.\d+\s+[^0-9.]", re.IGNORECASE)), # e.g. 1.1.1 Subsection
        ]
        ignore_title_prefixes = ("Note:", "Tip:", "Figure", "Table", "Listing", "Caution:", "Example:", "Source:", "Credit:")

        for element_doc in all_text_elements:
            element_text = element_doc.page_content.strip() if element_doc.page_content else ""
            element_category = element_doc.metadata.get("category", "").lower() # from Unstructured
            
            if not element_text: continue

            is_potential_heading = (element_category == "title" or len(element_text.split()) < 15) and len(element_text) < 250 # Heuristic for titles
            is_ignorable = any(element_text.lower().startswith(p.lower()) for p in ignore_title_prefixes)
            
            level_detected = 0
            detected_title = ""

            if is_potential_heading and not is_ignorable:
                # Try to match known title patterns from ToC (if robust matching is feasible)
                # Or use regex/heuristics as fallback
                for lvl, rgx in title_regexes:
                    match = rgx.match(element_text)
                    if match:
                        level_detected = lvl
                        detected_title = element_text # Use the full matched text as title
                        break
                
                # Simpler heuristic if regex doesn't match but it's categorized as title
                if not level_detected and element_category == "title":
                    # Basic hierarchy based on assumption of sequential titles
                    if not current_paths[1]: level_detected = 1
                    elif not current_paths[2]: level_detected = 2
                    elif not current_paths[3]: level_detected = 3
                    elif not current_paths[4]: level_detected = 4
                    else: level_detected = 5 # Max depth for this heuristic
                    if level_detected > 0: detected_title = element_text


            if level_detected > 0 and detected_title:
                for L_reset in range(level_detected + 1, 6): 
                    current_paths[L_reset], current_numbers[L_reset] = None, 0
                current_numbers[level_detected] += 1
                current_paths[level_detected] = detected_title.strip()
            
            doc_metadata = {"source": os.path.basename(book_path), "original_category": element_category}
            # Add raw_text_snippet for debugging if needed
            # doc_metadata["raw_text_snippet"] = element_text[:100] 
            
            has_hierarchy_info = False
            for L_meta in range(1, 6):
                if current_paths[L_meta]:
                    doc_metadata[f"level_{L_meta}_title"] = current_paths[L_meta]
                    # Construct hierarchical numbering like 1.2.1
                    num_parts = [str(current_numbers[l_num_meta]) for l_num_meta in range(1, L_meta + 1) if current_paths[l_num_meta]]
                    if num_parts:
                        doc_metadata[f"level_{L_meta}_number_hierarchical"] = ".".join(num_parts)
                    has_hierarchy_info = True
            
            if not has_hierarchy_info and not final_documents_with_metadata: # First element, no hierarchy yet
                 doc_metadata["level_1_title"] = "Book Introduction or Preamble"

            cleaned_doc_metadata = {k: clean_metadata_for_chroma(v) for k, v in doc_metadata.items() if v is not None}
            final_documents_with_metadata.append(Document(page_content=element_text, metadata=cleaned_doc_metadata))
    else: 
        # Fallback for unsupported types, or if no specific processing was done
        logger.warning(f"No specific hierarchical processing for file type {file_extension}. Using basic document loading.")
        final_documents_with_metadata = all_text_elements # Each doc has minimal metadata

    if not final_documents_with_metadata:
        logger.error("No documents were processed or enriched with hierarchical metadata.")
        return [], hierarchical_toc

    logger.info(f"Total documents prepared for chunking: {len(final_documents_with_metadata)}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        add_start_index=False 
    )
    final_chunks = text_splitter.split_documents(final_documents_with_metadata)
    logger.info(f"Split into {len(final_chunks)} final chunks for vector store, inheriting hierarchical metadata.")
    
    # Log a sample of metadata from the first few chunks
    for i, chunk in enumerate(final_chunks[:min(3, len(final_chunks))]):
        logger.debug(f"Sample Chunk {i} Metadata: {chunk.metadata}")
        
    return final_chunks, hierarchical_toc


# --- Main execution block ---

# 1. Check if pre-extracted ToC JSON exists
if not os.path.exists(PRE_EXTRACTED_TOC_JSON_PATH):
    logger.error(f"CRITICAL: Pre-extracted ToC file not found at '{PRE_EXTRACTED_TOC_JSON_PATH}'.")
    logger.error("Please run your EPUB or PDF ToC extraction script first and ensure the output is at this path.")
    book_hierarchical_toc_loaded = [] 
    final_chunks_for_db = []
else:
    # 2. Process the book using the pre-extracted ToC
    logger.info(f"Using book: {BOOK_PATH} and pre-extracted ToC: {PRE_EXTRACTED_TOC_JSON_PATH}")
    final_chunks_for_db, book_hierarchical_toc_loaded = process_book_with_extracted_toc(
        BOOK_PATH,
        PRE_EXTRACTED_TOC_JSON_PATH
    )

# 3. Save the (loaded) Hierarchical Table of Contents
if book_hierarchical_toc_loaded:
    try:
        with open(OUTPUT_HIERARCHICAL_TOC_JSON, 'w', encoding='utf-8') as f:
            json.dump(book_hierarchical_toc_loaded, f, indent=2, ensure_ascii=False)
        logger.info(f"Pre-extracted Hierarchical Table of Contents was loaded and re-saved to: {OUTPUT_HIERARCHICAL_TOC_JSON}")
    except Exception as e:
        logger.error(f"Error re-saving hierarchical ToC: {e}", exc_info=True)
else:
    logger.warning("No hierarchical ToC was loaded (or loading failed), so nothing to re-save.")


# 4. Create and Persist the Vector Database
if final_chunks_for_db:
    if os.path.exists(CHROMA_PERSIST_DIR_HIERARCHICAL):
        logger.info(f"Deleting existing ChromaDB directory: {CHROMA_PERSIST_DIR_HIERARCHICAL}")
        try:
            shutil.rmtree(CHROMA_PERSIST_DIR_HIERARCHICAL)
        except OSError as e:
            logger.error(f"Error deleting ChromaDB directory {CHROMA_PERSIST_DIR_HIERARCHICAL}: {e}", exc_info=True)
            # Potentially exit or handle more gracefully if deletion fails critically

    logger.info("Initializing Ollama embedding model...")
    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        logger.info(f"Creating/Rebuilding vector database at: {CHROMA_PERSIST_DIR_HIERARCHICAL}")
        
        # Ensure metadata is clean one last time
        for chunk in final_chunks_for_db:
            chunk.metadata = {k: clean_metadata_for_chroma(v) for k, v in chunk.metadata.items()}

        # Chroma.from_documents will create and persist the DB if persist_directory is provided
        vector_db = Chroma.from_documents(
            documents=final_chunks_for_db,
            embedding=embedding_model,
            persist_directory=CHROMA_PERSIST_DIR_HIERARCHICAL,
            collection_name=CHROMA_COLLECTION_NAME_HIERARCHICAL
        )
        # REMOVE THE ERRONEOUS LINE: vector_db.persist() 
        logger.info(f"Vector database created/updated with {len(final_chunks_for_db)} chunks and persisted to {CHROMA_PERSIST_DIR_HIERARCHICAL}.")
        
        # Verify collection count from a new instance (more robust check)
        # Ensure the embedding model is also passed when reloading an existing persisted DB
        db_reloaded = Chroma(
            persist_directory=CHROMA_PERSIST_DIR_HIERARCHICAL, 
            embedding_function=embedding_model, 
            collection_name=CHROMA_COLLECTION_NAME_HIERARCHICAL
        )
        collection_count = db_reloaded._collection.count()
        logger.info(f"ChromaDB collection count after reload: {collection_count}")

        if collection_count > 0:
            test_query = "digital evidence" if IS_EPUB else "cryptography basics"
            logger.info(f"Attempting test query: '{test_query}'")
            results = db_reloaded.similarity_search(test_query, k=2)
            if results:
                logger.info(f"Found {len(results)} results for test query '{test_query}':")
                for doc_idx, doc in enumerate(results):
                    logger.info(f"  Result {doc_idx+1} Metadata: {doc.metadata}")
                    # logger.info(f"  Result {doc_idx+1} Content Snippet: {doc.page_content[:200]}...")
            else: 
                logger.info(f"Test query '{test_query}' returned no results.")
        else:
            logger.warning("No documents in collection for test query.")

    except Exception as e:
        logger.error(f"Error initializing embedding model or creating/querying vector DB: {e}", exc_info=True)
else:
    logger.warning("No chunks were generated, so no vector database will be created.")

logger.info("Focused script for ToC-guided DB creation finished.")

2025-06-16 00:06:40,750 - INFO - Using book: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub and pre-extracted ToC: ./epub_table_of_contents.json
2025-06-16 00:06:40,751 - INFO - Processing book '/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub' using pre-extracted ToC from './epub_table_of_contents.json'.
2025-06-16 00:06:40,752 - INFO - Successfully loaded pre-extracted hierarchical ToC with 28 top-level entries.
2025-06-16 00:06:42,598 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-06-16 00:06:42,598 - INFO - NumExpr defaulting to 16 threads.
  data file translations/en.yaml not found
  dat