# Set up Paths 

In [1]:
# Cell 1: Setup and Configuration
import os
import re
import logging
import warnings
from docx import Document
import pdfplumber
import ollama
from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
import json

# Setup Logger for this cell
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 1. CORE SETTINGS ---
# Set this to True for EPUB, False for PDF. This controls the entire notebook's flow.
PROCESS_EPUB = True # for EPUB
# PROCESS_EPUB = False # for PDF

# --- 2. INPUT FILE NAMES ---
# The name of the Unit Outline file (e.g., DOCX, PDF)
UNIT_OUTLINE_FILENAME = "ICT312 Digital Forensic_Final.docx" # epub
# UNIT_OUTLINE_FILENAME = "ICT311 Applied Cryptography.docx" # pdf


# The names of the book files
EPUB_BOOK_FILENAME = "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
PDF_BOOK_FILENAME = "(Chapman & Hall_CRC Cryptography and Network Security Series) Jonathan Katz, Yehuda Lindell - Introduction to Modern Cryptography-CRC Press (2020).pdf"

# --- 3. DIRECTORY STRUCTURE ---
# Define the base path to your project to avoid hardcoding long paths everywhere
PROJECT_BASE_DIR = "/home/sebas_dev_linux/projects/course_generator"

# Define subdirectories relative to the base path
DATA_DIR = os.path.join(PROJECT_BASE_DIR, "data")
PARSE_DATA_DIR = os.path.join(PROJECT_BASE_DIR, "Parse_data")

# Construct full paths for clarity
INPUT_UO_DIR = os.path.join(DATA_DIR, "UO")
INPUT_BOOKS_DIR = os.path.join(DATA_DIR, "books")
OUTPUT_PARSED_UO_DIR = os.path.join(PARSE_DATA_DIR, "Parse_UO")
OUTPUT_PARSED_TOC_DIR = os.path.join(PARSE_DATA_DIR, "Parse_TOC_books")
OUTPUT_DB_DIR = os.path.join(DATA_DIR, "DataBase_Chroma")

# --- 4. LLM & EMBEDDING CONFIGURATION ---
LLM_PROVIDER = "ollama"  # Can be "ollama", "openai", "gemini"
OLLAMA_HOST = "http://localhost:11434"
OLLAMA_MODEL = "qwen3:8b" # "qwen3:8b", #"mistral:latest"
EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100

# --- 5. DYNAMICALLY GENERATED PATHS & IDs (DO NOT EDIT THIS SECTION) ---
# This section uses the settings above to create all the necessary variables for later cells.

# Extract Unit ID from the filename
def extract_uo_id_from_filename(filename: str) -> str:
    match = re.match(r'^[A-Z]+\d+', os.path.basename(filename))
    if match:
        return match.group(0)
    raise ValueError(f"Could not extract a valid Unit ID from filename: '{filename}'")

try:
    UNIT_ID = extract_uo_id_from_filename(UNIT_OUTLINE_FILENAME)
except ValueError as e:
    print(f"Error: {e}")
    UNIT_ID = "UNKNOWN_ID"

# Full path to the unit outline file
FULL_PATH_UNIT_OUTLINE = os.path.join(INPUT_UO_DIR, UNIT_OUTLINE_FILENAME)

# Determine which book and output paths to use based on the PROCESS_EPUB flag
if PROCESS_EPUB:
    BOOK_PATH = os.path.join(INPUT_BOOKS_DIR, EPUB_BOOK_FILENAME)
    PRE_EXTRACTED_TOC_JSON_PATH = os.path.join(OUTPUT_PARSED_TOC_DIR, f"{UNIT_ID}_epub_table_of_contents.json")
else:
    BOOK_PATH = os.path.join(INPUT_BOOKS_DIR, PDF_BOOK_FILENAME)
    PRE_EXTRACTED_TOC_JSON_PATH = os.path.join(OUTPUT_PARSED_TOC_DIR, f"{UNIT_ID}_pdf_table_of_contents.json")

# Define paths for the vector database
file_type_suffix = 'epub' if PROCESS_EPUB else 'pdf'
CHROMA_PERSIST_DIR = os.path.join(OUTPUT_DB_DIR, f"chroma_db_toc_guided_chunks_{file_type_suffix}_v2")
CHROMA_COLLECTION_NAME = f"book_toc_guided_chunks_{file_type_suffix}_v2"

# Define path for the parsed unit outline
PARSED_UO_JSON_PATH = os.path.join(OUTPUT_PARSED_UO_DIR, f"{os.path.splitext(UNIT_OUTLINE_FILENAME)[0]}_parsed.json")

# --- Sanity Check Printout ---
print("--- CONFIGURATION SUMMARY ---")
print(f"Processing Mode: {'EPUB' if PROCESS_EPUB else 'PDF'}")
print(f"Unit ID: {UNIT_ID}")
print(f"Unit Outline Path: {FULL_PATH_UNIT_OUTLINE}")
print(f"Book Path: {BOOK_PATH}")
print(f"Parsed UO Output Path: {PARSED_UO_JSON_PATH}")
print(f"Parsed ToC Output Path: {PRE_EXTRACTED_TOC_JSON_PATH}")
print(f"Vector DB Path: {CHROMA_PERSIST_DIR}")
print(f"Vector DB Collection: {CHROMA_COLLECTION_NAME}")
print("--- SETUP COMPLETE ---")

--- CONFIGURATION SUMMARY ---
Processing Mode: EPUB
Unit ID: ICT312
Unit Outline Path: /home/sebas_dev_linux/projects/course_generator/data/UO/ICT312 Digital Forensic_Final.docx
Book Path: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
Parsed UO Output Path: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_UO/ICT312 Digital Forensic_Final_parsed.json
Parsed ToC Output Path: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/ICT312_epub_table_of_contents.json
Vector DB Path: /home/sebas_dev_linux/projects/course_generator/data/DataBase_Chroma/chroma_db_toc_guided_chunks_epub_v2
Vector DB Collection: book_toc_guided_chunks_epub_v2
--- SETUP COMPLETE ---


# System Prompt

In [2]:
UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE = """
You are an expert academic assistant tasked with parsing a university unit outline document and extracting key information into a structured JSON format.

The input will be the raw text content of a unit outline. Your goal is to identify and extract the following details and structure them precisely as specified in the JSON schema below. Note: do not change any key name

**JSON Output Schema:**

```json
{{
  "unitInformation": {{
    "unitCode": "string | null",
    "unitName": "string | null",
    "creditPoints": "integer | null",
    "unitRationale": "string | null",
    "prerequisites": "string | null"
  }},
  "learningOutcomes": [
    "string"
  ],
  "assessments": [
    {{
      "taskName": "string",
      "description": "string",
      "dueWeek": "string | null",
      "weightingPercent": "integer | null",
      "learningOutcomesAssessed": "string | null"
    }}
  ],
  "weeklySchedule": [
    {{
      "week": "string",
      "contentTopic": "string",
      "requiredReading": "string | null"
    }}
  ],
  "requiredReadings": [
    "string"
  ],
  "recommendedReadings": [
    "string"
  ]
}}

Instructions for Extraction:
Unit Information: Locate Unit Code, Unit Name, Credit Points. Capture 'Unit Overview / Rationale' as unitRationale. Identify prerequisites.
Learning Outcomes: Extract each learning outcome statement.
Assessments: Each task as an object. Capture full task name, description, Due Week, Weighting % (number), and Learning Outcomes Assessed.
weeklySchedule: Each week as an object. Capture Week, contentTopic, and requiredReading.
Required and Recommended Readings: List full text for each.
**Important Considerations for the LLM**:
Pay close attention to headings and table structures.
If information is missing, use null for string/integer fields, or an empty list [] for array fields.
Do no change keys in the template given
Ensure the output is ONLY the JSON object, starting with {{{{ and ending with }}}}. No explanations or conversational text before or after the JSON. 
Now, parse the following unit outline text:
--- UNIT_OUTLINE_TEXT_START ---
{outline_text}
--- UNIT_OUTLINE_TEXT_END ---
"""

In [3]:
# Place this in a new cell after your imports, or within Cell 3 before the functions.
# This code is based on the schema from your screenshot on page 4.

from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional
import time

# Define Pydantic models that match your JSON schema
class UnitInformation(BaseModel):
    unitCode: Optional[str] = None
    unitName: Optional[str] = None
    creditPoints: Optional[int] = None
    unitRationale: Optional[str] = None
    prerequisites: Optional[str] = None

class Assessment(BaseModel):
    taskName: str
    description: str
    dueWeek: Optional[str] = None
    weightingPercent: Optional[int] = None
    learningOutcomesAssessed: Optional[str] = None

class WeeklyScheduleItem(BaseModel):
    week: str
    contentTopic: str
    requiredReading: Optional[str] = None

class ParsedUnitOutline(BaseModel):
    unitInformation: UnitInformation
    learningOutcomes: List[str]
    assessments: List[Assessment]
    weeklySchedule: List[WeeklyScheduleItem] 
    requiredReadings: List[str]
    recommendedReadings: List[str]

# Extrac Unit outline details to process following steps - output raw json with UO details 

In [4]:
# Cell 3: Parse Unit Outline


# --- Helper Functions for Parsing ---
def extract_text_from_file(filepath: str) -> str:
    _, ext = os.path.splitext(filepath.lower())
    if ext == '.docx':
        doc = Document(filepath)
        full_text = [p.text for p in doc.paragraphs]
        for table in doc.tables:
            for row in table.rows:
                full_text.append(" | ".join(cell.text for cell in row.cells))
        return '\n'.join(full_text)
    elif ext == '.pdf':
        with pdfplumber.open(filepath) as pdf:
            return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    else:
        raise TypeError(f"Unsupported file type: {ext}")

def parse_llm_json_output(content: str) -> dict:
    try:
        match = re.search(r'\{.*\}', content, re.DOTALL)
        if not match: return None
        return json.loads(match.group(0))
    except (json.JSONDecodeError, TypeError):
        return None

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=10))
def call_ollama_with_retry(client, prompt):
    logger.info(f"Calling Ollama model '{OLLAMA_MODEL}'...")
    response = client.chat(
        model=OLLAMA_MODEL,
        messages=[{"role": "user", "content": prompt}],
        format="json",
        options={"temperature": 0.0}
    )
    if not response or 'message' not in response or not response['message'].get('content'):
        raise ValueError("Ollama returned an empty or invalid response.")
    return response['message']['content']

# --- Main Orchestration Function for this Cell ---
def parse_and_save_outline_robust(
    input_filepath: str, 
    output_filepath: str, 
    prompt_template: str,
    max_retries: int = 3
):
    logger.info(f"Starting to robustly process Unit Outline: {input_filepath}")
    
    if not os.path.exists(input_filepath):
        logger.error(f"Input file not found: {input_filepath}")
        return

    try:
        outline_text = extract_text_from_file(input_filepath)
        if not outline_text.strip():
            logger.error("Extracted text is empty. Aborting.")
            return
    except Exception as e:
        logger.error(f"Failed to extract text from file: {e}", exc_info=True)
        return

    client = ollama.Client(host=OLLAMA_HOST)
    current_prompt = prompt_template.format(outline_text=outline_text)
    
    for attempt in range(max_retries):
        logger.info(f"Attempt {attempt + 1}/{max_retries} to parse outline.")
        
        try:
            # Call the LLM
            llm_output_str = call_ollama_with_retry(client, current_prompt)
            
            # Find the JSON blob in the response
            json_blob = parse_llm_json_output(llm_output_str) # Your existing helper
            if not json_blob:
                raise ValueError("LLM did not return a parsable JSON object.")

            # *** THE KEY VALIDATION STEP ***
            # Try to parse the dictionary into your Pydantic model.
            # This will raise a `ValidationError` if keys are wrong, types are wrong, or fields are missing.
            parsed_data = ParsedUnitOutline.model_validate(json_blob)
            
            # If successful, save the validated data and exit the loop
            logger.info("Successfully validated JSON structure against Pydantic model.")
            os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
            with open(output_filepath, 'w', encoding='utf-8') as f:
                # Use .model_dump_json() for clean, validated output
                f.write(parsed_data.model_dump_json(indent=2)) 

            logger.info(f"Successfully parsed and saved Unit Outline to: {output_filepath}")
            return # Exit function on success

        except ValidationError as e:
            logger.warning(f"Validation failed on attempt {attempt + 1}. Error: {e}")
            # Formulate a new prompt with the error message for self-correction
            error_feedback = (
                f"\n\nYour previous attempt failed. You MUST correct the following errors:\n"
                f"{e}\n\n"
                f"Please regenerate the entire JSON object, ensuring it strictly adheres to the schema "
                f"and corrects these specific errors. Do not change any key names."
            )
            current_prompt = current_prompt + error_feedback # Append the error to the prompt
            
        except Exception as e:
            # Catch other errors like network issues from call_ollama_with_retry
            logger.error(f"An unexpected error occurred on attempt {attempt + 1}: {e}", exc_info=True)
            # You might want to wait before retrying for non-validation errors
            time.sleep(5)

    logger.error(f"Failed to get valid structured data from the LLM after {max_retries} attempts.")


# --- In your execution block, call the new function ---
# parse_and_save_outline(...) becomes:
parse_and_save_outline_robust(
    input_filepath=FULL_PATH_UNIT_OUTLINE,
    output_filepath=PARSED_UO_JSON_PATH,
    prompt_template=UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE
)

KeyboardInterrupt: 

# Extract TOC from epub or epub 

In [None]:
# # Cell 4: Extract Book Table of Contents (ToC)
# # This cell extracts the ToC from the specified book (EPUB or PDF)
# # and saves it to the path defined in Cell 1.

# from ebooklib import epub, ITEM_NAVIGATION
# from bs4 import BeautifulSoup
# import fitz  # PyMuPDF
# import json

# # --- EPUB Extraction Logic ---
# def parse_navpoint(navpoint, level=0):
#     # (Your existing parse_navpoint function)
#     title = navpoint.navLabel.text.strip()
#     content_tag = navpoint.content
#     href = content_tag['src'] if content_tag else None

#     # Add filtering logic here if needed
#     node = {"level": level, "title": title, "href": href, "children": []}
#     for child_navpoint in navpoint.find_all('navPoint', recursive=False):
#         child_node = parse_navpoint(child_navpoint, level + 1)
#         if child_node: node["children"].append(child_node)
#     return node

# def parse_li(li_element, level=0):
#     # (Your existing parse_li function)
#     a_tag = li_element.find('a')
#     if a_tag:
#         title = a_tag.get_text(strip=True)
#         href = a_tag.get('href', None)
#         # Add filtering logic here if needed
#         node = {"level": level, "title": title, "href": href, "children": []}
#         nested_ol = li_element.find('ol')
#         if nested_ol:
#             for sub_li in nested_ol.find_all('li', recursive=False):
#                 child_node = parse_li(sub_li, level + 1)
#                 if child_node: node["children"].append(child_node)
#         return node
#     return None

# def extract_epub_toc(epub_path, output_json_path):
#     print(f"Processing EPUB ToC for: {epub_path}")
#     toc_data = []
#     book = epub.read_epub(epub_path)
#     for nav_item in book.get_items_of_type(ITEM_NAVIGATION):
#         soup = BeautifulSoup(nav_item.get_content(), 'xml')
#         if nav_item.get_name().endswith('.ncx'):
#             print("INFO: Found EPUB 2 (NCX) Table of Contents.")
#             navmap = soup.find('navMap')
#             if navmap:
#                 for navpoint in navmap.find_all('navPoint', recursive=False):
#                     node = parse_navpoint(navpoint, level=0)
#                     if node: toc_data.append(node)
#         else:
#             print("INFO: Found EPUB 3 (XHTML) Table of Contents.")
#             toc_nav = soup.select_one('nav[epub|type="toc"]')
#             if toc_nav:
#                 top_ol = toc_nav.find('ol')
#                 if top_ol:
#                     for li in top_ol.find_all('li', recursive=False):
#                         node = parse_li(li, level=0)
#                         if node: toc_data.append(node)
#         if toc_data: break
    
#     if toc_data:
#         os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
#         with open(output_json_path, 'w', encoding='utf-8') as f:
#             json.dump(toc_data, f, indent=2, ensure_ascii=False)
#         print(f"✅ Successfully wrote EPUB ToC to: {output_json_path}")
#     else:
#         print("❌ WARNING: No ToC data extracted from EPUB.")

# # --- PDF Extraction Logic ---
# def build_pdf_hierarchy(toc_list):
#     """
#     Builds a hierarchical structure from a flat ToC list from PyMuPDF.
#     MODIFIED: Normalizes levels to start at 0 for consistency with EPUB.
#     """
#     root = []
#     # The parent_stack keys are now level-based, starting from -1 for the root's parent.
#     parent_stack = {-1: {"children": root}}

#     for level, title, page in toc_list:
#         # --- FIX: NORMALIZE LEVEL TO START AT 0 ---
#         # fitz/PyMuPDF ToC levels start at 1, so we subtract 1.
#         normalized_level = level - 1

#         node = {
#             "level": normalized_level,
#             "title": title.strip(),
#             "page": page,
#             "children": []
#         }

#         # Find the correct parent in the stack. The parent's level is one less than the current node's.
#         # This logic correctly places the node under its parent in the hierarchy.
#         parent_node = parent_stack[normalized_level - 1]
#         parent_node["children"].append(node)

#         # Add the current node to the stack so it can be a parent for subsequent nodes.
#         parent_stack[normalized_level] = node

#     return root

# def extract_pdf_toc(pdf_path, output_json_path):
#     print(f"Processing PDF ToC for: {pdf_path}")
#     try:
#         doc = fitz.open(pdf_path)
#         toc = doc.get_toc()
#         if not toc:
#             print("❌ WARNING: This PDF has no embedded bookmarks (ToC).")
#             hierarchical_toc = []
#         else:
#             print(f"INFO: Found {len(toc)} bookmark entries.")
#             hierarchical_toc = build_pdf_hierarchy(toc)
        
#         os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
#         with open(output_json_path, 'w', encoding='utf-8') as f:
#             json.dump(hierarchical_toc, f, indent=2, ensure_ascii=False)
#         print(f"✅ Successfully wrote PDF ToC to: {output_json_path}")
            
#     except Exception as e:
#         print(f"An error occurred during PDF ToC extraction: {e}")

# # --- Execute ToC Extraction ---
# if PROCESS_EPUB:
#     extract_epub_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)
# else:
#     extract_pdf_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)

this work

In [4]:
# # Cell 4: Extract Book Table of Contents (ToC)
# # This cell extracts the ToC from the specified book (EPUB or PDF)
# # and saves it to the path defined in Cell 1.

# from ebooklib import epub, ITEM_NAVIGATION
# from bs4 import BeautifulSoup
# import fitz  # PyMuPDF
# import json

# # --- EPUB Extraction Logic ---
# def parse_navpoint(navpoint, level=0):
#     # (Your existing parse_navpoint function)
#     title = navpoint.navLabel.text.strip()
#     # Add filtering logic here if needed
#     node = {"level": level, "title": title, "children": []}
#     for child_navpoint in navpoint.find_all('navPoint', recursive=False):
#         child_node = parse_navpoint(child_navpoint, level + 1)
#         if child_node: node["children"].append(child_node)
#     return node

# def parse_li(li_element, level=0):
#     # (Your existing parse_li function)
#     a_tag = li_element.find('a')
#     if a_tag:
#         title = a_tag.get_text(strip=True)
#         # Add filtering logic here if needed
#         node = {"level": level, "title": title, "children": []}
#         nested_ol = li_element.find('ol')
#         if nested_ol:
#             for sub_li in nested_ol.find_all('li', recursive=False):
#                 child_node = parse_li(sub_li, level + 1)
#                 if child_node: node["children"].append(child_node)
#         return node
#     return None

# def extract_epub_toc(epub_path, output_json_path):
#     print(f"Processing EPUB ToC for: {epub_path}")
#     toc_data = []
#     book = epub.read_epub(epub_path)
#     for nav_item in book.get_items_of_type(ITEM_NAVIGATION):
#         soup = BeautifulSoup(nav_item.get_content(), 'xml')
#         if nav_item.get_name().endswith('.ncx'):
#             print("INFO: Found EPUB 2 (NCX) Table of Contents.")
#             navmap = soup.find('navMap')
#             if navmap:
#                 for navpoint in navmap.find_all('navPoint', recursive=False):
#                     node = parse_navpoint(navpoint, level=0)
#                     if node: toc_data.append(node)
#         else:
#             print("INFO: Found EPUB 3 (XHTML) Table of Contents.")
#             toc_nav = soup.select_one('nav[epub|type="toc"]')
#             if toc_nav:
#                 top_ol = toc_nav.find('ol')
#                 if top_ol:
#                     for li in top_ol.find_all('li', recursive=False):
#                         node = parse_li(li, level=0)
#                         if node: toc_data.append(node)
#         if toc_data: break
    
#     if toc_data:
#         os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
#         with open(output_json_path, 'w', encoding='utf-8') as f:
#             json.dump(toc_data, f, indent=2, ensure_ascii=False)
#         print(f"✅ Successfully wrote EPUB ToC to: {output_json_path}")
#     else:
#         print("❌ WARNING: No ToC data extracted from EPUB.")

# # --- PDF Extraction Logic ---
# def build_pdf_hierarchy(toc_list):
#     """
#     Builds a hierarchical structure from a flat ToC list from PyMuPDF.
#     MODIFIED: Normalizes levels to start at 0 for consistency with EPUB.
#     """
#     root = []
#     # The parent_stack keys are now level-based, starting from -1 for the root's parent.
#     parent_stack = {-1: {"children": root}}

#     for level, title, page in toc_list:
#         # --- FIX: NORMALIZE LEVEL TO START AT 0 ---
#         # fitz/PyMuPDF ToC levels start at 1, so we subtract 1.
#         normalized_level = level - 1

#         node = {
#             "level": normalized_level,
#             "title": title.strip(),
#             "page": page,
#             "children": []
#         }

#         # Find the correct parent in the stack. The parent's level is one less than the current node's.
#         # This logic correctly places the node under its parent in the hierarchy.
#         parent_node = parent_stack[normalized_level - 1]
#         parent_node["children"].append(node)

#         # Add the current node to the stack so it can be a parent for subsequent nodes.
#         parent_stack[normalized_level] = node

#     return root

# def extract_pdf_toc(pdf_path, output_json_path):
#     print(f"Processing PDF ToC for: {pdf_path}")
#     try:
#         doc = fitz.open(pdf_path)
#         toc = doc.get_toc()
#         if not toc:
#             print("❌ WARNING: This PDF has no embedded bookmarks (ToC).")
#             hierarchical_toc = []
#         else:
#             print(f"INFO: Found {len(toc)} bookmark entries.")
#             hierarchical_toc = build_pdf_hierarchy(toc)
        
#         os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
#         with open(output_json_path, 'w', encoding='utf-8') as f:
#             json.dump(hierarchical_toc, f, indent=2, ensure_ascii=False)
#         print(f"✅ Successfully wrote PDF ToC to: {output_json_path}")
            
#     except Exception as e:
#         print(f"An error occurred during PDF ToC extraction: {e}")

# # --- Execute ToC Extraction ---
# if PROCESS_EPUB:
#     extract_epub_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)
# else:
#     extract_pdf_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)

Processing EPUB ToC for: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
INFO: Found EPUB 2 (NCX) Table of Contents.
✅ Successfully wrote EPUB ToC to: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/ICT312_epub_table_of_contents.json


# Hirachical DB base on TOC

## Process Book

In [5]:
# Cell 5: Create Hierarchical Vector Database (Definitive Final Version)

import os
import json
import shutil
import logging
import re
from typing import List, Dict, Any, Tuple
from langchain_core.documents import Document
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---
def normalize_text(text: str) -> str:
    """Converts text to a canonical form for matching: lowercase, no punctuation, single spaces."""
    if not text: return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_metadata_for_chroma(value: Any) -> Any:
    """Sanitizes metadata values to be compatible with ChromaDB."""
    if isinstance(value, list): return ", ".join(map(str, value))
    if isinstance(value, dict): return json.dumps(value)
    if isinstance(value, (str, int, float, bool)) or value is None: return value
    return str(value)

# --- CORE PROCESSING FUNCTION ---
# In Cell 5, replace the entire function with this one.

from rapidfuzz import process, fuzz

def process_book_with_extracted_toc(book_path: str, extracted_toc_json_path: str,
                                    chunk_size: int, chunk_overlap: int) -> Tuple[List[Document], List[Dict[str, Any]]]:
    logger.info(f"Processing book '{os.path.basename(book_path)}'...")
    try:
        with open(extracted_toc_json_path, 'r', encoding='utf-8') as f:
            hierarchical_toc = json.load(f)
    except Exception as e:
        logger.error(f"FATAL: Error loading ToC JSON: {e}"); return ([], [])

    # --- [PATCH A - PART 1] ---
    # Build the href -> nav_path mapping from the full ToC
    href_to_navpath = {}
    def _walk(nodes, trail):
        for n in nodes:
            # Add the current node's title to the trail for its children
            current_trail = trail + [n.get("title", "")]
            href = n.get("href")
            if href:
                # Store the full trail using the href as the key
                href_to_navpath[href.split("#")[0]] = current_trail
            
            if n.get("children"):
                _walk(n["children"], current_trail)
    
    _walk(hierarchical_toc, [])
    # --- [END PATCH A - PART 1] ---

    loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
    all_raw_book_docs = loader.load()
    
    logger.info("Enriching documents with robust hierarchical metadata...")
    # (The following section combines the original heading-based enrichment with the new full_path enrichment)
    flat_toc_entries = []
    def _flatten_toc_recursive(nodes: List[Dict[str, Any]], path: List[str]):
        for node in nodes:
            if title := node.get("title", "").strip():
                new_path = path + [title]
                flat_toc_entries.append({"full_title_for_matching": title, "titles_path": new_path})
                if node.get("children"):
                    _flatten_toc_recursive(node["children"], new_path)
    _flatten_toc_recursive(hierarchical_toc, [])
    
    toc_title_set = {normalize_text(entry["full_title_for_matching"]) for entry in flat_toc_entries}
    normalized_title_to_path_map = {normalize_text(entry["full_title_for_matching"]): entry["titles_path"] for entry in flat_toc_entries}

    final_documents_with_metadata: List[Document] = []
    current_hierarchy = {}
    for doc in all_raw_book_docs:
        normalized_text = normalize_text(doc.page_content)
        if not normalized_text: continue

        match, score, _ = process.extractOne(normalized_text, toc_title_set, scorer=fuzz.token_set_ratio)
        if score > 95:
            current_hierarchy = {}
            path = normalized_title_to_path_map[match]
            for i, title in enumerate(path):
                current_hierarchy[f"level_{i}_title"] = title
        
        if not current_hierarchy: continue
            
        new_metadata = doc.metadata.copy()
        new_metadata.update(current_hierarchy)
        final_documents_with_metadata.append(Document(page_content=doc.page_content, metadata=new_metadata))
        
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    final_chunks = text_splitter.split_documents(final_documents_with_metadata)
    
    logger.info(f"Split documents into {len(final_chunks)} final chunks.")
    logger.info("Assigning sequence IDs and canonical paths...")
    
    for i, chunk in enumerate(final_chunks):
        chunk.metadata['global_chunk_sequence_id'] = i
        path_parts = [chunk.metadata.get(f"level_{j}_title", "") for j in range(6)]
        raw_path = " > ".join(filter(None, path_parts))
        chunk.metadata['toc_path'] = raw_path
        chunk.metadata['toc_path_norm'] = normalize_text(raw_path)
        
        # --- [PATCH A - PART 2] ---
        # Attach the canonical navMap path to every chunk.
        source_file = chunk.metadata.get("source", "").split("#")[0]
        nav_path = href_to_navpath.get(source_file, [])
        full_nav = " > ".join(filter(None, nav_path))
        
        chunk.metadata["toc_path_full"] = full_nav
        chunk.metadata["toc_path_full_norm"] = normalize_text(full_nav)
        # --- [END PATCH A - PART 2] ---

    return final_chunks, hierarchical_toc

# --- Main Execution Block ---
if not os.path.exists(PRE_EXTRACTED_TOC_JSON_PATH):
    logger.error("CRITICAL: Pre-extracted ToC file not found. Run Cell 4 first.")
else:
    final_chunks_for_db, toc_reloaded = process_book_with_extracted_toc(
        book_path=BOOK_PATH, extracted_toc_json_path=PRE_EXTRACTED_TOC_JSON_PATH, 
        chunk_size=500, chunk_overlap=50
    )
    if final_chunks_for_db:
        logger.info("Sanitizing all chunk metadata for ChromaDB compatibility...")
        for chunk in final_chunks_for_db:
            chunk.metadata = {k: clean_metadata_for_chroma(v) for k, v in chunk.metadata.items()}
        
        if os.path.exists(CHROMA_PERSIST_DIR):
            logger.warning(f"Deleting existing ChromaDB directory: {CHROMA_PERSIST_DIR}")
            shutil.rmtree(CHROMA_PERSIST_DIR)

        logger.info(f"Initializing embedding model '{EMBEDDING_MODEL_OLLAMA}' and creating new vector database...")
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        vector_db = Chroma.from_documents(
            documents=final_chunks_for_db, embedding=embedding_model, 
            persist_directory=CHROMA_PERSIST_DIR, collection_name=CHROMA_COLLECTION_NAME
        )
        count = vector_db._collection.count()
        print("-" * 50); logger.info(f"✅ Vector DB created successfully. Collection contains {count} documents."); print("-" * 50)
    else:
        logger.error("❌ Failed to generate chunks. Vector DB not created.")

2025-06-26 05:10:12,833 - INFO - Processing book 'Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub'...
2025-06-26 05:10:14,768 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-06-26 05:10:14,768 - INFO - NumExpr defaulting to 16 threads.
  data file translations/en.yaml not found
  data file translations/en.yaml not found


2025-06-26 05:10:24,902 - INFO - Enriching documents with robust hierarchical metadata...
2025-06-26 05:10:29,337 - INFO - Split documents into 12498 final chunks.
2025-06-26 05:10:29,337 - INFO - Assigning sequence IDs and canonical paths...
2025-06-26 05:10:29,415 - INFO - Sanitizing all chunk metadata for ChromaDB compatibility...
2025-06-26 05:10:29,447 - INFO - Initializing embedding model 'nomic-embed-text' and creating new vector database...
2025-06-26 05:10:29,497 - INFO - Anonymized teleme

--------------------------------------------------
--------------------------------------------------


### Smoke test

In [6]:
# # Cell 5.1: Smoke Test & Sanity Check

# print("Smoke Test & Sanity Check")

# if 'final_chunks_for_db' in locals() and final_chunks_for_db:
#     logger.info("✅ `final_chunks_for_db` object exists. Running sanity checks...")

#     # 1. Peek at one sample chunk's metadata
#     print("\n🔬 PEEKING at metadata of the first chunk:")
#     sample_metadata = final_chunks_for_db[0].metadata
#     print(json.dumps(sample_metadata, indent=2))
#     assert "toc_path" in sample_metadata, "FATAL: 'toc_path' field is missing!"
#     assert "toc_path_norm" in sample_metadata, "FATAL: 'toc_path_norm' field is missing!"
#     logger.info("✅ Sample chunk contains the required 'toc_path' and 'toc_path_norm' fields.")

#     # 2. Confirm every chunk has the new field
#     all_chunks_have_norm_path = all("toc_path_norm" in c.metadata for c in final_chunks_for_db)
#     assert all_chunks_have_norm_path, "FATAL: Not all chunks have the 'toc_path_norm' metadata field!"
#     logger.info(f"✅ Verified that all {len(final_chunks_for_db)} chunks have the 'toc_path_norm' field.")

#     # 3. Sanity-count distinct paths
#     unique_paths = {c.metadata["toc_path_norm"] for c in final_chunks_for_db if "toc_path_norm" in c.metadata}
#     logger.info(f"✅ Found {len(unique_paths)} unique normalized paths in the dataset.")
    
#     print("\n" + "*"*80)
#     print("SMOKE TEST PASSED. The data is correctly structured. You can now proceed to build the database.".center(80))
#     print("*"*80)

# else:
#     logger.error("❌ `final_chunks_for_db` not found or is empty. Run the main part of Cell 5 first.")

## Test Data Base for content development

### Verification Test Strategy
The script automatically validates the vector database by performing four dynamic tests that increase in complexity, moving from a general health check to specific application-level requirements.

Basic Retrieval Test:
- Goal: Confirm the database is live and its content is broadly relevant to the course subject.
- Method: It performs a simple search using the course's unitName (e.g., "Digital Forensic") extracted from the unit outline.
- Success means: The database is online, and the ingested content is thematically correct.

Deep Hierarchy Test:
- Goal: Verify the structural integrity of the metadata, ensuring text is correctly tagged with its full, multi-level context (e.g., Part -> Chapter -> Section).
- Method: It randomly picks a deeply nested sub-section from the Table of Contents and performs a search that is filtered to match that exact hierarchical path.
- Success means: The data ingestion process is correctly assigning detailed, nested parentage to all text chunks.

Advanced Unit Outline Alignment Test:
- Goal: Ensure the system can correctly map a weekly syllabus topic to the right chapter(s) in the book, adapting to different ToC structures (e.g., flat chapters vs. chapters inside "Parts").
- Method: It randomly selects a week, finds all required chapter numbers from the reading list, and dynamically determines the correct metadata level to check. It then verifies that a search for the weekly topic retrieves chunks belonging to the correct chapters.
- Success means: The database is directly useful for its primary purpose: linking the course structure to the source textbook reliably.

Content Sequence Test (PDF-only):
- Goal: Check if retrieved content can be re-ordered chronologically to form a coherent narrative.
- Method: It retrieves multiple chunks for a random topic, sorts them using the page_number metadata, and verifies the page numbers are in ascending order.
- Success means: The database contains the necessary metadata to reconstruct the original flow of the book's content, which is crucial for generating logical summaries or lecture material.

In [15]:
# # Cell 6: Verify Vector Database (Definitive Final Suite with Diagnostics)

# import os
# import json
# import re
# import random
# import logging
# from typing import List, Dict, Any, Tuple, Optional
# import pandas as pd

# try:
#     from langchain_chroma import Chroma
#     from langchain_core.documents import Document
#     from langchain_ollama.embeddings import OllamaEmbeddings
#     langchain_available = True
# except ImportError:
#     langchain_available = False

# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# # --- HELPER FUNCTIONS ---
# def normalize_text(text: str) -> str:
#     """Converts text to a canonical form for matching: lowercase, no punctuation, single spaces."""
#     if not text: return ""
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)
#     text = re.sub(r'\s+', ' ', text).strip()
#     return text

# def print_header(text: str, char: str = "="):
#     """Prints a centered header to the console."""
#     print("\n" + char * 80)
#     print(text.center(80))
#     print(char * 80)

# def print_results(operation_name: str, results: list, where_filter: Optional[Dict] = None):
#     """Prints the results of a vector store operation in a clear, readable format."""
#     print("\n" + "-"*50)
#     print(f"🔬 Operation: '{operation_name}'")
#     if where_filter:
#         print(f"📄 Filter: {json.dumps(where_filter, indent=2)}")
    
#     if not results:
#         print("\n❌ RESULTS: No documents found for this operation.")
#         print("-" * 50)
#         return
        
#     print(f"\n✅ RESULTS: Found {len(results)} documents. Displaying details for top 3:")
#     for i, doc in enumerate(results[:3]):
#         print(f"\n--- Result {i+1} ---")
#         print(f"Content: '{doc.page_content.replace('', ' ').strip()[:150]}...'")
#         print(f"Metadata: {json.dumps(doc.metadata, indent=2)}")
#     print("-" * 50)

# def find_leaf_section(nodes: List[Dict]) -> Optional[List[str]]:
#     """Finds a random, deep, leaf-node section from the ToC.json."""
#     leaf_paths = []
#     def _traverse(sub_nodes, current_path):
#         for node in sub_nodes:
#             new_path = current_path + [node.get('title', 'Untitled')]
#             if not node.get('children'):
#                 if len(new_path) > 2: # Ensure it's at least level 2 deep for a meaningful test
#                     leaf_paths.append(new_path)
#             else:
#                 _traverse(node['children'], new_path)
#     _traverse(nodes, [])
#     return random.choice(leaf_paths) if leaf_paths else None

# # --- TEST CASE FUNCTIONS ---
# def run_test(name: str, goal: str, func, *args):
#     """A wrapper to run each test and print its final status."""
#     print_header(name, char="-")
#     logger.info(f"🎯 GOAL: {goal}")
#     status = "❌ FAILED"
#     try:
#         if func(*args):
#             status = "✅ PASSED"
#             return True
#         return False
#     except Exception as e:
#         logger.error(f"ERROR: {e}", exc_info=False)
#         return False
#     finally:
#         print(f"\n--> {name} Status: {status}")

# def _health_and_hierarchy_report(db):
#     """Provides a high-level diagnostic overview of the database's structure."""
#     print_header("Database Health & Hierarchy Report", char="*")
#     total_docs = db._collection.count()
#     logger.info(f"Retrieving metadata for all {total_docs} chunks...")
#     retrieved_data = db.get(limit=total_docs, include=["metadatas"])
#     all_metadatas = retrieved_data['metadatas']
    
#     level_0_counts = {}
#     for meta in all_metadatas:
#         level_0_title = meta.get("level_0_title")
#         if level_0_title:
#             level_0_counts[level_0_title] = level_0_counts.get(level_0_title, 0) + 1
            
#     assert level_0_counts, "CRITICAL: No 'level_0_title' metadata found in any chunks!"
    
#     print("\n✅ Found the following top-level sections and their chunk counts:")
#     df = pd.DataFrame(list(level_0_counts.items()), columns=['Top-Level Section (level_0_title)', 'Chunk Count'])
#     df = df.sort_values(by='Chunk Count', ascending=False).reset_index(drop=True)
#     print(df.to_string())
#     return True

# def _deep_hierarchy_test(db, toc):
#     """Verifies a deep leaf section can be retrieved via a similarity search with a strict filter."""
#     path = find_leaf_section(toc)
#     assert path, "Could not find a leaf section to test."
#     section_title = path[-1]
    
#     # Use the normalized path for the filter, as created in Cell 5
#     full_path_norm = normalize_text(' > '.join(path))
#     w_filter = {"toc_path_norm": {"$eq": full_path_norm}}
    
#     # Use similarity search on the raw title for a realistic test
#     results = db.similarity_search(section_title, k=1, filter=w_filter)
#     print_results(f"Deep hierarchy check for: '{section_title}'", results, w_filter)
#     assert len(results) > 0, "Deeply filtered similarity search returned no results."
#     return True

# def _narrative_flow_test(db, toc):
#     """Verifies chunks for a specific leaf section are sequentially ordered."""
#     path = find_leaf_section(toc)
#     assert path, "Could not find a leaf section in ToC to test."
    
#     full_path_norm = normalize_text(' > '.join(path))
#     w_filter = {"toc_path_norm": {"$eq": full_path_norm}}
    
#     operation_name = f"Narrative flow check for: {' > '.join(path)}"
#     print_results(operation_name, [], w_filter) # Announce the operation
    
#     retrieved_data = db.get(where=w_filter, limit=200)
    
#     docs = []
#     if retrieved_data and retrieved_data['ids']:
#         docs = [Document(page_content=retrieved_data['documents'][i], metadata=retrieved_data['metadatas'][i]) for i in range(len(retrieved_data['ids']))]
    
#     print_results(f"Results for '{path[-1]}'", docs) # Show what was found
#     assert len(docs) > 1, "Not enough chunks retrieved to test sequence."
    
#     docs.sort(key=lambda x: x.metadata.get('global_chunk_sequence_id', -1))
#     sequence_numbers = [doc.metadata['global_chunk_sequence_id'] for doc in docs]
    
#     print(f"\nANALYSIS: Retrieved and sorted global sequence numbers: {sequence_numbers}")
#     assert all(sequence_numbers[i] < sequence_numbers[i+1] for i in range(len(sequence_numbers)-1)), "Global sequence is not strictly increasing."
#     return True

# # --- MAIN VERIFICATION EXECUTION ---
# def run_verification():
#     print_header("Database Verification Process")
#     if not langchain_available: logger.error("LangChain libraries not found."); return
#     required_paths = [CHROMA_PERSIST_DIR, PRE_EXTRACTED_TOC_JSON_PATH, PARSED_UO_JSON_PATH]
#     if not all(os.path.exists(p) for p in required_paths): logger.error("Missing file/dir. Run Cells 4 & 5."); return
#     with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r', encoding='utf-8') as f: toc_data = json.load(f)
    
#     logger.info(f"Initializing embedding model '{EMBEDDING_MODEL_OLLAMA}' to connect to DB...")
#     embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
#     vector_store = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embeddings, collection_name=CHROMA_COLLECTION_NAME)
    
#     tests = [
#         ("Diagnostic: Health & Hierarchy Report", "Provides a high-level overview of the DB's structure.", _health_and_hierarchy_report, (vector_store,)),
#         ("Test 1: Deep Hierarchy & Filter", "Checks if a deep section can be retrieved with a strict filter.", _deep_hierarchy_test, (vector_store, toc_data)),
#         ("Test 2: Narrative Flow", "Checks if chunks within a specific leaf sub-section are correctly ordered.", _narrative_flow_test, (vector_store, toc_data))
#     ]
#     results_summary = [run_test(name, goal, func, *args) for name, goal, func, args in tests]
    
#     passed_count = sum(filter(None, results_summary))
#     failed_count = len(results_summary) - passed_count
#     print_header("Final Verification Summary")
#     print(f"Total Tests Run: {len(results_summary)} | ✅ Passed: {passed_count} | ❌ Failed: {failed_count}")
#     print_header("Verification Complete", char="=")

# # --- Execute Verification ---
# run_verification()

In [16]:
# Cell 6: Verify Vector Database (Definitive Final Version)

import os
import json
import re
import random
import logging
from typing import List, Dict, Any, Tuple, Optional
import pandas as pd

try:
    from langchain_chroma import Chroma
    from langchain_core.documents import Document
    from langchain_ollama.embeddings import OllamaEmbeddings
    langchain_available = True
except ImportError:
    langchain_available = False

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---
def normalize_text(text: str) -> str:
    """Converts text to a canonical form for matching: lowercase, no punctuation, single spaces."""
    if not text: return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

def print_results(operation_name: str, results: list, where_filter: Optional[Dict] = None):
    """Prints the results of a vector store operation in a clear, readable format."""
    print("\n" + "-"*50)
    print(f"🔬 Operation: '{operation_name}'")
    if where_filter:
        print(f"📄 Filter: {json.dumps(where_filter, indent=2)}")
    
    if not results:
        print("\n❌ RESULTS: No documents found for this operation.")
        print("-" * 50)
        return
        
    print(f"\n✅ RESULTS: Found {len(results)} documents. Displaying details for top 3:")
    for i, doc in enumerate(results[:3]):
        print(f"\n--- Result {i+1} ---")
        print(f"Content: '{doc.page_content.replace('', ' ').strip()[:150]}...'")
        print(f"Metadata: {json.dumps(doc.metadata, indent=2)}")
    print("-" * 50)

def find_leaf_section(nodes: List[Dict]) -> Optional[List[str]]:
    """Finds a random, deep, leaf-node section from the ToC.json."""
    leaf_paths = []
    def _traverse(sub_nodes, current_path):
        for node in sub_nodes:
            new_path = current_path + [node.get('title', 'Untitled')]
            if not node.get('children'):
                if len(new_path) > 2:
                    leaf_paths.append(new_path)
            else:
                _traverse(node['children'], new_path)
    _traverse(nodes, [])
    return random.choice(leaf_paths) if leaf_paths else None

# --- TEST CASE FUNCTIONS ---
def run_test(name: str, goal: str, func, *args):
    """A wrapper to run each test and print its final status."""
    print_header(name, char="-")
    logger.info(f"🎯 GOAL: {goal}")
    status = "❌ FAILED"
    try:
        if func(*args):
            status = "✅ PASSED"
            return True
        return False
    except Exception as e:
        logger.error(f"ERROR: {e}", exc_info=False)
        return False
    finally:
        print(f"\n--> {name} Status: {status}")

def _health_and_hierarchy_report(db):
    """Provides a high-level diagnostic overview of the database's structure."""
    total_docs = db._collection.count()
    logger.info(f"Retrieving metadata for all {total_docs} chunks...")
    all_metadatas = db.get(limit=total_docs, include=["metadatas"])['metadatas']
    
    level_0_counts = {}
    for meta in all_metadatas:
        level_0_title = meta.get("level_0_title")
        if level_0_title:
            level_0_counts[level_0_title] = level_0_counts.get(level_0_title, 0) + 1
            
    assert level_0_counts, "CRITICAL: No 'level_0_title' metadata found!"
    
    print("\n✅ Found the following top-level sections and their chunk counts:")
    df = pd.DataFrame(list(level_0_counts.items()), columns=['Top-Level Section', 'Chunk Count'])
    print(df.sort_values(by='Chunk Count', ascending=False).reset_index(drop=True).to_string())
    return True

def _deep_hierarchy_and_flow_test(db, toc):
    """The definitive test. It verifies both hierarchy and sequence."""
    path = find_leaf_section(toc)
    assert path, "Could not find a suitable leaf section in ToC to test."

    # --- [PATCH B - FINAL VERSION] ---
    # Use a tolerant '$contains' filter on the new authoritative metadata field.
    assert len(path) >= 2, f"Found path '{path}' is not deep enough for this test."
    leaf_norm   = normalize_text(path[-1])
    parent_norm = normalize_text(path[-2])

    w_filter = {
        "$and": [
            {"toc_path_full_norm": {"$contains": parent_norm}},
            {"toc_path_full_norm": {"$contains": leaf_norm}}
        ]
    }
    # --- [END OF PATCH B] ---

    operation_name = f"Deep Hierarchy & Narrative Flow check for: {' > '.join(path)}"
    # Announce the operation and show the filter *before* executing the query.
    print_results(operation_name, [], where_filter=w_filter) 

    # --- [FIX] ---
    # Use similarity_search, which supports complex filters, instead of .get().
    # We provide a simple query string because our goal is to test the filter, not semantic similarity.
    docs = db.similarity_search(
        query=path[-1], # Use the leaf title as the query
        k=200,          # Retrieve up to 200 documents
        filter=w_filter
    )
    # --- [END FIX] ---

    print_results(f"Results for '{path[-1]}'", docs)
    assert len(docs) > 0, "Deep hierarchy search with '$contains' returned no results."
    
    if len(docs) > 1:
        # Sort by the sequence ID to verify chronological order
        docs.sort(key=lambda x: x.metadata.get('global_chunk_sequence_id', -1))
        sequence_numbers = [doc.metadata.get('global_chunk_sequence_id') for doc in docs]

        print(f"\nANALYSIS: Retrieved and sorted global sequence numbers:\n{sequence_numbers}")
        # Ensure the sequence is strictly increasing
        assert all(sequence_numbers[i] < sequence_numbers[i+1] for i in
                   range(len(sequence_numbers)-1)), "Global sequence is not strictly increasing."
    else:
        logger.warning("Only one chunk was retrieved; cannot test narrative flow sequence.")
        
    return True

# --- MAIN VERIFICATION EXECUTION ---
def run_verification():
    print_header("Database Verification Process")
    if not langchain_available: logger.error("LangChain libraries not found."); return
    required_paths = [CHROMA_PERSIST_DIR, PRE_EXTRACTED_TOC_JSON_PATH]
    if not all(os.path.exists(p) for p in required_paths): logger.error("Missing file/dir. Run Cells 4 & 5."); return
    with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r', encoding='utf-8') as f: toc_data = json.load(f)
    
    logger.info(f"Initializing embedding model '{EMBEDDING_MODEL_OLLAMA}' to connect to DB...")
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
    vector_store = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embeddings, collection_name=CHROMA_COLLECTION_NAME)
    
    tests = [
        ("Diagnostic: Health & Hierarchy Report", "Provides a high-level overview of the DB's structure.", _health_and_hierarchy_report, (vector_store,)),
        ("Test: Deep Hierarchy & Narrative Flow", "Checks if chunks within a leaf section are found and correctly ordered.", _deep_hierarchy_and_flow_test, (vector_store, toc_data))
    ]
    results_summary = [run_test(name, goal, func, *args) for name, goal, func, args in tests]
    
    passed_count = sum(filter(None, results_summary))
    failed_count = len(results_summary) - passed_count
    print_header("Final Verification Summary")
    print(f"Total Tests Run: {len(results_summary)} | ✅ Passed: {passed_count} | ❌ Failed: {failed_count}")
    print_header("Verification Complete", char="=")

# --- Execute Verification ---
run_verification()

2025-06-26 05:18:23,578 - INFO - Initializing embedding model 'nomic-embed-text' to connect to DB...


2025-06-26 05:18:23,588 - INFO - 🎯 GOAL: Provides a high-level overview of the DB's structure.
2025-06-26 05:18:23,591 - INFO - Retrieving metadata for all 12498 chunks...



                         Database Verification Process                          

--------------------------------------------------------------------------------
                     Diagnostic: Health & Hierarchy Report                      
--------------------------------------------------------------------------------


2025-06-26 05:18:24,404 - INFO - 🎯 GOAL: Checks if chunks within a leaf section are found and correctly ordered.
2025-06-26 05:18:24,486 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-06-26 05:18:24,487 - ERROR - ERROR: Expected where operator to be one of $gt, $gte, $lt, $lte, $ne, $eq, $in, $nin, got $contains in query.



✅ Found the following top-level sections and their chunk counts:
                                                                  Top-Level Section  Chunk Count
0                     Lab Manual for Guide to Computer Forensics and Investigations         5444
1                                        Chapter 6. Current Digital Forensics Tools         1440
2                                   Chapter 5. Working with Windows and CLI Systems         1310
3   Chapter 10. Virtual Machine Forensics, Live Acquisitions, and Network Forensics          699
4                                         Chapter 16. Ethics for the Expert Witness          593
5                                                       Chapter 13. Cloud Forensics          575
6      Chapter 1. Understanding the Digital Forensics Profession and Investigations          365
7                               Chapter 2. The Investigator’s Office and Laboratory          300
8                            Chapter 15. Expert Testimony in 

# Full Database Health & Hierarchy Diagnostic Report  

In [9]:
# Cell 6: Full Database Health & Hierarchy Diagnostic Report

import os
import json
import logging
from typing import List, Dict, Any, Optional
import pandas as pd

try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    langchain_available = True
except ImportError:
    langchain_available = False

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---
def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

def count_total_chunks(node: Dict) -> int:
    """Recursively counts all chunks in a node and its children."""
    total = node.get('_chunks', 0)
    for child_node in node.get('_children', {}).values():
        total += count_total_chunks(child_node)
    return total

def print_hierarchy_report(node: Dict, indent_level: int = 0):
    """Recursively prints the reconstructed hierarchy with chunk counts."""
    # Sort children by their total chunk count for a more organized report
    sorted_children = sorted(
        node.get('_children', {}).items(),
        key=lambda item: count_total_chunks(item[1]),
        reverse=True
    )
    
    for title, child_node in sorted_children:
        prefix = "    " * indent_level + "|__ "
        
        # Total chunks includes the node itself and all descendants
        total_chunks_in_branch = count_total_chunks(child_node)
        
        # Chunks directly assigned to this node (should be low if it has children)
        direct_chunks = child_node.get('_chunks', 0)
        
        print(f"{prefix}{title}  (Total Chunks: {total_chunks_in_branch}, Direct: {direct_chunks})")
        
        # Recursive call for the next level
        print_hierarchy_report(child_node, indent_level + 1)

# --- MAIN DIAGNOSTIC FUNCTION ---
def run_full_diagnostics():
    print_header("Full Database Health & Hierarchy Diagnostic Report")
    
    # 1. Connect to the Database
    logger.info("Connecting to the vector database...")
    if not os.path.exists(CHROMA_PERSIST_DIR):
        logger.error(f"FATAL: Chroma DB directory not found at {CHROMA_PERSIST_DIR}. Please run Cell 5 first."); return
    
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
    vector_store = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embeddings, collection_name=CHROMA_COLLECTION_NAME)
    
    # 2. Retrieve ALL Metadata
    total_docs = vector_store._collection.count()
    logger.info(f"Retrieving metadata for all {total_docs} chunks...")
    retrieved_data = vector_store.get(limit=total_docs, include=["metadatas"])
    all_metadatas = retrieved_data['metadatas']
    logger.info("Successfully retrieved all metadata.")
        
    # 3. Reconstruct the Hierarchy Tree from Metadata
    logger.info("Reconstructing hierarchy from chunk metadata...")
    hierarchy_tree = {'_children': {}} # A root node to hold everything
    chunks_without_path = 0
    
    for meta in all_metadatas:
        path = meta.get("toc_path")
        if not path:
            chunks_without_path += 1
            continue
            
        path_parts = path.split(" > ")
        current_node = hierarchy_tree
        
        for part in path_parts:
            # Navigate or create the path in our tree
            current_node = current_node['_children'].setdefault(part, {'_chunks': 0, '_children': {}})
        
        # Increment the count for the leaf node
        current_node['_chunks'] += 1
        
    logger.info("Hierarchy reconstruction complete.")
    
    # 4. Print the Report
    print_header("Reconstructed Hierarchy Report", char="-")
    print("This report shows the full hierarchical structure discovered from the chunk metadata.")
    print("'(Total Chunks: X, Direct: Y)' means X chunks in the whole branch, Y directly tagged to this heading.\n")
    
    print_hierarchy_report(hierarchy_tree)
    
    print_header("Diagnostic Summary", char="-")
    top_level_sections = len(hierarchy_tree.get('_children', {}))
    print(f"✅ Found {top_level_sections} distinct top-level sections.")
    if chunks_without_path > 0:
        logger.warning(f"⚠️ Found {chunks_without_path} chunks MISSING the 'toc_path' metadata. This indicates an error in the enrichment pipeline.")
    else:
        print("✅ All chunks contain 'toc_path' metadata.")
        
    print_header("Diagnostic Complete")

# --- Execute Diagnostics ---
run_full_diagnostics()

2025-06-26 05:14:02,922 - INFO - Connecting to the vector database...
2025-06-26 05:14:02,937 - INFO - Retrieving metadata for all 12498 chunks...



               Full Database Health & Hierarchy Diagnostic Report               


2025-06-26 05:14:03,841 - INFO - Successfully retrieved all metadata.
2025-06-26 05:14:03,842 - INFO - Reconstructing hierarchy from chunk metadata...
2025-06-26 05:14:03,851 - INFO - Hierarchy reconstruction complete.



--------------------------------------------------------------------------------
                         Reconstructed Hierarchy Report                         
--------------------------------------------------------------------------------
This report shows the full hierarchical structure discovered from the chunk metadata.
'(Total Chunks: X, Direct: Y)' means X chunks in the whole branch, Y directly tagged to this heading.

|__ Lab Manual for Guide to Computer Forensics and Investigations  (Total Chunks: 5444, Direct: 3)
    |__ Chapter 16. Ethics for the Expert Witness  (Total Chunks: 1698, Direct: 118)
        |__ Lab 16.1. Rebuilding an MFT Record from a Corrupt Image  (Total Chunks: 1568, Direct: 5)
            |__ Review Questions  (Total Chunks: 788, Direct: 788)
            |__ Objectives  (Total Chunks: 425, Direct: 298)
                |__ Materials Required  (Total Chunks: 127, Direct: 127)
            |__ Activity  (Total Chunks: 350, Direct: 337)
                |__ Co

In [10]:
# Cell 6: Database Health & Hierarchy Diagnostic Report

import os
import json
import logging
from typing import List, Dict
import pandas as pd # You might need to run: pip install pandas
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

# --- Main Diagnostic Function ---
def run_diagnostics():
    print_header("Database Health & Hierarchy Diagnostic Report")
    
    # --- 1. Connect to the Database ---
    logger.info("Connecting to the vector database...")
    if not os.path.exists(CHROMA_PERSIST_DIR):
        logger.error(f"FATAL: Chroma DB directory not found at {CHROMA_PERSIST_DIR}. Please run Cell 5 first."); return
    
    # This assumes your global variables (CHROMA_PERSIST_DIR, etc.) are available
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
    vector_store = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embeddings, collection_name=CHROMA_COLLECTION_NAME)
    logger.info("Successfully connected to the database.")

    # --- 2. Retrieve ALL Metadata ---
    logger.info("Retrieving all metadata from the database. This may take a moment...")
    try:
        total_docs = vector_store._collection.count()
        if total_docs == 0:
            logger.error("Database is empty. Cannot run diagnostics."); return
        retrieved_data = vector_store.get(limit=total_docs, include=["metadatas"])
        all_metadatas = retrieved_data['metadatas']
        logger.info(f"Successfully retrieved metadata for all {len(all_metadatas)} chunks.")
    except Exception as e:
        logger.error(f"Failed to retrieve data from ChromaDB: {e}"); return
        
    # --- 3. Analyze Hierarchy Distribution ---
    print_header("Hierarchy Distribution Analysis", char="-")
    
    level_0_counts = {}
    chunks_without_level_0 = 0
    for meta in all_metadatas:
        level_0_title = meta.get("level_0_title")
        if level_0_title:
            level_0_counts[level_0_title] = level_0_counts.get(level_0_title, 0) + 1
        else:
            chunks_without_level_0 += 1
            
    if level_0_counts:
        print("\n✅ Found the following top-level sections (level_0_title) and their chunk counts:")
        # Use pandas to create a nicely formatted table
        df = pd.DataFrame(list(level_0_counts.items()), columns=['Top-Level Section (level_0_title)', 'Chunk Count'])
        df = df.sort_values(by='Chunk Count', ascending=False).reset_index(drop=True)
        print(df.to_string())
    else:
        logger.error("❌ CRITICAL ERROR: No chunks with 'level_0_title' metadata were found!")

    if chunks_without_level_0 > 0:
        logger.warning(f"\n⚠️ Found {chunks_without_level_0} chunks that are MISSING the 'level_0_title' metadata entirely. This is a major sign of a data processing error.")
        
    print_header("Diagnostic Complete")

# --- Execute Diagnostics ---
run_diagnostics()

2025-06-26 03:58:57,716 - INFO - Connecting to the vector database...
2025-06-26 03:58:57,731 - INFO - Successfully connected to the database.
2025-06-26 03:58:57,733 - INFO - Retrieving all metadata from the database. This may take a moment...



                 Database Health & Hierarchy Diagnostic Report                  


2025-06-26 03:58:58,380 - INFO - Successfully retrieved metadata for all 12498 chunks.



--------------------------------------------------------------------------------
                        Hierarchy Distribution Analysis                         
--------------------------------------------------------------------------------

✅ Found the following top-level sections (level_0_title) and their chunk counts:
                                                  Top-Level Section (level_0_title)  Chunk Count
0                     Lab Manual for Guide to Computer Forensics and Investigations         4178
1                                         Chapter 16. Ethics for the Expert Witness         2550
2      Chapter 1. Understanding the Digital Forensics Profession and Investigations          594
3                                   Chapter 5. Working with Windows and CLI Systems          483
4                                        Chapter 6. Current Digital Forensics Tools          465
5                                   Chapter 4. Processing Crime and Incident Scenes         