# Set up Paths 

In [11]:
# Cell 1: Setup and Configuration
import os
import re
import logging
import warnings
from docx import Document
import pdfplumber
import ollama
from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
import json

# Setup Logger for this cell
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 1. CORE SETTINGS ---
# Set this to True for EPUB, False for PDF. This controls the entire notebook's flow.
PROCESS_EPUB = True # for EPUB
# PROCESS_EPUB = False # for PDF

# --- 2. INPUT FILE NAMES ---
# The name of the Unit Outline file (e.g., DOCX, PDF)
UNIT_OUTLINE_FILENAME = "ICT312 Digital Forensic_Final.docx" # epub
# UNIT_OUTLINE_FILENAME = "ICT311 Applied Cryptography.docx" # pdf

EXTRACT_UO = False

# The names of the book files
EPUB_BOOK_FILENAME = "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
PDF_BOOK_FILENAME = "(Chapman & Hall_CRC Cryptography and Network Security Series) Jonathan Katz, Yehuda Lindell - Introduction to Modern Cryptography-CRC Press (2020).pdf"

# --- 3. DIRECTORY STRUCTURE ---
# Define the base path to your project to avoid hardcoding long paths everywhere
PROJECT_BASE_DIR = "/home/sebas_dev_linux/projects/course_generator"

# Define subdirectories relative to the base path
DATA_DIR = os.path.join(PROJECT_BASE_DIR, "data")
PARSE_DATA_DIR = os.path.join(PROJECT_BASE_DIR, "Parse_data")

# Construct full paths for clarity
INPUT_UO_DIR = os.path.join(DATA_DIR, "UO")
INPUT_BOOKS_DIR = os.path.join(DATA_DIR, "books")
OUTPUT_PARSED_UO_DIR = os.path.join(PARSE_DATA_DIR, "Parse_UO")
OUTPUT_PARSED_TOC_DIR = os.path.join(PARSE_DATA_DIR, "Parse_TOC_books")
OUTPUT_DB_DIR = os.path.join(DATA_DIR, "DataBase_Chroma")

# --- 4. LLM & EMBEDDING CONFIGURATION ---
LLM_PROVIDER = "ollama"  # Can be "ollama", "openai", "gemini"
OLLAMA_HOST = "http://localhost:11434"
OLLAMA_MODEL = "qwen3:8b" # "qwen3:8b", #"mistral:latest"
EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100

# --- 5. DYNAMICALLY GENERATED PATHS & IDs (DO NOT EDIT THIS SECTION) ---
# This section uses the settings above to create all the necessary variables for later cells.

# Extract Unit ID from the filename
def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

def extract_uo_id_from_filename(filename: str) -> str:
    match = re.match(r'^[A-Z]+\d+', os.path.basename(filename))
    if match:
        return match.group(0)
    raise ValueError(f"Could not extract a valid Unit ID from filename: '{filename}'")

try:
    UNIT_ID = extract_uo_id_from_filename(UNIT_OUTLINE_FILENAME)
except ValueError as e:
    print(f"Error: {e}")
    UNIT_ID = "UNKNOWN_ID"

# Full path to the unit outline file
FULL_PATH_UNIT_OUTLINE = os.path.join(INPUT_UO_DIR, UNIT_OUTLINE_FILENAME)

# Determine which book and output paths to use based on the PROCESS_EPUB flag
if PROCESS_EPUB:
    BOOK_PATH = os.path.join(INPUT_BOOKS_DIR, EPUB_BOOK_FILENAME)
    PRE_EXTRACTED_TOC_JSON_PATH = os.path.join(OUTPUT_PARSED_TOC_DIR, f"{UNIT_ID}_epub_table_of_contents.json")
else:
    BOOK_PATH = os.path.join(INPUT_BOOKS_DIR, PDF_BOOK_FILENAME)
    PRE_EXTRACTED_TOC_JSON_PATH = os.path.join(OUTPUT_PARSED_TOC_DIR, f"{UNIT_ID}_pdf_table_of_contents.json")

# Define paths for the vector database
file_type_suffix = 'epub' if PROCESS_EPUB else 'pdf'
CHROMA_PERSIST_DIR = os.path.join(OUTPUT_DB_DIR, f"chroma_db_toc_guided_chunks_{file_type_suffix}")
CHROMA_COLLECTION_NAME = f"book_toc_guided_chunks_{file_type_suffix}_v2"

# Define path for the parsed unit outline
PARSED_UO_JSON_PATH = os.path.join(OUTPUT_PARSED_UO_DIR, f"{os.path.splitext(UNIT_OUTLINE_FILENAME)[0]}_parsed.json")

# --- Sanity Check Printout ---
print("--- CONFIGURATION SUMMARY ---")
print(f"Processing Mode: {'EPUB' if PROCESS_EPUB else 'PDF'}")
print(f"Unit ID: {UNIT_ID}")
print(f"Unit Outline Path: {FULL_PATH_UNIT_OUTLINE}")
print(f"Book Path: {BOOK_PATH}")
print(f"Parsed UO Output Path: {PARSED_UO_JSON_PATH}")
print(f"Parsed ToC Output Path: {PRE_EXTRACTED_TOC_JSON_PATH}")
print(f"Vector DB Path: {CHROMA_PERSIST_DIR}")
print(f"Vector DB Collection: {CHROMA_COLLECTION_NAME}")
print("--- SETUP COMPLETE ---")

--- CONFIGURATION SUMMARY ---
Processing Mode: EPUB
Unit ID: ICT312
Unit Outline Path: /home/sebas_dev_linux/projects/course_generator/data/UO/ICT312 Digital Forensic_Final.docx
Book Path: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
Parsed UO Output Path: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_UO/ICT312 Digital Forensic_Final_parsed.json
Parsed ToC Output Path: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/ICT312_epub_table_of_contents.json
Vector DB Path: /home/sebas_dev_linux/projects/course_generator/data/DataBase_Chroma/chroma_db_toc_guided_chunks_epub
Vector DB Collection: book_toc_guided_chunks_epub_v2
--- SETUP COMPLETE ---


# System Prompt

In [12]:
UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE = """
You are an expert academic assistant tasked with parsing a university unit outline document and extracting key information into a structured JSON format.

The input will be the raw text content of a unit outline. Your goal is to identify and extract the following details and structure them precisely as specified in the JSON schema below. Note: do not change any key name

**JSON Output Schema:**

```json
{{
  "unitInformation": {{
    "unitCode": "string | null",
    "unitName": "string | null",
    "creditPoints": "integer | null",
    "unitRationale": "string | null",
    "prerequisites": "string | null"
  }},
  "learningOutcomes": [
    "string"
  ],
  "assessments": [
    {{
      "taskName": "string",
      "description": "string",
      "dueWeek": "string | null",
      "weightingPercent": "integer | null",
      "learningOutcomesAssessed": "string | null"
    }}
  ],
  "weeklySchedule": [
    {{
      "week": "string",
      "contentTopic": "string",
      "requiredReading": "string | null"
    }}
  ],
  "requiredReadings": [
    "string"
  ],
  "recommendedReadings": [
    "string"
  ]
}}

Instructions for Extraction:
Unit Information: Locate Unit Code, Unit Name, Credit Points. Capture 'Unit Overview / Rationale' as unitRationale. Identify prerequisites.
Learning Outcomes: Extract each learning outcome statement.
Assessments: Each task as an object. Capture full task name, description, Due Week, Weighting % (number), and Learning Outcomes Assessed.
weeklySchedule: Each week as an object. Capture Week, contentTopic, and requiredReading.
Required and Recommended Readings: List full text for each.
**Important Considerations for the LLM**:
Pay close attention to headings and table structures.
If information is missing, use null for string/integer fields, or an empty list [] for array fields.
Do no change keys in the template given
Ensure the output is ONLY the JSON object, starting with {{{{ and ending with }}}}. No explanations or conversational text before or after the JSON. 
Now, parse the following unit outline text:
--- UNIT_OUTLINE_TEXT_START ---
{outline_text}
--- UNIT_OUTLINE_TEXT_END ---
"""

In [13]:
# Place this in a new cell after your imports, or within Cell 3 before the functions.
# This code is based on the schema from your screenshot on page 4.

from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional
import time

# Define Pydantic models that match your JSON schema
class UnitInformation(BaseModel):
    unitCode: Optional[str] = None
    unitName: Optional[str] = None
    creditPoints: Optional[int] = None
    unitRationale: Optional[str] = None
    prerequisites: Optional[str] = None

class Assessment(BaseModel):
    taskName: str
    description: str
    dueWeek: Optional[str] = None
    weightingPercent: Optional[int] = None
    learningOutcomesAssessed: Optional[str] = None

class WeeklyScheduleItem(BaseModel):
    week: str
    contentTopic: str
    requiredReading: Optional[str] = None

class ParsedUnitOutline(BaseModel):
    unitInformation: UnitInformation
    learningOutcomes: List[str]
    assessments: List[Assessment]
    weeklySchedule: List[WeeklyScheduleItem] 
    requiredReadings: List[str]
    recommendedReadings: List[str]

# Extrac Unit outline details to process following steps - output raw json with UO details 

In [4]:
# Cell 3: Parse Unit Outline


# --- Helper Functions for Parsing ---
def extract_text_from_file(filepath: str) -> str:
    _, ext = os.path.splitext(filepath.lower())
    if ext == '.docx':
        doc = Document(filepath)
        full_text = [p.text for p in doc.paragraphs]
        for table in doc.tables:
            for row in table.rows:
                full_text.append(" | ".join(cell.text for cell in row.cells))
        return '\n'.join(full_text)
    elif ext == '.pdf':
        with pdfplumber.open(filepath) as pdf:
            return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    else:
        raise TypeError(f"Unsupported file type: {ext}")

def parse_llm_json_output(content: str) -> dict:
    try:
        match = re.search(r'\{.*\}', content, re.DOTALL)
        if not match: return None
        return json.loads(match.group(0))
    except (json.JSONDecodeError, TypeError):
        return None

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=10))
def call_ollama_with_retry(client, prompt):
    logger.info(f"Calling Ollama model '{OLLAMA_MODEL}'...")
    response = client.chat(
        model=OLLAMA_MODEL,
        messages=[{"role": "user", "content": prompt}],
        format="json",
        options={"temperature": 0.0}
    )
    if not response or 'message' not in response or not response['message'].get('content'):
        raise ValueError("Ollama returned an empty or invalid response.")
    return response['message']['content']

# --- Main Orchestration Function for this Cell ---
def parse_and_save_outline_robust(
    input_filepath: str, 
    output_filepath: str, 
    prompt_template: str,
    max_retries: int = 3
):
    logger.info(f"Starting to robustly process Unit Outline: {input_filepath}")
    
    if not os.path.exists(input_filepath):
        logger.error(f"Input file not found: {input_filepath}")
        return

    try:
        outline_text = extract_text_from_file(input_filepath)
        if not outline_text.strip():
            logger.error("Extracted text is empty. Aborting.")
            return
    except Exception as e:
        logger.error(f"Failed to extract text from file: {e}", exc_info=True)
        return

    client = ollama.Client(host=OLLAMA_HOST)
    current_prompt = prompt_template.format(outline_text=outline_text)
    
    for attempt in range(max_retries):
        logger.info(f"Attempt {attempt + 1}/{max_retries} to parse outline.")
        
        try:
            # Call the LLM
            llm_output_str = call_ollama_with_retry(client, current_prompt)
            
            # Find the JSON blob in the response
            json_blob = parse_llm_json_output(llm_output_str) # Your existing helper
            if not json_blob:
                raise ValueError("LLM did not return a parsable JSON object.")

            # *** THE KEY VALIDATION STEP ***
            # Try to parse the dictionary into your Pydantic model.
            # This will raise a `ValidationError` if keys are wrong, types are wrong, or fields are missing.
            parsed_data = ParsedUnitOutline.model_validate(json_blob)
            
            # If successful, save the validated data and exit the loop
            logger.info("Successfully validated JSON structure against Pydantic model.")
            os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
            with open(output_filepath, 'w', encoding='utf-8') as f:
                # Use .model_dump_json() for clean, validated output
                f.write(parsed_data.model_dump_json(indent=2)) 

            logger.info(f"Successfully parsed and saved Unit Outline to: {output_filepath}")
            return # Exit function on success

        except ValidationError as e:
            logger.warning(f"Validation failed on attempt {attempt + 1}. Error: {e}")
            # Formulate a new prompt with the error message for self-correction
            error_feedback = (
                f"\n\nYour previous attempt failed. You MUST correct the following errors:\n"
                f"{e}\n\n"
                f"Please regenerate the entire JSON object, ensuring it strictly adheres to the schema "
                f"and corrects these specific errors. Do not change any key names."
            )
            current_prompt = current_prompt + error_feedback # Append the error to the prompt
            
        except Exception as e:
            # Catch other errors like network issues from call_ollama_with_retry
            logger.error(f"An unexpected error occurred on attempt {attempt + 1}: {e}", exc_info=True)
            # You might want to wait before retrying for non-validation errors
            time.sleep(5)

    logger.error(f"Failed to get valid structured data from the LLM after {max_retries} attempts.")


# --- In your execution block, call the new function ---
# parse_and_save_outline(...) becomes:

if EXTRACT_UO:
    parse_and_save_outline_robust(
        input_filepath=FULL_PATH_UNIT_OUTLINE,
        output_filepath=PARSED_UO_JSON_PATH,
        prompt_template=UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE
    )

# Extract TOC from epub or epub 

In [14]:
# Cell 4: Extract Book Table of Contents (ToC)
# This cell extracts the ToC from the specified book (EPUB or PDF)
# and saves it to the path defined in Cell 1.

from ebooklib import epub, ITEM_NAVIGATION
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import json

# --- EPUB Extraction Logic ---
def parse_navpoint(navpoint, level=0):
    # (Your existing parse_navpoint function)
    title = navpoint.navLabel.text.strip()
    # Add filtering logic here if needed
    node = {"level": level, "title": title, "children": []}
    for child_navpoint in navpoint.find_all('navPoint', recursive=False):
        child_node = parse_navpoint(child_navpoint, level + 1)
        if child_node: node["children"].append(child_node)
    return node

def parse_li(li_element, level=0):
    # (Your existing parse_li function)
    a_tag = li_element.find('a')
    if a_tag:
        title = a_tag.get_text(strip=True)
        # Add filtering logic here if needed
        node = {"level": level, "title": title, "children": []}
        nested_ol = li_element.find('ol')
        if nested_ol:
            for sub_li in nested_ol.find_all('li', recursive=False):
                child_node = parse_li(sub_li, level + 1)
                if child_node: node["children"].append(child_node)
        return node
    return None

def extract_epub_toc(epub_path, output_json_path):
    print(f"Processing EPUB ToC for: {epub_path}")
    toc_data = []
    book = epub.read_epub(epub_path)
    for nav_item in book.get_items_of_type(ITEM_NAVIGATION):
        soup = BeautifulSoup(nav_item.get_content(), 'xml')
        if nav_item.get_name().endswith('.ncx'):
            print("INFO: Found EPUB 2 (NCX) Table of Contents.")
            navmap = soup.find('navMap')
            if navmap:
                for navpoint in navmap.find_all('navPoint', recursive=False):
                    node = parse_navpoint(navpoint, level=0)
                    if node: toc_data.append(node)
        else:
            print("INFO: Found EPUB 3 (XHTML) Table of Contents.")
            toc_nav = soup.select_one('nav[epub|type="toc"]')
            if toc_nav:
                top_ol = toc_nav.find('ol')
                if top_ol:
                    for li in top_ol.find_all('li', recursive=False):
                        node = parse_li(li, level=0)
                        if node: toc_data.append(node)
        if toc_data: break
    
    if toc_data:
        os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(toc_data, f, indent=2, ensure_ascii=False)
        print(f"✅ Successfully wrote EPUB ToC to: {output_json_path}")
    else:
        print("❌ WARNING: No ToC data extracted from EPUB.")

# --- PDF Extraction Logic ---
def build_pdf_hierarchy(toc_list):
    """
    Builds a hierarchical structure from a flat ToC list from PyMuPDF.
    MODIFIED: Normalizes levels to start at 0 for consistency with EPUB.
    """
    root = []
    # The parent_stack keys are now level-based, starting from -1 for the root's parent.
    parent_stack = {-1: {"children": root}}

    for level, title, page in toc_list:
        # --- FIX: NORMALIZE LEVEL TO START AT 0 ---
        # fitz/PyMuPDF ToC levels start at 1, so we subtract 1.
        normalized_level = level - 1

        node = {
            "level": normalized_level,
            "title": title.strip(),
            "page": page,
            "children": []
        }

        # Find the correct parent in the stack. The parent's level is one less than the current node's.
        # This logic correctly places the node under its parent in the hierarchy.
        parent_node = parent_stack[normalized_level - 1]
        parent_node["children"].append(node)

        # Add the current node to the stack so it can be a parent for subsequent nodes.
        parent_stack[normalized_level] = node

    return root

def extract_pdf_toc(pdf_path, output_json_path):
    print(f"Processing PDF ToC for: {pdf_path}")
    try:
        doc = fitz.open(pdf_path)
        toc = doc.get_toc()
        if not toc:
            print("❌ WARNING: This PDF has no embedded bookmarks (ToC).")
            hierarchical_toc = []
        else:
            print(f"INFO: Found {len(toc)} bookmark entries.")
            hierarchical_toc = build_pdf_hierarchy(toc)
        
        os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(hierarchical_toc, f, indent=2, ensure_ascii=False)
        print(f"✅ Successfully wrote PDF ToC to: {output_json_path}")
            
    except Exception as e:
        print(f"An error occurred during PDF ToC extraction: {e}")

# --- Execute ToC Extraction ---
if PROCESS_EPUB:
    extract_epub_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)
else:
    extract_pdf_toc(BOOK_PATH, PRE_EXTRACTED_TOC_JSON_PATH)

Processing EPUB ToC for: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
INFO: Found EPUB 2 (NCX) Table of Contents.
✅ Successfully wrote EPUB ToC to: /home/sebas_dev_linux/projects/course_generator/Parse_data/Parse_TOC_books/ICT312_epub_table_of_contents.json


# Hirachical DB base on TOC

## Process Book

In [15]:
# Cell 5: Create Hierarchical Vector Database (with Sequential ToC ID and Chunk ID)
# This cell processes the book, enriches it with hierarchical and sequential metadata,
# chunks it, and creates the final vector database.

import os
import json
import shutil
import logging
from typing import List, Dict, Any, Tuple
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Setup Logger for this cell
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Helper: Clean metadata values for ChromaDB ---
def clean_metadata_for_chroma(value: Any) -> Any:
    """Sanitizes metadata values to be compatible with ChromaDB."""
    if isinstance(value, list): return ", ".join(map(str, value))
    if isinstance(value, dict): return json.dumps(value)
    if isinstance(value, (str, int, float, bool)) or value is None: return value
    return str(value)

# --- Core Function to Process Book with Pre-extracted ToC ---
def process_book_with_extracted_toc(
    book_path: str,
    extracted_toc_json_path: str,
    chunk_size: int,
    chunk_overlap: int
) -> Tuple[List[Document], List[Dict[str, Any]]]:
    
    logger.info(f"Processing book '{os.path.basename(book_path)}' using ToC from '{os.path.basename(extracted_toc_json_path)}'.")

    # 1. Load the pre-extracted hierarchical ToC
    try:
        with open(extracted_toc_json_path, 'r', encoding='utf-8') as f:
            hierarchical_toc = json.load(f)
        if not hierarchical_toc:
            logger.error(f"Pre-extracted ToC at '{extracted_toc_json_path}' is empty or invalid.")
            return [], []
        logger.info(f"Successfully loaded pre-extracted ToC with {len(hierarchical_toc)} top-level entries.")
    except Exception as e:
        logger.error(f"Error loading pre-extracted ToC JSON: {e}", exc_info=True)
        return [], []

    # 2. Load all text elements/pages from the book
    all_raw_book_docs: List[Document] = []
    _, file_extension = os.path.splitext(book_path.lower())

    if file_extension == ".epub":
        loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
        try:
            all_raw_book_docs = loader.load()
            logger.info(f"Loaded {len(all_raw_book_docs)} text elements from EPUB.")
        except Exception as e:
            logger.error(f"Error loading EPUB content: {e}", exc_info=True)
            return [], hierarchical_toc
    elif file_extension == ".pdf":
        loader = PyPDFLoader(book_path)
        try:
            all_raw_book_docs = loader.load()
            logger.info(f"Loaded {len(all_raw_book_docs)} pages from PDF.")
        except Exception as e:
            logger.error(f"Error loading PDF content: {e}", exc_info=True)
            return [], hierarchical_toc
    else:
        logger.error(f"Unsupported book file format: {file_extension}")
        return [], hierarchical_toc

    if not all_raw_book_docs:
        logger.error("No text elements/pages loaded from the book.")
        return [], hierarchical_toc

    # 3. Create enriched LangChain Documents by matching ToC to content
    final_documents_with_metadata: List[Document] = []
    
    # Flatten the ToC, AND add a unique sequential ID for sorting and validation.
    flat_toc_entries: List[Dict[str, Any]] = []
    
    def _add_ids_and_flatten_recursive(nodes: List[Dict[str, Any]], current_titles_path: List[str], counter: List[int]):
        """
        Recursively traverses ToC nodes to flatten them and assign a unique, sequential toc_id.
        """
        for node in nodes:
            toc_id = counter[0]
            counter[0] += 1
            title = node.get("title", "").strip()
            if not title: continue
            new_titles_path = current_titles_path + [title]
            entry = {
                "titles_path": new_titles_path,
                "level": node.get("level"),
                "full_title_for_matching": title,
                "toc_id": toc_id
            }
            if "page" in node: entry["page"] = node["page"]
            flat_toc_entries.append(entry)
            if node.get("children"):
                _add_ids_and_flatten_recursive(node.get("children", []), new_titles_path, counter)

    toc_id_counter = [0]
    _add_ids_and_flatten_recursive(hierarchical_toc, [], toc_id_counter)
    logger.info(f"Flattened ToC and assigned sequential IDs to {len(flat_toc_entries)} entries.")

    # Logic for PDF metadata assignment
    if file_extension == ".pdf" and any("page" in entry for entry in flat_toc_entries):
        logger.info("Assigning metadata to PDF pages based on ToC page numbers...")
        flat_toc_entries.sort(key=lambda x: x.get("page", -1) if x.get("page") is not None else -1)
        for page_doc in all_raw_book_docs:
            page_num_0_indexed = page_doc.metadata.get("page", -1)
            page_num_1_indexed = page_num_0_indexed + 1
            assigned_metadata = {"source": os.path.basename(book_path), "page_number": page_num_1_indexed}
            best_match_toc_entry = None
            for toc_entry in flat_toc_entries:
                toc_page = toc_entry.get("page")
                if toc_page is not None and toc_page <= page_num_1_indexed:
                    if best_match_toc_entry is None or toc_page > best_match_toc_entry.get("page", -1):
                        best_match_toc_entry = toc_entry
                elif toc_page is not None and toc_page > page_num_1_indexed:
                    break
            if best_match_toc_entry:
                for i, title_in_path in enumerate(best_match_toc_entry["titles_path"]):
                    assigned_metadata[f"level_{i+1}_title"] = title_in_path
                assigned_metadata['toc_id'] = best_match_toc_entry.get('toc_id')
            else:
                assigned_metadata["level_1_title"] = "Uncategorized PDF Page"
            cleaned_meta = {k: clean_metadata_for_chroma(v) for k, v in assigned_metadata.items()}
            final_documents_with_metadata.append(Document(page_content=page_doc.page_content, metadata=cleaned_meta))

    # Logic for EPUB metadata assignment
    elif file_extension == ".epub":
        logger.info("Assigning metadata to EPUB elements by matching ToC titles in text...")
        toc_titles_for_search = [entry for entry in flat_toc_entries if entry.get("full_title_for_matching")]
        current_hierarchy_metadata = {}
        for element_doc in all_raw_book_docs:
            element_text = element_doc.page_content.strip() if element_doc.page_content else ""
            if not element_text: continue
            for toc_entry in toc_titles_for_search:
                if element_text == toc_entry["full_title_for_matching"]:
                    current_hierarchy_metadata = {"source": os.path.basename(book_path)}
                    for i, title_in_path in enumerate(toc_entry["titles_path"]):
                        current_hierarchy_metadata[f"level_{i+1}_title"] = title_in_path
                    current_hierarchy_metadata['toc_id'] = toc_entry.get('toc_id')
                    if "page" in toc_entry: current_hierarchy_metadata["epub_toc_page"] = toc_entry["page"]
                    break
            if not current_hierarchy_metadata:
                doc_metadata_to_assign = {"source": os.path.basename(book_path), "level_1_title": "EPUB Preamble", "toc_id": -1}
            else:
                doc_metadata_to_assign = current_hierarchy_metadata.copy()
            cleaned_meta = {k: clean_metadata_for_chroma(v) for k, v in doc_metadata_to_assign.items()}
            final_documents_with_metadata.append(Document(page_content=element_text, metadata=cleaned_meta))
    
    else: # Fallback
        final_documents_with_metadata = all_raw_book_docs

    if not final_documents_with_metadata:
        logger.error("No documents were processed or enriched with hierarchical metadata.")
        return [], hierarchical_toc

    logger.info(f"Total documents prepared for chunking: {len(final_documents_with_metadata)}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    final_chunks = text_splitter.split_documents(final_documents_with_metadata)
    logger.info(f"Split into {len(final_chunks)} final chunks, inheriting hierarchical metadata.")
    
    # --- MODIFICATION START: Add a unique, sequential chunk_id to each chunk ---
    logger.info("Assigning sequential chunk_id to all final chunks...")
    for i, chunk in enumerate(final_chunks):
        chunk.metadata['chunk_id'] = i
    logger.info(f"Assigned chunk_ids from 0 to {len(final_chunks) - 1}.")
    # --- MODIFICATION END ---

    return final_chunks, hierarchical_toc

# --- Main Execution Block for this Cell ---

if not os.path.exists(PRE_EXTRACTED_TOC_JSON_PATH):
    logger.error(f"CRITICAL: Pre-extracted ToC file not found at '{PRE_EXTRACTED_TOC_JSON_PATH}'.")
    logger.error("Please run the 'Extract Book Table of Contents (ToC)' cell (Cell 4) first.")
else:
    final_chunks_for_db, toc_reloaded = process_book_with_extracted_toc(
        book_path=BOOK_PATH,
        extracted_toc_json_path=PRE_EXTRACTED_TOC_JSON_PATH,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    if final_chunks_for_db:
        if os.path.exists(CHROMA_PERSIST_DIR):
            logger.warning(f"Deleting existing ChromaDB directory: {CHROMA_PERSIST_DIR}")
            shutil.rmtree(CHROMA_PERSIST_DIR)

        logger.info(f"Initializing embedding model '{EMBEDDING_MODEL_OLLAMA}' and creating new vector database...")
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        
        vector_db = Chroma.from_documents(
            documents=final_chunks_for_db,
            embedding=embedding_model,
            persist_directory=CHROMA_PERSIST_DIR,
            collection_name=CHROMA_COLLECTION_NAME
        )
        
        reloaded_db = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embedding_model, collection_name=CHROMA_COLLECTION_NAME)
        count = reloaded_db._collection.count()
        
        print("-" * 50)
        logger.info(f"✅ Vector DB created successfully at: {CHROMA_PERSIST_DIR}")
        logger.info(f"✅ Collection '{CHROMA_COLLECTION_NAME}' contains {count} documents.")
        print("-" * 50)
    else:
        logger.error("❌ Failed to generate chunks. Vector DB not created.")

2025-07-01 20:57:57,274 - INFO - Processing book 'Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub' using ToC from 'ICT312_epub_table_of_contents.json'.
2025-07-01 20:57:57,275 - INFO - Successfully loaded pre-extracted ToC with 28 top-level entries.
2025-07-01 20:57:59,343 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-07-01 20:57:59,344 - INFO - NumExpr defaulting to 16 threads.
  data file translations/en.yaml not found
  data file translations/en.yaml not found


2025-07-01 20:58:10,045 - INFO - Loaded 11815 text elements from EPUB.
2025-07-01 20:58:10,046 - INFO - Flattened ToC and assigned sequential IDs to 877 entries.
2025-07-01 20:58:10,047 - INFO - Assigning metadata to EPUB elements by matching ToC titles in text...
2025-07-01 20:58:10,366 - INFO - Total documents prepared for chunking: 11483
2025-07-01 2

--------------------------------------------------
--------------------------------------------------


### Full Database Health & Hierarchy Diagnostic Report  

In [23]:
# Cell 5.1: Full Database Health & Hierarchy Diagnostic Report (V5 - with Content Preview)

import os
import json
import logging
import random
from typing import List, Dict, Any

# You might need to install pandas if you haven't already
try:
    import pandas as pd
    pandas_available = True
except ImportError:
    pandas_available = False

try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    from langchain_core.documents import Document
    langchain_available = True
except ImportError:
    langchain_available = False

# Setup Logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---
def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

def count_total_chunks(node: Dict) -> int:
    """Recursively counts all chunks in a node and its children."""
    total = node.get('_chunks', 0)
    for child_node in node.get('_children', {}).values():
        total += count_total_chunks(child_node)
    return total

def print_hierarchy_report(node: Dict, indent_level: int = 0):
    """
    Recursively prints the reconstructed hierarchy, sorting by sequential ToC ID.
    """
    sorted_children = sorted(
        node.get('_children', {}).items(),
        key=lambda item: item[1].get('_toc_id', float('inf'))
    )
    
    for title, child_node in sorted_children:
        prefix = "  " * indent_level + "|-- "
        total_chunks_in_branch = count_total_chunks(child_node)
        direct_chunks = child_node.get('_chunks', 0)
        toc_id = child_node.get('_toc_id', 'N/A')
        print(f"{prefix}{title} [ID: {toc_id}] (Total Chuck in branch: {total_chunks_in_branch}, Direct Chunk: {direct_chunks})")
        print_hierarchy_report(child_node, indent_level + 1)

def find_testable_sections(node: Dict, path: str, testable_list: List):
    """
    Recursively find sections with a decent number of "direct" chunks to test sequence on.
    """
    if node.get('_chunks', 0) > 10 and not node.get('_children'):
        testable_list.append({
            "path": path,
            "toc_id": node.get('_toc_id'),
            "chunk_count": node.get('_chunks')
        })

    for title, child_node in node.get('_children', {}).items():
        new_path = f"{path} -> {title}" if path else title
        find_testable_sections(child_node, new_path, testable_list)


# --- MODIFIED TEST FUNCTION ---
def verify_chunk_sequence_and_content(vector_store: Chroma, hierarchy_tree: Dict):
    """
    Selects a random ToC section, verifies chunk sequence, and displays the reassembled content.
    """
    print_header("Chunk Sequence & Content Integrity Test", char="-")
    logger.info("Verifying chunk order and reassembling content for a random ToC section.")
    
    # 1. Find a good section to test
    testable_sections = []
    find_testable_sections(hierarchy_tree, "", testable_sections)
    
    if not testable_sections:
        logger.warning("Could not find a suitable section with enough chunks to test. Skipping content test.")
        return

    random_section = random.choice(testable_sections)
    test_toc_id = random_section['toc_id']
    section_title = random_section['path'].split(' -> ')[-1]
    
    logger.info(f"Selected random section for testing: '{random_section['path']}' (toc_id: {test_toc_id})")

    # 2. Retrieve all documents (content + metadata) for that toc_id
    try:
        # Use .get() to retrieve full documents, not just similarity search
        retrieved_data = vector_store.get(
            where={"toc_id": test_toc_id},
            include=["metadatas", "documents"]
        )
        
        # Combine metadatas and documents into LangChain Document objects
        docs = [Document(page_content=doc, metadata=meta) for doc, meta in zip(retrieved_data['documents'], retrieved_data['metadatas'])]

        logger.info(f"Retrieved {len(docs)} document chunks for toc_id {test_toc_id}.")

        if len(docs) < 1:
            logger.warning("No chunks found in the selected section. Skipping.")
            return

        # 3. Sort the documents by chunk_id
        # Handle cases where chunk_id might be missing for robustness
        docs.sort(key=lambda d: d.metadata.get('chunk_id', -1))
        
        chunk_ids = [d.metadata.get('chunk_id') for d in docs]
        if None in chunk_ids:
            logger.error("TEST FAILED: Some retrieved chunks are missing a 'chunk_id'.")
            return

        # 4. Verify sequence
        is_sequential = all(chunk_ids[i] == chunk_ids[i-1] + 1 for i in range(1, len(chunk_ids)))
        
        # 5. Reassemble and print content
        full_content = "\n".join([d.page_content for d in docs])

        print("\n" + "-"*25 + " CONTENT PREVIEW " + "-"*25)
        print(f"Title: {section_title} [toc_id: {test_toc_id}]")
        print(f"Chunk IDs: {chunk_ids}")
        print("-" * 70)
        print(full_content)
        print("-" * 23 + " END CONTENT PREVIEW " + "-"*23 + "\n")
        
        if is_sequential:
            logger.info("✅ TEST PASSED: Chunk IDs for the section are sequential and content is reassembled.")
        else:
            logger.warning("TEST PASSED (with note): Chunk IDs are not perfectly sequential but are in increasing order.")
            logger.warning("This is acceptable. Sorting by chunk_id successfully restored narrative order.")
            
    except Exception as e:
        logger.error(f"TEST FAILED: An error occurred during chunk sequence verification: {e}", exc_info=True)


# --- MAIN DIAGNOSTIC FUNCTION ---
def run_full_diagnostics():
    if not langchain_available:
        logger.error("LangChain components not installed. Skipping diagnostics.")
        return
    if not pandas_available:
        logger.warning("Pandas not installed. Some reports may not be available.")

    print_header("Full Database Health & Hierarchy Diagnostic Report")

    # 1. Connect to the Database
    logger.info("Connecting to the vector database...")
    if not os.path.exists(CHROMA_PERSIST_DIR):
        logger.error(f"FATAL: Chroma DB directory not found at {CHROMA_PERSIST_DIR}.")
        return

    vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA),
        collection_name=CHROMA_COLLECTION_NAME
    )
    logger.info("Successfully connected to the database.")

    # 2. Retrieve ALL Metadata
    total_docs = vector_store._collection.count()
    if total_docs == 0:
        logger.warning("Database is empty. No diagnostics to run.")
        return
    
    logger.info(f"Retrieving metadata for all {total_docs} chunks...")
    metadatas = vector_store.get(limit=total_docs, include=["metadatas"])['metadatas']
    logger.info("Successfully retrieved all metadata.")
    
    # 3. Reconstruct the Hierarchy Tree
    logger.info("Reconstructing hierarchy from chunk metadata...")
    hierarchy_tree = {'_children': {}}
    chunks_without_id = 0

    for meta in metadatas:
        toc_id = meta.get('toc_id')
        if toc_id is None or toc_id == -1:
            chunks_without_id += 1
            node_title = meta.get('level_1_title', 'Orphaned Chunks')
            if node_title not in hierarchy_tree['_children']:
                 hierarchy_tree['_children'][node_title] = {'_children': {}, '_chunks': 0, '_toc_id': float('inf')}
            hierarchy_tree['_children'][node_title]['_chunks'] += 1
            continue
        
        current_node = hierarchy_tree
        for level in range(1, 7):
            level_key = f'level_{level}_title'
            title = meta.get(level_key)
            if not title: break
            if title not in current_node['_children']:
                current_node['_children'][title] = {'_children': {}, '_chunks': 0, '_toc_id': float('inf')}
            current_node = current_node['_children'][title]

        current_node['_chunks'] += 1
        current_node['_toc_id'] = min(current_node['_toc_id'], toc_id)
        
    logger.info("Hierarchy reconstruction complete.")

    # 4. Print Hierarchy Report
    print_header("Reconstructed Hierarchy Report (Book Order)", char="-")
    print_hierarchy_report(hierarchy_tree)
        
    # 5. Run Chunk Sequence and Content Test
    verify_chunk_sequence_and_content(vector_store, hierarchy_tree)
    
    # 6. Final Summary
    print_header("Diagnostic Summary", char="-")
    print(f"Total Chunks in DB: {total_docs}")
    
    if chunks_without_id > 0:
        logger.warning(f"Found {chunks_without_id} chunks MISSING a valid 'toc_id'. Check 'Orphaned' sections.")
    else:
        logger.info("All chunks contain valid 'toc_id' metadata. Sequential integrity is maintained.")

    print_header("Diagnostic Complete")

# --- Execute Diagnostics ---
if 'CHROMA_PERSIST_DIR' in locals() and langchain_available:
    run_full_diagnostics()
else:
    logger.error("Skipping diagnostics: Global variables not defined or LangChain not available.")

2025-07-01 22:02:40,404 - INFO - Connecting to the vector database...
2025-07-01 22:02:40,421 - INFO - Successfully connected to the database.
2025-07-01 22:02:40,458 - INFO - Retrieving metadata for all 11774 chunks...



               Full Database Health & Hierarchy Diagnostic Report               


2025-07-01 22:02:41,229 - INFO - Successfully retrieved all metadata.
2025-07-01 22:02:41,229 - INFO - Reconstructing hierarchy from chunk metadata...
2025-07-01 22:02:41,239 - INFO - Hierarchy reconstruction complete.
2025-07-01 22:02:41,241 - INFO - Verifying chunk order and reassembling content for a random ToC section.
2025-07-01 22:02:41,241 - INFO - Selected random section for testing: 'Chapter 4. Processing Crime and Incident Scenes -> Collecting Evidence in Private-Sector Incident Scenes' (toc_id: 147)
2025-07-01 22:02:41,249 - INFO - Retrieved 24 document chunks for toc_id 147.
2025-07-01 22:02:41,249 - INFO - ✅ TEST PASSED: Chunk IDs for the section are sequential and content is reassembled.



--------------------------------------------------------------------------------
                  Reconstructed Hierarchy Report (Book Order)                   
--------------------------------------------------------------------------------
|-- Preface [ID: 3] (Total Chuck in branch: 10, Direct Chunk: 10)
|-- Introduction [ID: 4] (Total Chuck in branch: 73, Direct Chunk: 73)
|-- About the Authors [ID: 5] (Total Chuck in branch: 5, Direct Chunk: 5)
|-- Acknowledgments [ID: 6] (Total Chuck in branch: 20, Direct Chunk: 20)
|-- Chapter 1. Understanding the Digital Forensics Profession and Investigations [ID: 7] (Total Chuck in branch: 4566, Direct Chunk: 23)
  |-- An Overview of Digital Forensics [ID: 9] (Total Chuck in branch: 60, Direct Chunk: 18)
    |-- Digital Forensics and Other Related Disciplines [ID: 10] (Total Chuck in branch: 18, Direct Chunk: 18)
    |-- A Brief History of Digital Forensics [ID: 11] (Total Chuck in branch: 13, Direct Chunk: 13)
    |-- Understanding Case Law

## Test Data Base for content development

Require Description

In [19]:
# Cell 6: Verify Vector Database (Final Version with Rich Diagnostic Output)

import os
import json
import re
import random
import logging
from typing import List, Dict, Any, Tuple, Optional

# Third-party imports
try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    from langchain_core.documents import Document
    langchain_available = True
except ImportError:
    langchain_available = False

# Setup Logger for this cell
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---


def print_results(query_text: str, results: list, where_filter: Optional[Dict] = None):
    """
    Richly prints query results, showing the query, filter, and retrieved documents.
    """
    print("\n" + "-"*10 + " DIAGNOSTIC: RETRIEVAL RESULTS " + "-"*10)
    print(f"QUERY: '{query_text}'")
    if where_filter:
        print(f"FILTER: {json.dumps(where_filter, indent=2)}")
    
    if not results:
        print("--> No documents were retrieved for this query and filter.")
        print("-" * 55)
        return
        
    print(f"--> Found {len(results)} results. Displaying top {min(len(results), 3)}:")
    for i, doc in enumerate(results[:3]):
        print(f"\n[ RESULT {i+1} ]")
        content_preview = doc.page_content.replace('\n', ' ').strip()
        print(f"  Content : '{content_preview[:200]}...'")
        print(f"  Metadata: {json.dumps(doc.metadata, indent=2)}")
    print("-" * 55)


# --- HELPER FUNCTIONS FOR FINDING DATA (UNCHANGED) ---
def find_deep_entry(nodes: List[Dict], current_path: List[str] = []) -> Optional[Tuple[Dict, List[str]]]:
    shuffled_nodes = random.sample(nodes, len(nodes))
    for node in shuffled_nodes:
        if node.get('level', 0) >= 2 and node.get('children'): return node, current_path + [node['title']]
        if node.get('children'):
            path = current_path + [node['title']]
            deep_entry = find_deep_entry(node['children'], path)
            if deep_entry: return deep_entry
    return None

def find_chapter_title_by_number(toc_data: List[Dict], chap_num: int) -> Optional[List[str]]:
    def search_nodes(nodes, num, current_path):
        for node in nodes:
            path = current_path + [node['title']]
            if re.match(rf"(Chapter\s)?{num}[.:\s]", node.get('title', ''), re.IGNORECASE): return path
            if node.get('children'):
                found_path = search_nodes(node['children'], num, path)
                if found_path: return found_path
        return None
    return search_nodes(toc_data, chap_num, [])


# --- ENHANCED TEST CASES with DIAGNOSTIC OUTPUT ---

def basic_retrieval_test(db, outline):
    print_header("Test 1: Basic Retrieval", char="-")
    try:
        logger.info("Goal: Confirm the database is live and contains thematically relevant content.")
        logger.info("Strategy: Perform a simple similarity search using the course's 'unitName'.")
        query_text = outline.get("unitInformation", {}).get("unitName", "introduction")
        
        logger.info(f"Action: Searching for query: '{query_text}'...")
        results = db.similarity_search(query_text, k=1)
        
        print_results(query_text, results) # <--- SHOW THE EVIDENCE
        
        logger.info("Verification: Check if at least one document was returned.")
        assert len(results) > 0, "Basic retrieval query returned no results."
        
        logger.info("✅ Result: TEST 1 PASSED. The database is online and responsive.")
        return True
    except Exception as e:
        logger.error(f"❌ Result: TEST 1 FAILED. Reason: {e}")
        return False

def deep_hierarchy_test(db, toc):
    print_header("Test 2: Deep Hierarchy Retrieval", char="-")
    try:
        logger.info("Goal: Verify that the multi-level hierarchical metadata was ingested correctly.")
        logger.info("Strategy: Find a random, deeply nested sub-section and use a precise filter to retrieve it.")
        deep_entry_result = find_deep_entry(toc)
        assert deep_entry_result, "Could not find a suitable deep entry (level >= 2) to test."
        node, path = deep_entry_result
        query = node['title']
        
        logger.info(f"  - Selected random deep section: {' -> '.join(path)}")
        conditions = [{f"level_{i+1}_title": {"$eq": title}} for i, title in enumerate(path)]
        w_filter = {"$and": conditions}
        
        logger.info("Action: Performing a similarity search with a highly specific '$and' filter.")
        results = db.similarity_search(query, k=1, filter=w_filter)
        
        print_results(query, results, w_filter) # <--- SHOW THE EVIDENCE
        
        logger.info("Verification: Check if the precisely filtered query returned any documents.")
        assert len(results) > 0, "Deeply filtered query returned no results."

        logger.info("✅ Result: TEST 2 PASSED. Hierarchical metadata is structured correctly.")
        return True
    except Exception as e:
        logger.error(f"❌ Result: TEST 2 FAILED. Reason: {e}")
        return False

def advanced_alignment_test(db, outline, toc):
    print_header("Test 3: Advanced Unit Outline Alignment", char="-")
    try:
        logger.info("Goal: Ensure a weekly topic from the syllabus can be mapped to the correct textbook chapter(s).")
        logger.info("Strategy: Pick a random week, find its chapter, and query for the topic filtered by that chapter.")
        week_to_test = random.choice(outline['weeklySchedule'])
        logger.info(f"  - Selected random week: Week {week_to_test['week']} - '{week_to_test['contentTopic']}'")

        reading = week_to_test.get('requiredReading', '')
        chap_nums_str = re.findall(r'\d+', reading)
        assert chap_nums_str, f"Could not find chapter numbers in required reading: '{reading}'"
        logger.info(f"  - Extracted required chapter number(s): {chap_nums_str}")

        chapter_paths = [find_chapter_title_by_number(toc, int(n)) for n in chap_nums_str]
        chapter_paths = [path for path in chapter_paths if path is not None]
        assert chapter_paths, f"Could not map chapter numbers {chap_nums_str} to a valid ToC path."
        
        level_1_titles = list(set([path[0] for path in chapter_paths]))
        logger.info(f"  - Mapped to top-level ToC entries: {level_1_titles}")

        or_filter = [{"level_1_title": {"$eq": title}} for title in level_1_titles]
        w_filter = {"$or": or_filter} if len(or_filter) > 1 else or_filter[0]
        query = week_to_test['contentTopic']
        
        logger.info("Action: Searching for the weekly topic, filtered by the mapped chapter(s).")
        results = db.similarity_search(query, k=5, filter=w_filter)
        
        print_results(query, results, w_filter) # <--- SHOW THE EVIDENCE
        
        logger.info("Verification: Check if at least one returned document is from the correct chapter.")
        assert len(results) > 0, "Alignment query returned no results for the correct section/chapter."
        
        logger.info("✅ Result: TEST 3 PASSED. The syllabus can be reliably aligned with the textbook content.")
        return True
    except Exception as e:
        logger.error(f"❌ Result: TEST 3 FAILED. Reason: {e}")
        return False

def content_sequence_test(db, outline):
    print_header("Test 4: Content Sequence Verification", char="-")
    try:
        logger.info("Goal: Confirm that chunks for a topic can be re-ordered to form a coherent narrative.")
        logger.info("Strategy: Retrieve several chunks for a random topic and verify their 'chunk_id' is sequential.")
        topic_query = random.choice(outline['weeklySchedule'])['contentTopic']
        
        logger.info(f"Action: Performing similarity search for topic: '{topic_query}' to get a set of chunks.")
        results = db.similarity_search(topic_query, k=10)
        
        print_results(topic_query, results) # <--- SHOW THE EVIDENCE
        
        docs_with_id = [doc for doc in results if 'chunk_id' in doc.metadata]
        assert len(docs_with_id) > 3, "Fewer than 4 retrieved chunks have a 'chunk_id' to test."
        
        chunk_ids = [doc.metadata['chunk_id'] for doc in docs_with_id]
        sorted_ids = sorted(chunk_ids)
        
        logger.info(f"  - Retrieved and sorted chunk IDs: {sorted_ids}")
        logger.info("Verification: Check if the sorted list of chunk_ids is strictly increasing.")
        is_ordered = all(sorted_ids[i] >= sorted_ids[i-1] for i in range(1, len(sorted_ids)))
        assert is_ordered, "The retrieved chunks' chunk_ids are not in ascending order when sorted."

        logger.info("✅ Result: TEST 4 PASSED. Narrative order can be reconstructed using 'chunk_id'.")
        return True
    except Exception as e:
        logger.error(f"❌ Result: TEST 4 FAILED. Reason: {e}")
        return False

# --- MAIN VERIFICATION EXECUTION ---
def run_verification():
    print_header("Database Verification Process")
    
    if not langchain_available:
        logger.error("LangChain libraries not found. Aborting tests.")
        return

    required_files = {
        "Chroma DB": CHROMA_PERSIST_DIR,
        "ToC JSON": PRE_EXTRACTED_TOC_JSON_PATH,
        "Parsed Outline": PARSED_UO_JSON_PATH
    }
    for name, path in required_files.items():
        if not os.path.exists(path):
            logger.error(f"Required '{name}' not found at '{path}'. Please run previous cells.")
            return

    with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r', encoding='utf-8') as f:
        toc_data = json.load(f)
    with open(PARSED_UO_JSON_PATH, 'r', encoding='utf-8') as f:
        unit_outline_data = json.load(f)

    logger.info("Connecting to DB and initializing components...")
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
    vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=embeddings,
        collection_name=CHROMA_COLLECTION_NAME
    )
    
    results_summary = [
        basic_retrieval_test(vector_store, unit_outline_data),
        deep_hierarchy_test(vector_store, toc_data),
        advanced_alignment_test(vector_store, unit_outline_data, toc_data),
        content_sequence_test(vector_store, unit_outline_data)
    ]

    passed_count = sum(filter(None, results_summary))
    failed_count = len(results_summary) - passed_count
    
    print_header("Verification Summary")
    print(f"Total Tests Run: {len(results_summary)}")
    print(f"✅ Passed: {passed_count}")
    print(f"❌ Failed: {failed_count}")
    print_header("Verification Complete", char="=")

# --- Execute Verification ---
# Assumes global variables from Cell 1 are available in the notebook's scope
run_verification()

2025-07-01 21:02:48,736 - INFO - Connecting to DB and initializing components...
2025-07-01 21:02:48,746 - INFO - Goal: Confirm the database is live and contains thematically relevant content.
2025-07-01 21:02:48,746 - INFO - Strategy: Perform a simple similarity search using the course's 'unitName'.
2025-07-01 21:02:48,747 - INFO - Action: Searching for query: 'Digital Forensic'...
2025-07-01 21:02:48,814 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-07-01 21:02:48,818 - INFO - Verification: Check if at least one document was returned.
2025-07-01 21:02:48,818 - INFO - ✅ Result: TEST 1 PASSED. The database is online and responsive.
2025-07-01 21:02:48,819 - INFO - Goal: Verify that the multi-level hierarchical metadata was ingested correctly.
2025-07-01 21:02:48,819 - INFO - Strategy: Find a random, deeply nested sub-section and use a precise filter to retrieve it.
2025-07-01 21:02:48,820 - INFO -   - Selected random deep section: Chapter 6. Curren


                         Database Verification Process                          

--------------------------------------------------------------------------------
                            Test 1: Basic Retrieval                             
--------------------------------------------------------------------------------

---------- DIAGNOSTIC: RETRIEVAL RESULTS ----------
QUERY: 'Digital Forensic'
--> Found 1 results. Displaying top 1:

[ RESULT 1 ]
  Content : 'An Overview of Digital Forensics...'
  Metadata: {
  "chunk_id": 156,
  "source": "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub",
  "level_1_title": "Chapter 1. Understanding the Digital Forensics Profession and Investigations",
  "toc_id": 9,
  "level_2_title": "An Overview of Digital Forensics"
}
-------------------------------------------------------

------------------------------------------------------------

2025-07-01 21:02:48,953 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-07-01 21:02:48,965 - INFO - Verification: Check if the precisely filtered query returned any documents.
2025-07-01 21:02:48,965 - INFO - ✅ Result: TEST 2 PASSED. Hierarchical metadata is structured correctly.
2025-07-01 21:02:48,966 - INFO - Goal: Ensure a weekly topic from the syllabus can be mapped to the correct textbook chapter(s).
2025-07-01 21:02:48,966 - INFO - Strategy: Pick a random week, find its chapter, and query for the topic filtered by that chapter.
2025-07-01 21:02:48,967 - INFO -   - Selected random week: Week Week 9 - 'Email and Social Media.'
2025-07-01 21:02:48,967 - INFO -   - Extracted required chapter number(s): ['2019', '978', '1', '337', '56894', '4', '11']
2025-07-01 21:02:48,970 - INFO -   - Mapped to top-level ToC entries: ['Chapter 11. E-mail and Social Media Investigations', 'Chapter 4. Processing Crime and Incident Scenes', 'Chapter 1. Understanding


---------- DIAGNOSTIC: RETRIEVAL RESULTS ----------
QUERY: 'Forensic Workstations'
FILTER: {
  "$and": [
    {
      "level_1_title": {
        "$eq": "Chapter 6. Current Digital Forensics Tools"
      }
    },
    {
      "level_2_title": {
        "$eq": "Digital Forensics Hardware Tools"
      }
    },
    {
      "level_3_title": {
        "$eq": "Forensic Workstations"
      }
    }
  ]
}
--> Found 1 results. Displaying top 1:

[ RESULT 1 ]
  Content : 'Forensic Workstations...'
  Metadata: {
  "level_3_title": "Forensic Workstations",
  "level_2_title": "Digital Forensics Hardware Tools",
  "toc_id": 255,
  "level_1_title": "Chapter 6. Current Digital Forensics Tools",
  "source": "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub",
  "chunk_id": 3311
}
-------------------------------------------------------

-----------------------------------------------------------------

2025-07-01 21:02:49,217 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-07-01 21:02:49,220 - INFO -   - Retrieved and sorted chunk IDs: [49, 3138, 3141, 3160, 3164, 3166, 3267, 3271, 3308, 9541]
2025-07-01 21:02:49,221 - INFO - Verification: Check if the sorted list of chunk_ids is strictly increasing.
2025-07-01 21:02:49,221 - INFO - ✅ Result: TEST 4 PASSED. Narrative order can be reconstructed using 'chunk_id'.



---------- DIAGNOSTIC: RETRIEVAL RESULTS ----------
QUERY: 'Current Computer Forensics Tools.'
--> Found 10 results. Displaying top 3:

[ RESULT 1 ]
  Content : 'Chapter 6. Current Digital Forensics Tools...'
  Metadata: {
  "toc_id": 231,
  "level_1_title": "Chapter 6. Current Digital Forensics Tools",
  "source": "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub",
  "chunk_id": 3138
}

[ RESULT 2 ]
  Content : 'Chapter 6. Current Digital Forensics Tools...'
  Metadata: {
  "level_1_title": "Chapter 6. Current Digital Forensics Tools",
  "chunk_id": 9541,
  "source": "Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub",
  "toc_id": 231
}

[ RESULT 3 ]
  Content : 'Software Forensics Tools...'
  Metadata: {
  "level_3_title": "Types of Digital Forensics Tools",
  "level_4_ti

#  Content Generation

## Planning Agent 

In [None]:
# Cell 7: The Data-Driven Planning Agent (Final Hierarchical Version✅)

import os
import json
import re
import math
import logging
from typing import List, Dict, Any, Optional

# Setup Logger and LangChain components
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    langchain_available = True
except ImportError:
    langchain_available = False

def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)

class PlanningAgent:
    """
    An agent that creates a hierarchical content plan, adaptively partitions content
    into distinct lecture decks, and allocates presentation time.
    """
    def __init__(self, master_config: Dict, vector_store: Optional[Any] = None):
        self.config = master_config['processed_settings']
        self.unit_outline = master_config['unit_outline']
        self.book_toc = master_config['book_toc']
        self.flat_toc_with_ids = self._create_flat_toc_with_ids()
        self.vector_store = vector_store
        logger.info("Data-Driven PlanningAgent initialized successfully.")

    def _create_flat_toc_with_ids(self) -> List[Dict]:
        """Creates a flattened list of the ToC for easy metadata lookup."""
        flat_list = []
        def flatten_recursive(nodes, counter):
            for node in nodes:
                node_id = counter[0]; counter[0] += 1
                flat_list.append({'toc_id': node_id, 'title': node.get('title', ''), 'node': node})
                if node.get('children'):
                    flatten_recursive(node.get('children'), counter)
        flatten_recursive(self.book_toc, [0])
        return flat_list

    def _identify_relevant_chapters(self, weekly_schedule_item: Dict) -> List[int]:
        """Extracts chapter numbers precisely from the 'requiredReading' string."""
        reading_str = weekly_schedule_item.get('requiredReading', '')
        match = re.search(r'Chapter(s)?', reading_str, re.IGNORECASE)
        if not match: return []
        search_area = reading_str[match.start():]
        chap_nums_str = re.findall(r'\d+', search_area)
        if chap_nums_str:
            return sorted(list(set(int(n) for n in chap_nums_str)))
        return []

    def _find_chapter_node(self, chapter_number: int) -> Optional[Dict]:
        """Finds the ToC node for a specific chapter number."""
        for item in self.flat_toc_with_ids:
            if re.match(rf"Chapter\s{chapter_number}(?:\D|$)", item['title']):
                return item['node']
        return None

    def _build_topic_plan_tree(self, toc_node: Dict) -> Dict:
        """
        Recursively builds a hierarchical plan tree from any ToC node,
        annotating it with direct and total branch chunk counts.
        """
        node_metadata = next((item for item in self.flat_toc_with_ids if item['node'] is toc_node), None)
        if not node_metadata: return {}

        retrieved_docs = self.vector_store.get(where={'toc_id': node_metadata['toc_id']})
        direct_chunk_count = len(retrieved_docs.get('ids', []))

        plan_node = {
            "title": node_metadata['title'],
            "toc_id": node_metadata['toc_id'],
            "chunk_count": direct_chunk_count,
            "total_chunks_in_branch": 0,
            "slides_allocated": 0,
            "children": []
        }

        child_branch_total = 0
        for child_node in toc_node.get('children', []):
            if any(ex in child_node.get('title', '').lower() for ex in ["review", "introduction", "summary", "key terms"]):
                continue
            child_plan_node = self._build_topic_plan_tree(child_node)
            if child_plan_node:
                plan_node['children'].append(child_plan_node)
                child_branch_total += child_plan_node.get('total_chunks_in_branch', 0)
        
        plan_node['total_chunks_in_branch'] = direct_chunk_count + child_branch_total
        return plan_node
    
    def _allocate_slides_to_tree(self, plan_tree: Dict, content_slides_budget: int):
        """Performs a two-pass safety-net allocation on a hierarchical plan tree."""
        leaf_nodes = []
        def find_leaves(node):
            if not node.get('children'):
                leaf_nodes.append(node)
            for child in node.get('children', []):
                find_leaves(child)
        find_leaves(plan_tree)

        if not leaf_nodes or content_slides_budget <= 0: return plan_tree

        # Pass 1: Safety Net
        slides_per_topic = 1 if content_slides_budget >= len(leaf_nodes) else 0
        for node in leaf_nodes:
            node['slides_allocated'] = slides_per_topic
        
        remaining_budget = content_slides_budget - (len(leaf_nodes) * slides_per_topic)

        # Pass 2: Proportional Distribution
        if remaining_budget > 0:
            total_leaf_chunks = sum(node['chunk_count'] for node in leaf_nodes)
            if total_leaf_chunks > 0:
                # Distribute remaining slides based on chunk weight
                for node in leaf_nodes:
                    node['slides_allocated'] += round((node['chunk_count'] / total_leaf_chunks) * remaining_budget)
        
        # Pass 3: Sum totals upwards
        def sum_slides_upwards(node):
            if not node.get('children'):
                return node['slides_allocated']
            node['slides_allocated'] = sum(sum_slides_upwards(child) for child in node['children'])
            return node['slides_allocated']
        sum_slides_upwards(plan_tree)
        return plan_tree

    def create_content_plan_for_week(self, week_number: int) -> Optional[Dict]:
        """Orchestrates the adaptive planning and partitioning process."""
        print_header(f"Planning Week {week_number}", char="*")
        
        weekly_schedule_item = self.unit_outline['weeklySchedule'][week_number - 1]
        chapter_numbers = self._identify_relevant_chapters(weekly_schedule_item)
        if not chapter_numbers: return None

        num_decks = self.config['week_session_setup'].get('sessions_per_week', 1)
        
        # 1. Build a full plan tree for each chapter to get its weight.
        chapter_plan_trees = [self._build_topic_plan_tree(self._find_chapter_node(cn)) for cn in chapter_numbers if self._find_chapter_node(cn)]
        total_weekly_chunks = sum(tree.get('total_chunks_in_branch', 0) for tree in chapter_plan_trees)

        # 2. NEW: Adaptive Partitioning Strategy
        partitionable_units = []
        num_chapters = len(chapter_plan_trees)
        
        if num_chapters >= num_decks:
            logger.info(f"Partitioning strategy: Distributing {num_chapters} whole chapters across {num_decks} decks.")
            partitionable_units = chapter_plan_trees
        else:
            logger.info(f"Partitioning strategy: Splitting sub-topics from {num_chapters} chapter(s) across {num_decks} decks.")
            for chapter_tree in chapter_plan_trees:
                partitionable_units.extend(chapter_tree.get('children', []))
        
        # 3. Partition the chosen units into decks using a bin-packing algorithm
        decks = [[] for _ in range(num_decks)]
        deck_weights = [0] * num_decks
        sorted_units = sorted(partitionable_units, key=lambda x: x.get('total_chunks_in_branch', 0), reverse=True)
        
        for unit in sorted_units:
            lightest_deck_index = deck_weights.index(min(deck_weights))
            decks[lightest_deck_index].append(unit)
            deck_weights[lightest_deck_index] += unit.get('total_chunks_in_branch', 0)

        # 4. Plan each deck
        content_slides_per_week = self.config['slide_count_strategy'].get('target', 25)
        final_deck_plans = []
        for i, deck_content_trees in enumerate(decks):
            deck_number = i + 1
            deck_chunk_weight = sum(tree.get('total_chunks_in_branch', 0) for tree in deck_content_trees)
            deck_slide_budget = round((deck_chunk_weight / total_weekly_chunks) * content_slides_per_week) if total_weekly_chunks > 0 else 0

            logger.info(f"--- Planning Deck {deck_number}/{num_decks} | Topics: {[t['title'] for t in deck_content_trees]} | Weight: {deck_chunk_weight} chunks | Slide Budget: {deck_slide_budget} ---")
            
            # The allocation function is recursive and works on any tree or sub-tree
            planned_content = [self._allocate_slides_to_tree(tree, round(deck_slide_budget * (tree.get('total_chunks_in_branch', 0) / deck_chunk_weight))) if deck_chunk_weight > 0 else tree for tree in deck_content_trees]
            
            final_deck_plans.append({
                "deck_number": deck_number,
                "deck_title": f"{self.config.get('unit_name', 'Course')} - Week {week_number}, Lecture {deck_number}",
                "session_content": planned_content
            })

        return {
            "week": week_number,
            "overall_topic": weekly_schedule_item.get('contentTopic'),
            "deck_plans": final_deck_plans
        }

# --- EXECUTION BLOCK (No Changes Needed Here) ---
logger.info("--- Initializing Data-Driven Planning Agent Test ---")

if langchain_available:
    logger.info("Connecting to ChromaDB for the Planning Agent...")
    try:
        # NOTE: Ensure these global variables are defined in your notebook from Cell 1
        # CHROMA_PERSIST_DIR, EMBEDDING_MODEL_OLLAMA, CHROMA_COLLECTION_NAME
        # CONFIG_DIR, PRE_EXTRACTED_TOC_JSON_PATH, PARSED_UO_JSON_PATH, PROJECT_BASE_DIR
        
        vector_store = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA),
            collection_name=CHROMA_COLLECTION_NAME
        )
        logger.info("Database connection successful.")

        logger.info("Loading configuration files...")
        with open(os.path.join(CONFIG_DIR, "processed_settings.json"), 'r') as f:
            processed_settings = json.load(f)
        with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r') as f:
            book_toc = json.load(f)
        with open(PARSED_UO_JSON_PATH, 'r') as f:
            unit_outline = json.load(f)
        logger.info("Configuration files loaded.")

        master_config_from_file = {
            "processed_settings": processed_settings,
            "unit_outline": unit_outline,
            "book_toc": book_toc
        }

        planning_agent = PlanningAgent(master_config_from_file, vector_store=vector_store)

        WEEK_TO_TEST = 7 # Changed to Week 6 to match your example
        logger.info(f"--> Explicitly testing planning logic for Week {WEEK_TO_TEST}")
        content_plan = planning_agent.create_content_plan_for_week(WEEK_TO_TEST)

        if content_plan:
            print("\n--- Generated Content Plan (Hierarchical & Partitioned) ---")
            print(json.dumps(content_plan, indent=2))

            PLAN_OUTPUT_DIR = os.path.join(PROJECT_BASE_DIR, "generated_plans")
            os.makedirs(PLAN_OUTPUT_DIR, exist_ok=True)
            plan_filename = f"{processed_settings.get('course_id', 'COURSE')}_Week{WEEK_TO_TEST}_plan.json"
            plan_filepath = os.path.join(PLAN_OUTPUT_DIR, plan_filename)
            with open(plan_filepath, 'w') as f:
                json.dump(content_plan, f, indent=2)
            logger.info(f"\nSuccessfully saved content plan for Week {WEEK_TO_TEST} to: {plan_filepath}")
        else:
            logger.error(f"Failed to generate content plan for Week {WEEK_TO_TEST}.")

    except Exception as e:
        logger.error(f"An error occurred during the planning process: {e}", exc_info=True)

else:
    logger.error("LangChain/Chroma libraries not found. Cannot run the Planning Agent.")

2025-07-02 11:07:25,436 - INFO - --- Initializing Data-Driven Planning Agent Test ---
2025-07-02 11:07:25,438 - INFO - Connecting to ChromaDB for the Planning Agent...
2025-07-02 11:07:25,448 - INFO - Database connection successful.
2025-07-02 11:07:25,449 - INFO - Loading configuration files...
2025-07-02 11:07:25,451 - INFO - Configuration files loaded.
2025-07-02 11:07:25,452 - INFO - Data-Driven PlanningAgent initialized successfully.
2025-07-02 11:07:25,453 - INFO - --> Explicitly testing planning logic for Week 7
2025-07-02 11:07:25,508 - INFO - Partitioning strategy: Splitting sub-topics from 2 chapter(s) across 4 decks.
2025-07-02 11:07:25,508 - INFO - --- Planning Deck 1/4 | Topics: ['Examining Linux File Structures'] | Weight: 131 chunks | Slide Budget: 6 ---
2025-07-02 11:07:25,508 - INFO - --- Planning Deck 2/4 | Topics: ['Understanding Data Compression', 'Understanding Copyright Issues with Graphics'] | Weight: 120 chunks | Slide Budget: 6 ---
2025-07-02 11:07:25,509 - INF


********************************************************************************
                                Planning Week 7                                 
********************************************************************************

--- Generated Content Plan (Hierarchical & Partitioned) ---
{
  "week": 7,
  "overall_topic": "Linux Boot Processes and File Systems. Recovering Graphics Files.",
  "deck_plans": [
    {
      "deck_number": 1,
      "deck_title": "Digital Forensic - Week 7, Lecture 1",
      "session_content": [
        {
          "title": "Examining Linux File Structures",
          "toc_id": 272,
          "chunk_count": 77,
          "total_chunks_in_branch": 131,
          "slides_allocated": 6,
          "children": [
            {
              "title": "File Structures in Ext4",
              "toc_id": 273,
              "chunk_count": 8,
              "total_chunks_in_branch": 54,
              "slides_allocated": 6,
              "children": [
       

In [None]:
# Cell 8: The Data-Driven Planning Agent (Old version🔴)

import os
import json
import re
import math
import logging
from typing import List, Dict, Any, Optional

# Setup Logger and LangChain components
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    langchain_available = True
except ImportError:
    langchain_available = False

class PlanningAgent:
    """
    An agent that creates a content plan driven by the syllabus and uses
    data-driven metrics (chunk counts) to allocate presentation time.
    """
    def __init__(self, master_config: Dict, vector_store: Optional[Any] = None):
        self.config = master_config['processed_settings']
        self.unit_outline = master_config['unit_outline']
        self.book_toc = master_config['book_toc']
        self.flat_toc_with_ids = self._create_flat_toc_with_ids()
        # The agent now requires access to the vector store to get chunk counts
        self.vector_store = vector_store
        logger.info("Data-Driven PlanningAgent initialized successfully.")

    def _create_flat_toc_with_ids(self) -> List[Dict]:
        """Creates a flattened list of the ToC with a unique sequential ID for each node."""
        flat_list = []
        def flatten_recursive(nodes, counter):
            for node in nodes:
                node_id = counter[0]; counter[0] += 1
                flat_list.append({'toc_id': node_id, 'title': node['title'], 'node': node})
                if node.get('children'):
                    flatten_recursive(node.get('children'), counter)
        flatten_recursive(self.book_toc, [0])
        return flat_list

    def _identify_relevant_chapters(self, weekly_schedule_item: Dict) -> List[int]:
        """Extracts chapter numbers precisely from the 'requiredReading' string."""
        reading_str = weekly_schedule_item.get('requiredReading', '')
        match = re.search(r'Chapter(s)?', reading_str, re.IGNORECASE)
        if not match: return []
        search_area = reading_str[match.start():]
        chap_nums_str = re.findall(r'\d+', search_area)
        if chap_nums_str:
            return sorted(list(set(int(n) for n in chap_nums_str)))
        return []

    def _find_chapter_node(self, chapter_number: int) -> Optional[Dict]:
        """Accurately finds the ToC node for a specific chapter number."""
        for item in self.flat_toc_with_ids:
            title = item['title']
            if re.match(rf"Chapter\s{chapter_number}(?:\D|$)", title):
                return item['node']
        return None

    def _get_chapter_sub_topics(self, chapter_node: Dict) -> List[Dict]:
        """
        Recursively traverses a chapter node to find all teachable sub-topics (nodes that have no children).
        This ensures we plan based on the most granular topics available.
        """
        sub_topics = []
        
        def find_leaf_nodes_recursive(node: Dict):
            # A "leaf" or "teachable topic" is a node that has no further sub-topics (children).
            is_leaf_node = not node.get('children')

            # Exclude high-level container topics that are not granular enough to teach directly.
            # You can customize this list.
            excluded_titles = ["review", "introduction", "summary", "key terms"]
            title_lower = node.get('title', '').lower()
            if any(excluded in title_lower for excluded in excluded_titles):
                return

            if is_leaf_node:
                # Find the full metadata for this node from our flattened ToC list
                matched_item = next((item for item in self.flat_toc_with_ids if item['node'] is node), None)
                if matched_item:
                    sub_topics.append({
                        "title": matched_item['title'], 
                        "toc_id": matched_item['toc_id']
                    })
            else:
                # If it's not a leaf, continue diving into its children.
                for child_node in node.get('children', []):
                    find_leaf_nodes_recursive(child_node)

        # Start the recursive search from the main chapter node.
        find_leaf_nodes_recursive(chapter_node)
        
        return sub_topics

    def _build_topic_plan_tree(self, chapter_node: Dict) -> Dict:
        """
        Recursively builds a hierarchical plan tree from a ToC chapter node,
        annotating each node with its direct and total branch chunk counts.
        """
        # Find the full metadata for this node from our flattened ToC list
        node_metadata = next((item for item in self.flat_toc_with_ids if item['node'] is chapter_node), None)
        if not node_metadata:
            return {}

        # Get the chunk count for THIS specific level (Direct Chunks)
        retrieved_docs = self.vector_store.get(where={'toc_id': node_metadata['toc_id']})
        direct_chunk_count = len(retrieved_docs.get('ids', []))

        plan_node = {
            "title": node_metadata['title'],
            "toc_id": node_metadata['toc_id'],
            "chunk_count": direct_chunk_count,
            "total_chunks_in_branch": 0, # Initialize the new field
            "slides_allocated": 0,
            "children": []
        }

        # --- RECURSION and SUMMATION LOGIC ---
        
        # 1. Recursively build all children for the current node
        child_branch_total = 0
        for child_node in chapter_node.get('children', []):
            # Exclude non-teachable summary sections
            title_lower = child_node.get('title', '').lower()
            if any(excluded in title_lower for excluded in ["review", "introduction", "summary", "key terms"]):
                continue
                
            child_plan_node = self._build_topic_plan_tree(child_node)
            if child_plan_node:
                plan_node['children'].append(child_plan_node)
                # 2. Sum the total branch count from the returned children
                child_branch_total += child_plan_node.get('total_chunks_in_branch', 0)
        
        # 3. The total for this node is its direct chunks + the sum of its children's branches.
        plan_node['total_chunks_in_branch'] = direct_chunk_count + child_branch_total
        
        return plan_node
    
    def _allocate_slides_to_tree(self, plan_tree: Dict, content_slides_budget: int):
        """
        Performs a two-pass "safety net" allocation on a hierarchical plan tree.
        """
        leaf_nodes = []
        def find_leaves(node):
            if not node.get('children'):
                leaf_nodes.append(node)
            for child in node.get('children', []):
                find_leaves(child)
        
        find_leaves(plan_tree)

        if not leaf_nodes:
            return plan_tree # No teachable topics found

        logger.info(f"Allocating a budget of {content_slides_budget} slides across {len(leaf_nodes)} granular topics.")
        
        # --- Pass 1: Safety Net Allocation ---
        # Give every leaf topic 1 slide to ensure it's covered.
        for node in leaf_nodes:
            node['slides_allocated'] = 1
        
        remaining_budget = content_slides_budget - len(leaf_nodes)

        # --- Pass 2: Proportional Distribution of Remainder ---
        if remaining_budget > 0:
            # Calculate total chunks ONLY among the leaf nodes that will receive more slides
            total_leaf_chunks = sum(node['chunk_count'] for node in leaf_nodes)

            if total_leaf_chunks > 0:
                for node in leaf_nodes:
                    proportion = node['chunk_count'] / total_leaf_chunks
                    additional_slides = round(proportion * remaining_budget)
                    node['slides_allocated'] += additional_slides
        
        # --- Pass 3: Sum totals up the tree ---
        def sum_slides_upwards(node):
            if not node.get('children'):
                return node['slides_allocated']
            
            child_slide_total = sum(sum_slides_upwards(child) for child in node['children'])
            node['slides_allocated'] = child_slide_total
            return child_slide_total

        sum_slides_upwards(plan_tree)
        return plan_tree
    
    
    
    # --- NEW DATA-DRIVEN ALLOCATION METHOD ---
    def _allocate_slides_by_chunk_count(self, topics: List[Dict], 
                                    content_slides_budget: int) -> List[Dict]:
        """
        Allocates slides proportionally based on the number of chunks 
        associated with each topic.
        """
        if not self.vector_store:
            logger.error("Vector store not available. Cannot perform chunk count allocation. Falling back to even distribution.")
            # Fallback logic: Distribute slides evenly if DB is not available
            num_topics = len(topics)
            if num_topics == 0: return []
            slides_per_topic = content_slides_budget // num_topics
            for topic in topics:
                topic['chunk_count'] = 0
                topic['slides_allocated'] = slides_per_topic
            return topics

        logger.info(f"Allocating a budget of {content_slides_budget} content slides across {len(topics)} topics...")
    
        # Step 1: Get chunk counts (your existing logic here is now correct)
        total_chunks_in_session = 0
        for topic in topics:
            retrieved_docs = self.vector_store.get(where={'toc_id': topic['toc_id']})
            count = len(retrieved_docs.get('ids', []))
            topic['chunk_count'] = count
            total_chunks_in_session += count
            
        logger.info(f"Found a total of {total_chunks_in_session} chunks for this session's topics.")

        if total_chunks_in_session == 0:
            logger.warning("No chunks found for any topics. Distributing slides evenly.")
            slides_per_topic = content_slides_budget // len(topics) if topics else 0
            for topic in topics:
                topic['slides_allocated'] = slides_per_topic
            return topics

        # Step 2: Allocate slides proportionally using the content-only budget
        slides_allocated_so_far = 0
        for topic in topics:
            proportion = topic['chunk_count'] / total_chunks_in_session
            # Use round() for a more balanced initial allocation instead of floor()
            topic['slides_allocated'] = round(proportion * content_slides_budget)
            slides_allocated_so_far += topic['slides_allocated']

        # Step 3: Distribute remainder/deficit to topics with the most chunks to match the budget
        # This handles cases where rounding causes a mismatch with the budget.
        difference = content_slides_budget - slides_allocated_so_far
        
        sorted_topics = sorted(topics, key=lambda x: x.get('chunk_count', 0), reverse=True)
        
        # Adjust slide counts up or down to meet the exact budget
        for i in range(abs(difference)):
            # If we allocated too many, subtract from the smallest topics first by reversing the sort
            if difference < 0:
                topic_to_adjust = sorted_topics[-(i + 1)]
                # Don't let a slide count go below 1 if it has chunks
                if topic_to_adjust['slides_allocated'] > 1:
                    topic_to_adjust['slides_allocated'] -= 1
            else: # If we allocated too few, add to the largest topics
                topic_to_adjust = sorted_topics[i % len(sorted_topics)]
                topic_to_adjust['slides_allocated'] += 1
                
        return topics

    def create_content_plan_for_week(self, week_number: int) -> Optional[Dict]:
        """Orchestrates the new hierarchical planning process for a single week."""
        print_header(f"Planning Week {week_number}", char="*")
        
        # Get weekly data and chapter numbers (no change here)
        weekly_schedule_item = self.unit_outline['weeklySchedule'][week_number - 1]
        chapter_numbers = self._identify_relevant_chapters(weekly_schedule_item)
        if not chapter_numbers:
            logger.error("No valid chapter numbers found. Aborting.")
            return None

        content_slides_per_week = self.config['slide_count_strategy'].get('target', 25)
        slides_per_session = content_slides_per_week // len(chapter_numbers) if chapter_numbers else content_slides_per_week

        final_sessions_plan = []
        for i, chap_num in enumerate(chapter_numbers):
            logger.info(f"--- Planning Session {i+1} (Chapter {chap_num}) ---")
            chapter_node = self._find_chapter_node(chap_num)
            if not chapter_node: continue

            # 1. Build the hierarchical plan tree for the chapter
            plan_tree = self._build_topic_plan_tree(chapter_node)
            
            # 2. Allocate slides to the hierarchical tree
            plan_with_slides = self._allocate_slides_to_tree(plan_tree, slides_per_session)

            final_sessions_plan.append({
                "session_number": i + 1,
                # The 'session_topic' is the root of our plan
                "session_topic": plan_with_slides['title'],
                # 'topics_to_cover' is now the entire hierarchical tree
                "topics_to_cover": plan_with_slides
            })

        if not final_sessions_plan: return None
        
        week_plan = {
            "week": week_number,
            "overall_topic": weekly_schedule_item.get('contentTopic'),
            "sessions": final_sessions_plan
        }

        print_header(f"Plan for Week {week_number} Generated Successfully", char="*")
        return week_plan

# --- EXECUTION BLOCK ---
logger.info("--- Initializing Data-Driven Planning Agent Test ---")

# Check if LangChain components are available before proceeding
if langchain_available:

    # 1. --- Connect to the Database FIRST ---
    # The agent requires a live vector store connection to count chunks.
    logger.info("Connecting to ChromaDB for the Planning Agent...")
    vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA),
        collection_name=CHROMA_COLLECTION_NAME
    )
    logger.info("Database connection successful.")

    # 2. --- Load all configuration files ---
    # This part of your code is correct.
    with open(os.path.join(CONFIG_DIR, "processed_settings.json"), 'r') as f:
        processed_settings = json.load(f)
    with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r') as f:
        book_toc = json.load(f)
    with open(PARSED_UO_JSON_PATH, 'r') as f:
        unit_outline = json.load(f)

    master_config_from_file = {
        "processed_settings": processed_settings,
        "unit_outline": unit_outline,
        "book_toc": book_toc
    }

    # 3. --- Initialize the agent, PASSING the vector store connection ---
    # This is the crucial step.
    planning_agent = PlanningAgent(master_config_from_file, vector_store=vector_store)

    # 4. --- Run the test ---
    WEEK_TO_TEST = 6
    logger.info(f"--> Explicitly testing planning logic for Week {WEEK_TO_TEST}")
    content_plan = planning_agent.create_content_plan_for_week(WEEK_TO_TEST)

    # 5. --- Save and print the output ---
    if content_plan:
        print("\n--- Generated Content Plan (Data-Driven Allocation) ---")
        print(json.dumps(content_plan, indent=2))

        # Save the plan to a file
        PLAN_OUTPUT_DIR = os.path.join(PROJECT_BASE_DIR, "generated_plans")
        os.makedirs(PLAN_OUTPUT_DIR, exist_ok=True)
        plan_filename = f"{processed_settings['course_id']}_Week{WEEK_TO_TEST}_plan.json"
        plan_filepath = os.path.join(PLAN_OUTPUT_DIR, plan_filename)
        with open(plan_filepath, 'w') as f:
            json.dump(content_plan, f, indent=2)
        logger.info(f"\nSuccessfully saved content plan for Week {WEEK_TO_TEST} to: {plan_filepath}")
    else:
        logger.error(f"Failed to generate content plan for Week {WEEK_TO_TEST}.")

2025-07-02 10:25:21,794 - INFO - --- Initializing Data-Driven Planning Agent Test ---
2025-07-02 10:25:21,795 - INFO - Connecting to ChromaDB for the Planning Agent...
2025-07-02 10:25:21,806 - INFO - Database connection successful.
2025-07-02 10:25:21,808 - INFO - Data-Driven PlanningAgent initialized successfully.
2025-07-02 10:25:21,809 - INFO - --> Explicitly testing planning logic for Week 6
2025-07-02 10:25:21,810 - INFO - --- Planning Session 1 (Chapter 6) ---
2025-07-02 10:25:21,851 - INFO - Allocating a budget of 25 slides across 22 granular topics.
2025-07-02 10:25:21,852 - INFO - 
Successfully saved content plan for Week 6 to: /home/sebas_dev_linux/projects/course_generator/generated_plans/ICT312_Week6_plan.json



********************************************************************************
                                Planning Week 6                                 
********************************************************************************

********************************************************************************
                     Plan for Week 6 Generated Successfully                     
********************************************************************************

--- Generated Content Plan (Data-Driven Allocation) ---
{
  "week": 6,
  "overall_topic": "Current Computer Forensics Tools.",
  "sessions": [
    {
      "session_number": 1,
      "session_topic": "Chapter 6. Current Digital Forensics Tools",
      "topics_to_cover": {
        "title": "Chapter 6. Current Digital Forensics Tools",
        "toc_id": 231,
        "chunk_count": 22,
        "total_chunks_in_branch": 315,
        "slides_allocated": 23,
        "children": [
          {
            "title": 

Next steps in the plan
- Add the sorted chunks for each slide to process the summaries or content geneneration later 
- Add title, agenda, summary and end as part of this planning to start having 
- Add label to reference title, agenda, content, summary and end 
- Process the images from the book and store them with relation to the chunk so we can potentially use the image in the slides 
- Process unit outlines and store them with good labels for phase 1

In [None]:
# Cell 8.1: Diagnostic Comparison Dashboard for Planning Agent (Final Version)

import os
import json
import re
import logging
from typing import List, Dict, Any, Optional

# Ensure the PlanningAgent class from Cell 8 is available in memory
try:
    from langchain_chroma import Chroma
    from langchain_ollama.embeddings import OllamaEmbeddings
    langchain_available = True
except ImportError:
    langchain_available = False

def print_diag_header(text: str):
    """Prints a formatted header for a diagnostic section."""
    print("\n" + "="*80)
    print(f"DIAGNOSTIC DASHBOARD: {text}")
    print("="*80)

# This helper is not strictly needed for the diagnostic but kept for completeness
def find_node_by_title(nodes: List[Dict], title: str) -> Optional[Dict]:
    """Recursively finds a node in the ToC by its exact title."""
    for node in nodes:
        if node.get('title') == title:
            return node
        if node.get('children'):
            found = find_node_by_title(node.get('children', []), title)
            if found:
                return found
    return None

# --- Main Diagnostic Function ---
def run_comparison_diagnostic(week_to_test: int):
    logger.info(f"--- Starting Diagnostic Comparison Dashboard for Week {week_to_test} ---")
    
    # 1. --- Load All necessary data ---
    try:
        with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r') as f:
            book_toc_data = json.load(f)
        with open(PARSED_UO_JSON_PATH, 'r') as f:
            unit_outline_data = json.load(f)
        with open(os.path.join(CONFIG_DIR, "processed_settings.json"), 'r') as f:
            processed_settings = json.load(f)
        
        master_config = {
            "processed_settings": processed_settings,
            "unit_outline": unit_outline_data,
            "book_toc": book_toc_data
        }
        logger.info("Successfully loaded all raw data files.")
    except Exception as e:
        logger.error(f"Failed to load initial files. Error: {e}")
        return
        
    # --- MODIFICATION: Show the initial input from the Unit Outline ---
    print_diag_header("Ground Truth: Weekly Schedule from Unit Outline")
    try:
        weekly_item = unit_outline_data['weeklySchedule'][week_to_test - 1]
        print(f"--- Input for Week {week_to_test} ---")
        print(json.dumps(weekly_item, indent=2))
    except IndexError:
        logger.error(f"Week {week_to_test} not found in Unit Outline. Aborting.")
        return

    # 2. --- Generate the Plan using the Agent ---
    print_diag_header("Generated Plan")
    agent = PlanningAgent(master_config)
    generated_plan = agent.create_content_plan_for_week(week_to_test)
    
    if not generated_plan:
        logger.error("Planning Agent failed to generate a plan. Aborting diagnostic.")
        return
        
    print(json.dumps(generated_plan, indent=2))

    # 3. --- Extract the relevant slice from the ground-truth ToC ---
    print_diag_header("Ground Truth: Relevant Section(s) from book_toc.json")
    chapter_numbers = agent._identify_relevant_chapters(weekly_item)
    
    if not chapter_numbers:
        logger.error("Could not identify chapters to create ToC slice.")
    else:
        for chap_num in chapter_numbers:
            chapter_node = agent._find_chapter_node(chap_num)
            if chapter_node:
                print(f"\n--- ToC for '{chapter_node['title']}' ---")
                print(json.dumps(chapter_node, indent=2))
            else:
                print(f"\nCould not find Chapter {chap_num} in ToC.")


    # 4. --- Verify Plan against Vector Database ---
    print_diag_header("Vector Database Verification")
    if not langchain_available:
        logger.warning("LangChain/Chroma libraries not available. Skipping Vector DB verification.")
        return
        
    try:
        logger.info("Connecting to ChromaDB to verify chunk counts...")
        embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        vector_store = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=embeddings,
            collection_name=CHROMA_COLLECTION_NAME
        )
        logger.info("Successfully connected to ChromaDB.")
        
        # This handles both single and multi-session plans
        for session in generated_plan.get('sessions', []):
            topics_to_verify = session.get('topics_to_cover', [])
            print(f"\nVerifying chunk counts for Session {session['session_number']}: '{session['session_topic']}'")
            print("-" * 70)
            print(f"{'Topic Title':<55} | {'ToC ID'} | {'Chunk Count'}")
            print("-" * 70)

            for topic in topics_to_verify:
                toc_id = topic['toc_id']
                title = topic['title']
                retrieved = vector_store._collection.get(where={"toc_id": toc_id})
                count = len(retrieved['ids'])
                
                title_short = (title[:50] + '...') if len(title) > 53 else title
                print(f"{title_short:<55} | {toc_id:<6} | {count}")
            
            print("-" * 70)

    except Exception as e:
        logger.error(f"An error occurred during Vector DB verification: {e}", exc_info=True)


# --- Execute the Diagnostic Dashboard ---
WEEK_TO_TEST = 7 # Set the week you want to diagnose here
run_comparison_diagnostic(WEEK_TO_TEST)
print("\n" + "="*80)
print("DIAGNOSTIC DASHBOARD COMPLETE")
print("="*80)

2025-07-01 21:38:19,164 - INFO - --- Starting Diagnostic Comparison Dashboard for Week 7 ---
2025-07-01 21:38:19,165 - INFO - Successfully loaded all raw data files.
2025-07-01 21:38:19,166 - INFO - Data-Driven PlanningAgent initialized successfully.
2025-07-01 21:38:19,166 - INFO - --- Planning Session 1 (Chapter 7) ---
2025-07-01 21:38:19,167 - ERROR - Vector store not available. Cannot perform chunk count allocation.
2025-07-01 21:38:19,167 - INFO - --- Planning Session 2 (Chapter 8) ---
2025-07-01 21:38:19,167 - ERROR - Vector store not available. Cannot perform chunk count allocation.
2025-07-01 21:38:19,168 - INFO - Connecting to ChromaDB to verify chunk counts...
2025-07-01 21:38:19,177 - INFO - Successfully connected to ChromaDB.



DIAGNOSTIC DASHBOARD: Ground Truth: Weekly Schedule from Unit Outline
--- Input for Week 7 ---
{
  "week": "Week 7",
  "contentTopic": "Linux Boot Processes and File Systems. Recovering Graphics Files.",
  "requiredReading": "Nelson, Phillips, Steuart,\u00a0Guide to Computer Forensics and Investigations, Sixth Edition, Cengage Learning 2019, ISBN:978-1-337-56894-4 Chapters 7 &8"
}

DIAGNOSTIC DASHBOARD: Generated Plan

********************************************************************************
                                Planning Week 7                                 
********************************************************************************

********************************************************************************
                     Plan for Week 7 Generated Successfully                     
********************************************************************************
{
  "week": 7,
  "overall_topic": "Linux Boot Processes and File Systems. Recovering Grap

Next steps 

Chunnk relation wwith the weights of the number of the slides per subtopic, haave in mind that 1 hour of delivery is like 20-25 slides 

to ensure to move to the case to handle i wourl like to ensure the concepts are clear when we discussde about sessions and week, sessions in this context is number of classes that we have for week, if we say week , 3 sessions in one week or sessions_per_week = 3 is 3 classes per week that require 3 different set of 

https://youtu.be/6xcCwlDx6f8?si=7QxFyzuNVppHBQ-c

## Configuration and Scope

**Description:**  

**Parameters and concideration**
- 1 hour in the setting session_time_duration_in_hour - is 18-20 slides at the time so it is require to calculate this according to the given value but this also means per session so sessions_per_week is a multiplicator factor that   
- if apply_topic_interactive is available will add an extra slide and add extra 5 min time but to determine this is required to plan all the content first and then calculate then provide a extra time 

settings_deck.json

{
  "course_id": "",
  "unit_name": "",
  "teaching_flow_id": "apply_topic_interactive",
  "slide_count_strategy": {
    "method": "per_week",
    "target": 0
  },
  "week_session_setup": {
    "sessions_per_week": 1,
    "distribution_strategy": "even",
    "session_time_duration_in_hour": 1.5
  },
  "generation_scope": {
    "weeks": "all"
  }
}

In [52]:
# Cell 7: Configuration and Scoping for Content Generation

import os
import json
import logging

# Setup Logger for this cell
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 1. DEFINE FILE PATHS AND GLOBAL TEST SETTINGS ---
# Assumes these variables are loaded from a previous setup cell (like Cell 1)
# For demonstration, we define them here. Replace with your actual global vars.
# PROJECT_BASE_DIR = "/path/to/your/project"
# PARSED_UO_JSON_PATH = os.path.join(PROJECT_BASE_DIR, "Parse_data/Parse_UO/ICT312_Digital_Forensic_Final_parsed.json")
# PRE_EXTRACTED_TOC_JSON_PATH = os.path.join(PROJECT_BASE_DIR, "Parse_data/Parse_TOC_books/ICT312_epub_table_of_contents.json")

# New configuration file paths
CONFIG_DIR = os.path.join(PROJECT_BASE_DIR, "configs")
SETTINGS_DECK_PATH = os.path.join(CONFIG_DIR, "settings_deck.json")
TEACHING_FLOWS_PATH = os.path.join(CONFIG_DIR, "teaching_flows.json")

# New output path for the processed settings
PROCESSED_SETTINGS_PATH = os.path.join(CONFIG_DIR, "processed_settings.json")

# --- Global Test Overrides (for easy testing) ---
# To test a specific week, change this from "all" to a list, e.g., [7]
# To test a different flow, change the teaching_flow_id.

TEST_OVERRIDE_WEEKS = [7] # e.g., [7] or [1, 2, 3] or "all"
TEST_OVERRIDE_FLOW_ID = "apply_topic_interactive" # or "standard_lecture"
TEST_OVERRIDE_SESSIONS_PER_WEEK = 4 # e.g., 1 or 2
TEST_OVERRIDE_DISTRIBUTION_STRATEGY = "even" # 'even', 'front_load', 'end_load'

def print_header(text: str, char: str = "="):
    """Prints a centered header to the console."""
    print("\n" + char * 80)
    print(text.center(80))
    print(char * 80)


def process_and_load_configurations():
    """
    Loads all configuration files, processes them to create a definitive plan,
    and returns a master configuration object.
    """
    print_header("Configuration and Scoping Process", char="-")
    
    # --- 2. LOAD ALL INPUT FILES ---
    logger.info("Loading all necessary configuration and data files...")
    try:
        with open(PARSED_UO_JSON_PATH, 'r', encoding='utf-8') as f:
            unit_outline = json.load(f)
        with open(PRE_EXTRACTED_TOC_JSON_PATH, 'r', encoding='utf-8') as f:
            book_toc = json.load(f)
        with open(SETTINGS_DECK_PATH, 'r', encoding='utf-8') as f:
            settings_deck = json.load(f)
        with open(TEACHING_FLOWS_PATH, 'r', encoding='utf-8') as f:
            teaching_flows = json.load(f)
        logger.info("All files loaded successfully.")
    except FileNotFoundError as e:
        logger.error(f"FATAL: A required configuration file was not found: {e}")
        return None

    # --- 3. PRE-PROCESS AND REFINE SETTINGS ---
    logger.info("Pre-processing settings_deck for definitive plan...")
    
    # Create a deep copy to avoid modifying the original object
    processed_settings = json.loads(json.dumps(settings_deck))

    # a. Smartly set Course ID and Unit Name from the Unit Outline
    unit_info = unit_outline.get("unitInformation", {})
    course_id = unit_info.get("unitCode", "UNKNOWN_COURSE")
    unit_name = unit_info.get("unitName", "Unknown Unit Name")
    
    processed_settings['course_id'] = course_id
    processed_settings['unit_name'] = unit_name # <-- NEW: Add unit_name
    
    logger.info(f"  - Set 'course_id' from Unit Outline: {course_id}")
    logger.info(f"  - Set 'unit_name' from Unit Outline: {unit_name}")

    # b. Apply test overrides for easier development
    processed_settings['teaching_flow_id'] = TEST_OVERRIDE_FLOW_ID
    logger.info(f"  - Using 'teaching_flow_id' from test override: '{TEST_OVERRIDE_FLOW_ID}'")
    
    processed_settings['week_session_setup']['sessions_per_week'] = TEST_OVERRIDE_SESSIONS_PER_WEEK
    logger.info(f"  - Using 'sessions_per_week' from test override: {TEST_OVERRIDE_SESSIONS_PER_WEEK}")
    
    processed_settings['week_session_setup']['distribution_strategy'] = TEST_OVERRIDE_DISTRIBUTION_STRATEGY
    logger.info(f"  - Using 'distribution_strategy' from test override: '{TEST_OVERRIDE_DISTRIBUTION_STRATEGY}'")
    
    # c. Resolve the generation scope (which weeks to generate)
    scope = TEST_OVERRIDE_WEEKS
    if scope == "all":
        num_weeks = len(unit_outline.get('weeklySchedule', []))
        if num_weeks == 0:
            logger.error("Unit Outline 'weeklySchedule' is empty. Cannot determine number of weeks.")
            return None
        final_scope = list(range(1, num_weeks + 1))
        logger.info(f"  - 'generation_scope' is 'all'. Resolved to {num_weeks} weeks: {final_scope}")
    elif isinstance(scope, list) and all(isinstance(i, int) for i in scope):
        final_scope = scope
        logger.info(f"  - 'generation_scope' is a specific list from test override: {final_scope}")
    else:
        logger.error(f"Invalid 'generation_scope' setting: {scope}. Must be 'all' or a list of integers.")
        return None
        
    processed_settings['generation_scope']['weeks'] = final_scope
    
    # --- 4. ASSEMBLE & SAVE FINAL CONFIGURATION ---
    master_config = {
        "processed_settings": processed_settings,
        "unit_outline": unit_outline,
        "book_toc": book_toc,
        "teaching_flows": teaching_flows
    }
    
    logger.info(f"Saving the final processed configuration to: {PROCESSED_SETTINGS_PATH}")
    os.makedirs(os.path.dirname(PROCESSED_SETTINGS_PATH), exist_ok=True)
    with open(PROCESSED_SETTINGS_PATH, 'w', encoding='utf-8') as f:
        json.dump(processed_settings, f, indent=2)
    
    print_header("Configuration Complete", char="-")
    logger.info("Master configuration object is ready for the generation pipeline.")
    
    return master_config

# --- EXECUTE THE CONFIGURATION PROCESS ---
master_config = process_and_load_configurations()

# Optional: Print a preview to verify the output
if master_config:
    print("\n--- Preview of Processed Settings ---")
    print(json.dumps(master_config['processed_settings'], indent=2))
    print(f"\nNumber of weeks to generate: {len(master_config['processed_settings']['generation_scope']['weeks'])}")
    print("-----------------------------------")


2025-07-02 11:05:53,200 - INFO - Loading all necessary configuration and data files...
2025-07-02 11:05:53,201 - INFO - All files loaded successfully.
2025-07-02 11:05:53,202 - INFO - Pre-processing settings_deck for definitive plan...
2025-07-02 11:05:53,202 - INFO -   - Set 'course_id' from Unit Outline: ICT312
2025-07-02 11:05:53,202 - INFO -   - Set 'unit_name' from Unit Outline: Digital Forensic
2025-07-02 11:05:53,202 - INFO -   - Using 'teaching_flow_id' from test override: 'apply_topic_interactive'
2025-07-02 11:05:53,203 - INFO -   - Using 'sessions_per_week' from test override: 4
2025-07-02 11:05:53,203 - INFO -   - Using 'distribution_strategy' from test override: 'even'
2025-07-02 11:05:53,204 - INFO -   - 'generation_scope' is a specific list from test override: [7]
2025-07-02 11:05:53,204 - INFO - Saving the final processed configuration to: /home/sebas_dev_linux/projects/course_generator/configs/processed_settings.json
2025-07-02 11:05:53,205 - INFO - Master configuratio


--------------------------------------------------------------------------------
                       Configuration and Scoping Process                        
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
                             Configuration Complete                             
--------------------------------------------------------------------------------

--- Preview of Processed Settings ---
{
  "course_id": "ICT312",
  "unit_name": "Digital Forensic",
  "teaching_flow_id": "apply_topic_interactive",
  "slide_count_strategy": {
    "method": "per_week",
    "target": 25
  },
  "week_session_setup": {
    "sessions_per_week": 4,
    "distribution_strategy": "even"
  },
  "generation_scope": {
    "weeks": [
      7
    ]
  }
}

Number of weeks to generate: 1
-----------------------------------
