In [None]:
import os
import json
import logging
import re
import shutil
from typing import List, Dict, Any, Optional

# --- LangChain Core Imports ---
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser # Keep StrOutputParser for debugging
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from pydantic import BaseModel, Field
from langchain.storage import InMemoryStore

# --- LangChain Community/Vendor Imports ---
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma

# --- LangChain Text Splitter & Retriever ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
UNIT_OUTLINE_JSON_PATH = "/home/sebas_dev_linux/projects/course_generator/results/Parse_UO/ICT312 Digital Forensic_Final_parsed.json"

CHROMA_PERSIST_DIR_CHILD_GLOBAL = "./chroma_db_global_child_chunks_strat3_v3" # For PDR to find parent chapters
CHROMA_COLLECTION_NAME_CHILD_GLOBAL = "global_book_child_chunks_strat3_v3"

OUTPUT_STRUCTURED_JSON_DIR = "./structured_weekly_content_toc_iterated_v3"
os.makedirs(OUTPUT_STRUCTURED_JSON_DIR, exist_ok=True)

EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
GENERATION_MODEL_OLLAMA = "mistral:latest"

CHILD_CHUNK_SIZE_GLOBAL = 700 # For PDR's initial search for parent chapters
CHILD_CHUNK_OVERLAP_GLOBAL = 100

# For splitting a retrieved parent chapter for subtopic search
SUBTOPIC_CONTEXT_CHUNK_SIZE = 500
SUBTOPIC_CONTEXT_CHUNK_OVERLAP = 50
TOP_K_SUBTOPIC_CONTEXT = 3 # How many child chunks to get for a specific subtopic

# --- Pydantic Models (Same) ---
class LearningModule(BaseModel):
    subtopicTitle: str = Field(description="A clear and concise title for this sub-topic or learning objective, often derived from chapter sections.")
    elaboratedContent: List[str] = Field(description="A list of detailed paragraphs or comprehensive bullet points explaining the subtopic.")

class WeeklyContent(BaseModel):
    mainWeeklyTopic: str = Field(description="The main overarching topic for the week.")
    learningModules: List[LearningModule] = Field(description="A list of learning modules covering distinct sub-topics with detailed content.")

# --- Helper Functions (load_unit_outline, clean_metadata_value, load_book_into_parent_documents - largely same) ---
def load_unit_outline(json_path: str) -> Dict[str, Any]:
    try:
        with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f)
        logger.info(f"Successfully loaded unit outline from: {json_path}"); return data
    except Exception as e: logger.error(f"Error loading unit outline {json_path}: {e}", exc_info=True); raise

def clean_metadata_value(value: Any) -> Any:
    """
    Cleans metadata values to be Chroma-compatible (str, int, float, bool, or None).
    Lists are converted to comma-separated strings.
    Dicts are converted to JSON strings.
    """
    if isinstance(value, list):
        # Always convert lists to a comma-separated string for ChromaDB metadata
        return ", ".join(map(str, value)) 
    elif isinstance(value, dict):
        return json.dumps(value) # Convert dict to a JSON string
    elif isinstance(value, (str, int, float, bool)) or value is None:
        return value
    else:
        # For any other type, attempt to convert to string as a fallback
        return str(value)

def load_book_into_parent_documents(book_path: str) -> List[Document]:
    # (Same as your last working version that extracts chapter_toc for EPUBs)
    logger.info(f"Loading book from '{book_path}' to create parent documents (chapters with ToC).")
    _, file_extension = os.path.splitext(book_path.lower())
    parent_documents: List[Document] = []

    if file_extension == ".pdf":
        logger.warning("PDF processing: Treating each page as a parent document. Chapter/ToC segmentation is not robustly implemented here.")
        pdf_loader = PyPDFLoader(book_path)
        try:
            pages_as_parents = pdf_loader.load()
            for i, page_doc in enumerate(pages_as_parents):
                page_num = page_doc.metadata.get('page', i + 1)
                parent_id = f"pdf_page_{page_num}"
                cleaned_metadata = {
                    "source": os.path.basename(page_doc.metadata.get("source", book_path)),
                    "page_number": page_num, "chapter_number": page_num,
                    "chapter_title": f"PDF Page {page_num}", "chapter_toc": [],
                    "document_type": "parent", "parent_id": parent_id
                }
                parent_documents.append(Document(page_content=page_doc.page_content, metadata=cleaned_metadata))
            logger.info(f"Loaded {len(parent_documents)} pages as parent documents from PDF.")
        except Exception as e: logger.error(f"Error loading PDF: {e}", exc_info=True); raise

    elif file_extension == ".epub":
        logger.info("Using UnstructuredEPubLoader for EPUB to identify chapters and their ToCs.")
        epub_loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
        try:
            raw_lc_documents = epub_loader.load()
            if not raw_lc_documents: raise ValueError(f"No elements loaded from EPUB {book_path}")
            logger.info(f"Loaded {len(raw_lc_documents)} raw elements as LangChain Documents from EPUB.")
        except Exception as e: logger.error(f"Error loading EPUB: {e}", exc_info=True); raise

        current_chapter_number = 0
        current_chapter_title = "Preface or Introduction"
        current_chapter_toc: List[str] = []
        current_chapter_content_accumulator: List[str] = []
        parent_id_counter = 0

        for i, element_doc in enumerate(raw_lc_documents):
            element_text = element_doc.page_content.strip() if element_doc.page_content else ""
            element_category = element_doc.metadata.get("category")
            new_chapter_detected = False

            if element_category == "Title" and element_text and len(element_text) < 150:
                chapter_match = re.match(r"(?i)^(chapter\s+(\d+|[IVXLCDM]+)\b|part\s+[A-Z0-9]+|appendix\s+[A-Z])", element_text)
                if chapter_match: new_chapter_detected = True
            
            if new_chapter_detected and current_chapter_content_accumulator:
                parent_id_counter += 1; parent_doc_content = "\n".join(current_chapter_content_accumulator).strip()
                if parent_doc_content:
                    parent_metadata = {"source": os.path.basename(book_path), "chapter_number": current_chapter_number,
                                       "chapter_title": current_chapter_title, "chapter_toc": current_chapter_toc.copy(),
                                       "document_type": "parent", "parent_id": f"epub_ch_{parent_id_counter}"}
                    parent_documents.append(Document(page_content=parent_doc_content, metadata={k: clean_metadata_value(v) for k, v in parent_metadata.items() if v is not None}))
                    logger.info(f"Created parent doc for Ch {current_chapter_number}: '{current_chapter_title}' with {len(current_chapter_toc)} ToC items.")
                current_chapter_content_accumulator = []; current_chapter_toc = []

            if new_chapter_detected:
                current_chapter_number +=1; current_chapter_title = element_text
                if element_text not in current_chapter_toc: current_chapter_toc.append(element_text)
            elif element_category == "Title" and element_text and current_chapter_number > 0: # Subheading
                if element_text not in current_chapter_toc: current_chapter_toc.append(element_text)
            if element_text: current_chapter_content_accumulator.append(element_text)
        
        if current_chapter_content_accumulator: # Last chapter
            parent_id_counter += 1; parent_doc_content = "\n".join(current_chapter_content_accumulator).strip()
            if parent_doc_content:
                parent_metadata = {"source": os.path.basename(book_path), "chapter_number": current_chapter_number,
                                   "chapter_title": current_chapter_title, "chapter_toc": current_chapter_toc.copy(),
                                   "document_type": "parent", "parent_id": f"epub_ch_{parent_id_counter}"}
                parent_documents.append(Document(page_content=parent_doc_content, metadata={k: clean_metadata_value(v) for k, v in parent_metadata.items() if v is not None}))
                logger.info(f"Created parent doc for last Ch {current_chapter_number}: '{current_chapter_title}' with {len(current_chapter_toc)} ToC items.")
    else: raise ValueError(f"Unsupported book file format: {file_extension}")
    if not parent_documents: logger.error(f"No parent documents created from {book_path}.")
    else: logger.info(f"Created {len(parent_documents)} parent documents with ToC metadata (if EPUB).")
    return parent_documents




# --- NEW FUNCTION for subtopic-specific context ---
def get_subtopic_specific_context_from_parent(
    subtopic_query: str,
    parent_chapter_document: Document,
    embedding_model: OllamaEmbeddings,
    k: int = TOP_K_SUBTOPIC_CONTEXT
) -> str:
    """
    Splits a given parent chapter document into small chunks,
    creates a temporary in-memory vector store for these chunks,
    and retrieves the most relevant chunks for the subtopic_query.
    """
    logger.info(f"Creating temporary vectorstore for parent chapter: '{parent_chapter_document.metadata.get('chapter_title', 'Unknown Chapter')}' to find context for subtopic: '{subtopic_query}'")
    
    # Split the single parent document's content
    sub_splitter = RecursiveCharacterTextSplitter(
        chunk_size=SUBTOPIC_CONTEXT_CHUNK_SIZE,
        chunk_overlap=SUBTOPIC_CONTEXT_CHUNK_OVERLAP
    )
    # Create Document objects from the text splits of the parent,
    # they can inherit parent's metadata for reference if needed, but not strictly necessary for this temp store
    parent_content_chunks = sub_splitter.split_text(parent_chapter_document.page_content)
    
    if not parent_content_chunks:
        logger.warning("Parent chapter content yielded no chunks for temporary vectorstore.")
        return "No specific textual context found within the chapter for this subtopic after chunking."

    # Create LangChain Documents from these text chunks
    temp_docs_for_subtopic_search = [Document(page_content=text) for text in parent_content_chunks]
    
    # Create a temporary, in-memory Chroma vector store for these specific chunks
    try:
        temp_vectorstore = Chroma.from_documents(
            documents=temp_docs_for_subtopic_search,
            embedding=embedding_model
            # No persistence needed for this temporary store
        )
        retrieved_sub_chunks = temp_vectorstore.similarity_search(subtopic_query, k=k)
        logger.info(f"Retrieved {len(retrieved_sub_chunks)} child chunks from parent chapter for subtopic '{subtopic_query}'.")
        return format_docs(retrieved_sub_chunks) # format_docs is your existing helper
    except Exception as e:
        logger.error(f"Error creating/querying temporary vectorstore for subtopic '{subtopic_query}': {e}", exc_info=True)
        return f"Error retrieving specific context for subtopic: {str(e)}"

# --- NEW FUNCTION for generating content for a single subtopic ---
def generate_elaborated_content_for_subtopic(
    subtopic_title: str,
    subtopic_context: str,
    generation_llm: ChatOllama
) -> List[str]:
    logger.info(f"Generating elaborated content for subtopic: '{subtopic_title}'")
    # Using Pydantic model for structured list output
    class SubtopicElaboration(BaseModel):
        points: List[str] = Field(description="A list of detailed paragraphs or comprehensive bullet points explaining the subtopic.")

    parser = JsonOutputParser(pydantic_object=SubtopicElaboration)
    
    template = """
    You are an academic writer. Given a specific subtopic title and highly relevant text excerpts
    from a textbook chapter, provide a detailed explanation of ONLY that subtopic.

    Subtopic to Explain: "{subtopic_title}"

    Relevant Textbook Excerpts for this Subtopic:
    ---CONTEXT START---
    {subtopic_context}
    ---CONTEXT END---

    Based ONLY on the provided Textbook Excerpts for this subtopic:
    1. Generate a detailed explanation of the subtopic.
    2. The explanation should include key definitions, principles, and examples if found in the context.
    3. Present your output as a JSON object with a single key "points", where the value is a list of strings. Each string in the list should be a full paragraph or a comprehensive bullet point.
    4. Aim for substantial detail for the subtopic. Ensure your explanation is focused and directly supported by the provided context.

    {format_instructions}
    """
    prompt = ChatPromptTemplate.from_template(
        template,
        partial_variables={"format_instructions": parser.get_format_instructions()}
    )
    chain = prompt | generation_llm | parser

    try:
        llm_response_dict = chain.invoke({
            "subtopic_title": subtopic_title,
            "subtopic_context": subtopic_context
        })
        content_list = llm_response_dict.get("points", [])
        if not content_list and subtopic_context and "Error" not in subtopic_context: # If context was there but no points generated
            logger.warning(f"LLM generated no points for subtopic '{subtopic_title}' despite context. Using placeholder.")
            content_list = [f"Content for '{subtopic_title}' should be elaborated based on chapter knowledge."]
        elif not content_list: # No context and no points
             content_list = [f"No specific context available to elaborate on '{subtopic_title}'."]

        logger.info(f"Successfully generated {len(content_list)} content points for subtopic '{subtopic_title}'.")
        return content_list
    except Exception as e:
        logger.error(f"Error generating content for subtopic '{subtopic_title}': {e}", exc_info=True)
        # Log raw output if parsing fails
        try:
            raw_output = (prompt | generation_llm | StrOutputParser()).invoke({
                "subtopic_title": subtopic_title, "subtopic_context": subtopic_context
            })
            logger.error(f"LLM Raw Output for subtopic '{subtopic_title}':\n{raw_output}")
        except Exception as raw_e:
            logger.error(f"Failed to get raw LLM output for subtopic after error: {raw_e}")
        return [f"Error generating content for {subtopic_title}: {str(e)}"]


# --- Main Execution ---
def main():
    logger.info("Starting ENHANCED Content Structuring RAG pipeline (Iterate ToC Subtopics)...")
    # (Initial setup: paths, load unit outline, select week, init LLMs - same as before)
    if not os.path.exists(BOOK_PATH): logger.error(f"Book file not found: {BOOK_PATH}"); return
    if not os.path.exists(UNIT_OUTLINE_JSON_PATH): logger.error(f"Unit outline JSON file not found: {UNIT_OUTLINE_JSON_PATH}"); return

    unit_outline_data = load_unit_outline(UNIT_OUTLINE_JSON_PATH)
    weekly_schedule = unit_outline_data.get("weeklySchedule", [])
    if not weekly_schedule: logger.error("No weekly schedule in unit outline."); return

    WEEK_TO_PROCESS_INDEX = 0 
    if WEEK_TO_PROCESS_INDEX >= len(weekly_schedule): logger.error(f"Week index {WEEK_TO_PROCESS_INDEX} out of bounds."); return
    
    selected_week_info = weekly_schedule[WEEK_TO_PROCESS_INDEX]
    main_weekly_topic_title_from_outline = selected_week_info.get("topic") or selected_week_info.get("contentTopic")
    week_identifier = selected_week_info.get("week", f"Idx_{WEEK_TO_PROCESS_INDEX}")
    full_main_topic_name = f"Week {week_identifier}: {main_weekly_topic_title_from_outline}"

    if not main_weekly_topic_title_from_outline: 
        logger.error(f"Selected week (Idx {WEEK_TO_PROCESS_INDEX}) has no valid topic title."); return
    logger.info(f"Processing selected week: {full_main_topic_name}")

    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        # Generation LLM for subtopic content needs to be good at focused explanation and JSON list output
        generation_llm_for_subtopics = ChatOllama(model=GENERATION_MODEL_OLLAMA, temperature=0.2, format="json")
    except Exception as e:
        logger.error(f"Failed to initialize Ollama models: {e}. Ensure Ollama is running.", exc_info=True); return

    parent_documents = load_book_into_parent_documents(BOOK_PATH)
    if not parent_documents: logger.error("Failed to load book into parent documents."); return

    # Setup ParentDocumentRetriever (for finding the main chapter(s) for the week)
    docstore_for_parents = InMemoryStore()
    parent_doc_ids_for_store = [doc.metadata.get("parent_id", f"fallback_pid_{i}") for i, doc in enumerate(parent_documents)]
    docstore_for_parents.mset(list(zip(parent_doc_ids_for_store, parent_documents)))
    logger.info(f"Parent documents ({len(parent_documents)}) added to in-memory docstore.")

    if os.path.exists(CHROMA_PERSIST_DIR_CHILD_GLOBAL):
        logger.info(f"Deleting existing GLOBAL child chunk DB: {CHROMA_PERSIST_DIR_CHILD_GLOBAL}")
        shutil.rmtree(CHROMA_PERSIST_DIR_CHILD_GLOBAL)
        
    global_child_vectorstore = Chroma(
        collection_name=CHROMA_COLLECTION_NAME_CHILD_GLOBAL,
        embedding_function=embedding_model,
        persist_directory=CHROMA_PERSIST_DIR_CHILD_GLOBAL
    )
    global_child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHILD_CHUNK_SIZE_GLOBAL, chunk_overlap=CHILD_CHUNK_OVERLAP_GLOBAL)
    
    logger.info("Initializing ParentDocumentRetriever for chapter lookup...")
    parent_retriever_for_chapter_lookup = ParentDocumentRetriever(
        vectorstore=global_child_vectorstore, docstore=docstore_for_parents, child_splitter=global_child_splitter
    )
    parent_retriever_for_chapter_lookup.add_documents(parent_documents, ids=parent_doc_ids_for_store, add_to_docstore=False) # Already added
    logger.info("Global ParentDocumentRetriever initialized, child chunks added to its vector store.")

    # --- 1. Retrieve the PRIMARY Parent Document(s) (Chapter) for the Week's Main Topic ---
    logger.info(f"Retrieving primary parent chapter(s) for: '{main_weekly_topic_title_from_outline}'")
    try:
        retrieved_primary_chapters = parent_retriever_for_chapter_lookup.invoke(main_weekly_topic_title_from_outline)
        if not retrieved_primary_chapters:
            logger.error(f"No parent chapter(s) found for main topic: '{main_weekly_topic_title_from_outline}'. Cannot proceed."); return
        logger.info(f"Retrieved {len(retrieved_primary_chapters)} primary parent chapter(s). Focusing on the first.")
        # For simplicity, we'll focus on the ToC of the *first* retrieved chapter.
        # A more advanced approach might merge ToCs or process multiple relevant chapters.
        primary_chapter_doc = retrieved_primary_chapters[0]
        chapter_title = primary_chapter_doc.metadata.get("chapter_title", "Unknown Chapter")
        chapter_number_meta = primary_chapter_doc.metadata.get("chapter_number", "NA")
        
        chapter_toc_raw = primary_chapter_doc.metadata.get("chapter_toc", [])
        chapter_toc_list = []
        if isinstance(chapter_toc_raw, str): # Handle if ToC was stringified
            try: chapter_toc_list = json.loads(chapter_toc_raw) if chapter_toc_raw.startswith("[") else [t.strip() for t in chapter_toc_raw.split(',') if t.strip()]
            except: chapter_toc_list = [t.strip() for t in chapter_toc_raw.split(',') if t.strip()]
        elif isinstance(chapter_toc_raw, list):
            chapter_toc_list = chapter_toc_raw
        
        if not chapter_toc_list:
            logger.warning(f"No ToC found or ToC is empty in retrieved chapter '{chapter_title}'. Will attempt to generate content for the chapter as a whole or use a generic subtopic.")
            # Fallback: Treat the whole chapter title as the only subtopic, or use a generic prompt
            chapter_toc_list = [f"Overview of {chapter_title}"] if chapter_title != "Unknown Chapter" else ["Main concepts of the week"]


    except Exception as e:
        logger.error(f"Error retrieving primary chapter or its ToC: {e}", exc_info=True); return

    # --- 2. Iterate through Chapter ToC Subtopics ---
    learning_modules_for_week: List[LearningModule] = []
    logger.info(f"Processing {len(chapter_toc_list)} subtopics from ToC of chapter '{chapter_title}':")

    for subtopic_from_toc in chapter_toc_list:
        if not subtopic_from_toc.strip(): continue
        logger.info(f"-- Processing Subtopic from ToC: '{subtopic_from_toc}' --")

        # 3a. Targeted RAG: Get specific context for THIS subtopic from WITHIN the primary_chapter_doc
        subtopic_specific_context_str = get_subtopic_specific_context_from_parent(
            subtopic_query=subtopic_from_toc,
            parent_chapter_document=primary_chapter_doc, # Pass the whole chapter doc
            embedding_model=embedding_model # For the temporary vector store
        )

        if not subtopic_specific_context_str.strip() or "Error retrieving specific context" in subtopic_specific_context_str :
            logger.warning(f"No specific context found for subtopic '{subtopic_from_toc}'. Using placeholder or skipping detailed generation.")
            # If using placeholder, it means the LLM will try to generate based on title alone
            # Forcing some context, even if generic, might be better if LLM struggles with no context
            subtopic_specific_context_str = f"General information pertaining to {subtopic_from_toc} within the broader context of {chapter_title}." if not subtopic_specific_context_str.strip() else subtopic_specific_context_str


        # 3b. LLM Content Generation for this specific subtopic
        elaborated_content_for_subtopic = generate_elaborated_content_for_subtopic(
            subtopic_title=subtopic_from_toc,
            subtopic_context=subtopic_specific_context_str,
            generation_llm=generation_llm_for_subtopics
        )
        
        learning_modules_for_week.append(
            LearningModule(subtopicTitle=subtopic_from_toc, elaboratedContent=elaborated_content_for_subtopic)
        )

    # 4. Structure the final output
    final_weekly_output = WeeklyContent(
        mainWeeklyTopic=full_main_topic_name,
        learningModules=learning_modules_for_week
    )

    # 5. Save JSON
    output_filename = os.path.join(OUTPUT_STRUCTURED_JSON_DIR, f"week_{week_identifier}_content_iterated_toc_ch{chapter_number_meta}.json")
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(final_weekly_output.dict(), f, indent=2, ensure_ascii=False) # .dict() for Pydantic model
        logger.info(f"Successfully saved detailed subtopic content to {output_filename}")
    except Exception as e:
        logger.error(f"Error saving structured JSON: {e}", exc_info=True)

    logger.info("ENHANCED Content Structuring RAG pipeline (Iterate ToC Subtopics) finished.")


if __name__ == "__main__":
    if "YOUR_BOOK.pdf" in BOOK_PATH:
        logger.error("!!! PLEASE UPDATE 'BOOK_PATH' with the actual path to your book. !!!")
    else:
        main()

In [1]:
import os
import json
import logging
import re
import shutil
from typing import List, Dict, Any, Optional, Tuple # Added Tuple

# --- LangChain Core Imports ---
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from pydantic import BaseModel, Field # Using direct pydantic import
from langchain.storage import InMemoryStore

# --- LangChain Community/Vendor Imports ---
from langchain_community.document_loaders import PyPDFLoader, UnstructuredEPubLoader
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma

# --- LangChain Text Splitter & Retriever ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"
UNIT_OUTLINE_JSON_PATH = "/home/sebas_dev_linux/projects/course_generator/results/Parse_UO/ICT312 Digital Forensic_Final_parsed.json"

CHROMA_PERSIST_DIR_CHILD_GLOBAL = "./chroma_db_global_child_chunks_strat3_v4_fulltoc"
CHROMA_COLLECTION_NAME_CHILD_GLOBAL = "global_book_child_chunks_strat3_v4_fulltoc"

OUTPUT_STRUCTURED_JSON_DIR = "./structured_weekly_content_toc_iterated_v4_fulltoc"
OUTPUT_BOOK_TOC_JSON_PATH = "./book_table_of_contents.json" # New output for the book's ToC
os.makedirs(OUTPUT_STRUCTURED_JSON_DIR, exist_ok=True)

EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"
CONTENT_GENERATION_MODEL_OLLAMA = "mistral:latest"
TOC_GENERATION_MODEL_OLLAMA = "mistral:7b-instruct-q4_K_M"

CHILD_CHUNK_SIZE_GLOBAL = 700
CHILD_CHUNK_OVERLAP_GLOBAL = 100
SUBTOPIC_CONTEXT_CHUNK_SIZE = 500
SUBTOPIC_CONTEXT_CHUNK_OVERLAP = 50
TOP_K_SUBTOPIC_CONTEXT = 3

# --- Pydantic Models (Same) ---
class LearningModule(BaseModel):
    subtopicTitle: str = Field(description="A clear and concise title for this sub-topic/objective.")
    elaboratedContent: List[str] = Field(description="Detailed paragraphs/bullets explaining the subtopic.")

class WeeklyContent(BaseModel):
    mainWeeklyTopic: str = Field(description="The main weekly topic.")
    learningModules: List[LearningModule] = Field(description="List of learning modules.")

class ChapterToCOutput(BaseModel): # For LLM ToC generation for individual chapters
    table_of_contents: List[str] = Field(description="List of concise section headings for the chapter.")

# --- Helper Functions (load_unit_outline, clean_metadata_value, generate_toc_for_chapter_text - same) ---
def load_unit_outline(json_path: str) -> Dict[str, Any]:
    try:
        with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f)
        logger.info(f"Successfully loaded unit outline: {json_path}"); return data
    except Exception as e: logger.error(f"Error loading outline {json_path}: {e}", exc_info=True); raise

def clean_metadata_value(value: Any) -> Any:
    """
    Cleans metadata values to be Chroma-compatible (str, int, float, bool, or None).
    Lists are converted to comma-separated strings.
    Dicts are converted to JSON strings.
    """
    if isinstance(value, list):
        # Always convert lists to a comma-separated string for ChromaDB metadata
        return ", ".join(map(str, value)) 
    elif isinstance(value, dict):
        return json.dumps(value) # Convert dict to a JSON string
    elif isinstance(value, (str, int, float, bool)) or value is None:
        return value
    else:
        # For any other type, attempt to convert to string as a fallback
        return str(value)

def generate_toc_for_chapter_text(chapter_text: str, chapter_title: str, toc_llm: ChatOllama) -> List[str]:
    # (Same as your last working version)
    logger.info(f"Generating ToC with LLM for chapter: '{chapter_title}' (text length: {len(chapter_text)})")
    parser = JsonOutputParser(pydantic_object=ChapterToCOutput)
    MAX_CHARS_FOR_TOC_LLM = 28000 
    chapter_text_for_llm = chapter_text[:MAX_CHARS_FOR_TOC_LLM] + "\n... [TRUNCATED]" if len(chapter_text) > MAX_CHARS_FOR_TOC_LLM else chapter_text
    if not chapter_text_for_llm.strip(): return [f"Content for chapter '{chapter_title}' was empty."]
    prompt_template = """You are an expert technical writer... Return ONLY a JSON object... {format_instructions}""" # Your full prompt
    prompt = ChatPromptTemplate.from_template(prompt_template, partial_variables={"format_instructions": parser.get_format_instructions()})
    chain = prompt | toc_llm | parser
    try:
        response_dict = chain.invoke({"chapter_title": chapter_title, "chapter_content": chapter_text_for_llm})
        toc_list = response_dict.get("table_of_contents", [])
        logger.info(f"LLM generated ToC for '{chapter_title}': Count {len(toc_list)}")
        return toc_list if toc_list else [f"No ToC generated by LLM for {chapter_title}"]
    except Exception as e: logger.error(f"LLM ToC generation error for '{chapter_title}': {e}", exc_info=True); return [f"Error generating ToC for {chapter_title}"]


def load_book_into_parent_documents(book_path: str, toc_generation_llm: ChatOllama) -> Tuple[List[Document], List[Dict[str, Any]]]:
    logger.info(f"Loading book '{book_path}' for parent docs & full ToC (LLM-assisted for chapters).")
    _, file_extension = os.path.splitext(book_path.lower())
    parent_documents: List[Document] = []
    hierarchical_toc: List[Dict[str, Any]] = []

    # --- PDF Section (remains the same basic handling) ---
    if file_extension == ".pdf":
        # ... (your existing PDF handling code) ...
        logger.warning("PDF processing: Pages as parents. Full hierarchical ToC extraction is basic for PDF.")
        pdf_loader = PyPDFLoader(book_path)
        try:
            pages_as_parents = pdf_loader.load()
            current_chapter_entry_for_toc = None # For the global ToC
            for i, page_doc in enumerate(pages_as_parents):
                page_num = page_doc.metadata.get('page', i + 1)
                parent_id = f"pdf_page_{page_num}"
                page_title = f"PDF Page {page_num}"
                
                if not current_chapter_entry_for_toc: # Create a single "chapter" for all PDF pages
                    current_chapter_entry_for_toc = {
                        "title": "PDF Content (Page by Page)", "level": 1, 
                        "number": "PDF", "sections": []
                    }
                    hierarchical_toc.append(current_chapter_entry_for_toc)
                current_chapter_entry_for_toc["sections"].append({"title": page_title, "level": 2, "subsections": []})

                parent_metadata = {
                    "source": os.path.basename(page_doc.metadata.get("source", book_path)),
                    "page_number": page_num, "chapter_number": page_num, 
                    "chapter_title": page_title, "chapter_toc": [], 
                    "document_type": "parent", "parent_id": parent_id
                }
                parent_documents.append(Document(page_content=page_doc.page_content, metadata={k: clean_metadata_value(v) for k,v in parent_metadata.items() if v is not None}))
            logger.info(f"Loaded {len(parent_documents)} PDF pages as parent documents.")
        except Exception as e: logger.error(f"Error loading PDF: {e}", exc_info=True); raise


    # --- EPUB Section (Refined) ---
    elif file_extension == ".epub":
        logger.info("Processing EPUB for chapters and generating detailed ToCs using LLM.")
        epub_loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
        try:
            raw_lc_documents = epub_loader.load()
            if not raw_lc_documents: raise ValueError(f"No elements from EPUB {book_path}")
        except Exception as e: logger.error(f"Error loading EPUB: {e}", exc_info=True); raise

        current_chapter_number_val = 0
        current_chapter_title_str = "Preface or Introduction" # Default for content before Ch1
        current_chapter_content_acc: List[str] = []
        current_chapter_sections_for_global_toc: List[Dict[str, Any]] = []
        parent_id_counter = 0
        
        # Titles that are likely not main section headings for the global ToC
        IGNORE_TITLE_PREFIXES_FOR_GLOBAL_TOC = ("Note:", "Tip:", "Figure:", "Table:", "Listing:")
        # Specific title to look for as introduction
        CHAPTER_INTRODUCTION_TITLE = "Chapter Introduction"

        for element_doc in raw_lc_documents:
            element_text = element_doc.page_content.strip() if element_doc.page_content else ""
            element_category = element_doc.metadata.get("category")
            is_new_chapter_boundary = False

            if element_category == "Title" and element_text and len(element_text) < 150:
                if re.match(r"(?i)^(chapter\s+(\d+|[IVXLCDM]+)\b|part\s+[A-Z0-9]+|appendix\s+[A-Z])", element_text):
                    is_new_chapter_boundary = True
            
            if is_new_chapter_boundary and current_chapter_content_acc:
                parent_id_counter += 1
                parent_doc_content = "\n".join(current_chapter_content_acc).strip()
                if parent_doc_content:
                    llm_toc = generate_toc_for_chapter_text(parent_doc_content, current_chapter_title_str, toc_generation_llm)
                    
                    # Add completed chapter to hierarchical_toc
                    if current_chapter_number_val > 0: # Avoid adding preface/intro as a numbered chapter if it was processed
                         hierarchical_toc.append({
                             "title": current_chapter_title_str, "level": 1,
                             "number": str(current_chapter_number_val),
                             "sections": current_chapter_sections_for_global_toc
                         })

                    parent_metadata = {"source": os.path.basename(book_path), "chapter_number": current_chapter_number_val,
                                       "chapter_title": current_chapter_title_str, 
                                       "chapter_toc": llm_toc,
                                       "document_type": "parent", "parent_id": f"epub_ch_{parent_id_counter}"}
                    parent_documents.append(Document(page_content=parent_doc_content, metadata={k: clean_metadata_value(v) for k, v in parent_metadata.items() if v is not None}))
                    logger.info(f"Created parent doc Ch {current_chapter_number_val}: '{current_chapter_title_str}' (LLM ToC items: {len(llm_toc)})")
                current_chapter_content_acc = []
                current_chapter_sections_for_global_toc = [] 

            if is_new_chapter_boundary:
                current_chapter_number_val +=1
                current_chapter_title_str = element_text
                # Don't add the chapter title itself to its own list of sections for global ToC here.
                # The hierarchical_toc structure has a dedicated "title" for the chapter.
            elif element_category == "Title" and element_text and current_chapter_number_val >= 0: # Allow for Preface/Intro (Ch0) sections
                # Check if this title should be ignored for the global ToC's section list
                is_ignorable_title = any(element_text.lower().startswith(p.lower()) for p in IGNORE_TITLE_PREFIXES_FOR_GLOBAL_TOC)
                if not is_ignorable_title and len(element_text) < 150 : # Also check length for section titles
                    # Only add if it's not a generic/ignorable title and it's a reasonable length for a section title
                    current_chapter_sections_for_global_toc.append({"title": element_text, "level": 2, "subsections": []})
            
            if element_text: current_chapter_content_acc.append(element_text)
        
        if current_chapter_content_acc: # Last chapter
            parent_id_counter += 1; parent_doc_content = "\n".join(current_chapter_content_acc).strip()
            if parent_doc_content:
                llm_toc = generate_toc_for_chapter_text(parent_doc_content, current_chapter_title_str, toc_generation_llm)
                # Add the last chapter to hierarchical_toc
                if current_chapter_number_val >= 0: # Include preface/intro if it has content
                    hierarchical_toc.append({
                        "title": current_chapter_title_str, "level": 1,
                        "number": str(current_chapter_number_val) if current_chapter_number_val > 0 else "0", # Use "0" for preface
                        "sections": current_chapter_sections_for_global_toc
                    })
                parent_metadata = {"source": os.path.basename(book_path), "chapter_number": current_chapter_number_val,
                                   "chapter_title": current_chapter_title_str, "chapter_toc": llm_toc,
                                   "document_type": "parent", "parent_id": f"epub_ch_{parent_id_counter}"}
                parent_documents.append(Document(page_content=parent_doc_content, metadata={k: clean_metadata_value(v) for k, v in parent_metadata.items() if v is not None}))
                logger.info(f"Created parent doc last Ch {current_chapter_number_val}: '{current_chapter_title_str}' (LLM ToC items: {len(llm_toc)})")
    else: raise ValueError(f"Unsupported book file format: {file_extension}")

    if not parent_documents: logger.error(f"No parent documents created.")
    else: logger.info(f"Created {len(parent_documents)} parent documents.")
    
    if not hierarchical_toc: logger.warning("Hierarchical Table of Contents for the book is empty.")
    else: logger.info(f"Generated hierarchical book ToC with {len(hierarchical_toc)} top-level entries.")
    
    return parent_documents, hierarchical_toc
# format_docs_with_toc, get_subtopic_specific_context_from_parent, generate_elaborated_content_for_subtopic
# (These remain largely the same as your last correct version)
def format_docs_with_toc(docs: List[Document]) -> str:
    # (Same as your last version - uses doc.metadata.get("chapter_toc"))
    formatted_strings = []
    for i, doc in enumerate(docs):
        content_parts = []; title = doc.metadata.get("chapter_title", f"Retrieved Doc {i+1}")
        toc_items = doc.metadata.get("chapter_toc", []) 
        content_parts.append(f"--- Start of Content for: {title} ---")
        if toc_items and isinstance(toc_items, list) and len(toc_items) > 0 and not (len(toc_items)==1 and "Error" in toc_items[0]):
            content_parts.append("Chapter Table of Contents (LLM Generated):")
            for item in toc_items: content_parts.append(f"- {item}")
            content_parts.append("---")
        if doc.page_content: content_parts.append(doc.page_content)
        content_parts.append(f"--- End of Content for: {title} ---")
        formatted_strings.append("\n".join(content_parts))
    return "\n\n".join(formatted_strings)

def format_docs_simple(docs: List[Document]) -> str:
    return "\n\n---\n\n".join([doc.page_content for doc in docs if doc.page_content is not None])

def get_subtopic_specific_context_from_parent(subtopic_query: str, parent_chapter_document: Document, embedding_model: OllamaEmbeddings, k: int = TOP_K_SUBTOPIC_CONTEXT) -> str:
    # (Same as your last version)
    logger.info(f"Temp VS for parent: '{parent_chapter_document.metadata.get('chapter_title', 'Unk')}' for subtopic: '{subtopic_query}'")
    sub_splitter = RecursiveCharacterTextSplitter(chunk_size=SUBTOPIC_CONTEXT_CHUNK_SIZE, chunk_overlap=SUBTOPIC_CONTEXT_CHUNK_OVERLAP)
    parent_content_chunks_text = sub_splitter.split_text(parent_chapter_document.page_content)
    if not parent_content_chunks_text: return "Parent chapter yielded no text chunks."
    temp_docs = [Document(page_content=text) for text in parent_content_chunks_text]
    try:
        temp_vectorstore = Chroma.from_documents(documents=temp_docs, embedding=embedding_model)
        retrieved_sub_chunks = temp_vectorstore.similarity_search(subtopic_query, k=k)
        logger.info(f"Retrieved {len(retrieved_sub_chunks)} child chunks from parent chapter for subtopic '{subtopic_query}'.")
        return format_docs_simple(retrieved_sub_chunks) 
    except Exception as e: logger.error(f"Error in temp VS for subtopic '{subtopic_query}': {e}", exc_info=True); return f"Error: {str(e)}"

def generate_elaborated_content_for_subtopic(subtopic_title: str, subtopic_context: str, generation_llm: ChatOllama) -> List[str]:
    # (Same as your last version)
    logger.info(f"Generating elaborated content for subtopic: '{subtopic_title}'")
    class SubtopicElaboration(BaseModel): points: List[str] = Field(description="List of detailed paragraphs/bullets.")
    parser = JsonOutputParser(pydantic_object=SubtopicElaboration)
    template = """You are an academic writer...{format_instructions}""" # Your full detailed prompt
    prompt = ChatPromptTemplate.from_template(template, partial_variables={"format_instructions": parser.get_format_instructions()})
    chain = prompt | generation_llm | parser
    try:
        llm_response_dict = chain.invoke({"subtopic_title": subtopic_title, "subtopic_context": subtopic_context})
        content_list = llm_response_dict.get("points", [])
        if not content_list: content_list = [f"No specific elaboration for '{subtopic_title}' based on context."]
        logger.info(f"Generated {len(content_list)} content points for subtopic '{subtopic_title}'.")
        return content_list
    except Exception as e: logger.error(f"LLM error for subtopic '{subtopic_title}': {e}", exc_info=True); return [f"Error: {str(e)}"]


# --- Main Execution ---
def main():
    logger.info("RAG Pipeline (ParentDocRetriever + LLM ToC for Parents + Iterate LLM-ToC Subtopics)...")
    # (File checks, unit outline, week selection - same)
    if not os.path.exists(BOOK_PATH): logger.error(f"Book not found: {BOOK_PATH}"); return
    if not os.path.exists(UNIT_OUTLINE_JSON_PATH): logger.error(f"Outline not found: {UNIT_OUTLINE_JSON_PATH}"); return
    unit_outline_data = load_unit_outline(UNIT_OUTLINE_JSON_PATH)
    weekly_schedule = unit_outline_data.get("weeklySchedule", [])
    if not weekly_schedule: logger.error("No weekly schedule."); return
    WEEK_TO_PROCESS_INDEX = 0
    if WEEK_TO_PROCESS_INDEX >= len(weekly_schedule): logger.error("Week index out of bounds."); return
    selected_week_info = weekly_schedule[WEEK_TO_PROCESS_INDEX]
    main_weekly_topic_title = selected_week_info.get("topic") or selected_week_info.get("contentTopic")
    week_identifier = selected_week_info.get("week", f"Idx_{WEEK_TO_PROCESS_INDEX}")
    full_main_topic_name = f"Week {week_identifier}: {main_weekly_topic_title}"
    if not main_weekly_topic_title: logger.error("Selected week has no topic title."); return
    logger.info(f"Processing: {full_main_topic_name}")

    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        content_gen_llm = ChatOllama(model=CONTENT_GENERATION_MODEL_OLLAMA, temperature=0.2, format="json")
        toc_gen_llm = ChatOllama(model=TOC_GENERATION_MODEL_OLLAMA, temperature=0.1, format="json") # LLM for ToC
    except Exception as e: logger.error(f"LLM init error: {e}", exc_info=True); return

    # --- Load Parent Documents and Generate Full Book Hierarchical ToC ---
    parent_documents, book_hierarchical_toc = load_book_into_parent_documents(BOOK_PATH, toc_generation_llm=toc_gen_llm)
    if not parent_documents: logger.error("No parent documents loaded/created."); return

    # --- Save the Full Book Hierarchical ToC ---
    try:
        with open(OUTPUT_BOOK_TOC_JSON_PATH, 'w', encoding='utf-8') as f:
            json.dump(book_hierarchical_toc, f, indent=2, ensure_ascii=False)
        logger.info(f"Full book hierarchical Table of Contents saved to: {OUTPUT_BOOK_TOC_JSON_PATH}")
    except Exception as e:
        logger.error(f"Error saving book hierarchical ToC: {e}", exc_info=True)

    # --- Setup ParentDocumentRetriever (same as before) ---
    docstore_for_parents = InMemoryStore()
    parent_doc_ids = [doc.metadata.get("parent_id", f"pid_{i}") for i, doc in enumerate(parent_documents)]
    for i, doc in enumerate(parent_documents): doc.metadata["parent_id"] = parent_doc_ids[i]
    docstore_for_parents.mset(list(zip(parent_doc_ids, parent_documents)))
    if os.path.exists(CHROMA_PERSIST_DIR_CHILD_GLOBAL): shutil.rmtree(CHROMA_PERSIST_DIR_CHILD_GLOBAL)
    global_child_vectorstore = Chroma(collection_name=CHROMA_COLLECTION_NAME_CHILD_GLOBAL, embedding_function=embedding_model, persist_directory=CHROMA_PERSIST_DIR_CHILD_GLOBAL)
    global_child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHILD_CHUNK_SIZE_GLOBAL, chunk_overlap=CHILD_CHUNK_OVERLAP_GLOBAL)
    parent_retriever_for_chapters = ParentDocumentRetriever(vectorstore=global_child_vectorstore, docstore=docstore_for_parents, child_splitter=global_child_splitter)
    parent_retriever_for_chapters.add_documents(parent_documents, ids=parent_doc_ids, add_to_docstore=False)
    logger.info("Global ParentDocumentRetriever initialized for chapter lookup.")

    # --- Retrieve Primary Chapter & Iterate its LLM-Generated ToC for Content Gen ---
    logger.info(f"Retrieving primary parent chapter(s) for: '{main_weekly_topic_title}'")
    try:
        retrieved_primary_chapters = parent_retriever_for_chapters.invoke(main_weekly_topic_title)
        if not retrieved_primary_chapters: logger.error(f"No parent chapter found for: {main_weekly_topic_title}"); return
        primary_chapter_doc = retrieved_primary_chapters[0]
        chapter_title = primary_chapter_doc.metadata.get("chapter_title", "Unknown Chapter")
        chapter_number_meta = str(primary_chapter_doc.metadata.get("chapter_number", "NA"))
        # THIS IS KEY: Use the LLM-generated ToC from the parent document's metadata
        llm_generated_chapter_toc_list = primary_chapter_doc.metadata.get("chapter_toc", []) 
        if not (isinstance(llm_generated_chapter_toc_list, list) and llm_generated_chapter_toc_list and not (len(llm_generated_chapter_toc_list) == 1 and "Error" in llm_generated_chapter_toc_list[0])):
            logger.warning(f"Invalid/empty LLM-generated ToC for '{chapter_title}'. Using fallback.")
            llm_generated_chapter_toc_list = [f"Overview of {chapter_title}"] if chapter_title != "Unknown Chapter" else ["Main concepts"]
    except Exception as e: logger.error(f"Error retrieving primary chapter/LLM-ToC: {e}", exc_info=True); return

    learning_modules_for_week: List[LearningModule] = []
    logger.info(f"Processing {len(llm_generated_chapter_toc_list)} subtopics from LLM-generated ToC of chapter '{chapter_title}':")
    for subtopic_from_llm_toc in llm_generated_chapter_toc_list:
        if not subtopic_from_llm_toc.strip(): continue
        logger.info(f"-- Processing LLM-ToC Subtopic: '{subtopic_from_llm_toc}' --")
        subtopic_specific_context_str = get_subtopic_specific_context_from_parent(
            subtopic_query=subtopic_from_llm_toc, parent_chapter_document=primary_chapter_doc, embedding_model=embedding_model)
        
        elaborated_content = generate_elaborated_content_for_subtopic(
            subtopic_title=subtopic_from_llm_toc, subtopic_context=subtopic_specific_context_str, generation_llm=content_gen_llm)
        learning_modules_for_week.append(LearningModule(subtopicTitle=subtopic_from_llm_toc, elaboratedContent=elaborated_content))

    final_weekly_output = WeeklyContent(mainWeeklyTopic=full_main_topic_name, learningModules=learning_modules_for_week)
    output_filename = os.path.join(OUTPUT_STRUCTURED_JSON_DIR, f"week_{week_identifier}_content_llm_toc_ch{chapter_number_meta}.json")
    try:
        with open(output_filename, 'w', encoding='utf-8') as f: json.dump(final_weekly_output.dict(), f, indent=2, ensure_ascii=False)
        logger.info(f"Saved detailed subtopic content to {output_filename}")
    except Exception as e: logger.error(f"Error saving JSON: {e}", exc_info=True)
    logger.info("Pipeline finished.")

if __name__ == "__main__":
    if "YOUR_BOOK.pdf" in BOOK_PATH: logger.error("!!! PLEASE UPDATE 'BOOK_PATH' !!!")
    else: main()

2025-06-15 21:31:05,279 - INFO - RAG Pipeline (ParentDocRetriever + LLM ToC for Parents + Iterate LLM-ToC Subtopics)...
2025-06-15 21:31:05,279 - INFO - Successfully loaded unit outline: /home/sebas_dev_linux/projects/course_generator/results/Parse_UO/ICT312 Digital Forensic_Final_parsed.json
2025-06-15 21:31:05,280 - INFO - Processing: Week 1: Understanding the Digital Forensics Profession and Investigations.
2025-06-15 21:31:05,310 - INFO - Loading book '/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub' for parent docs & full ToC (LLM-assisted for chapters).
2025-06-15 21:31:05,311 - INFO - Processing EPUB for chapters and generating detailed ToCs using LLM.
2025-06-15 21:31:07,497 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
2025-06-15 21:31:07,498 - INFO - 

In [2]:
import os
import json
import logging
from typing import List, Dict, Optional

# --- LangChain & Pydantic Imports ---
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from pydantic import BaseModel, Field

# --- Setup Logging (can be in a shared setup cell or here) ---
if not logging.getLogger().hasHandlers(): # Avoid adding handlers multiple times in Jupyter
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger_slides = logging.getLogger(__name__ + "_slides") # Use a specific logger name
logger_slides.setLevel(logging.INFO)

# --- Configuration for this Cell ---
CHROMA_PERSIST_DIR = "./chroma_db_book_toc_guided_chunks_v1" # From previous script
CHROMA_COLLECTION_NAME = "book_toc_guided_chunks_v1"   # From previous script
EMBEDDING_MODEL_OLLAMA = "nomic-embed-text"           # Must match DB creation
CONTENT_GENERATION_MODEL_OLLAMA = "mistral:latest"
OUTPUT_JSON_SLIDES_PATH = "./linux_validation_slides_content.json"

SEARCH_QUERY = "Linux Validation Methods"
NUM_DOCS_TO_RETRIEVE = 5 # Number of relevant documents to fetch for context
MAX_CONTEXT_LENGTH = 7000 # Max characters for context to avoid overly long prompts

# --- Pydantic Models for Structured Output ---
class Slide(BaseModel):
    slide_number: int = Field(description="Sequential number of the slide")
    title: str = Field(description="The main title for this slide.")
    key_points: List[str] = Field(description="A list of bullet points or key takeaways for the slide.")
    speaker_notes: Optional[str] = Field(None, description="Additional notes or details for the speaker for this slide.")

class Presentation(BaseModel):
    overall_topic: str = Field(description="The main topic of the presentation.")
    slides: List[Slide] = Field(description="A list of slide objects.")

def generate_slide_content():
    logger_slides.info(f"Starting slide content generation for topic: '{SEARCH_QUERY}'")

    # 1. Initialize Embedding Model and Load ChromaDB
    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        logger_slides.info(f"Loading ChromaDB from: {CHROMA_PERSIST_DIR} with collection: {CHROMA_COLLECTION_NAME}")
        vector_db = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=embedding_model,
            collection_name=CHROMA_COLLECTION_NAME
        )
        if vector_db._collection.count() == 0:
            logger_slides.error("ChromaDB collection is empty. Cannot generate content.")
            return
        logger_slides.info(f"Successfully loaded ChromaDB. Collection count: {vector_db._collection.count()}")
    except Exception as e:
        logger_slides.error(f"Error loading ChromaDB: {e}", exc_info=True)
        return

    # 2. Retrieve Relevant Documents
    try:
        logger_slides.info(f"Searching for relevant documents with query: '{SEARCH_QUERY}'")
        retrieved_docs_with_score = vector_db.similarity_search_with_score(SEARCH_QUERY, k=NUM_DOCS_TO_RETRIEVE)
        
        if not retrieved_docs_with_score:
            logger_slides.warning("No relevant documents found in the database for the query.")
            context_text = "No specific information found in the provided documents. Please generate general content based on the topic."
        else:
            logger_slides.info(f"Retrieved {len(retrieved_docs_with_score)} documents.")
            context_parts = []
            current_length = 0
            for i, (doc, score) in enumerate(retrieved_docs_with_score):
                doc_info = f"Source Document {i+1} (Relevance Score: {score:.4f}):\n"
                doc_info += f"  Section Titles: "
                titles = []
                for level in range(1, 6): # Assuming up to 5 levels of titles
                    title = doc.metadata.get(f"level_{level}_title")
                    if title:
                        titles.append(title)
                doc_info += " > ".join(titles) if titles else "N/A"
                doc_info += f"\n  Page (if PDF): {doc.metadata.get('page_number', 'N/A')}\n"
                doc_info += f"  Content Snippet:\n{doc.page_content}\n---\n"
                
                if current_length + len(doc_info) > MAX_CONTEXT_LENGTH:
                    logger_slides.warning(f"Context length limit ({MAX_CONTEXT_LENGTH} chars) reached. Truncating context.")
                    break
                context_parts.append(doc_info)
                current_length += len(doc_info)
            context_text = "\n".join(context_parts)
            logger_slides.debug(f"Context Text for LLM:\n{context_text[:500]}...") # Log a snippet

    except Exception as e:
        logger_slides.error(f"Error retrieving documents: {e}", exc_info=True)
        return

    # 3. Initialize Content Generation LLM
    try:
        llm = ChatOllama(
            model=CONTENT_GENERATION_MODEL_OLLAMA,
            temperature=0.3, # Lower temperature for more factual, less creative slides
            # format="json" # If the model and Ollama version support direct JSON output reliably
        )
        logger_slides.info(f"Initialized LLM: {CONTENT_GENERATION_MODEL_OLLAMA}")
    except Exception as e:
        logger_slides.error(f"Error initializing LLM: {e}", exc_info=True)
        return

    # 4. Define Prompt and Output Parser
    json_parser = JsonOutputParser(pydantic_object=Presentation)

    prompt_template_str = """
    You are an expert instructional designer tasked with creating presentation slides.
    Your goal is to generate content for a presentation on the topic: "{query}"

    Use the following context from relevant documents to inform the slide content.
    Focus on extracting and synthesizing key information related to the query.
    If the context is sparse or unhelpful, rely on your general knowledge about the topic.

    Context:
    ---
    {context}
    ---

    Presentation Topic: "{query}"

    Please generate content for approximately 3-5 slides.
    For each slide, provide:
    1.  A clear `title`.
    2.  A list of `key_points` (bullet points, 3-5 per slide).
    3.  Optional `speaker_notes` for further explanation.
    4.  A sequential `slide_number` starting from 1.

    The output MUST be a single, valid JSON object that conforms to the following Pydantic schema:
    ```json
    {json_schema}
    ```

    Ensure your entire response is ONLY the JSON object. Do not include any text before or after the JSON.
    """
    
    prompt = ChatPromptTemplate.from_template(
        template=prompt_template_str,
        partial_variables={"json_schema": json_parser.get_format_instructions()}
    )

    # 5. Create LangChain Chain and Generate Content
    chain = prompt | llm | json_parser
    logger_slides.info("Generating slide content using LLM...")
    
    try:
        # Retry logic could be added here if needed for LLM flakiness
        generated_presentation_data = chain.invoke({
            "query": SEARCH_QUERY,
            "context": context_text
        })
        # The output from JsonOutputParser should already be a Pydantic model instance or dict
        # If it's a dict, Pydantic model can validate it:
        # presentation_obj = Presentation(**generated_presentation_data)

        logger_slides.info("Successfully generated slide content.")

    except Exception as e:
        logger_slides.error(f"Error during LLM content generation or JSON parsing: {e}", exc_info=True)
        
        # Attempt to get raw output for debugging if parsing fails
        # This part is more complex if the error is before the parser
        try:
            logger_slides.info("Attempting to get raw output from LLM for debugging...")
            raw_output_chain = prompt | llm | StrOutputParser()
            raw_output = raw_output_chain.invoke({
                "query": SEARCH_QUERY,
                "context": context_text
            })
            logger_slides.error(f"LLM Raw Output:\n{raw_output}")
        except Exception as e_raw:
            logger_slides.error(f"Could not get raw output: {e_raw}")
        return

    # 6. Save Output to JSON
    try:
        # If generated_presentation_data is a Pydantic model, convert to dict for json.dump
        if isinstance(generated_presentation_data, BaseModel):
            output_dict = generated_presentation_data.model_dump()
        else: # Should be a dict if JsonOutputParser worked correctly
            output_dict = generated_presentation_data

        with open(OUTPUT_JSON_SLIDES_PATH, 'w', encoding='utf-8') as f:
            json.dump(output_dict, f, indent=4, ensure_ascii=False)
        logger_slides.info(f"Slide content saved to: {OUTPUT_JSON_SLIDES_PATH}")
        
        # Print a summary of the generated content
        print("\n--- Generated Presentation Summary ---")
        print(f"Topic: {output_dict.get('overall_topic')}")
        if 'slides' in output_dict:
            print(f"Number of Slides: {len(output_dict['slides'])}")
            for i, slide_data in enumerate(output_dict['slides']):
                print(f"  Slide {slide_data.get('slide_number', i+1)}: {slide_data.get('title')}")
        print("------------------------------------")

    except Exception as e:
        logger_slides.error(f"Error saving JSON output: {e}", exc_info=True)

# --- Execute the generation ---
if __name__ == "__main__": # This block allows running as a script
    generate_slide_content()

2025-06-16 00:24:02,544 - INFO - Starting slide content generation for topic: 'Linux Validation Methods'
2025-06-16 00:24:02,570 - INFO - Loading ChromaDB from: ./chroma_db_book_toc_guided_chunks_v1 with collection: book_toc_guided_chunks_v1
2025-06-16 00:24:02,577 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-06-16 00:24:02,758 - INFO - Successfully loaded ChromaDB. Collection count: 11774
2025-06-16 00:24:02,758 - INFO - Searching for relevant documents with query: 'Linux Validation Methods'
2025-06-16 00:24:03,957 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-06-16 00:24:03,966 - INFO - Retrieved 5 documents.
2025-06-16 00:24:03,979 - INFO - Initialized LLM: mistral:latest
2025-06-16 00:24:03,986 - INFO - Generating slide content using LLM...
2025-06-16 00:24:09,049 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-06-16 00:24:23,778


--- Generated Presentation Summary ---
Topic: Linux Validation Methods
Number of Slides: 5
  Slide 1: Introduction to Linux Validation Methods
  Slide 2: Validation and Verification in Linux Forensics
  Slide 3: Using Validation Protocols in Linux Forensics
  Slide 4: Evaluating Digital Forensics Tools for Linux Validation
  Slide 5: Conclusion: Best Practices for Linux Validation Methods
------------------------------------
