In [3]:
import os
import json
import logging
from typing import List, Dict, Any

# --- LangChain Core Imports ---
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# --- LangChain Community Imports ---
from langchain_community.document_loaders import PyPDFLoader # For PDFs
# from langchain_community.document_loaders import Docx2txtLoader # For DOCX
from langchain_community.document_loaders import UnstructuredEPubLoader # For EPUB
# from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma # New
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores.utils import filter_complex_metadata

# --- LangChain Text Splitter ---
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- PowerPoint ---
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
# Paths
BOOK_PATH = "/home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub"  # <--- !!! UPDATE THIS: Path to your book file !!!
UNIT_OUTLINE_JSON_PATH = "/home/sebas_dev_linux/projects/course_generator/results/Parse_UO/ICT312 Digital Forensic_Final_parsed.json" # Path to your parsed JSON
CHROMA_PERSIST_DIR = "./chroma_db_book"
CHROMA_COLLECTION_NAME = "book_content"
OUTPUT_PPTX_FILENAME = "generated_course_slides.pptx"

# Ollama Models
EMBEDDING_MODEL_OLLAMA = "nomic-embed-text" # Or mxbai-embed-large, etc.
GENERATION_MODEL_OLLAMA = "mistral:latest" # Or llama3, etc. Ensure this model is pulled in Ollama

# RAG Parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
TOP_K_RETRIEVER = 5 # Number of chunks to retrieve for context

# --- Helper Functions ---

def load_unit_outline(json_path: str) -> Dict[str, Any]:
    """Loads the unit outline from a JSON file."""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logger.info(f"Successfully loaded unit outline from: {json_path}")
        return data
    except FileNotFoundError:
        logger.error(f"Unit outline JSON file not found: {json_path}")
        raise
    except json.JSONDecodeError:
        logger.error(f"Error decoding JSON from: {json_path}")
        raise
    except Exception as e:
        logger.error(f"An unexpected error occurred while loading unit outline: {e}")
        raise

def load_and_split_book(book_path: str) -> List[Any]: # Returns List of LangChain Documents
    """Loads and splits the book content into chunks."""
    logger.info(f"Loading book from: {book_path}")
    _, file_extension = os.path.splitext(book_path.lower())

    if file_extension == ".pdf":
        loader = PyPDFLoader(book_path)
    elif file_extension == ".epub":
        logger.info("Using UnstructuredEPubLoader for EPUB file.")
        # mode="elements" is good for structured data. strategy="fast" is usually sufficient.
        loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
    else:
        logger.error(f"Unsupported book file format: {file_extension}. Please use PDF, DOCX, or EPUB.")
        raise ValueError(f"Unsupported book file format: {file_extension}")

    try:
        documents = loader.load()
        if not documents:
            logger.error(f"No documents loaded from {book_path}. The loader might have failed or the file is empty/corrupt.")
            raise ValueError(f"No content loaded from {book_path}")
        logger.info(f"Successfully loaded {len(documents)} elements/sections from the EPUB book.")

        # --- Filter complex metadata HERE before splitting ---
        # This is a good place as splitters might carry over metadata
        logger.info("Filtering complex metadata from loaded documents...")
        filtered_documents = filter_complex_metadata(documents)
        logger.info(f"Original doc count: {len(documents)}, Filtered doc count: {len(filtered_documents)}")
        # It's possible some documents might be entirely removed if all their metadata was complex. Usually, it just cleans the metadata fields.

    except Exception as e:
        logger.error(f"Error loading book content with {loader.__class__.__name__}: {e}", exc_info=True)
        raise

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        add_start_index=True
    )
    # Use the filtered_documents for splitting
    chunks = text_splitter.split_documents(filtered_documents) # <--- USE filtered_documents
    if not chunks:
        logger.warning(f"Text splitting resulted in 0 chunks for {book_path}. Check document content and splitter settings.")
    else:
        logger.info(f"Split book into {len(chunks)} chunks.")
    return chunks

def create_or_load_vector_db(chunks: List[Any], embedding_model: OllamaEmbeddings, persist_directory: str, collection_name: str) -> Chroma:
    """Creates and persists a ChromaDB vector store from chunks, or loads if it exists."""

    # Ensure chunks have metadata that Chroma can handle.
    # While filtering in load_and_split_book is good, an extra check here for chunks can be defensive.
    # However, if load_and_split_book already filters, this might be redundant.
    # For now, assuming filter_complex_metadata in load_and_split_book is sufficient.
    # If errors persist, you might need to apply it to `chunks` directly:
    # logger.info("Pre-filtering metadata of chunks before DB insertion (defensive)...")
    # chunks = filter_complex_metadata(chunks) # This would modify the list of Document objects

    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        logger.info(f"Loading existing vector database from: {persist_directory}")
        try:
            vector_db = Chroma(
                persist_directory=persist_directory,
                embedding_function=embedding_model,
                collection_name=collection_name
            )
            if vector_db._collection.count() == 0 and chunks:
                logger.warning("Found existing Chroma directory, but collection seems empty or couldn't load. Rebuilding...")
                # It's crucial that `chunks` here have clean metadata
                vector_db = Chroma.from_documents(
                    documents=chunks, # These chunks should have already been filtered
                    embedding=embedding_model,
                    persist_directory=persist_directory,
                    collection_name=collection_name
                )
                logger.info(f"Vector database rebuilt and persisted to: {persist_directory}")
            else:
                logger.info("Successfully loaded vector database.")
        except Exception as e:
            logger.warning(f"Failed to load existing vector database ({e}). Rebuilding...")
            # Ensure chunks have clean metadata
            vector_db = Chroma.from_documents(
                documents=chunks, # These chunks should have already been filtered
                embedding=embedding_model,
                persist_directory=persist_directory,
                collection_name=collection_name
            )
            logger.info(f"Vector database created and persisted to: {persist_directory}")
    else:
        logger.info(f"Creating new vector database and persisting to: {persist_directory}")
        # Ensure chunks have clean metadata
        vector_db = Chroma.from_documents(
            documents=chunks, # These chunks should have already been filtered
            embedding=embedding_model,
            persist_directory=persist_directory,
            collection_name=collection_name
        )
        logger.info("Vector database created successfully.")
    return vector_db

def format_docs(docs: List[Any]) -> str:
    """Helper function to format retrieved documents into a single string."""
    return "\n\n---\n\n".join([doc.page_content for doc in docs])

def generate_slide_content_for_topic(topic: str, retriever: Any, generation_llm: ChatOllama) -> str:
    """Generates slide content for a given topic using RAG."""
    logger.info(f"Generating content for topic: '{topic}'")

    template = """
    You are an assistant helping create content for university lecture slides.
    The main topic for the current slide is: "{topic}"

    Here is some relevant context retrieved from the course textbook:
    ---CONTEXT START---
    {context}
    ---CONTEXT END---

    Based on the topic and the provided context, generate concise and informative content suitable for a presentation slide.
    Aim for 3-5 key bullet points or a short, clear paragraph. Focus on the most important aspects for students.
    Do not include a title for the slide, just the content itself.
    Do not say "Here is the slide content" or similar phrases.
    """
    prompt = ChatPromptTemplate.from_template(template)

    rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | generation_llm
        | StrOutputParser()
    )

    rag_chain_with_source = RunnableParallel(
        {"context": retriever, "topic": RunnablePassthrough()}
    ).assign(answer=rag_chain_from_docs)

    try:
        response = rag_chain_with_source.invoke(topic)
        # The 'answer' key from rag_chain_from_docs becomes the 'answer' in response from rag_chain_with_source
        generated_text = response.get("answer", "Error: Could not generate content.")
        if not generated_text.strip() or "Error:" in generated_text:
            logger.warning(f"LLM returned empty or error content for topic '{topic}'. Response: {generated_text}")
            return f"Could not generate content for this topic. LLM response: {generated_text if generated_text.strip() else 'Empty'}"
        logger.info(f"Successfully generated content for topic: '{topic}'")
        return generated_text
    except Exception as e:
        logger.error(f"Error generating content for topic '{topic}': {e}", exc_info=True)
        return f"Error generating content for this topic: {e}"


def create_presentation(slides_data: List[Dict[str, str]], output_filename: str, unit_name: str):
    """Creates a PowerPoint presentation from the generated slide data."""
    prs = Presentation()

    # Title Slide
    title_slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(title_slide_layout)
    title = slide.shapes.title
    subtitle = slide.placeholders[1]
    title.text = unit_name if unit_name else "Course Presentation"
    subtitle.text = "Generated Course Material"

    # Content Slides
    for item in slides_data:
        slide_topic = item.get("topic", "Unnamed Topic")
        slide_content = item.get("content", "No content generated.")

        bullet_slide_layout = prs.slide_layouts[1] # Title and Content
        slide = prs.slides.add_slide(bullet_slide_layout)

        title_shape = slide.shapes.title
        title_shape.text = slide_topic

        body_shape = slide.placeholders[1] # Assumes placeholder 1 is the main content body
        tf = body_shape.text_frame
        tf.clear() # Clear existing text
        tf.word_wrap = True

        # Attempt to format content as bullet points if it looks like a list
        # A more sophisticated approach might be needed if LLM output varies greatly
        content_lines = [line.strip() for line in slide_content.split('\n') if line.strip()]
        if any(line.startswith(('-', '*', '1.', 'a)')) for line in content_lines) and len(content_lines) > 1:
            for line in content_lines:
                p = tf.add_paragraph()
                p.text = line.lstrip('-* ').lstrip('0123456789.abcdefghijklmnopqrstuvwxyz) ') # Basic cleaning
                p.font.size = Pt(18)
                p.level = 0 # Adjust level for sub-bullets if needed
        else:
            p = tf.add_paragraph()
            p.text = slide_content
            p.font.size = Pt(18)

    try:
        prs.save(output_filename)
        logger.info(f"Presentation saved as {output_filename}")
    except Exception as e:
        logger.error(f"Error saving presentation: {e}")
        raise

# --- Main Execution ---
def main():
    logger.info("Starting Book-to-Slides RAG pipeline...")

    # --- 0. Check prerequisites ---
    if not os.path.exists(BOOK_PATH):
        logger.error(f"Book file not found at: {BOOK_PATH}. Please update BOOK_PATH.")
        return
    if not os.path.exists(UNIT_OUTLINE_JSON_PATH):
        logger.error(f"Unit outline JSON file not found at: {UNIT_OUTLINE_JSON_PATH}. Please update UNIT_OUTLINE_JSON_PATH.")
        return

    # --- 1. Load Unit Outline ---
    unit_outline_data = load_unit_outline(UNIT_OUTLINE_JSON_PATH)
    unit_name = unit_outline_data.get("unitInformation", {}).get("unitName", "Course Slides")
    weekly_schedule = unit_outline_data.get("weeklySchedule", [])
    if not weekly_schedule:
        logger.error("No weekly schedule found in the unit outline. Cannot generate slides.")
        return

    # --- 2. Initialize Ollama Models ---
    logger.info(f"Initializing Ollama embedding model: {EMBEDDING_MODEL_OLLAMA}")
    try:
        embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_OLLAMA)
        # Test embedding
        # embedding_model.embed_query("Test query")
    except Exception as e:
        logger.error(f"Failed to initialize Ollama embedding model '{EMBEDDING_MODEL_OLLAMA}': {e}")
        logger.error("Please ensure Ollama is running and the model is pulled (e.g., `ollama pull nomic-embed-text`).")
        return

    logger.info(f"Initializing Ollama generation model: {GENERATION_MODEL_OLLAMA}")
    try:
        generation_llm = ChatOllama(model=GENERATION_MODEL_OLLAMA, temperature=0.3)
        # Test generation
        # generation_llm.invoke("Hello!")
    except Exception as e:
        logger.error(f"Failed to initialize Ollama generation model '{GENERATION_MODEL_OLLAMA}': {e}")
        logger.error("Please ensure Ollama is running and the model is pulled (e.g., `ollama pull mistral`).")
        return

    # --- 3. Load and Process Book ---
    book_chunks = load_and_split_book(BOOK_PATH)
    if not book_chunks:
        logger.error("Failed to load or split the book. Exiting.")
        return

    # --- 4. Create or Load Vector DB ---
    vector_db = create_or_load_vector_db(book_chunks, embedding_model, CHROMA_PERSIST_DIR, CHROMA_COLLECTION_NAME)
    retriever = vector_db.as_retriever(search_kwargs={"k": TOP_K_RETRIEVER})

    # --- 5. Generate Content for Each Topic ---
    all_slides_data = []
    logger.info(f"Found {len(weekly_schedule)} weeks/topics in the schedule.")

    for week_info in weekly_schedule:
        topic_title = week_info.get("contentTopic")
        week_identifier = week_info.get("week", "N/A")
        if not topic_title:
            logger.warning(f"Skipping week {week_identifier} due to missing topic title.")
            continue

        slide_content = generate_slide_content_for_topic(topic_title, retriever, generation_llm)
        all_slides_data.append({"topic": f"Week {week_identifier}: {topic_title}", "content": slide_content})
        logger.info(f"generated content: {all_slides_data}")

    # --- 6. Create PowerPoint Presentation ---
    if all_slides_data:
        create_presentation(all_slides_data, OUTPUT_PPTX_FILENAME, unit_name)
    else:
        logger.warning("No slide data was generated. PowerPoint presentation will not be created.")

    logger.info("Book-to-Slides RAG pipeline finished.")

if __name__ == "__main__":
    # --- IMPORTANT: User Configuration Checks ---
    if "YOUR_BOOK.pdf" in BOOK_PATH:
        logger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        logger.error("!!! PLEASE UPDATE 'BOOK_PATH' in the script with the actual path to your book file. !!!")
        logger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    else:
        main()

2025-06-14 12:48:34,755 - INFO - Starting Book-to-Slides RAG pipeline...
2025-06-14 12:48:34,756 - INFO - Successfully loaded unit outline from: /home/sebas_dev_linux/projects/course_generator/results/Parse_UO/ICT312 Digital Forensic_Final_parsed.json
2025-06-14 12:48:34,757 - INFO - Initializing Ollama embedding model: nomic-embed-text
2025-06-14 12:48:34,757 - INFO - Initializing Ollama generation model: mistral:latest
2025-06-14 12:48:34,758 - INFO - Loading book from: /home/sebas_dev_linux/projects/course_generator/data/books/Bill Nelson, Amelia Phillips, Christopher Steuart - Guide to Computer Forensics and Investigations_ Processing Digital Evidence-Cengage Learning (2018).epub
2025-06-14 12:48:34,758 - INFO - Using UnstructuredEPubLoader for EPUB file.
  data file translations/en.yaml not found
  data file translations/en.yaml not found


2025-06-14 12:48:42,792 - INFO - Successfully loaded 11815 elements/sections from the EPUB book.
2025-06-14 12:48:42,793 - INFO - Filtering co