<a href="https://colab.research.google.com/github/sahanpereramerry/evergreen-notes-public/blob/main/Evergreen_Notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import json
import PyPDF2
import openai
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Set your OpenAI API key (in production, use environment variables)
# openai.api_key = os.environ.get("OPENAI_API_KEY")

class EvergreenNotesGenerator:
    def __init__(self, api_key=None, model="gpt-4o", notes_per_chapter=5):
        """
        Initialize the Evergreen Notes Generator

        Args:
            api_key (str): OpenAI API key
            model (str): OpenAI model to use
            notes_per_chapter (int): Number of notes to generate per chapter
        """
        if api_key:
            self.client = openai.OpenAI(api_key=api_key)
        else:
            self.client = openai.OpenAI()

        self.model = model
        self.notes_per_chapter = notes_per_chapter

        # GPT-4o can handle up to 128k tokens
        # Setting a conservative limit of ~90k tokens (~360,000 chars)
        self.max_chunk_chars = 360000

    def extract_text_from_pdf(self, pdf_path):
        """Extract full text from PDF file"""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text

    def extract_chapters(self, full_text):
        """
        Advanced chapter extraction from text content
        Uses multiple methods to identify chapter boundaries
        """
        # Method 1: Extract based on Table of Contents if available
        toc_pattern = r'(?:TABLE\s+OF\s+CONTENTS|CONTENTS).*?\n(.*?)(?:\n\s*\n|\n(?:CHAPTER|INTRODUCTION))'
        toc_match = re.search(toc_pattern, full_text, re.DOTALL | re.IGNORECASE)

        chapter_patterns = [
            # Common chapter heading patterns
            r'(?:^|\n)(?:CHAPTER|PART|SECTION)\s+[\dIVXLC]+(?:[:\.\s]+[^\n]+)?(?=\n)',
            r'(?:^|\n)(?:\d+\.?\s+)(?:[A-Z][^\n]+)(?=\n)',
            r'(?:^|\n)(?:[IVX]+\.?\s+)(?:[A-Z][^\n]+)(?=\n)'
        ]

        # If TOC found, extract chapter titles from it
        if toc_match:
            toc_text = toc_match.group(1)
            toc_entries = re.findall(r'((?:chapter|part|section)\s+[\dIVXLC]+[:\.\s]+[^\n]+)',
                                    toc_text, re.IGNORECASE)

            # Add TOC entries to patterns
            for entry in toc_entries:
                clean_entry = entry.strip()
                chapter_patterns.insert(0, f'(?:^|\n)(?:{re.escape(clean_entry)})(?=\n)')

        # Find all potential chapter headings
        all_headings = []
        for pattern in chapter_patterns:
            headings = re.finditer(pattern, full_text, re.MULTILINE | re.IGNORECASE)
            all_headings.extend([(match.start(), match.group().strip()) for match in headings])

        # Sort headings by position in text
        all_headings.sort()

        # Extract chapter content
        chapters = {}
        for i, (pos, heading) in enumerate(all_headings):
            start_pos = pos + len(heading)

            # If not the last chapter, end at the next chapter
            if i < len(all_headings) - 1:
                end_pos = all_headings[i + 1][0]
            else:
                end_pos = len(full_text)

            chapter_content = full_text[start_pos:end_pos].strip()

            # Skip extremely short "chapters" (likely false positives)
            if len(chapter_content) < 1000:
                continue

            chapters[heading] = chapter_content

        # Fallback method: If no chapters found, try simple numeric chapter detection
        if not chapters:
            simple_chapters = re.split(r'\n(?:CHAPTER|PART|SECTION)\s+[\dIVXLC]+', full_text)
            if len(simple_chapters) > 1:
                for i, chapter in enumerate(simple_chapters[1:], 1):
                    if len(chapter.strip()) > 1000:  # Skip short sections
                        chapters[f"Chapter {i}"] = chapter.strip()

        # Last resort: split by length into even sections
        if not chapters:
            # Split into roughly equal sections, aiming for 10-15 chapters
            text_length = len(full_text)
            section_length = max(10000, text_length // 12)  # Min 10,000 chars per section

            for i in range(0, text_length, section_length):
                section_text = full_text[i:i + section_length].strip()
                if len(section_text) > 1000:  # Skip short sections
                    chapters[f"Section {i//section_length + 1}"] = section_text

        return chapters

    def generate_titles(self, chapter_text, chapter_title):
        """Generate potential evergreen note titles for a chapter"""
        # Prepare system prompt with detailed guidance
        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to generate potential titles for evergreen notes based on the chapter provided. "
            "Evergreen note titles should: "
            "1. Be complete sentences that express a clear, specific claim or insight. "
            "2. Convey enough detail to understand the core idea without additional context. "
            "3. Use clear, precise language and active verbs. "
            "4. Focus on the most important and insightful ideas from the text. "
            "5. Be framed positively and constructively. "
            "6. Avoid vague, general statements - be specific and concrete. "
            "Generate titles that capture the most significant concepts, arguments, frameworks, and insights from the chapter."
        )

        # Prepare user prompt
        user_prompt = (
            f"Generate {self.notes_per_chapter * 2} potential evergreen note titles based on this chapter: "
            f"CHAPTER TITLE: {chapter_title} "
            f"CHAPTER CONTENT: {chapter_text} "
            f"Return ONLY the titles, one per line, numbered."
        )

        # Make API call
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )

        # Extract titles from response
        content = response.choices[0].message.content
        titles = []

        # Parse titles, handling both numbered and unnumbered formats
        for line in content.split('\n'):
            line = line.strip()
            if not line:
                continue

            # Remove numbering if present
            cleaned_line = re.sub(r'^\d+[\.\)]\s*', '', line)
            if cleaned_line:
                titles.append(cleaned_line)

        return titles

    def select_best_titles(self, chapter_text, titles, chapter_title):
        """Select the best titles based on relevance and quality"""
        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to select the most valuable and insightful evergreen note titles from a list of candidates. "
            "The best evergreen note titles should: "
            "1. Capture significant insights or claims from the text. "
            "2. Be specific, clear, and well-formulated. "
            "3. Focus on the most important concepts in the chapter. "
            "4. Cover diverse aspects of the chapter content (avoid redundancy). "
            "5. Prioritize non-obvious, thought-provoking ideas."
        )

        user_prompt = (
            f"From the following list of potential evergreen note titles for the chapter '{chapter_title}', "
            f"select exactly {self.notes_per_chapter} titles that best meet the criteria in my instructions. "
            f"Candidate titles: {' '.join([f'{i+1}. {title}' for i, title in enumerate(titles)])} "
            f"Return ONLY the numbers of the selected titles in the format: 1, 5, 8, 10, 12 "
            f"Do not include any explanations or additional text."
        )

        # Make API call
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )

        # Parse selected title numbers
        content = response.choices[0].message.content.strip()

        # Extract numbers, handling various formats
        selected_indices = []
        for num in re.findall(r'\d+', content):
            idx = int(num) - 1  # Convert to 0-based index
            if 0 <= idx < len(titles):
                selected_indices.append(idx)

        # Ensure we have the correct number of titles
        selected_indices = selected_indices[:self.notes_per_chapter]

        # Get the selected titles
        selected_titles = [titles[idx] for idx in selected_indices]

        # If we don't have enough titles, add more from the original list
        if len(selected_titles) < self.notes_per_chapter:
            for title in titles:
                if title not in selected_titles:
                    selected_titles.append(title)
                    if len(selected_titles) >= self.notes_per_chapter:
                        break

        return selected_titles

    def generate_note_content(self, chapter_text, title, chapter_title):
        """Generate content for a single evergreen note"""
        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to write the content for an evergreen note based on the provided title and chapter. "
            "Your evergreen note should: "
            "1. Thoroughly explore the specific idea in the title (200-300 words). "
            "2. Include interpretations, critical thinking, and broader implications. "
            "3. Be structured clearly with a logical flow of ideas. "
            "4. Use precise language and concrete examples. "
            "5. Go beyond summarizing - add insight and connections. "
            "6. Maintain academic rigor while being accessible. "
            "7. Look for opportunities to reference related concepts (for interlinking). "
            "The note should function as a standalone 'API' for this knowledge that can be understood without the original text."
        )

        user_prompt = (
            f"Write content for an evergreen note with the following title, based on this chapter: "
            f"CHAPTER: {chapter_title} "
            f"NOTE TITLE: {title} "
            f"CHAPTER CONTENT: {chapter_text} "
            f"Generate a 200-300 word evergreen note that thoroughly explores this idea."
        )

        # Make API call
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )

        # Extract content
        content = response.choices[0].message.content
        return content

    def process_chapter(self, chapter_title, chapter_text):
        """Process a single chapter to generate evergreen notes"""
        print(f"Processing: {chapter_title}")

        # Handle large chapters by chunking if necessary
        if len(chapter_text) > self.max_chunk_chars:
            # For title generation, use the first 1/3 and last 1/3 of the chapter
            third = len(chapter_text) // 3
            title_text = chapter_text[:third] + "\n...\n" + chapter_text[-third:]
        else:
            title_text = chapter_text

        # Generate potential titles
        titles = self.generate_titles(title_text, chapter_title)

        # Select best titles
        selected_titles = self.select_best_titles(title_text, titles, chapter_title)

        # Generate content for each selected title
        notes = []

        for title in tqdm(selected_titles, desc="Generating notes", leave=False):
            # For content generation, extract relevant parts of the chapter if it's too large
            if len(chapter_text) > self.max_chunk_chars:
                relevant_text = self.extract_relevant_sections(chapter_text, title)
            else:
                relevant_text = chapter_text

            content = self.generate_note_content(relevant_text, title, chapter_title)

            notes.append({
                "title": title,
                "content": content
            })

        return notes

    def extract_relevant_sections(self, chapter_text, title):
        """Extract the most relevant parts of a long chapter for a specific title"""
        # Split chapter into paragraphs
        paragraphs = [p for p in chapter_text.split('\n\n') if p.strip()]

        # Conservative max size (GPT-4o can handle ~100k tokens, but we'll use less)
        max_chars = self.max_chunk_chars // 2

        if len(chapter_text) <= max_chars:
            return chapter_text

        # Extract important keywords from title
        title_words = set(re.findall(r'\b\w{4,}\b', title.lower()))
        # Remove common stop words
        stop_words = {'this', 'that', 'with', 'from', 'have', 'they', 'what', 'when', 'where', 'which', 'their'}
        title_words = {w for w in title_words if w not in stop_words}

        # Score paragraphs based on relevance to title
        scored_paragraphs = []

        for i, para in enumerate(paragraphs):
            score = 0
            para_lower = para.lower()

            # Score based on keyword matches
            for word in title_words:
                # Full word matches
                score += para_lower.count(f' {word} ') * 3
                # Partial matches
                score += para_lower.count(word) * 2

            # Boost score for position in chapter
            # First few paragraphs get a boost for context
            if i < 3:
                score += 5
            # Introduction paragraphs get a boost
            elif i < len(paragraphs) * 0.1:
                score += 3
            # Conclusion paragraphs get a boost
            elif i > len(paragraphs) * 0.9:
                score += 2

            # Length factor (prefer substantial paragraphs)
            if 100 <= len(para) <= 1000:
                score += 1

            scored_paragraphs.append((score, i, para))

        # Sort by score descending
        scored_paragraphs.sort(reverse=True)

        # Get introduction (first few paragraphs)
        intro_size = min(3, len(paragraphs))
        intro = '\n\n'.join(paragraphs[:intro_size])

        # Get top-scoring paragraphs
        selected = []
        current_size = len(intro)

        for score, i, para in scored_paragraphs:
            # Skip paragraphs already in intro
            if i < intro_size:
                continue

            # Stop if we've reached the size limit
            if current_size + len(para) > max_chars:
                break

            # Add paragraph
            selected.append((i, para))
            current_size += len(para)

        # Sort selected paragraphs by original position
        selected.sort()

        # Combine intro with selected paragraphs
        result = intro + '\n\n' + '\n\n'.join(para for _, para in selected)

        # If we have space, add a few more paragraphs for context
        remaining = max_chars - len(result)
        if remaining > 5000:
            additional = []
            for i, para in enumerate(paragraphs):
                # Skip paragraphs we already have
                if i < intro_size or any(idx == i for idx, _ in selected):
                    continue

                if len(para) < remaining:
                    additional.append((i, para))
                    remaining -= len(para)

                    # Stop if we've added enough
                    if remaining < 5000:
                        break

            # Add additional paragraphs in order
            additional.sort()
            for _, para in additional:
                result += '\n\n' + para

        return result

    def save_notes_as_text(self, notes, chapter_title, output_dir):
        """Save notes in plain text format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.txt")

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Evergreen Notes for: {chapter_title}\n")
            f.write("=" * 80 + "\n\n")

            for i, note in enumerate(notes, 1):
                f.write(f"Note {i}: {note['title']}\n")
                f.write("-" * 80 + "\n")
                f.write(note['content'] + "\n\n")
                f.write("=" * 80 + "\n\n")

        return filename

    def save_notes_as_json(self, notes, chapter_title, output_dir):
        """Save notes in JSON format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.json")

        # Add IDs to notes for reference
        numbered_notes = []
        for i, note in enumerate(notes, 1):
            note_with_id = {
                "id": f"{safe_chapter_title}_note_{i}",
                "title": note['title'],
                "content": note['content']
            }
            numbered_notes.append(note_with_id)

        data = {
            "chapter": chapter_title,
            "notes": numbered_notes
        }

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

        return filename

    def process_book(self, pdf_path, output_dir, parallel=True):
        """Process the entire book to generate evergreen notes"""
        print(f"Extracting text from PDF: {pdf_path}")
        full_text = self.extract_text_from_pdf(pdf_path)

        print("Identifying chapters...")
        chapters = self.extract_chapters(full_text)
        print(f"Found {len(chapters)} chapters.")

        if len(chapters) == 0:
            print("No chapters identified. Please check the PDF format.")
            return None

        # Process each chapter
        all_notes = {}

        if parallel and len(chapters) > 1:
            # Process chapters in parallel
            with ThreadPoolExecutor(max_workers=min(5, len(chapters))) as executor:
                future_to_chapter = {
                    executor.submit(self.process_chapter, title, text): title
                    for title, text in chapters.items()
                }

                for future in tqdm(future_to_chapter, desc="Processing chapters"):
                    chapter_title = future_to_chapter[future]
                    try:
                        chapter_notes = future.result()

                        # Save chapter notes
                        txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                        json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                        print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                        all_notes[chapter_title] = chapter_notes
                    except Exception as e:
                        print(f"Error processing chapter '{chapter_title}': {e}")
        else:
            # Process chapters sequentially
            for chapter_title, chapter_text in tqdm(chapters.items(), desc="Processing chapters"):
                try:
                    chapter_notes = self.process_chapter(chapter_title, chapter_text)

                    # Save chapter notes
                    txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                    json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                    print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                    all_notes[chapter_title] = chapter_notes
                except Exception as e:
                    print(f"Error processing chapter '{chapter_title}': {e}")

        # Save combined JSON with all notes
        combined_json = os.path.join(output_dir, "all_notes.json")

        # Create a structured representation of all notes
        book_notes = {
            "book": os.path.basename(pdf_path),
            "chapters": []
        }

        for chapter_title, chapter_notes in all_notes.items():
            safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
            chapter_data = {
                "title": chapter_title,
                "id": safe_chapter_title,
                "notes": []
            }

            for i, note in enumerate(chapter_notes, 1):
                note_id = f"{safe_chapter_title}_note_{i}"
                note_with_id = {
                    "id": note_id,
                    "title": note['title'],
                    "content": note['content']
                }
                chapter_data["notes"].append(note_with_id)

            book_notes["chapters"].append(chapter_data)

        with open(combined_json, 'w', encoding='utf-8') as f:
            json.dump(book_notes, f, indent=2)

        print(f"✓ Combined notes saved to {combined_json}")

        return book_notes


# Example usage in Google Colab
def run_in_colab(pdf_file_path, api_key, notes_per_chapter=5):
    """
    Run the evergreen notes generator in Google Colab

    Args:
        pdf_file_path (str): Path to the PDF file
        api_key (str): OpenAI API key
        notes_per_chapter (int): Number of notes to generate per chapter
    """
    output_dir = "./evergreen_notes_output"

    # Initialize the generator
    generator = EvergreenNotesGenerator(
        api_key=api_key,
        model="gpt-4o",
        notes_per_chapter=notes_per_chapter
    )

    # Process the book
    generator.process_book(pdf_file_path, output_dir)

    # Zip the output directory for easy download
    os.system(f"zip -r evergreen_notes.zip {output_dir}")

    print("Processing complete! You can now download the evergreen_notes.zip file.")

    # For Google Colab, you would use:
    # from google.colab import files
    # files.download("evergreen_notes.zip")

# INSTEAD, add this code at the end:
from google.colab import files

# Upload PDF file
print("Please upload your PDF book:")
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

# Get OpenAI API key
api_key = input("Enter your OpenAI API key: ")

# Run the generator
run_in_colab(pdf_filename, api_key, notes_per_chapter=5)

ModuleNotFoundError: No module named 'PyPDF2'

In [None]:
import os
import re
import json
import time
import PyPDF2
import openai
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import random

class EvergreenNotesGenerator:
    def __init__(self, api_key=None, model_config=None, notes_per_chapter=5):
        """
        Initialize the Evergreen Notes Generator

        Args:
            api_key (str): OpenAI API key
            model_config (dict): Model configuration with different models for different tasks
            notes_per_chapter (int): Number of notes to generate per chapter
        """
        if api_key:
            self.client = openai.OpenAI(api_key=api_key)
        else:
            self.client = openai.OpenAI()

        # Default model configuration (using GPT-4o for everything)
        self.model_config = {
            'title_generation': 'gpt-4o-mini',  # Titles can use the mini model
            'title_selection': 'gpt-4o-mini',   # Selection can use the mini model
            'content_generation': 'gpt-4o'      # Content needs the full model
        }

        # Override with custom config if provided
        if model_config:
            self.model_config.update(model_config)

        self.notes_per_chapter = notes_per_chapter

        # Limiting context sizes based on models
        self.max_chunk_chars = 300000  # For GPT-4o (~75k tokens)
        self.mini_max_chunk_chars = 200000  # For GPT-4o-mini (smaller to be safe)

    def rate_limited_api_call(self, messages, model, max_retries=8, initial_wait=2):
        """Make an API call with automatic retry on rate limit errors"""
        retry_count = 0
        wait_time = initial_wait

        while retry_count < max_retries:
            try:
                return self.client.chat.completions.create(
                    model=model,
                    messages=messages
                )
            except Exception as e:
                error_str = str(e)
                if "rate_limit_exceeded" in error_str:
                    # Extract wait time from error message if possible
                    wait_match = re.search(r'try again in (\d+\.\d+)s', error_str)
                    if wait_match:
                        wait_time = float(wait_match.group(1)) * 1.5  # Add a 50% buffer

                    # Add some jitter to avoid thundering herd problem
                    jitter = random.uniform(0.5, 1.5)
                    adjusted_wait = wait_time * jitter

                    print(f"Rate limit hit for {model}. Waiting {adjusted_wait:.2f}s before retry {retry_count+1}/{max_retries}")
                    time.sleep(adjusted_wait)
                    wait_time *= 1.5  # Increase wait time for next attempt
                    retry_count += 1
                else:
                    # Non-rate-limit error, re-raise
                    raise

        # If we get here, we've exhausted our retries
        raise Exception(f"Failed after {max_retries} attempts due to rate limiting")

    def extract_text_from_pdf(self, pdf_path):
        """Extract full text from PDF file"""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                except Exception as e:
                    print(f"Warning: Could not extract text from a page: {e}")
        return text

    def extract_chapters(self, full_text):
        """
        Advanced chapter extraction from text content
        Uses multiple methods to identify chapter boundaries
        """
        # Keep original text for better chapter detection
        original_text = full_text

        # Clean up text version for pattern matching
        clean_text = re.sub(r'\s+', ' ', full_text)
        clean_text = re.sub(r'(\w)- (\w)', r'\1\2', clean_text)  # Fix hyphenated words

        # Try to find a table of contents
        print("Looking for table of contents...")
        toc_patterns = [
            r'(?:TABLE\s+OF\s+CONTENTS|CONTENTS).*?\n(.*?)(?:\n\s*\n|\n(?:CHAPTER|INTRODUCTION))',
            r'(?:CONTENTS|INDEX).*?\n(.*?)(?:\n\s*\n)'
        ]

        toc_entries = []
        for pattern in toc_patterns:
            try:
                toc_match = re.search(pattern, original_text, re.DOTALL | re.IGNORECASE)
                if toc_match:
                    toc_text = toc_match.group(1)
                    # Extract chapter titles and page numbers from TOC
                    toc_entries = re.findall(r'((?:chapter|part|section)\s+[\dIVXLC]+[:\.\s]+[^0-9\n]+)',
                                           toc_text, re.IGNORECASE)
                    if toc_entries:
                        print(f"Found {len(toc_entries)} entries in table of contents")
                        break
            except Exception as e:
                print(f"Error processing TOC pattern: {e}")

        # For your book (religious text), look for specific patterns
        print("Searching for chapter patterns...")
        chapter_patterns = [
            # Standard chapter headings
            r'(?:^|\n)(?:CHAPTER|CH\.|PART|SECTION)\s+[\dIVXLC]+(?:[:\.\s]+[^\n]+)?(?=\n)',
            # Numbered sections (like "1. Introduction")
            r'(?:^|\n)(?:\d+\.?\s+)(?:[A-Z][^\n]{5,})(?=\n)',
            # Roman numeral sections
            r'(?:^|\n)(?:[IVX]+\.?\s+)(?:[A-Z][^\n]{5,})(?=\n)',
            # Try patterns specific to religious/spiritual books
            r'(?:^|\n)(?:Prayer|Meditation|Reflection)(?:\s+[\dIVXLC]+)?(?:[:\.\s]+[^\n]+)?(?=\n)',
            # Try bold/emphasized headings (hard to detect without formatting info)
            r'(?:^|\n)([A-Z][A-Z\s]{3,}[A-Z])(?=\n)',
            # Try detecting based on line breaks and capitalization
            r'(?:\n\n)([A-Z][^a-z\n]{0,10}[A-Za-z][^\n]{3,60})(?:\n\n)'
        ]

        # Add patterns from TOC entries
        if toc_entries:
            for entry in toc_entries:
                entry_clean = re.sub(r'\s+', ' ', entry.strip())
                # Convert to regex pattern, escaping special characters
                entry_pattern = re.escape(entry_clean)
                # Make it more flexible with whitespace
                entry_pattern = re.sub(r'\\\s+', r'\\s+', entry_pattern)
                chapter_patterns.insert(0, f'(?:^|\n)(?:{entry_pattern})(?=\n)')

        # For your specific book, add custom pattern
        # This seems to be working from your previous run
        chapter_patterns.insert(0, r'(?:^|\n)(\d+\.\s+[A-Z][^\n]{5,})(?=\n)')

        # Find all potential chapter headings
        all_headings = []
        for pattern in chapter_patterns:
            try:
                headings = re.finditer(pattern, original_text, re.MULTILINE | re.IGNORECASE)
                matches = [(match.start(), match.group().strip()) for match in headings]
                if matches:
                    print(f"Pattern '{pattern}' found {len(matches)} matches")
                all_headings.extend(matches)
            except Exception as e:
                print(f"Warning: Error with pattern {pattern}: {e}")

        # Sort headings by position in text and remove duplicates
        all_headings.sort()
        unique_headings = []
        for pos, heading in all_headings:
            if not unique_headings or pos > unique_headings[-1][0] + 50:  # Allow closer headings
                unique_headings.append((pos, heading))

        print(f"Found {len(unique_headings)} potential chapter headings")

        # Extract chapter content
        chapters = {}

        # REMOVE the check for too many chapters - some books have many small chapters

        for i, (pos, heading) in enumerate(unique_headings):
            start_pos = pos + len(heading)

            # If not the last chapter, end at the next chapter
            if i < len(unique_headings) - 1:
                end_pos = unique_headings[i + 1][0]
            else:
                end_pos = len(original_text)

            chapter_content = original_text[start_pos:end_pos].strip()

            # Reduced minimum size to catch smaller chapters
            if len(chapter_content) < 500:
                continue

            # Clean up chapter title
            clean_heading = re.sub(r'\s+', ' ', heading).strip()
            # Limit chapter title length
            if len(clean_heading) > 100:
                clean_heading = clean_heading[:97] + "..."

            chapters[clean_heading] = chapter_content

        # Fallback 1: Try detecting chapters based on page numbers or markers
        if not chapters:
            print("Trying to detect chapters based on page breaks...")
            page_break_patterns = [
                r'\n\s*\d+\s*\n',  # Page numbers
                r'\n\s*\*\s*\*\s*\*\s*\n',  # Asterisk dividers
                r'\n\s*-{3,}\s*\n'  # Dash dividers
            ]

            for pattern in page_break_patterns:
                sections = re.split(pattern, original_text)
                if len(sections) > 5:  # If we found reasonable number of sections
                    print(f"Found {len(sections)} sections using page break pattern")
                    for i, section in enumerate(sections, 1):
                        if len(section.strip()) > 500:
                            chapters[f"Section {i}"] = section.strip()
                    break

        # Fallback 2: If still no chapters, try simpler patterns
        if not chapters:
            print("Using simplified chapter detection method...")
            simple_chapters = re.split(r'\n(?:CHAPTER|PART|SECTION)\s+[\dIVXLC]+\s*[:\.]?', original_text)
            if len(simple_chapters) > 1:
                for i, chapter in enumerate(simple_chapters[1:], 1):
                    if len(chapter.strip()) > 500:  # Lower minimum size
                        chapters[f"Chapter {i}"] = chapter.strip()

        # Fallback 3: Split by double line breaks to find potential sections
        if not chapters:
            print("Trying to split by paragraph breaks...")
            sections = re.split(r'\n\s*\n\s*\n', original_text)  # Try to find major paragraph breaks
            if len(sections) >= 5:  # If we have at least 5 sections
                # Group sections into chapters (approximately 5-7 sections per chapter)
                section_per_chapter = max(3, len(sections) // 10)
                for i in range(0, len(sections), section_per_chapter):
                    end_idx = min(i + section_per_chapter, len(sections))
                    chapter_text = "\n\n".join(sections[i:end_idx])
                    if len(chapter_text.strip()) > 500:
                        chapters[f"Part {i//section_per_chapter + 1}"] = chapter_text

        # Last resort: split by length into even sections
        if not chapters:
            print("No chapters detected. Creating equal-length sections...")
            # Split into roughly equal sections
            text_length = len(original_text)
            # Adjusted to create more sections for a spiritual/religious book
            section_length = max(5000, text_length // 20)

            for i in range(0, text_length, section_length):
                section_text = original_text[i:i + section_length].strip()
                if len(section_text) > 500:  # Lower minimum size
                    chapters[f"Section {i//section_length + 1}"] = section_text

        print(f"Successfully extracted {len(chapters)} chapters/sections")
        return chapters

    def generate_titles(self, chapter_text, chapter_title):
        """Generate potential evergreen note titles for a chapter"""
        # Use the mini model for title generation
        model = self.model_config['title_generation']

        # Prepare system prompt with detailed guidance
        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to generate potential titles for evergreen notes based on the chapter provided. "
            "Evergreen note titles should: "
            "1. Be complete sentences that express a clear, specific claim or insight. "
            "2. Convey enough detail to understand the core idea without additional context. "
            "3. Use clear, precise language and active verbs. "
            "4. Focus on the most important and insightful ideas from the text. "
            "5. Be framed positively and constructively. "
            "6. Avoid vague, general statements - be specific and concrete. "
            "Generate titles that capture the most significant concepts, arguments, frameworks, and insights from the chapter."
        )

        # Limit text size based on mini model
        max_size = self.mini_max_chunk_chars
        if len(chapter_text) > max_size:
            # Use first and last portions with an indication of truncation
            portion_size = max_size // 2 - 50
            chapter_text = chapter_text[:portion_size] + "\n[...TEXT TRUNCATED...]\n" + chapter_text[-portion_size:]

        # Prepare user prompt
        user_prompt = (
            f"Generate {self.notes_per_chapter * 2} potential evergreen note titles based on this chapter: "
            f"CHAPTER TITLE: {chapter_title} "
            f"CHAPTER CONTENT: {chapter_text} "
            f"Return ONLY the titles, one per line, numbered."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Extract titles from response
        content = response.choices[0].message.content
        titles = []

        # Parse titles, handling both numbered and unnumbered formats
        for line in content.split('\n'):
            line = line.strip()
            if not line:
                continue

            # Remove numbering if present
            cleaned_line = re.sub(r'^\d+[\.\)]\s*', '', line)
            if cleaned_line:
                titles.append(cleaned_line)

        return titles

    def select_best_titles(self, chapter_text, titles, chapter_title):
        """Select the best titles based on relevance and quality"""
        # Use the mini model for title selection
        model = self.model_config['title_selection']

        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to select the most valuable and insightful evergreen note titles from a list of candidates. "
            "The best evergreen note titles should: "
            "1. Capture significant insights or claims from the text. "
            "2. Be specific, clear, and well-formulated. "
            "3. Focus on the most important concepts in the chapter. "
            "4. Cover diverse aspects of the chapter content (avoid redundancy). "
            "5. Prioritize non-obvious, thought-provoking ideas."
        )

        # Format the titles with numbers
        formatted_titles = "\n".join([f"{i+1}. {title}" for i, title in enumerate(titles)])

        user_prompt = (
            f"From the following list of potential evergreen note titles for the chapter '{chapter_title}', "
            f"select exactly {self.notes_per_chapter} titles that best meet the criteria in my instructions. "
            f"Candidate titles:\n{formatted_titles}\n\n"
            f"Return ONLY the numbers of the selected titles in the format: 1, 5, 8, 10, 12 "
            f"Do not include any explanations or additional text."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Parse selected title numbers
        content = response.choices[0].message.content.strip()

        # Extract numbers, handling various formats
        selected_indices = []
        for num in re.findall(r'\d+', content):
            idx = int(num) - 1  # Convert to 0-based index
            if 0 <= idx < len(titles):
                selected_indices.append(idx)

        # Ensure we have the correct number of titles
        selected_indices = selected_indices[:self.notes_per_chapter]

        # Get the selected titles
        selected_titles = [titles[idx] for idx in selected_indices]

        # If we don't have enough titles, add more from the original list
        if len(selected_titles) < self.notes_per_chapter:
            for title in titles:
                if title not in selected_titles:
                    selected_titles.append(title)
                    if len(selected_titles) >= self.notes_per_chapter:
                        break

        return selected_titles

    def generate_note_content(self, chapter_text, title, chapter_title):
        """Generate content for a single evergreen note"""
        # Use the full model for content generation
        model = self.model_config['content_generation']

        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to write the content for an evergreen note based on the provided title and chapter. "
            "Your evergreen note should: "
            "1. Thoroughly explore the specific idea in the title (200-300 words). "
            "2. Include interpretations, critical thinking, and broader implications. "
            "3. Be structured clearly with a logical flow of ideas. "
            "4. Use precise language and concrete examples. "
            "5. Go beyond summarizing - add insight and connections. "
            "6. Maintain academic rigor while being accessible. "
            "7. Look for opportunities to reference related concepts (for interlinking). "
            "The note should function as a standalone 'API' for this knowledge that can be understood without the original text."
        )

        # Extract relevant parts if chapter is too large
        if len(chapter_text) > self.max_chunk_chars:
            relevant_text = self.extract_relevant_sections(chapter_text, title)
        else:
            relevant_text = chapter_text

        user_prompt = (
            f"Write content for an evergreen note with the following title, based on this chapter: "
            f"CHAPTER: {chapter_title} "
            f"NOTE TITLE: {title} "
            f"CHAPTER CONTENT: {relevant_text} "
            f"Generate a 200-300 word evergreen note that thoroughly explores this idea."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Extract content
        content = response.choices[0].message.content
        return content

    def process_chapter(self, chapter_title, chapter_text):
        """Process a single chapter to generate evergreen notes"""
        try:
            print(f"Processing: {chapter_title}")

            # Handle large chapters by chunking if necessary
            if len(chapter_text) > self.mini_max_chunk_chars:
                # For title generation, use the first 1/3 and last 1/3 of the chapter
                third = len(chapter_text) // 3
                title_text = chapter_text[:third] + "\n...\n" + chapter_text[-third:]
            else:
                title_text = chapter_text

            # Generate potential titles
            titles = self.generate_titles(title_text, chapter_title)

            # Add delay after title generation to avoid rate limits
            time.sleep(1)

            # Select best titles
            selected_titles = self.select_best_titles(title_text, titles, chapter_title)

            # Add delay after title selection to avoid rate limits
            time.sleep(1)

            # Generate content for each selected title
            notes = []

            for title in tqdm(selected_titles, desc="Generating notes", leave=False):
                # For content generation, extract relevant parts of the chapter if it's too large
                if len(chapter_text) > self.max_chunk_chars:
                    relevant_text = self.extract_relevant_sections(chapter_text, title)
                else:
                    relevant_text = chapter_text

                content = self.generate_note_content(relevant_text, title, chapter_title)

                notes.append({
                    "title": title,
                    "content": content
                })

                # Add delay between note generation to avoid rate limits
                time.sleep(2)

            return notes

        except Exception as e:
            print(f"Error in process_chapter for '{chapter_title}': {str(e)}")
            raise

    def extract_relevant_sections(self, chapter_text, title):
        """Extract the most relevant parts of a long chapter for a specific title"""
        # Split chapter into paragraphs
        paragraphs = [p for p in chapter_text.split('\n\n') if p.strip()]

        # Conservative max size
        max_chars = self.max_chunk_chars // 2

        if len(chapter_text) <= max_chars:
            return chapter_text

        # Extract important keywords from title
        title_words = set(re.findall(r'\b\w{4,}\b', title.lower()))
        # Remove common stop words
        stop_words = {'this', 'that', 'with', 'from', 'have', 'they', 'what', 'when', 'where', 'which', 'their'}
        title_words = {w for w in title_words if w not in stop_words}

        # Score paragraphs based on relevance to title
        scored_paragraphs = []

        for i, para in enumerate(paragraphs):
            score = 0
            para_lower = para.lower()

            # Score based on keyword matches
            for word in title_words:
                # Full word matches
                score += para_lower.count(f' {word} ') * 3
                # Partial matches
                score += para_lower.count(word) * 2

            # Boost score for position in chapter
            # First few paragraphs get a boost for context
            if i < 3:
                score += 5
            # Introduction paragraphs get a boost
            elif i < len(paragraphs) * 0.1:
                score += 3
            # Conclusion paragraphs get a boost
            elif i > len(paragraphs) * 0.9:
                score += 2

            # Length factor (prefer substantial paragraphs)
            if 100 <= len(para) <= 1000:
                score += 1

            scored_paragraphs.append((score, i, para))

        # Sort by score descending
        scored_paragraphs.sort(reverse=True)

        # Get introduction (first few paragraphs)
        intro_size = min(3, len(paragraphs))
        intro = '\n\n'.join(paragraphs[:intro_size])

        # Get top-scoring paragraphs
        selected = []
        current_size = len(intro)

        for score, i, para in scored_paragraphs:
            # Skip paragraphs already in intro
            if i < intro_size:
                continue

            # Stop if we've reached the size limit
            if current_size + len(para) > max_chars:
                break

            # Add paragraph
            selected.append((i, para))
            current_size += len(para)

        # Sort selected paragraphs by original position
        selected.sort()

        # Combine intro with selected paragraphs
        result = intro + '\n\n' + '\n\n'.join(para for _, para in selected)

        return result

    def save_notes_as_text(self, notes, chapter_title, output_dir):
        """Save notes in plain text format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.txt")

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Evergreen Notes for: {chapter_title}\n")
            f.write("=" * 80 + "\n\n")

            for i, note in enumerate(notes, 1):
                f.write(f"Note {i}: {note['title']}\n")
                f.write("-" * 80 + "\n")
                f.write(note['content'] + "\n\n")
                f.write("=" * 80 + "\n\n")

        return filename

    def save_notes_as_json(self, notes, chapter_title, output_dir):
        """Save notes in JSON format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.json")

        # Add IDs to notes for reference
        numbered_notes = []
        for i, note in enumerate(notes, 1):
            note_with_id = {
                "id": f"{safe_chapter_title}_note_{i}",
                "title": note['title'],
                "content": note['content']
            }
            numbered_notes.append(note_with_id)

        data = {
            "chapter": chapter_title,
            "notes": numbered_notes
        }

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

        return filename

    def process_book(self, pdf_path, output_dir, parallel=False):
        """Process the entire book to generate evergreen notes"""
        print(f"Extracting text from PDF: {pdf_path}")
        full_text = self.extract_text_from_pdf(pdf_path)

        print("Identifying chapters...")
        chapters = self.extract_chapters(full_text)
        print(f"Found {len(chapters)} chapters.")

        if len(chapters) == 0:
            print("No chapters identified. Please check the PDF format.")
            return None

        # Process each chapter
        all_notes = {}
        failed_chapters = []

        if parallel and len(chapters) > 1:
            # Process chapters in parallel with fewer workers to avoid rate limits
            with ThreadPoolExecutor(max_workers=min(2, len(chapters))) as executor:
                future_to_chapter = {
                    executor.submit(self.process_chapter, title, text): title
                    for title, text in chapters.items()
                }

                for future in tqdm(future_to_chapter, desc="Processing chapters"):
                    chapter_title = future_to_chapter[future]
                    try:
                        chapter_notes = future.result()

                        # Save chapter notes
                        txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                        json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                        print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                        all_notes[chapter_title] = chapter_notes

                        # Add delay between chapters
                        time.sleep(3)

                    except Exception as e:
                        print(f"Error processing chapter '{chapter_title}': {e}")
                        failed_chapters.append((chapter_title, chapters[chapter_title]))
        else:
            # Process chapters sequentially
            for chapter_title, chapter_text in tqdm(chapters.items(), desc="Processing chapters"):
                try:
                    chapter_notes = self.process_chapter(chapter_title, chapter_text)

                    # Save chapter notes
                    txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                    json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                    print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                    all_notes[chapter_title] = chapter_notes

                    # Add delay between chapters to avoid rate limits
                    time.sleep(3)

                except Exception as e:
                    print(f"Error processing chapter '{chapter_title}': {e}")
                    failed_chapters.append((chapter_title, chapter_text))

        # Retry failed chapters after a longer delay
        if failed_chapters:
            print(f"\nRetrying {len(failed_chapters)} failed chapters after a delay...")
            time.sleep(60)  # Wait a full minute before retries

            for chapter_title, chapter_text in tqdm(failed_chapters, desc="Retrying failed chapters"):
                try:
                    print(f"Retrying: {chapter_title}")
                    chapter_notes = self.process_chapter(chapter_title, chapter_text)

                    # Save chapter notes
                    txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                    json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                    print(f"✓ Successfully saved notes for '{chapter_title}' on retry")
                    all_notes[chapter_title] = chapter_notes

                    # Longer delay between retries
                    time.sleep(10)

                except Exception as e:
                    print(f"Still failed to process chapter '{chapter_title}': {e}")

        # Save combined JSON with all notes
        combined_json = os.path.join(output_dir, "all_notes.json")

        # Create a structured representation of all notes
        book_notes = {
            "book": os.path.basename(pdf_path),
            "chapters": []
        }

        for chapter_title, chapter_notes in all_notes.items():
            safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
            chapter_data = {
                "title": chapter_title,
                "id": safe_chapter_title,
                "notes": []
            }

            for i, note in enumerate(chapter_notes, 1):
                note_id = f"{safe_chapter_title}_note_{i}"
                note_with_id = {
                    "id": note_id,
                    "title": note['title'],
                    "content": note['content']
                }
                chapter_data["notes"].append(note_with_id)

            book_notes["chapters"].append(chapter_data)

        with open(combined_json, 'w', encoding='utf-8') as f:
            json.dump(book_notes, f, indent=2)

        print(f"✓ Combined notes saved to {combined_json}")

        return book_notes


# Example usage in Google Colab
def run_in_colab(pdf_file_path, api_key, notes_per_chapter=5, use_mixed_models=True):
    """
    Run the evergreen notes generator in Google Colab

    Args:
        pdf_file_path (str): Path to the PDF file
        api_key (str): OpenAI API key
        notes_per_chapter (int): Number of notes to generate per chapter
        use_mixed_models (bool): Whether to use mixed models (gpt-4o-mini for simpler tasks)
    """
    output_dir = "./evergreen_notes_output"

    # Model configuration
    if use_mixed_models:
        model_config = {
            'title_generation': 'gpt-4o-mini',  # Titles can use the mini model
            'title_selection': 'gpt-4o-mini',   # Selection can use the mini model
            'content_generation': 'gpt-4o'      # Content needs the full model
        }
    else:
        model_config = {
            'title_generation': 'gpt-4o',
            'title_selection': 'gpt-4o',
            'content_generation': 'gpt-4o'
        }

    # Initialize the generator
    generator = EvergreenNotesGenerator(
        api_key=api_key,
        model_config=model_config,
        notes_per_chapter=notes_per_chapter
    )

    # Process the book
    generator.process_book(pdf_file_path, output_dir, parallel=False)

    # Zip the output directory for easy download
    os.system(f"zip -r evergreen_notes.zip {output_dir}")

In [None]:
# Upload PDF file
print("Please upload your PDF book:")
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

# Get OpenAI API key
api_key = input("Enter your OpenAI API key: ")

# Set number of notes per chapter
notes_per_chapter = int(input("How many notes per chapter? (default: 3): ") or "3")

# Ask if user wants to use mixed models
use_mixed = input("Use GPT-4o-mini for title generation to save costs? (y/n, default: y): ").lower() != 'n'

# Run the generator
print(f"Starting to process {pdf_filename} with {notes_per_chapter} notes per chapter...")
run_in_colab(pdf_filename, api_key, notes_per_chapter=notes_per_chapter, use_mixed_models=use_mixed)

# Create and download the zip file
from google.colab import files
print("\nProcessing complete! Preparing download...")
time.sleep(3)
files.download("evergreen_notes.zip")

# Add a viewer for browsing notes in the notebook
def view_notes():
    import json
    from IPython.display import display, HTML

    if not os.path.exists("./evergreen_notes_output/all_notes.json"):
        print("Notes haven't been generated yet or couldn't be found.")
        return

    with open("./evergreen_notes_output/all_notes.json", 'r') as f:
        data = json.load(f)

    # Create a simple HTML viewer
    html = f"""
    <h1>Notes for {data['book']}</h1>
    <div style="max-width:800px">
    """

    for chapter in data['chapters']:
        html += f"""
        <div style="margin-top:20px; border:1px solid #ccc; padding:10px; border-radius:5px">
            <h2>{chapter['title']}</h2>
        """

        for note in chapter['notes']:
            html += f"""
            <div style="margin:10px 0; background:#f8f8f8; padding:10px; border-radius:3px">
                <h3>{note['title']}</h3>
                <div style="white-space:pre-wrap">{note['content']}</div>
            </div>
            """

        html += "</div>"

    html += "</div>"
    display(HTML(html))

print("\nWould you like to view the generated notes in the notebook?")
if input("View notes now? (y/n): ").lower() == 'y':
    view_notes()

Please upload your PDF book:


Saving Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf to Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008) (2).pdf
Enter your OpenAI API key: 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
How many notes per chapter? (default: 3): 3
Use GPT-4o-mini for title generation to save costs? (y/n, default: y): y
Starting to process Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008) (2).pdf with 3 notes per chapter...
Extracting text from PDF: Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008) (2).pdf
Identifying chapters...
Looking for table of contents...
Searching for chapter patterns...
Pattern '(?:^|\n)(\d+\.\s+[A-Z][^\n]{5,})(?=\n)' found 73 matches
Pattern '(?:^|\n)(?:CHAPTER|CH\.|PART|SECTION)\s+[\dIVXLC]+(?:[:\.\s]+[^\n]+)?(?=\n)' found 4 matches
Patt

Processing chapters:   0%|          | 0/95 [00:00<?, ?it/s]

Processing: Scripture texts from the New and Old Testaments are taken from The Holy Bible Revised Standard



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Processing chapters:   0%|          | 0/95 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [None]:
import os
import re
import json
import time
import random
import PyPDF2
import openai
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

class EvergreenNotesGenerator:
    def __init__(self, api_key=None, model_config=None, notes_per_chapter=5):
        """
        Initialize the Evergreen Notes Generator

        Args:
            api_key (str): OpenAI API key
            model_config (dict): Model configuration with different models for different tasks
            notes_per_chapter (int): Number of notes to generate per chapter
        """
        if api_key:
            self.client = openai.OpenAI(api_key=api_key)
        else:
            self.client = openai.OpenAI()

        # Default model configuration (using GPT-4o for everything)
        self.model_config = {
            'title_generation': 'gpt-4o-mini',  # Titles can use the mini model
            'title_selection': 'gpt-4o-mini',   # Selection can use the mini model
            'content_generation': 'gpt-4o'      # Content needs the full model
        }

        # Override with custom config if provided
        if model_config:
            self.model_config.update(model_config)

        self.notes_per_chapter = notes_per_chapter

        # Limiting context sizes based on models
        self.max_chunk_chars = 300000  # For GPT-4o (~75k tokens)
        self.mini_max_chunk_chars = 200000  # For GPT-4o-mini (smaller to be safe)

    def rate_limited_api_call(self, messages, model, max_retries=8, initial_wait=2):
        """Make an API call with automatic retry on rate limit errors"""
        retry_count = 0
        wait_time = initial_wait

        while retry_count < max_retries:
            try:
                return self.client.chat.completions.create(
                    model=model,
                    messages=messages
                )
            except Exception as e:
                error_str = str(e)
                if "rate_limit_exceeded" in error_str:
                    # Extract wait time from error message if possible
                    wait_match = re.search(r'try again in (\d+\.\d+)s', error_str)
                    if wait_match:
                        wait_time = float(wait_match.group(1)) * 1.5  # Add a 50% buffer

                    # Add some jitter to avoid thundering herd problem
                    jitter = random.uniform(0.5, 1.5)
                    adjusted_wait = wait_time * jitter

                    print(f"Rate limit hit for {model}. Waiting {adjusted_wait:.2f}s before retry {retry_count+1}/{max_retries}")
                    time.sleep(adjusted_wait)
                    wait_time *= 1.5  # Increase wait time for next attempt
                    retry_count += 1
                else:
                    # Non-rate-limit error, re-raise
                    raise

        # If we get here, we've exhausted our retries
        raise Exception(f"Failed after {max_retries} attempts due to rate limiting")

    def extract_text_from_pdf(self, pdf_path):
        """Extract full text from PDF file"""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                except Exception as e:
                    print(f"Warning: Could not extract text from a page: {e}")
        return text

    def extract_chapters(self, full_text):
        """Main chapter extraction method - uses TOC method first, then fallback if needed"""
        try:
            # Try the TOC-based method first
            chapters = self.extract_chapters_by_toc(full_text)

            # If we got a reasonable number of chapters, return them
            if chapters and len(chapters) >= 3 and len(chapters) <= 40:
                return chapters

            # Otherwise, use the fallback method
            return self.extract_chapters_fallback(full_text)

        except Exception as e:
            print(f"Error in chapter extraction: {e}")
            return self.extract_chapters_fallback(full_text)

    def extract_chapters_by_toc(self, full_text):
        """Extract chapters by identifying the table of contents using LLM"""
        # Import here to avoid installation issues
        try:
            from fuzzywuzzy import fuzz
        except ImportError:
            print("fuzzywuzzy not installed. Installing...")
            os.system("pip install fuzzywuzzy python-Levenshtein")
            from fuzzywuzzy import fuzz

        # Step 1: Extract initial portion of text (first 15% to capture TOC)
        sample_size = min(len(full_text) // 6, 30000)  # First ~15% or max 30k chars
        text_sample = full_text[:sample_size]

        # Step 2: Use LLM to identify the TOC and extract chapter titles
        system_prompt = (
            "You are a specialized AI trained to identify and extract Tables of Contents from books. "
            "Your only task is to find chapter titles from the text provided."
        )

        user_prompt = (
            "Below is the beginning portion of a book. Your task is to:\n"
            "1. Identify if there is a Table of Contents\n"
            "2. Extract ONLY the main chapter titles as they appear (not subheadings or sections)\n"
            "3. Return them as a numbered list\n\n"
            "If you don't find a clear Table of Contents, look for chapter headings in the text itself.\n"
            "Return ONLY the numbered list of chapter titles, nothing else.\n\n"
            f"TEXT:\n{text_sample}"
        )

        # Use a cheaper model for this task
        print("Identifying table of contents...")
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model="gpt-3.5-turbo"  # Cheaper model is sufficient for this task
        )

        toc_content = response.choices[0].message.content.strip()

        # Parse the LLM's response into a list of chapter titles
        chapter_titles = []
        for line in toc_content.splitlines():
            line = line.strip()
            if not line:
                continue

            # Remove numbering if present
            match = re.match(r'^\d+\.?\s*(.*)', line)
            if match:
                title = match.group(1).strip()
                if title and len(title) > 3:  # Avoid empty or very short titles
                    chapter_titles.append(title)

        print(f"Found {len(chapter_titles)} potential chapter titles")

        if len(chapter_titles) < 3:
            print("Too few chapter titles found, trying alternative approach...")
            # If we didn't find enough chapter titles, try a different prompt
            user_prompt_alt = (
                "You are examining the beginning of a book. Please identify the main chapter titles or major sections.\n"
                "Look carefully for any patterns that indicate chapters, such as 'Chapter 1', numbered sections, or distinct headings.\n"
                "Return ONLY a numbered list of the major divisions you find, with no additional text.\n\n"
                f"TEXT:\n{text_sample}"
            )

            response = self.rate_limited_api_call(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt_alt}
                ],
                model="gpt-3.5-turbo"
            )

            toc_content = response.choices[0].message.content.strip()

            # Parse the alternative response
            chapter_titles = []
            for line in toc_content.splitlines():
                line = line.strip()
                if not line:
                    continue
                match = re.match(r'^\d+\.?\s*(.*)', line)
                if match:
                    title = match.group(1).strip()
                    if title and len(title) > 3:
                        chapter_titles.append(title)

            print(f"Second attempt found {len(chapter_titles)} potential chapter titles")

        # If we still don't have enough chapters, fall back to the original method
        if len(chapter_titles) < 3:
            print("Failed to identify sufficient chapter titles, using fallback method")
            return self.extract_chapters_fallback(full_text)

        # Step 3: Use fuzzy matching to find chapter positions in the full text
        print("Locating chapters in the full text...")

        # Prepare to collect chapter positions
        chapter_positions = []

        # Try different matching strategies:
        # 1. First try exact matches with context
        for title in chapter_titles:
            # Consider various formatting possibilities
            patterns = [
                # Exact match with newlines before/after
                rf'\n\s*{re.escape(title)}\s*\n',
                # Chapter X: Title format
                rf'\n\s*(?:Chapter|CHAPTER|Part|PART|Section|SECTION)(?:\s+[\dIVXLC]+)?(?:[\:\.]?\s+){re.escape(title)}\s*\n',
                # X. Title format
                rf'\n\s*(?:\d+\.|\d+|[IVX]+\.|\[.+?\])\s+{re.escape(title)}\s*\n'
            ]

            for pattern in patterns:
                matches = list(re.finditer(pattern, full_text, re.MULTILINE | re.IGNORECASE))
                if matches:
                    print(f"Found '{title}' with pattern: {pattern}")
                    pos = matches[0].start()  # Take the first occurrence
                    chapter_positions.append((pos, title))
                    break

        # 2. If we don't have enough matches, try fuzzy matching
        if len(chapter_positions) < len(chapter_titles) // 2:
            print("Using fuzzy matching for remaining chapters...")

            # For titles we haven't found yet
            unfound_titles = [t for t in chapter_titles if not any(t == title for _, title in chapter_positions)]

            # Search for each title using fuzzy matching
            for title in unfound_titles:
                best_match = None
                best_score = 0
                best_pos = -1

                # Look for potential matches in the text
                # Simplified approach: check the beginning of paragraphs
                paragraphs = re.split(r'\n\s*\n', full_text)
                for i, para in enumerate(paragraphs):
                    # Check first line of paragraph
                    first_line = para.split('\n')[0] if para else ""
                    if len(first_line) > 3:
                        score = fuzz.ratio(title.lower(), first_line.lower())

                        # If the score is good enough and better than previous matches
                        if score > 75 and score > best_score:
                            best_score = score
                            best_match = first_line
                            # Calculate approximate position in full text
                            best_pos = full_text.find(para)

                if best_pos >= 0:
                    print(f"Fuzzy matched '{title}' to '{best_match}' (score: {best_score})")
                    chapter_positions.append((best_pos, title))

        # If we still don't have enough matches, fall back
        if len(chapter_positions) < 3:
            print(f"Only found {len(chapter_positions)} chapters via matching, using fallback")
            return self.extract_chapters_fallback(full_text)

        # Sort chapters by position
        chapter_positions.sort()

        # Extract chapter content
        chapters = {}
        for i, (pos, title) in enumerate(chapter_positions):
            # Find end position (start of next chapter or end of text)
            if i < len(chapter_positions) - 1:
                end_pos = chapter_positions[i + 1][0]
            else:
                end_pos = len(full_text)

            # Extract chapter content
            content = full_text[pos:end_pos].strip()

            # Skip the title in the content
            first_newline = content.find('\n')
            if first_newline > 0:
                content = content[first_newline:].strip()

            # Only add chapters with sufficient content
            if len(content) > 500:
                chapters[title] = content

        print(f"Successfully extracted {len(chapters)} chapters via TOC method")

        return chapters

    def extract_chapters_fallback(self, full_text):
        """Fallback method to extract chapters when TOC method fails"""
        print("Using fallback chapter extraction method...")

        # Create 10-15 roughly equal sections
        chapters = {}
        target_sections = min(15, max(8, len(full_text) // 15000))
        section_length = len(full_text) // target_sections

        for i in range(0, len(full_text), section_length):
            section_text = full_text[i:i + section_length].strip()
            if len(section_text) > 500:  # Only include substantive sections
                chapters[f"Section {i//section_length + 1}"] = section_text

        print(f"Created {len(chapters)} equal-length sections")
        return chapters

    def generate_titles(self, chapter_text, chapter_title):
        """Generate potential evergreen note titles for a chapter"""
        # Use the mini model for title generation
        model = self.model_config['title_generation']

        # Prepare system prompt with detailed guidance
        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to generate potential titles for evergreen notes based on the chapter provided. "
            "Evergreen note titles should: "
            "1. Be complete sentences that express a clear, specific claim or insight. "
            "2. Convey enough detail to understand the core idea without additional context. "
            "3. Use clear, precise language and active verbs. "
            "4. Focus on the most important and insightful ideas from the text. "
            "5. Be framed positively and constructively. "
            "6. Avoid vague, general statements - be specific and concrete. "
            "Generate titles that capture the most significant concepts, arguments, frameworks, and insights from the chapter."
        )

        # Limit text size based on mini model
        max_size = self.mini_max_chunk_chars
        if len(chapter_text) > max_size:
            # Use first and last portions with an indication of truncation
            portion_size = max_size // 2 - 50
            chapter_text = chapter_text[:portion_size] + "\n[...TEXT TRUNCATED...]\n" + chapter_text[-portion_size:]

        # Prepare user prompt
        user_prompt = (
            f"Generate {self.notes_per_chapter * 2} potential evergreen note titles based on this chapter: "
            f"CHAPTER TITLE: {chapter_title} "
            f"CHAPTER CONTENT: {chapter_text} "
            f"Return ONLY the titles, one per line, numbered."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Extract titles from response
        content = response.choices[0].message.content
        titles = []

        # Parse titles, handling both numbered and unnumbered formats
        for line in content.split('\n'):
            line = line.strip()
            if not line:
                continue

            # Remove numbering if present
            cleaned_line = re.sub(r'^\d+[\.\)]\s*', '', line)
            if cleaned_line:
                titles.append(cleaned_line)

        return titles

    def select_best_titles(self, chapter_text, titles, chapter_title):
        """Select the best titles based on relevance and quality"""
        # Use the mini model for title selection
        model = self.model_config['title_selection']

        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to select the most valuable and insightful evergreen note titles from a list of candidates. "
            "The best evergreen note titles should: "
            "1. Capture significant insights or claims from the text. "
            "2. Be specific, clear, and well-formulated. "
            "3. Focus on the most important concepts in the chapter. "
            "4. Cover diverse aspects of the chapter content (avoid redundancy). "
            "5. Prioritize non-obvious, thought-provoking ideas."
        )

        # Format the titles with numbers
        formatted_titles = "\n".join([f"{i+1}. {title}" for i, title in enumerate(titles)])

        user_prompt = (
            f"From the following list of potential evergreen note titles for the chapter '{chapter_title}', "
            f"select exactly {self.notes_per_chapter} titles that best meet the criteria in my instructions. "
            f"Candidate titles:\n{formatted_titles}\n\n"
            f"Return ONLY the numbers of the selected titles in the format: 1, 5, 8, 10, 12 "
            f"Do not include any explanations or additional text."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Parse selected title numbers
        content = response.choices[0].message.content.strip()

        # Extract numbers, handling various formats
        selected_indices = []
        for num in re.findall(r'\d+', content):
            idx = int(num) - 1  # Convert to 0-based index
            if 0 <= idx < len(titles):
                selected_indices.append(idx)

        # Ensure we have the correct number of titles
        selected_indices = selected_indices[:self.notes_per_chapter]

        # Get the selected titles
        selected_titles = [titles[idx] for idx in selected_indices]

        # If we don't have enough titles, add more from the original list
        if len(selected_titles) < self.notes_per_chapter:
            for title in titles:
                if title not in selected_titles:
                    selected_titles.append(title)
                    if len(selected_titles) >= self.notes_per_chapter:
                        break

        return selected_titles

    def generate_note_content(self, chapter_text, title, chapter_title):
        """Generate content for a single evergreen note"""
        # Use the full model for content generation
        model = self.model_config['content_generation']

        system_prompt = (
            "You are an expert at creating insightful evergreen notes from academic and non-fiction texts. "
            "Your task is to write the content for an evergreen note based on the provided title and chapter. "
            "Your evergreen note should: "
            "1. Thoroughly explore the specific idea in the title (200-300 words). "
            "2. Include interpretations, critical thinking, and broader implications. "
            "3. Be structured clearly with a logical flow of ideas. "
            "4. Use precise language and concrete examples. "
            "5. Go beyond summarizing - add insight and connections. "
            "6. Maintain academic rigor while being accessible. "
            "7. Look for opportunities to reference related concepts (for interlinking). "
            "The note should function as a standalone 'API' for this knowledge that can be understood without the original text."
        )

        # Extract relevant parts if chapter is too large
        if len(chapter_text) > self.max_chunk_chars:
            relevant_text = self.extract_relevant_sections(chapter_text, title)
        else:
            relevant_text = chapter_text

        user_prompt = (
            f"Write content for an evergreen note with the following title, based on this chapter: "
            f"CHAPTER: {chapter_title} "
            f"NOTE TITLE: {title} "
            f"CHAPTER CONTENT: {relevant_text} "
            f"Generate a 200-300 word evergreen note that thoroughly explores this idea."
        )

        # Make API call with rate limiting
        response = self.rate_limited_api_call(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=model
        )

        # Extract content
        content = response.choices[0].message.content
        return content

    def process_chapter(self, chapter_title, chapter_text):
        """Process a single chapter to generate evergreen notes"""
        try:
            print(f"Processing: {chapter_title}")

            # Handle large chapters by chunking if necessary
            if len(chapter_text) > self.mini_max_chunk_chars:
                # For title generation, use the first 1/3 and last 1/3 of the chapter
                third = len(chapter_text) // 3
                title_text = chapter_text[:third] + "\n...\n" + chapter_text[-third:]
            else:
                title_text = chapter_text

            # Generate potential titles
            titles = self.generate_titles(title_text, chapter_title)

            # Add delay after title generation to avoid rate limits
            time.sleep(1)

            # Select best titles
            selected_titles = self.select_best_titles(title_text, titles, chapter_title)

            # Add delay after title selection to avoid rate limits
            time.sleep(1)

            # Generate content for each selected title
            notes = []

            for title in tqdm(selected_titles, desc="Generating notes", leave=False):
                # For content generation, extract relevant parts of the chapter if it's too large
                if len(chapter_text) > self.max_chunk_chars:
                    relevant_text = self.extract_relevant_sections(chapter_text, title)
                else:
                    relevant_text = chapter_text

                content = self.generate_note_content(relevant_text, title, chapter_title)

                notes.append({
                    "title": title,
                    "content": content
                })

                # Add delay between note generation to avoid rate limits
                time.sleep(2)

            return notes

        except Exception as e:
            print(f"Error in process_chapter for '{chapter_title}': {str(e)}")
            raise

    def extract_relevant_sections(self, chapter_text, title):
        """Extract the most relevant parts of a long chapter for a specific title"""
        # Split chapter into paragraphs
        paragraphs = [p for p in chapter_text.split('\n\n') if p.strip()]

        # Conservative max size
        max_chars = self.max_chunk_chars // 2

        if len(chapter_text) <= max_chars:
            return chapter_text

        # Extract important keywords from title
        title_words = set(re.findall(r'\b\w{4,}\b', title.lower()))
        # Remove common stop words
        stop_words = {'this', 'that', 'with', 'from', 'have', 'they', 'what', 'when', 'where', 'which', 'their'}
        title_words = {w for w in title_words if w not in stop_words}

        # Score paragraphs based on relevance to title
        scored_paragraphs = []

        for i, para in enumerate(paragraphs):
            score = 0
            para_lower = para.lower()

            # Score based on keyword matches
            for word in title_words:
                # Full word matches
                score += para_lower.count(f' {word} ') * 3
                # Partial matches
                score += para_lower.count(word) * 2

            # Boost score for position in chapter
            # First few paragraphs get a boost for context
            if i < 3:
                score += 5
            # Introduction paragraphs get a boost
            elif i < len(paragraphs) * 0.1:
                score += 3
            # Conclusion paragraphs get a boost
            elif i > len(paragraphs) * 0.9:
                score += 2

            # Length factor (prefer substantial paragraphs)
            if 100 <= len(para) <= 1000:
                score += 1

            scored_paragraphs.append((score, i, para))

        # Sort by score descending
        scored_paragraphs.sort(reverse=True)

        # Get introduction (first few paragraphs)
        intro_size = min(3, len(paragraphs))
        intro = '\n\n'.join(paragraphs[:intro_size])

        # Get top-scoring paragraphs
        selected = []
        current_size = len(intro)

        for score, i, para in scored_paragraphs:
            # Skip paragraphs already in intro
            if i < intro_size:
                continue

            # Stop if we've reached the size limit
            if current_size + len(para) > max_chars:
                break

            # Add paragraph
            selected.append((i, para))
            current_size += len(para)

        # Sort selected paragraphs by original position
        selected.sort()

        # Combine intro with selected paragraphs
        result = intro + '\n\n' + '\n\n'.join(para for _, para in selected)

        return result

    def save_notes_as_text(self, notes, chapter_title, output_dir):
        """Save notes in plain text format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.txt")

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Evergreen Notes for: {chapter_title}\n")
            f.write("=" * 80 + "\n\n")

            for i, note in enumerate(notes, 1):
                f.write(f"Note {i}: {note['title']}\n")
                f.write("-" * 80 + "\n")
                f.write(note['content'] + "\n\n")
                f.write("=" * 80 + "\n\n")

        return filename

    def save_notes_as_json(self, notes, chapter_title, output_dir):
        """Save notes in JSON format"""
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
        filename = os.path.join(output_dir, f"{safe_chapter_title}_notes.json")

        # Add IDs to notes for reference
        numbered_notes = []
        for i, note in enumerate(notes, 1):
            note_with_id = {
                "id": f"{safe_chapter_title}_note_{i}",
                "title": note['title'],
                "content": note['content']
            }
            numbered_notes.append(note_with_id)

        data = {
            "chapter": chapter_title,
            "notes": numbered_notes
        }

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

        return filename

    def process_book(self, pdf_path, output_dir, parallel=False):
        """Process the entire book to generate evergreen notes"""
        print(f"Extracting text from PDF: {pdf_path}")
        full_text = self.extract_text_from_pdf(pdf_path)

        print("Identifying chapters...")
        chapters = self.extract_chapters(full_text)
        print(f"Found {len(chapters)} chapters.")

        if len(chapters) == 0:
            print("No chapters identified. Please check the PDF format.")
            return None

        # Process each chapter
        all_notes = {}
        failed_chapters = []

        if parallel and len(chapters) > 1:
            # Process chapters in parallel with fewer workers to avoid rate limits
            with ThreadPoolExecutor(max_workers=min(2, len(chapters))) as executor:
                future_to_chapter = {
                    executor.submit(self.process_chapter, title, text): title
                    for title, text in chapters.items()
                }

                for future in tqdm(future_to_chapter, desc="Processing chapters"):
                    chapter_title = future_to_chapter[future]
                    try:
                        chapter_notes = future.result()

                        # Save chapter notes
                        txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                        json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                        print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                        all_notes[chapter_title] = chapter_notes

                        # Add delay between chapters
                        time.sleep(3)

                    except Exception as e:
                        print(f"Error processing chapter '{chapter_title}': {e}")
                        failed_chapters.append((chapter_title, chapters[chapter_title]))
        else:
            # Process chapters sequentially
            for chapter_title, chapter_text in tqdm(chapters.items(), desc="Processing chapters"):
                try:
                    chapter_notes = self.process_chapter(chapter_title, chapter_text)

                    # Save chapter notes
                    txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                    json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                    print(f"✓ Saved notes for '{chapter_title}' to {txt_file} and {json_file}")
                    all_notes[chapter_title] = chapter_notes

                    # Add delay between chapters to avoid rate limits
                    time.sleep(3)

                except Exception as e:
                    print(f"Error processing chapter '{chapter_title}': {e}")
                    failed_chapters.append((chapter_title, chapter_text))


        # Retry failed chapters after a longer delay
            if failed_chapters:
                print(f"\nRetrying {len(failed_chapters)} failed chapters after a delay...")
                time.sleep(60)  # Wait a full minute before retries

                for chapter_title, chapter_text in tqdm(failed_chapters, desc="Retrying failed chapters"):
                    try:
                        print(f"Retrying: {chapter_title}")
                        chapter_notes = self.process_chapter(chapter_title, chapter_text)

                        # Save chapter notes
                        txt_file = self.save_notes_as_text(chapter_notes, chapter_title, output_dir)
                        json_file = self.save_notes_as_json(chapter_notes, chapter_title, output_dir)

                        print(f"✓ Successfully saved notes for '{chapter_title}' on retry")
                        all_notes[chapter_title] = chapter_notes

                        # Longer delay between retries
                        time.sleep(10)

                    except Exception as e:
                        print(f"Still failed to process chapter '{chapter_title}': {e}")

            # Save combined JSON with all notes
            combined_json = os.path.join(output_dir, "all_notes.json")

            # Create a structured representation of all notes
            book_notes = {
                "book": os.path.basename(pdf_path),
                "chapters": []
            }

            for chapter_title, chapter_notes in all_notes.items():
                safe_chapter_title = re.sub(r'[^\w\s-]', '', chapter_title).strip().replace(' ', '_')
                chapter_data = {
                    "title": chapter_title,
                    "id": safe_chapter_title,
                    "notes": []
                }

                for i, note in enumerate(chapter_notes, 1):
                    note_id = f"{safe_chapter_title}_note_{i}"
                    note_with_id = {
                        "id": note_id,
                        "title": note['title'],
                        "content": note['content']
                    }
                    chapter_data["notes"].append(note_with_id)

                book_notes["chapters"].append(chapter_data)

            with open(combined_json, 'w', encoding='utf-8') as f:
                json.dump(book_notes, f, indent=2)

            print(f"✓ Combined notes saved to {combined_json}")

            return book_notes

# Function to run in Google Colab
def run_in_colab(pdf_file_path, api_key, notes_per_chapter=5, use_mixed_models=True):
    """
    Run the evergreen notes generator in Google Colab

    Args:
        pdf_file_path (str): Path to the PDF file
        api_key (str): OpenAI API key
        notes_per_chapter (int): Number of notes to generate per chapter
        use_mixed_models (bool): Whether to use mixed models (gpt-4o-mini for simpler tasks)
    """
    output_dir = "./evergreen_notes_output"

    # Model configuration
    if use_mixed_models:
        model_config = {
            'title_generation': 'gpt-4o-mini',  # Titles can use the mini model
            'title_selection': 'gpt-4o-mini',   # Selection can use the mini model
            'content_generation': 'gpt-4o'      # Content needs the full model
        }
    else:
        model_config = {
            'title_generation': 'gpt-4o',
            'title_selection': 'gpt-4o',
            'content_generation': 'gpt-4o'
        }

    # Initialize the generator
    generator = EvergreenNotesGenerator(
        api_key=api_key,
        model_config=model_config,
        notes_per_chapter=notes_per_chapter
    )

    # Process the book
    generator.process_book(pdf_file_path, output_dir, parallel=False)

    # Zip the output directory for easy download
    os.system(f"zip -r evergreen_notes.zip {output_dir}")

    print("Processing complete! You can now download the evergreen_notes.zip file.")

    # For Google Colab, download the zip file
    from google.colab import files
    files.download("evergreen_notes.zip")

    # Add viewer functionality
    def view_notes():
        import json
        from IPython.display import display, HTML

        if not os.path.exists(f"{output_dir}/all_notes.json"):
            print("Notes haven't been generated yet or couldn't be found.")
            return

        with open(f"{output_dir}/all_notes.json", 'r') as f:
            data = json.load(f)

        # Create a simple HTML viewer
        html = f"""
        <h1>Notes for {data['book']}</h1>
        <div style="max-width:800px">
        """

        for chapter in data['chapters']:
            html += f"""
            <div style="margin-top:20px; border:1px solid #ccc; padding:10px; border-radius:5px">
                <h2>{chapter['title']}</h2>
            """

            for note in chapter['notes']:
                html += f"""
                <div style="margin:10px 0; background:#f8f8f8; padding:10px; border-radius:3px">
                    <h3>{note['title']}</h3>
                    <div style="white-space:pre-wrap">{note['content']}</div>
                </div>
                """

            html += "</div>"

        html += "</div>"
        display(HTML(html))

    # Ask if user wants to view the notes
    print("\nWould you like to view the generated notes in the notebook?")
    if input("View notes now? (y/n): ").lower() == 'y':
        view_notes()

In [None]:


# Copy the entire EvergreenNotesGenerator class and run_in_colab function here

# Now add this code at the end to execute:
from google.colab import files
import os
import time

# Upload PDF file
print("Please upload your PDF book:")
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

# Get OpenAI API key
api_key = input("Enter your OpenAI API key: ")

# Set number of notes per chapter
notes_per_chapter = int(input("How many notes per chapter? (default: 3): ") or "3")

# Ask if user wants to use mixed models
use_mixed = input("Use GPT-4o-mini for title generation to save costs? (y/n, default: y): ").lower() != 'n'

# Run the generator
print(f"Starting to process {pdf_filename} with {notes_per_chapter} notes per chapter...")
run_in_colab(pdf_filename, api_key, notes_per_chapter=notes_per_chapter, use_mixed_models=use_mixed)

Please upload your PDF book:


Saving Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf to Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf
Enter your OpenAI API key: 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
How many notes per chapter? (default: 3): 3
Use GPT-4o-mini for title generation to save costs? (y/n, default: y): y
Starting to process Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf with 3 notes per chapter...
Extracting text from PDF: Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf
Identifying chapters...
Identifying table of contents...
Found 5 potential chapter titles
Locating chapters in the full text...
Found 'Mental Prayer is Not a Technique But a Grace' with pattern: \n\s*(?:\d+\.|\d+|[IVX]+\.|\[.+?\])\s+Mental\ Prayer\ is\ Not\ a\ Technique\ But

Processing chapters:   0%|          | 0/9 [00:00<?, ?it/s]

Processing: Section 1



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:07<00:14,  7.25s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:14<00:07,  7.44s/it][A
Generating notes: 100%|██████████| 3/3 [00:22<00:00,  7.40s/it][A
                                                               [A

✓ Saved notes for 'Section 1' to ./evergreen_notes_output/Section_1_notes.txt and ./evergreen_notes_output/Section_1_notes.json


Processing chapters:  11%|█         | 1/9 [00:30<04:02, 30.34s/it]

Processing: Section 2



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:07<00:14,  7.20s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:14<00:07,  7.50s/it][A
Generating notes: 100%|██████████| 3/3 [00:22<00:00,  7.56s/it][A
                                                               [A

✓ Saved notes for 'Section 2' to ./evergreen_notes_output/Section_2_notes.txt and ./evergreen_notes_output/Section_2_notes.json


Processing chapters:  22%|██▏       | 2/9 [01:00<03:30, 30.13s/it]

Processing: Section 3



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:09<00:18,  9.42s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:17<00:08,  8.34s/it][A
Generating notes: 100%|██████████| 3/3 [00:25<00:00,  8.37s/it][A
                                                               [A

✓ Saved notes for 'Section 3' to ./evergreen_notes_output/Section_3_notes.txt and ./evergreen_notes_output/Section_3_notes.json


Processing chapters:  33%|███▎      | 3/9 [01:33<03:09, 31.58s/it]

Processing: Section 4



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:08<00:16,  8.03s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:14<00:07,  7.38s/it][A
Generating notes: 100%|██████████| 3/3 [00:22<00:00,  7.39s/it][A
                                                               [A

✓ Saved notes for 'Section 4' to ./evergreen_notes_output/Section_4_notes.txt and ./evergreen_notes_output/Section_4_notes.json


Processing chapters:  44%|████▍     | 4/9 [02:04<02:36, 31.37s/it]

Processing: Section 5



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:08<00:17,  8.54s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:16<00:08,  8.36s/it][A
Generating notes: 100%|██████████| 3/3 [00:23<00:00,  7.59s/it][A
                                                               [A

✓ Saved notes for 'Section 5' to ./evergreen_notes_output/Section_5_notes.txt and ./evergreen_notes_output/Section_5_notes.json


Processing chapters:  56%|█████▌    | 5/9 [02:35<02:05, 31.25s/it]

Processing: Section 6



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:06<00:13,  6.57s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:13<00:06,  6.85s/it][A
Generating notes: 100%|██████████| 3/3 [00:21<00:00,  7.55s/it][A
                                                               [A

✓ Saved notes for 'Section 6' to ./evergreen_notes_output/Section_6_notes.txt and ./evergreen_notes_output/Section_6_notes.json


Processing chapters:  67%|██████▋   | 6/9 [03:05<01:31, 30.60s/it]

Processing: Section 7



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:08<00:17,  8.94s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:18<00:09,  9.24s/it][A
Generating notes: 100%|██████████| 3/3 [00:27<00:00,  9.33s/it][A
                                                               [A

✓ Saved notes for 'Section 7' to ./evergreen_notes_output/Section_7_notes.txt and ./evergreen_notes_output/Section_7_notes.json


Processing chapters:  78%|███████▊  | 7/9 [03:40<01:04, 32.31s/it]

Processing: Section 8



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:07<00:14,  7.44s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:18<00:09,  9.72s/it][A
Generating notes: 100%|██████████| 3/3 [00:29<00:00, 10.00s/it][A
                                                               [A

✓ Saved notes for 'Section 8' to ./evergreen_notes_output/Section_8_notes.txt and ./evergreen_notes_output/Section_8_notes.json


Processing chapters:  89%|████████▉ | 8/9 [04:20<00:34, 34.49s/it]

Processing: Section 9



Generating notes:   0%|          | 0/3 [00:00<?, ?it/s][A
Generating notes:  33%|███▎      | 1/3 [00:08<00:16,  8.17s/it][A
Generating notes:  67%|██████▋   | 2/3 [00:18<00:09,  9.28s/it][A
Generating notes: 100%|██████████| 3/3 [00:24<00:00,  8.03s/it][A
                                                               [A

✓ Saved notes for 'Section 9' to ./evergreen_notes_output/Section_9_notes.txt and ./evergreen_notes_output/Section_9_notes.json


Processing chapters: 100%|██████████| 9/9 [04:52<00:00, 32.52s/it]

✓ Combined notes saved to ./evergreen_notes_output/all_notes.json
Processing complete! You can now download the evergreen_notes.zip file.





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Would you like to view the generated notes in the notebook?
View notes now? (y/n): y


In [None]:
# Install dependencies
!pip install pypdf openai

# Import libraries
import pypdf
import openai
import re
import os
from google.colab import files
import getpass

# Step 1: Upload PDF
def upload_pdf():
    print("Please upload your PDF file:")
    uploaded = files.upload()
    pdf_filename = next(iter(uploaded))
    return pdf_filename

# Step 2: Extract first 10 pages (likely contains TOC)
def extract_initial_text(pdf_path, pages=10):
    reader = pypdf.PdfReader(pdf_path)
    text = ""
    for i in range(min(pages, len(reader.pages))):
        text += reader.pages[i].extract_text() + "\n"
    return text

# Step 3: Use OpenAI model to extract TOC
def get_toc_from_llm(initial_text, api_key, model="gpt-4o-mini"):
    client = openai.OpenAI(api_key=api_key)
    prompt = f"""You're given the initial pages of a book. Clearly list ONLY the chapters from the table of contents exactly as they appear. Number them clearly.

TEXT:
{initial_text}

Return ONLY the numbered list of chapters."""

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": "Extract chapters accurately from tables of contents."},
                  {"role": "user", "content": prompt}],
        temperature=0
    )

    toc_raw = response.choices[0].message.content.strip()
    chapters = [re.sub(r'^\d+[.)]\s*', '', line).strip() for line in toc_raw.splitlines() if line.strip()]
    return chapters

# Step 4: Extract full text and split by chapters
def split_chapters(pdf_path, chapters):
    reader = pypdf.PdfReader(pdf_path)

    # First extract full text
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"

    # Map chapters to their positions in the text
    splits = {}
    for idx, chapter in enumerate(chapters):
        pattern = re.escape(chapter)
        match = re.search(pattern, full_text, re.IGNORECASE)
        if match:
            splits[match.start()] = chapter

    # Sort by position and extract chapter contents
    sorted_splits = sorted(splits.items())
    chapter_contents = {}

    for i, (start_pos, title) in enumerate(sorted_splits):
        end_pos = sorted_splits[i+1][0] if i+1 < len(sorted_splits) else len(full_text)
        chapter_contents[title] = full_text[start_pos:end_pos].strip()

    return chapter_contents

# Step 5: Estimate page numbers for chapters
def estimate_page_numbers(pdf_path, chapter_titles):
    """Estimate page numbers where each chapter begins"""
    reader = pypdf.PdfReader(pdf_path)
    page_numbers = {}

    # For each page, check if it contains any chapter title
    for page_num in range(len(reader.pages)):
        page_text = reader.pages[page_num].extract_text()

        for title in chapter_titles:
            if title in page_text:
                page_numbers[title] = page_num + 1  # 1-based page numbers
                break

    return page_numbers

# Step 6: Save chapters to files with additional metadata
def save_chapters(chapter_contents, page_numbers, output_dir=None):
    # Create output directory
    if output_dir is None:
        output_dir = "extracted_chapters"

    os.makedirs(output_dir, exist_ok=True)

    # Create summary file
    with open(os.path.join(output_dir, "00_chapter_summary.txt"), "w", encoding="utf-8") as f:
        f.write("CHAPTER SUMMARY\n")
        f.write("==============\n\n")

        for i, title in enumerate(chapter_contents.keys(), 1):
            page = page_numbers.get(title, "Unknown")
            f.write(f"{i}. {title} (Page {page})\n")

    # Save individual chapter files
    for i, (title, content) in enumerate(chapter_contents.items(), 1):
        safe_title = re.sub(r'[^\w\s-]', '', title).replace(' ', '_')[:50]
        filename = f"{i:02d}_{safe_title}.txt"
        filepath = os.path.join(output_dir, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            page = page_numbers.get(title, "Unknown")
            f.write(f"CHAPTER: {title}\n")
            f.write(f"PAGE: {page}\n")
            f.write("=" * 50 + "\n\n")
            f.write(content)

        print(f"Saved: {filename}")

    return output_dir

# Step 7: Create zip file and download it
def create_downloadable_zip(output_dir):
    # Create a zip file
    zip_filename = f"{output_dir}.zip"

    # Use shell command to create zip (safer in Colab)
    !zip -r {zip_filename} {output_dir}

    # Make files available for download
    from google.colab import files

    try:
        # Download the zip file
        files.download(zip_filename)
        print(f"Download started for {zip_filename}")

        # Also provide the summary file separately
        summary_file = os.path.join(output_dir, "00_chapter_summary.txt")
        files.download(summary_file)
        print(f"Download started for {summary_file}")
    except Exception as e:
        print(f"Error during download: {e}")
        print("If download doesn't start automatically, use the file browser to download the files.")

# Main workflow function
def extract_book_chapters():
    # 1. Upload PDF
    pdf_file = upload_pdf()
    print(f"Processing {pdf_file}...")

    # 2. Get API key
    api_key = getpass.getpass('Enter your OpenAI API key: ')

    # 3. Extract initial text for TOC analysis
    initial_text = extract_initial_text(pdf_file)

    # 4. Use LLM to extract chapter list
    print("Extracting chapters using OpenAI...")
    chapters = get_toc_from_llm(initial_text, api_key)

    print(f"\nFound {len(chapters)} chapters:")
    for idx, chap in enumerate(chapters, 1):
        print(f"{idx}. {chap}")

    # 5. Split content by chapters
    print("\nExtracting chapter contents...")
    chapter_contents = split_chapters(pdf_file, chapters)

    # 6. Estimate page numbers
    print("Estimating page numbers...")
    page_numbers = estimate_page_numbers(pdf_file, chapters)

    # 7. Save chapters to files
    output_dir = save_chapters(chapter_contents, page_numbers)

    # 8. Create zip file for download
    print("\nCreating zip archive for download...")
    create_downloadable_zip(output_dir)

    print(f"\nProcessing complete! {len(chapter_contents)} chapters extracted.")
    print(f"Look in the {output_dir} folder for your extracted chapters.")

# Run the extraction process
if __name__ == "__main__":
    extract_book_chapters()

Please upload your PDF file:


Saving Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008).pdf to Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008) (2).pdf
Processing Jacques Philippe_ Helena Scott - Time for God_ A Guide to Mental Prayer (2008) (2).pdf...
Enter your OpenAI API key: ··········
Extracting chapters using OpenAI...

Found 33 chapters:
1. Mental Prayer is Not a Technique But a Grace
2. 1. Mental Prayer is Not a Kind of Christian Yoga
3. 2. Some Immediate Consequences
4. 3. Faith and Trust as the Basis for Mental Prayer
5. 4. Fidelity and Perseverance
6. 5. Purity of Intention
7. 6. Humility and Poverty of Heart
8. 7. Determination to Persevere
9. 8. Total Self-Giving to God
10. How to Use the Time of Mental Prayer
11. 1. Introductory Ideas
12. 2. When the Question Does Not Arise
13. 3. Primacy of God’s Action
14. 4. Primacy of Love
15. 5. God Gives Himself Through the Humanity of Jesus
16. 6. God Dwells in Our Hearts
17. The Development of the Life 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started for extracted_chapters.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started for extracted_chapters/00_chapter_summary.txt

Processing complete! 32 chapters extracted.
Look in the extracted_chapters folder for your extracted chapters.


In [None]:
!pip install PyPDF2 --upgrade



In [None]:
import PyPDF2 as pdf_library
import openai
import re
import os
import time
import json
from google.colab import files
import getpass

class EvergreenNotesGenerator:
    def __init__(self, pdf_path, api_key, notes_per_chapter=3, use_mixed_models=True):
        """
        Initialize the Evergreen Notes Generator

        Args:
            pdf_path (str): Path to the PDF file
            api_key (str): OpenAI API key
            notes_per_chapter (int): Number of notes to generate per chapter
            use_mixed_models (bool): Whether to use mixed models for cost efficiency
        """
        self.pdf_path = pdf_path
        self.api_key = api_key
        self.notes_per_chapter = notes_per_chapter
        self.use_mixed_models = use_mixed_models
        self.client = openai.OpenAI(api_key=api_key)

        # Models configuration
        self.primary_model = "gpt-4o"
        self.lighter_model = "gpt-4o-mini"  # For less intensive tasks

        # Initialize output directory
        self.output_dir = "evergreen_notes"
        os.makedirs(self.output_dir, exist_ok=True)

    def extract_initial_text(self, pages=10):
        """Extract text from initial pages (for TOC detection)"""
        reader = pdf_library.PdfReader(self.pdf_path)
        text = ""
        for i in range(min(pages, len(reader.pages))):
            text += reader.pages[i].extract_text() + "\n"
        return text

    def get_toc_from_llm(self, initial_text):
        """Use LLM to extract table of contents"""
        model = self.lighter_model  # Use lighter model for TOC extraction

        prompt = f"""You're given the initial pages of a book. Clearly list ONLY the chapters from the table of contents exactly as they appear. Number them clearly.

TEXT:
{initial_text}

Return ONLY the numbered list of chapters."""

        response = self.client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "Extract chapters accurately from tables of contents."},
                    {"role": "user", "content": prompt}],
            temperature=0
        )

        toc_raw = response.choices[0].message.content.strip()
        chapters = [re.sub(r'^\d+[.)]\s*', '', line).strip() for line in toc_raw.splitlines() if line.strip()]
        return chapters

    def split_chapters(self, chapters):
        """Split PDF text by chapters using exact chapter titles"""
        reader = pdf_library.PdfReader(self.pdf_path)

        # First extract full text
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text() + "\n"

        # Map chapters to their positions in the text
        splits = {}
        for idx, chapter in enumerate(chapters):
            pattern = re.escape(chapter)
            match = re.search(pattern, full_text, re.IGNORECASE)
            if match:
                splits[match.start()] = chapter

        # Sort by position and extract chapter contents
        sorted_splits = sorted(splits.items())
        chapter_contents = {}

        for i, (start_pos, title) in enumerate(sorted_splits):
            end_pos = sorted_splits[i+1][0] if i+1 < len(sorted_splits) else len(full_text)
            chapter_contents[title] = full_text[start_pos:end_pos].strip()

        return chapter_contents

    def estimate_page_numbers(self, chapter_titles):
        """Estimate page numbers where each chapter begins"""
        reader = pdf_library.PdfReader(self.pdf_path)
        page_numbers = {}

        # For each page, check if it contains any chapter title
        for page_num in range(len(reader.pages)):
            page_text = reader.pages[page_num].extract_text()

            for title in chapter_titles:
                if title in page_text:
                    page_numbers[title] = page_num + 1  # 1-based page numbers
                    break

        return page_numbers

    def generate_evergreen_notes(self, chapter_title, chapter_content):
        """Generate evergreen notes for a single chapter using structured outputs"""
        if self.use_mixed_models:
            note_model = self.primary_model
            title_model = self.lighter_model
        else:
            note_model = title_model = self.primary_model

        # Make sure we don't try to generate more notes than requested
        notes_to_generate = min(self.notes_per_chapter, 1)

        # Comprehensive prompt for generating high-quality evergreen notes
        prompt = f"""From the following chapter content, generate exactly {notes_to_generate} insightful evergreen notes that follow the principles of "evergreen notes" by Andy Matuschak.

CHAPTER: {chapter_title}

CONTENT:
{chapter_content[:50000]}

EVERGREEN NOTES PRINCIPLES:
1. Evergreen notes should be atomic (about one thing, and one thing only)
2. Evergreen notes should be concept-oriented (focused on ideas rather than facts)
3. Evergreen notes should be densely linked (though this is handled elsewhere in our system)
4. Evergreen notes should make assertions (they should have a clear opinion/stance)

DETAILED INSTRUCTIONS:
- Create notes that could be valuable to revisit years from now
- Focus on timeless principles, mental models, and insights rather than specific examples or temporary facts
- Each note should capture one complete idea in sufficient detail to make sense on its own
- Phrase notes as complete statements, not as topics or questions
- Make them precise, clear, and with a strong "insight density"
- Avoid superficial observations or simple summaries - dig deeper for insights
- Use clear, accessible language that will be understandable without the original context
- Include enough context so the note can stand alone
- Prioritize unique, counter-intuitive or profound insights over obvious ones

You MUST generate EXACTLY {notes_to_generate} notes, no more and no less.

Return your response in JSON format with an array of notes. For example if generating one note:
{{"notes": [
  {{"content": "Your note content here"}}
]}}

Or if generating multiple notes:
{{"notes": [
  {{"content": "First note content here"}},
  {{"content": "Second note content here"}},
  {{"content": "Third note content here"}}
]}}"""

        # Generate notes
        try:
            response = self.client.chat.completions.create(
                model=note_model,
                messages=[
                    {"role": "system", "content": "You are an expert at knowledge synthesis and creating high-quality evergreen notes in the style of Andy Matuschak. You extract deep, nuanced insights from content and express them as standalone, atomic conceptual units that will remain valuable over time."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0.7
            )

            # Parse the JSON response
            try:
                response_content = response.choices[0].message.content
                notes_data = json.loads(response_content)

                # Handle different possible response formats
                if isinstance(notes_data, list):
                    notes = [note.get("content", "") for note in notes_data]
                elif "notes" in notes_data and isinstance(notes_data["notes"], list):
                    notes = [note.get("content", "") for note in notes_data["notes"]]
                else:
                    # If we can't find notes in the expected format, look for any content field
                    notes = []
                    for key, value in notes_data.items():
                        if isinstance(value, dict) and "content" in value:
                            notes.append(value["content"])
                        elif isinstance(value, list):
                            for item in value:
                                if isinstance(item, dict) and "content" in item:
                                    notes.append(item["content"])

                # If we still have no notes, create a fallback
                if not notes:
                    print(f"Warning: Could not parse notes from response: {response_content[:100]}...")
                    notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")
                print(f"Raw response: {response_content[:100]}...")
                notes = [f"Note for chapter: {chapter_title}" for _ in range(self.notes_per_chapter)]

            # Handle edge case where fewer notes were returned than requested
            if len(notes) < self.notes_per_chapter:
                print(f"Warning: Only received {len(notes)} notes instead of {self.notes_per_chapter}.")
                # Pad with empty notes if necessary
                while len(notes) < self.notes_per_chapter:
                    notes.append("Note content unavailable.")
        except Exception as e:
            print(f"Error generating notes: {e}")
            # Try to directly extract content if possible
            try:
                content = response.choices[0].message.content
                if content and len(content) > 50:  # If we got some substantial content
                    # Just use the raw content as a single note
                    notes = [content]
                    # Pad with empty notes if necessary
                    while len(notes) < self.notes_per_chapter:
                        notes.append(f"Additional note for chapter: {chapter_title}")
                else:
                    # Fallback: generate generic placeholder notes
                    notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]
            except:
                # Fallback if all else fails
                notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]

        # Generate titles for each note
        titled_notes = []
        for note in notes:
            # Generate a title using the lighter model to save costs
            title_prompt = f"""Create a concise, descriptive title (3-7 words) for this evergreen note:

NOTE CONTENT: {note}

TITLE GUIDELINES:
- Make it specific enough to distinguish it from other notes
- Use language that resonates with the note's key insight
- Phrase it as a noun phrase or complete thought, not a question
- Avoid generic words like "Introduction" or "Overview"
- Include the most distinctive concept from the note
- Make it memorable and easy to reference
- Aim for clarity over cleverness

Respond with a JSON object containing only a 'title' field, like this:
{{"title": "Your Title Here"}}"""

            try:
                title_response = self.client.chat.completions.create(
                    model=title_model,
                    messages=[
                        {"role": "system", "content": "You are an expert at creating precise, distinctive titles for evergreen notes in the Zettelkasten tradition. Your titles are specific, conceptual, and optimized for future discoverability."},
                        {"role": "user", "content": title_prompt}
                    ],
                    response_format={"type": "json_object"},
                    temperature=0.5
                )

                try:
                    title_data = json.loads(title_response.choices[0].message.content)
                    title = title_data.get("title", f"Note on {chapter_title}")
                except json.JSONDecodeError:
                    # If JSON parsing fails, try to extract title with regex
                    content = title_response.choices[0].message.content
                    match = re.search(r'"title"\s*:\s*"([^"]+)"', content)
                    if match:
                        title = match.group(1)
                    else:
                        title = f"Note on {chapter_title}"
                    print(f"Extracted title: {title}")
            except Exception as e:
                print(f"Error generating title: {e}")
                title = f"Note on {chapter_title}"

            titled_notes.append({"title": title, "content": note})

        # Return exactly the requested number of notes
        return titled_notes[:self.notes_per_chapter]

    def save_notes(self, chapter_contents, page_numbers):
        """Process all chapters and save notes"""
        all_notes = []
        chapter_metadata = []

        # Create index file
        index_path = os.path.join(self.output_dir, "00_notes_index.md")
        with open(index_path, "w", encoding="utf-8") as index_file:
            index_file.write("# Evergreen Notes Index\n\n")

            # Process each chapter
            for i, (chapter_title, content) in enumerate(chapter_contents.items(), 1):
                print(f"\nProcessing Chapter {i}: {chapter_title}")
                page = page_numbers.get(chapter_title, "Unknown")
                chapter_metadata.append({"number": i, "title": chapter_title, "page": page})

                # Generate notes for this chapter
                try:
                    chapter_notes = self.generate_evergreen_notes(chapter_title, content)

                    # Add chapter info to notes
                    for note in chapter_notes:
                        note["chapter"] = chapter_title
                        note["chapter_number"] = i
                        note["page"] = page
                        all_notes.append(note)

                    # Add chapter header to the index file
                    index_file.write(f"### Chapter {i}: {chapter_title}\n\n")

                    for j, note in enumerate(chapter_notes, 1):
                        # Combine chapter number, chapter title, and note title for the filename
                        combined_title = f"Chapter {i}: {chapter_title} - {note['title']}"
                        note_filename = f"{i:02d}_{j:02d}_{self.safe_filename(combined_title)}.md"

                        # Only write exactly the number of notes per chapter requested
                        if j <= self.notes_per_chapter:
                            index_file.write(f"{i}.{j} [{combined_title}](./{note_filename})\n\n")

                            # Save individual note file
                            note_path = os.path.join(self.output_dir, note_filename)
                            with open(note_path, "w", encoding="utf-8") as note_file:
                                note_file.write(f"# {combined_title}\n\n")
                                note_file.write(f"*Page {page}*\n\n")
                                note_file.write(note['content'])
                                note_file.write("\n")

                    print(f"  Generated {len(chapter_notes)} notes")

                except Exception as e:
                    print(f"  Error generating notes for chapter: {e}")

        # Create metadata file
        meta_path = os.path.join(self.output_dir, "book_metadata.md")
        with open(meta_path, "w", encoding="utf-8") as meta_file:
            meta_file.write(f"# Book: {os.path.basename(self.pdf_path)}\n\n")
            meta_file.write(f"* Total Chapters: {len(chapter_contents)}\n")
            meta_file.write(f"* Total Notes: {len(all_notes)}\n\n")

            meta_file.write("## Chapter List\n\n")
            for chapter in chapter_metadata:
                meta_file.write(f"{chapter['number']}. {chapter['title']} (Page {chapter['page']})\n")

        return len(all_notes)

    def safe_filename(self, text):
        """Convert text to a safe filename"""
        # Remove unsafe chars and replace spaces with underscores
        safe = re.sub(r'[^\w\s-]', '', text).strip().replace(' ', '_')
        # Limit length to avoid overly long filenames
        return safe[:50]

    def create_zip_archive(self):
        """Create a downloadable ZIP archive of notes"""
        # Create a zip file
        zip_filename = f"{self.output_dir}.zip"

        # Use shell command to create zip (safer in Colab)
        os.system(f"zip -r {zip_filename} {self.output_dir}")

        # Make files available for download in Colab
        try:
            files.download(zip_filename)
            print(f"Download started for {zip_filename}")
        except Exception as e:
            print(f"Error during download: {e}")
            print("If download doesn't start automatically, use the file browser to download the files.")

    def process_book(self):
        """Main processing function"""
        print(f"Processing {self.pdf_path}...")

        # Step 1: Extract initial text for TOC analysis
        initial_text = self.extract_initial_text()

        # Step 2: Use LLM to extract chapter list
        print("Extracting chapters using OpenAI...")
        chapters = self.get_toc_from_llm(initial_text)

        print(f"\nFound {len(chapters)} chapters:")
        for idx, chap in enumerate(chapters, 1):
            print(f"{idx}. {chap}")

        # Step 3: Split content by chapters
        print("\nExtracting chapter contents...")
        chapter_contents = self.split_chapters(chapters)

        # Step 4: Estimate page numbers
        print("Estimating page numbers...")
        page_numbers = self.estimate_page_numbers(chapters)

        # Step 5: Generate and save notes
        print(f"\nGenerating {self.notes_per_chapter} notes per chapter...")
        total_notes = self.save_notes(chapter_contents, page_numbers)

        # Step 6: Create zip archive
        print("\nCreating downloadable ZIP archive...")
        self.create_zip_archive()

        print(f"\nProcessing complete! Generated {total_notes} notes across {len(chapter_contents)} chapters.")
        print(f"Notes saved in the '{self.output_dir}' folder.")


def run_in_colab(pdf_path, api_key, notes_per_chapter=3, use_mixed_models=True):
    """Run the Evergreen Notes Generator in Google Colab"""
    generator = EvergreenNotesGenerator(
        pdf_path=pdf_path,
        api_key=api_key,
        notes_per_chapter=notes_per_chapter,
        use_mixed_models=use_mixed_models
    )
    generator.process_book()


# Main execution code
if __name__ == "__main__":
    from google.colab import files
    import os

    try:
        # Try to import PyPDF2, and if that fails, try pypdf
        import PyPDF2 as pdf_library
    except ImportError:
        try:
            import pypdf as pdf_library
        except ImportError:
            # If neither is available, we'll need to install one
            import pip
            pip.main(['install', 'PyPDF2'])
            import PyPDF2 as pdf_library

    # Upload PDF file
    print("Please upload your PDF book:")
    uploaded = files.upload()
    pdf_filename = list(uploaded.keys())[0]

    # Get OpenAI API key
    api_key = input("Enter your OpenAI API key: ")

    # Set number of notes per chapter
    notes_per_chapter = int(input("How many notes per chapter? (default: 3): ") or "3")

    # Ask if user wants to use mixed models
    use_mixed = input("Use GPT-4o-mini for title generation to save costs? (y/n, default: y): ").lower() != 'n'

    # Run the generator
    print(f"Starting to process {pdf_filename} with {notes_per_chapter} notes per chapter...")
    run_in_colab(pdf_filename, api_key, notes_per_chapter=notes_per_chapter, use_mixed_models=use_mixed)

Please upload your PDF book:


Saving Jacques Philippe - Interior Freedom-Scepter Publishers (2010).pdf to Jacques Philippe - Interior Freedom-Scepter Publishers (2010) (5).pdf
Enter your OpenAI API key: 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
How many notes per chapter? (default: 3): 1
Use GPT-4o-mini for title generation to save costs? (y/n, default: y): y
Starting to process Jacques Philippe - Interior Freedom-Scepter Publishers (2010) (5).pdf with 1 notes per chapter...
Processing Jacques Philippe - Interior Freedom-Scepter Publishers (2010) (5).pdf...
Extracting chapters using OpenAI...



Found 33 chapters:
1. The Search for Freedom
2. Accepting Ourselves
3. Accepting Suffering
4. Accepting Other People
5. Freedom and the Present Moment
6. “To Love” Has a Present Tense Only
7. We Can Suffer for Only One Moment
8. “Let the Day’s Own Trouble Be Sufficient for the Day”
9. Tomorrow Can Take Care of Itself
10. Live, Instead of Waiting to Live
11. Availability to Other People
12. Psychological Time and Interior Time
13. The Theological Virtues
14. The Three Outpourings of the Holy Spirit
15. Vocation and the Gift of Faith
16. St. Peter’s Tears, and the Gift of Hope
17. Pentecost and the Gift of Charity
18. The Fire That Lights Up, Burns, and Transfigures
19. The Dynamism of the Theological Virtues
20. Love Needs Hope; Hope is Based on Faith
21. The Key Role of Hope
22. Dynamism of Sin, Dynamism of Grace
23. Hope and Purity of Heart
24. Law and Grace
25. “Where the Spirit Rules, There is Freedom.” The Difference Between Freedom and Licentiousness
26. The Trap of the Law
27. L

  Generated 1 notes

Processing Chapter 2: Accepting Ourselves


  Generated 1 notes

Processing Chapter 3: Accepting Suffering


  Generated 1 notes

Processing Chapter 4: Accepting Other People


  Generated 1 notes

Processing Chapter 5: Freedom and the Present Moment


  Generated 1 notes

Processing Chapter 6: “To Love” Has a Present Tense Only


  Generated 1 notes

Processing Chapter 7: We Can Suffer for Only One Moment


  Generated 1 notes

Processing Chapter 8: “Let the Day’s Own Trouble Be Sufficient for the Day”


  Generated 1 notes

Processing Chapter 9: Tomorrow Can Take Care of Itself


  Generated 1 notes

Processing Chapter 10: Live, Instead of Waiting to Live


  Generated 1 notes

Processing Chapter 11: Availability to Other People


  Generated 1 notes

Processing Chapter 12: Psychological Time and Interior Time


  Generated 1 notes

Processing Chapter 13: The Theological Virtues


  Generated 1 notes

Processing Chapter 14: The Three Outpourings of the Holy Spirit


  Generated 1 notes

Processing Chapter 15: Vocation and the Gift of Faith


  Generated 1 notes

Processing Chapter 16: St. Peter’s Tears, and the Gift of Hope


  Generated 1 notes

Processing Chapter 17: Pentecost and the Gift of Charity


  Generated 1 notes

Processing Chapter 18: The Fire That Lights Up, Burns, and Transfigures


  Generated 1 notes

Processing Chapter 19: The Dynamism of the Theological Virtues


  Generated 1 notes

Processing Chapter 20: Love Needs Hope; Hope is Based on Faith


  Generated 1 notes

Processing Chapter 21: The Key Role of Hope


  Generated 1 notes

Processing Chapter 22: Hope and Purity of Heart


  Generated 1 notes

Processing Chapter 23: Law and Grace


  Generated 1 notes

Processing Chapter 24: The Trap of the Law


  Generated 1 notes

Processing Chapter 25: Learning to Love: Giving and Receiving Freely


  Generated 1 notes

Processing Chapter 26: The Need to Be


  Generated 1 notes

Processing Chapter 27: Pride and Spiritual Poverty


  Generated 1 notes

Processing Chapter 28: Spiritual Trials


  Generated 1 notes

Processing Chapter 29: Relying on Mercy Alone


  Generated 1 notes

Processing Chapter 30: The Truly Free Person is the One Who Has Nothing Left to Lose


  Generated 1 notes

Processing Chapter 31: Happy Are the Poor


  Generated 1 notes

Processing Chapter 32: Dynamism of Sin, Dynamism of Grace


  Generated 1 notes

Creating downloadable ZIP archive...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started for evergreen_notes.zip

Processing complete! Generated 32 notes across 32 chapters.
Notes saved in the 'evergreen_notes' folder.


In [34]:
import PyPDF2 as pdf_library
import openai
import re
import os
import time
import json
from google.colab import files
import getpass

class EvergreenNotesGenerator:
    def __init__(self, pdf_path, api_key, notes_per_chapter=3, use_mixed_models=True):
        """
        Initialize the Evergreen Notes Generator

        Args:
            pdf_path (str): Path to the PDF file
            api_key (str): OpenAI API key
            notes_per_chapter (int): Number of notes to generate per chapter
            use_mixed_models (bool): Whether to use mixed models for cost efficiency
        """
        self.pdf_path = pdf_path
        self.api_key = api_key
        self.notes_per_chapter = notes_per_chapter
        self.use_mixed_models = use_mixed_models
        self.client = openai.OpenAI(api_key=api_key)

        # Models configuration
        self.primary_model = "gpt-4o"
        self.lighter_model = "gpt-4o-mini"  # For less intensive tasks

        # Get book title from filename
        book_title = os.path.splitext(os.path.basename(pdf_path))[0]
        # Clean up book title for folder name
        self.book_folder = re.sub(r'[^\w\s-]', '', book_title).strip().replace(' ', '_')

        # Create a timestamp suffix to ensure unique folder names
        timestamp = time.strftime("%Y%m%d_%H%M%S")

        # Initialize output directory with book name and timestamp
        self.output_dir = f"evergreen_notes_{self.book_folder}_{timestamp}"
        os.makedirs(self.output_dir, exist_ok=True)

    def extract_initial_text(self, pages=10):
        """Extract text from initial pages (for TOC detection)"""
        reader = pdf_library.PdfReader(self.pdf_path)
        text = ""
        for i in range(min(pages, len(reader.pages))):
            text += reader.pages[i].extract_text() + "\n"
        return text

    def get_toc_from_llm(self, initial_text):
        """Use LLM to extract table of contents"""
        model = self.lighter_model  # Use lighter model for TOC extraction

        prompt = f"""You're given the initial pages of a book. Clearly list ONLY the chapters from the table of contents exactly as they appear. Number them clearly.

TEXT:
{initial_text}

Return ONLY the numbered list of chapters."""

        response = self.client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "Extract chapters accurately from tables of contents."},
                    {"role": "user", "content": prompt}],
            temperature=0
        )

        toc_raw = response.choices[0].message.content.strip()
        chapters = [re.sub(r'^\d+[.)]\s*', '', line).strip() for line in toc_raw.splitlines() if line.strip()]
        return chapters

    def split_chapters(self, chapters):

      reader = pdf_library.PdfReader(self.pdf_path)
      total_pages = len(reader.pages)
      print(f"PDF has {total_pages} pages")

      # First, find the approximate page numbers for each chapter
      chapter_pages = {}
      for page_num in range(total_pages):
          page_text = reader.pages[page_num].extract_text()

          for chapter in chapters:
              if chapter in page_text and chapter not in chapter_pages:
                  chapter_pages[chapter] = page_num
                  print(f"Found chapter '{chapter}' on page {page_num+1}")

      # Sort chapters by their order in the TOC (not by page number)
      sorted_chapters = []
      for chapter in chapters:
          if chapter in chapter_pages:
              sorted_chapters.append(chapter)

      missing_chapters = [c for c in chapters if c not in chapter_pages]
      if missing_chapters:
          print(f"Warning: Could not find these chapters: {', '.join(missing_chapters)}")

      # Extract content for each chapter
      chapter_contents = {}
      for i, chapter in enumerate(sorted_chapters):
          start_page = chapter_pages[chapter]

          # For the end page, find the next chapter or use the end of the document
          if i < len(sorted_chapters) - 1:
              next_chapter = sorted_chapters[i+1]
              end_page = chapter_pages[next_chapter]
          else:
              end_page = total_pages

          # Extract text from the chapter's pages
          chapter_text = ""
          for page_num in range(start_page, end_page):
              chapter_text += reader.pages[page_num].extract_text() + "\n"

          chapter_contents[chapter] = chapter_text
          print(f"Extracted content for '{chapter}' from pages {start_page+1}-{end_page}")

      print(f"Successfully processed {len(chapter_contents)}/{len(chapters)} chapters")
      return chapter_contents

    def generate_evergreen_notes(self, chapter_title, chapter_content):
        """Generate evergreen notes for a single chapter using structured outputs"""
        if self.use_mixed_models:
            note_model = self.primary_model
            title_model = self.lighter_model
        else:
            note_model = title_model = self.primary_model

        # Make sure we don't try to generate more notes than requested
        notes_to_generate = min(self.notes_per_chapter, 1)

        # Comprehensive prompt for generating high-quality evergreen notes
        prompt = f"""Generate exactly {notes_to_generate} evergreen notes following Andy Matuschak's approach from the following chapter content.

CHAPTER: {chapter_title}

CONTENT:
{chapter_content[:50000]}

EVERGREEN NOTE PRINCIPLES:
1. Each note should be atomic - about one thing and one thing only, capturing that concept completely
2. Each note should be concept-oriented rather than fact-oriented
3. Each note should make a clear, declarative assertion
4. Notes should be written for yourself, not for an audience
5. Notes should be densely linked (though links will be added elsewhere)

DETAILED INSTRUCTIONS:
- Create notes that contain a single, clear, complete idea that will remain valuable for years
- Focus on timeless principles and mental models, not temporary facts
- Each note should represent a unit of knowledge that can stand on its own
- Aim for insight density - avoid obvious observations or simple summaries
- Write from a position of understanding, not just collecting information
- Make the note complete enough that your future self would understand it without the book

You MUST generate EXACTLY {notes_to_generate} notes, no more and no less.

Return your response in JSON format with an array of notes. For example:
{{"notes": [
  {{"content": "Your note content here - several paragraphs that fully explore the idea"}}
]}}"""

        # Generate notes
        try:
            response = self.client.chat.completions.create(
                model=note_model,
                messages=[
                    {"role": "system", "content": "You are an expert at knowledge synthesis and creating high-quality evergreen notes in the style of Andy Matuschak. You extract deep, nuanced insights from content and express them as standalone, atomic conceptual units that will remain valuable over time."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0.7
            )

            # Parse the JSON response
            try:
                response_content = response.choices[0].message.content
                notes_data = json.loads(response_content)

                # Handle different possible response formats
                if isinstance(notes_data, list):
                    notes = [note.get("content", "") for note in notes_data]
                elif "notes" in notes_data and isinstance(notes_data["notes"], list):
                    notes = [note.get("content", "") for note in notes_data["notes"]]
                else:
                    # If we can't find notes in the expected format, look for any content field
                    notes = []
                    for key, value in notes_data.items():
                        if isinstance(value, dict) and "content" in value:
                            notes.append(value["content"])
                        elif isinstance(value, list):
                            for item in value:
                                if isinstance(item, dict) and "content" in item:
                                    notes.append(item["content"])

                # If we still have no notes, create a fallback
                if not notes:
                    print(f"Warning: Could not parse notes from response: {response_content[:100]}...")
                    notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")
                print(f"Raw response: {response_content[:100]}...")
                notes = [f"Note for chapter: {chapter_title}" for _ in range(self.notes_per_chapter)]

            # Handle edge case where fewer notes were returned than requested
            if len(notes) < self.notes_per_chapter:
                print(f"Warning: Only received {len(notes)} notes instead of {self.notes_per_chapter}.")
                # Pad with empty notes if necessary
                while len(notes) < self.notes_per_chapter:
                    notes.append("Note content unavailable.")
        except Exception as e:
            print(f"Error generating notes: {e}")
            # Try to directly extract content if possible
            try:
                content = response.choices[0].message.content
                if content and len(content) > 50:  # If we got some substantial content
                    # Just use the raw content as a single note
                    notes = [content]
                    # Pad with empty notes if necessary
                    while len(notes) < self.notes_per_chapter:
                        notes.append(f"Additional note for chapter: {chapter_title}")
                else:
                    # Fallback: generate generic placeholder notes
                    notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]
            except:
                # Fallback if all else fails
                notes = [f"Note for chapter: {chapter_title}" for _ in range(notes_to_generate)]

        # Generate titles for each note
        titled_notes = []
        for note in notes:
            # Generate a title using the lighter model to save costs
            title_prompt = f"""Create a title for an evergreen note following Andy Matuschak's approach.

NOTE CONTENT: {note}

TITLE REQUIREMENTS:
1. Phrase as a complete declarative statement that makes a clear assertion
2. The title should function as an "API" for the note's content
3. Be specific and precise enough that someone could understand the core idea just from the title
4. Use active phrasing that emphasizes the claim being made
5. The title should be memorable and distinctive
6. Prefer positive framing over negative framing
7. Include the most distinctive concept from the note
8. Avoid questions or vague phrasings like "Introduction to X" or "Notes on Y"

Examples of excellent evergreen note titles:
- "Enabling environments focus on doing things rather than understanding things"
- "Evergreen notes permit smooth incremental progress in writing"
- "Spaced repetition systems can be used to program attention"
- "People seem to have different intrinsic capacities for different kinds of thinking"
- "Knowledge work should accrete"

Respond with a JSON object containing only a 'title' field, like this:
{{"title": "Your declarative title that serves as an API for the note"}}"""

            try:
                title_response = self.client.chat.completions.create(
                    model=title_model,
                    messages=[
                        {"role": "system", "content": "You are an expert at creating precise, distinctive titles for evergreen notes in the Zettelkasten tradition. Your titles are specific, conceptual, and optimized for future discoverability."},
                        {"role": "user", "content": title_prompt}
                    ],
                    response_format={"type": "json_object"},
                    temperature=0.5
                )

                try:
                    title_data = json.loads(title_response.choices[0].message.content)
                    title = title_data.get("title", f"Note on {chapter_title}")
                except json.JSONDecodeError:
                    # If JSON parsing fails, try to extract title with regex
                    content = title_response.choices[0].message.content
                    match = re.search(r'"title"\s*:\s*"([^"]+)"', content)
                    if match:
                        title = match.group(1)
                    else:
                        title = f"Note on {chapter_title}"
                    print(f"Extracted title: {title}")
            except Exception as e:
                print(f"Error generating title: {e}")
                title = f"Note on {chapter_title}"

            titled_notes.append({"title": title, "content": note})

        # Return exactly the requested number of notes
        return titled_notes[:self.notes_per_chapter]

    def save_notes(self, chapter_contents, page_numbers):
        """Process all chapters and save notes"""
        all_notes = []
        chapter_metadata = []

        # Initialize consolidated content with header
        consolidated_content = []
        consolidated_content.append(f"# Evergreen Notes from {os.path.basename(self.pdf_path)}\n")
        consolidated_content.append(f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        # Create index file
        index_path = os.path.join(self.output_dir, "00_index.md")
        with open(index_path, "w", encoding="utf-8") as index_file:
            index_file.write("# Evergreen Notes\n\n")
            index_file.write(f"Source: {os.path.basename(self.pdf_path)}\n\n")
            index_file.write(f"Created: {time.strftime('%Y-%m-%d')}\n\n")
            index_file.write("## Notes\n\n")

            # Process each chapter
            for i, (chapter_title, content) in enumerate(chapter_contents.items(), 1):
                print(f"\nProcessing Chapter {i}: {chapter_title}")
                page = page_numbers.get(chapter_title, "Unknown")
                chapter_metadata.append({"number": i, "title": chapter_title, "page": page})

                # Add chapter header to index file once
                index_file.write(f"### Chapter {i}: {chapter_title}\n\n")
                consolidated_content.append(f"## Chapter {i}: {chapter_title} (Page {page})\n\n")

                # Generate notes for this chapter
                try:
                    chapter_notes = self.generate_evergreen_notes(chapter_title, content)

                    for j, note in enumerate(chapter_notes, 1):
                        note["chapter"] = chapter_title
                        note["chapter_number"] = i
                        note["page"] = page
                        all_notes.append(note)

                        combined_title = f"Chapter {i}: {chapter_title} - {note['title']}"
                        note_filename = f"{i:02d}_{j:02d}_{self.safe_filename(combined_title)}.md"

                        if j <= self.notes_per_chapter:
                            # Write to index file
                            index_file.write(f"{i}.{j} [{combined_title}](./{note_filename})\n\n")

                            # Save individual note file
                            note_path = os.path.join(self.output_dir, note_filename)
                            with open(note_path, "w", encoding="utf-8") as note_file:
                                note_file.write(f"# {combined_title}\n\n")
                                note_file.write(f"*Page {page}*\n\n")
                                note_file.write(note['content'])
                                note_file.write("\n")

                            # Append note content to consolidated files
                            consolidated_content.append(f"### {combined_title}\n\n")
                            consolidated_content.append(note['content'] + "\n\n")

                    print(f"  Generated {len(chapter_notes)} notes")

                except Exception as e:
                    print(f"  Error generating notes for chapter: {e}")

        # Create metadata file
        meta_path = os.path.join(self.output_dir, "book_metadata.md")
        with open(meta_path, "w", encoding="utf-8") as meta_file:
            book_filename = os.path.basename(self.pdf_path)
            meta_file.write(f"# Book: {book_filename}\n\n")
            meta_file.write(f"* Process Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            meta_file.write(f"* Total Chapters: {len(chapter_contents)}\n")
            meta_file.write(f"* Total Notes: {len(all_notes)}\n\n")

            meta_file.write("## Chapter List\n\n")
            for chapter in chapter_metadata:
                meta_file.write(f"{chapter['number']}. {chapter['title']} (Page {chapter['page']})\n")

        # Create consolidated notes files with actual content
        consolidated_path = os.path.join(self.output_dir, "all_notes.md")
        with open(consolidated_path, "w", encoding="utf-8") as consolidated_file:
            consolidated_file.write("\n".join(consolidated_content))

        consolidated_txt_path = os.path.join(self.output_dir, "all_notes.txt")
        with open(consolidated_txt_path, "w", encoding="utf-8") as consolidated_txt_file:
            consolidated_txt_file.write("\n".join(consolidated_content))

        return len(all_notes)


    def safe_filename(self, text):
        """Convert text to a safe filename"""
        # Remove unsafe chars and replace spaces with underscores
        # First truncate to avoid very long text processing
        truncated = text[:100] if text else "Untitled"
        safe = re.sub(r'[^\w\s-]', '', truncated).strip().replace(' ', '_')
        # Further limit length to avoid overly long filenames
        return safe[:80]

    def create_zip_archive(self):
        """Create a downloadable ZIP archive of notes"""
        # Create a zip file with the same name as the output directory
        zip_filename = f"{self.output_dir}.zip"

        # Use shell command to create zip (safer in Colab)
        os.system(f"zip -r {zip_filename} {self.output_dir}")

        # Make files available for download in Colab
        try:
            files.download(zip_filename)
            print(f"Download started for {zip_filename}")
        except Exception as e:
            print(f"Error during download: {e}")
            print("If download doesn't start automatically, use the file browser to download the files.")


    def generate_summary(self, max_words=1000):
        """
        Generate a concise summary (~1000 words) from existing evergreen notes, structured by themes.

        Returns:
            str: Generated summary
        """
        import re, os

        # Helper method to load notes from markdown files in the output directory
        def load_notes():
            notes = []
            note_files = [f for f in os.listdir(self.output_dir) if f.endswith(".md") and f not in ["00_index.md", "all_notes.md", "book_metadata.md"]]

            for filename in note_files:
                filepath = os.path.join(self.output_dir, filename)
                with open(filepath, "r", encoding="utf-8") as f:
                    content = f.read()

                title_match = re.search(r"# (.+)", content)
                title = title_match.group(1).strip() if title_match else "Untitled"

                chapter_match = re.search(r"\*Page (\d+)\*", content)
                page = chapter_match.group(1).strip() if chapter_match else "Unknown"

                chapter_number_match = re.match(r"(\d+):", filename)
                chapter_number = int(chapter_number_match.group(1)) if chapter_number_match else "Unknown"

                chapter_title_match = re.match(r"\d+_\d+_Chapter_\d+_(.+?)_-_", filename)
                chapter_title = chapter_title_match.group(1).replace("_", " ") if chapter_title_match else "Unknown Chapter"

                # Clean note content (remove markdown headers/metadata)
                note_body = re.sub(r"# .+\n\n\*Page \d+\*\n\n", "", content).strip()

                notes.append({
                    "title": title,
                    "content": note_body,
                    "page": page,
                    "chapter": chapter_title,
                    "chapter_number": chapter_number
                })

            return notes

        # Load notes
        notes = load_notes()

        # Prepare aggregated notes content
        notes_content = []
        for note in notes:
            notes_content.append(f"{note['title']} (Chapter {note['chapter_number']}): {note['content']}")

        aggregated_notes = "\n\n".join(notes_content)

        # Create prompt for generating summary
        summary_prompt = f"""
        You have evergreen notes extracted from a book. Your task is to create a coherent, insightful summary of approximately {max_words} words.

        The summary must:

        1. Highlight the 3-5 core themes emerging from these notes.
        2. Reference explicitly the original evergreen note titles with corresponding chapter numbers.
        3. Create logical bridges between ideas, expanding briefly with examples or analogies when helpful.
        4. Be insightful, clear, and structured logically into sections based on themes.
        5. Include a short introduction and integrative conclusion.

        EVERGREEN NOTES CONTENT:
        {aggregated_notes}

        Return the summary in Markdown format clearly structured into an introduction, thematic sections, and conclusion.
        """

        # Call OpenAI API
        try:
            response = self.client.chat.completions.create(
                model=self.primary_model,
                messages=[
                    {"role": "system", "content": "You expertly synthesize knowledge into coherent, insightful, and concise summaries suitable for deep understanding."},
                    {"role": "user", "content": summary_prompt}
                ],
                temperature=0.5,
                max_tokens=2000
            )

            summary = response.choices[0].message.content.strip()

            # Save summary to markdown file
            summary_path = os.path.join(self.output_dir, "book_summary.md")
            with open(summary_path, "w", encoding="utf-8") as f:
                f.write(summary)

            print(f"Summary generated and saved to {summary_path}")

        except Exception as e:
            print(f"Error generating summary: {e}")
            summary = "Summary generation failed."

        return summary

    def process_book(self):
        """Main processing function"""
        print(f"Processing {self.pdf_path}...")

        # Step 1: Extract initial text for TOC analysis
        initial_text = self.extract_initial_text()

        # Step 2: Use LLM to extract chapter list
        print("Extracting chapters using OpenAI...")
        chapters = self.get_toc_from_llm(initial_text)

        print(f"\nFound {len(chapters)} chapters:")
        for idx, chap in enumerate(chapters, 1):
            print(f"{idx}. {chap}")

        # Step 3: Split content by chapters
        print("\nExtracting chapter contents...")
        chapter_contents = self.extract_chapters_from_pdf(chapters)

        # Step 4: Estimate page numbers
        print("Estimating page numbers...")
        page_numbers = self.estimate_page_numbers(chapters)

        # Step 5: Generate and save notes
        print(f"\nGenerating {self.notes_per_chapter} notes per chapter...")
        total_notes = self.save_notes(chapter_contents, page_numbers)

        # Step 6: Generate book summary
        print("\nGenerating book summary...")
        summary = self.generate_summary()

        # Save summary in text format as well
        summary_txt_path = os.path.join(self.output_dir, "book_summary.txt")
        with open(summary_txt_path, "w", encoding="utf-8") as summary_txt_file:
            summary_txt_file.write(summary)

        print(f"Summary saved to {os.path.basename(summary_txt_path)}")

        # Step 7: Create zip archive
        print("\nCreating downloadable ZIP archive...")
        self.create_zip_archive()

        print(f"\nProcessing complete! Generated {total_notes} notes across {len(chapter_contents)} chapters.")
        print(f"Notes and summary saved in the '{self.output_dir}' folder.")

    def run_in_colab(pdf_path, api_key, notes_per_chapter=3, use_mixed_models=True):
        """Run the Evergreen Notes Generator in Google Colab"""
        generator = EvergreenNotesGenerator(
            pdf_path=pdf_path,
            api_key=api_key,
            notes_per_chapter=notes_per_chapter,
            use_mixed_models=use_mixed_models
        )
        generator.process_book()

    # Attach robust LLM-driven PDF chapter extraction methods dynamically to EvergreenNotesGenerator

    def extract_chapters_from_pdf(self, chapter_titles):
        reader = pdf_library.PdfReader(self.pdf_path)
        total_pages = len(reader.pages)
        print(f"PDF has {total_pages} pages")

        # Extract metadata from the first 30 pages
        beginning_text = ""
        sample_page_count = min(30, total_pages)
        for i in range(sample_page_count):
            beginning_text += f"--- PAGE {i+1} ---\n{reader.pages[i].extract_text()}\n\n"

        # Extract metadata from the last 5 pages
        ending_text = ""
        sample_end_count = min(5, total_pages)
        for i in range(total_pages - sample_end_count, total_pages):
            ending_text += f"--- PAGE {i+1} ---\n{reader.pages[i].extract_text()}\n\n"

        # Prepare the prompt for LLM
        prompt = f"""
    You are analyzing a PDF book. Provide EXACT start and end page numbers for each chapter title listed below.

    Book Title: {os.path.basename(self.pdf_path)}
    Total Pages: {total_pages}

    Chapters from TOC:
    {', '.join(f"{i+1}. {title}" for i, title in enumerate(chapter_titles))}

    First {sample_page_count} pages:
    {beginning_text[:10000]}

    Last {sample_end_count} pages:
    {ending_text[:5000]}

    Return JSON ONLY in this format:
    {{
      "chapters": [
        {{ "title": "Chapter Title", "start_page": 1, "end_page": 10 }},
        ...
      ]
    }}"""

        try:
            print("Using LLM to identify chapter page ranges...")
            response = self.client.chat.completions.create(
                model=self.lighter_model,
                messages=[
                    {"role": "system", "content": "Identify PDF chapter boundaries with high accuracy."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0
            )

            mappings = json.loads(response.choices[0].message.content)
            chapter_mappings = mappings.get("chapters", [])

            if not chapter_mappings:
                print("No chapters found, reverting to fallback.")
                return self.fallback_extract_chapters(chapter_titles)

            print(f"Identified page ranges for {len(chapter_mappings)} chapters.")

            chapter_contents = {}
            for mapping in chapter_mappings:
                title = mapping["title"]
                start_page = max(0, min(mapping["start_page"] - 1, total_pages - 1))
                end_page = max(start_page + 1, min(mapping["end_page"], total_pages))

                chapter_text = ""
                for page_num in range(start_page, end_page):
                    try:
                        chapter_text += reader.pages[page_num].extract_text() + "\n"
                    except Exception as e:
                        print(f"  Text extraction error on page {page_num}: {e}")

                matched_title = next((t for t in chapter_titles if title.lower() in t.lower() or t.lower() in title.lower()), title)
                chapter_contents[matched_title] = chapter_text

            missing_chapters = set(chapter_titles) - set(chapter_contents.keys())
            if missing_chapters:
                print(f"Chapters missing from extraction: {', '.join(missing_chapters)}")

            return chapter_contents

        except Exception as e:
            print(f"LLM extraction failed: {e}\nUsing fallback method.")
            return self.fallback_extract_chapters(chapter_titles)

    def fallback_extract_chapters(self, chapter_titles):
        print("Fallback extraction: dividing PDF into equal sections...")
        reader = pdf_library.PdfReader(self.pdf_path)
        full_text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
        total_length = len(full_text)
        num_chapters = len(chapter_titles)
        section_size = total_length // num_chapters

        chapter_contents = {}
        for i, title in enumerate(chapter_titles):
            start = i * section_size
            end = start + section_size if i < num_chapters - 1 else total_length
            chapter_contents[title] = full_text[start:end].strip()

        print(f"Divided text into {num_chapters} sections.")
        return chapter_contents

    # Dynamically attach both methods to the class
    setattr(EvergreenNotesGenerator, "extract_chapters_from_pdf", extract_chapters_from_pdf)
    setattr(EvergreenNotesGenerator, "fallback_extract_chapters", fallback_extract_chapters)

    print("Methods 'extract_chapters_from_pdf' and 'fallback_extract_chapters' successfully attached.")

    def estimate_page_numbers(self, chapter_titles):
        """Estimate page numbers where each chapter begins using extracted chapter data"""
        reader = pdf_library.PdfReader(self.pdf_path)
        page_numbers = {}

        # For each page, check if it contains any chapter title
        for page_num in range(len(reader.pages)):
            page_text = reader.pages[page_num].extract_text()

            for title in chapter_titles:
                if title in page_text and title not in page_numbers:
                    page_numbers[title] = page_num + 1  # 1-based page numbers
                    break

        # If we're missing any chapters after scanning, assign an estimated page
        for idx, title in enumerate(chapter_titles):
            if title not in page_numbers:
                # Estimate a page number based on position in the TOC
                estimated_page = (idx + 1) * (len(reader.pages) // (len(chapter_titles) + 1))
                page_numbers[title] = estimated_page
                print(f"Estimated page for '{title}': {estimated_page}")

        return page_numbers

# Main execution code
if __name__ == "__main__":
    from google.colab import files
    import os

    try:
        # Try to import PyPDF2, and if that fails, try pypdf
        import PyPDF2 as pdf_library
    except ImportError:
        try:
            import pypdf as pdf_library
        except ImportError:
            # If neither is available, we'll need to install one
            import pip
            pip.main(['install', 'PyPDF2'])
            import PyPDF2 as pdf_library

    # Upload PDF file
    print("Please upload your PDF book:")
    uploaded = files.upload()
    pdf_filename = list(uploaded.keys())[0]

    # Get OpenAI API key
    api_key = input("Enter your OpenAI API key: ")

    # Set number of notes per chapter
    notes_per_chapter = int(input("How many notes per chapter? (default: 3): ") or "3")

    # Ask if user wants to use mixed models
    use_mixed = input("Use GPT-4o-mini for title generation to save costs? (y/n, default: y): ").lower() != 'n'

    # Run the generator
    print(f"Starting to process {pdf_filename} with {notes_per_chapter} notes per chapter...")
    run_in_colab(pdf_filename, api_key, notes_per_chapter=notes_per_chapter, use_mixed_models=use_mixed)

Methods 'extract_chapters_from_pdf' and 'fallback_extract_chapters' successfully attached.
Please upload your PDF book:


Saving Toju Duke_ Paolo Giudici - Responsible AI in Practice _ A Practical Guide to Safe and Human AI-Apress (2025).pdf to Toju Duke_ Paolo Giudici - Responsible AI in Practice _ A Practical Guide to Safe and Human AI-Apress (2025) (6).pdf
Enter your OpenAI API key: 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
How many notes per chapter? (default: 3): 1
Use GPT-4o-mini for title generation to save costs? (y/n, default: y): y
Starting to process Toju Duke_ Paolo Giudici - Responsible AI in Practice _ A Practical Guide to Safe and Human AI-Apress (2025) (6).pdf with 1 notes per chapter...
Processing Toju Duke_ Paolo Giudici - Responsible AI in Practice _ A Practical Guide to Safe and Human AI-Apress (2025) (6).pdf...
Extracting chapters using OpenAI...



Found 8 chapters:
1. Responsible AI and AI Governance
2. Accuracy
3. Robustness
4. Explainability
5. Fairness and Human Rights
6. Privacy
7. Sustainability
8. Human-Centered AI

Extracting chapter contents...
PDF has 194 pages
Using LLM to identify chapter page ranges...


Identified page ranges for 8 chapters.
Estimating page numbers...
Estimated page for 'Responsible AI and AI Governance': 21
Estimated page for 'Fairness and Human Rights': 105

Generating 1 notes per chapter...

Processing Chapter 1: Responsible AI and AI Governance


  Generated 1 notes

Processing Chapter 2: Accuracy


  Generated 1 notes

Processing Chapter 3: Robustness


  Generated 1 notes

Processing Chapter 4: Explainability


  Generated 1 notes

Processing Chapter 5: Fairness and Human Rights


  Generated 1 notes

Processing Chapter 6: Privacy


  Generated 1 notes

Processing Chapter 7: Sustainability


  Generated 1 notes

Processing Chapter 8: Human-Centered AI


  Generated 1 notes

Generating book summary...


Summary generated and saved to evergreen_notes_Toju_Duke__Paolo_Giudici_-_Responsible_AI_in_Practice___A_Practical_Guide_to_Safe_and_Human_AI-Apress_2025_6_20250323_143800/book_summary.md
Summary saved to book_summary.txt

Creating downloadable ZIP archive...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started for evergreen_notes_Toju_Duke__Paolo_Giudici_-_Responsible_AI_in_Practice___A_Practical_Guide_to_Safe_and_Human_AI-Apress_2025_6_20250323_143800.zip

Processing complete! Generated 8 notes across 8 chapters.
Notes and summary saved in the 'evergreen_notes_Toju_Duke__Paolo_Giudici_-_Responsible_AI_in_Practice___A_Practical_Guide_to_Safe_and_Human_AI-Apress_2025_6_20250323_143800' folder.
