# Initial Setup and define Environment Variables

In [1]:
# Initial setup and configuration
# ------------------------------
# This notebook performs ETL operations on Notion data and sets up vector storage
# Required imports for core functionality
import os
import sys
from getpass import getpass
import nest_asyncio
from IPython.display import Markdown, display
import openai

# Enable async code execution in notebook environment
# Required for llama-parse which uses async/await patterns
nest_asyncio.apply()

# Load and validate environment variables
# -------------------------------------
# Notion API credentials and database IDs
NOTION_TOKEN = os.getenv("NOTION_TOKEN")
PROJECTS_DATABASE_ID = os.getenv("NOTION_PROJECTS_DATABASE_ID") 
EXPERIENCE_DATABASE_ID = os.getenv("NOTION_EXPERIENCE_DATABASE_ID")

# Vector store (Qdrant) connection details
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# API keys for ML services
# Fallback to interactive prompt if env vars not set
CO_API_KEY = os.getenv("CO_API_KEY") or getpass("Enter your Cohere API key: ")
openai.api_key = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")

# Set up the LLM, Embeddings and Vector Store

In [3]:
# Import required libraries for LLM, embeddings and vector store functionality
from llama_index.embeddings.openai import OpenAIEmbedding
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.settings import Settings

# Get collection names from environment variables
COLLECTION_TEXT = os.getenv("COLLECTION_TEXT")
COLLECTION_KEYWORD = os.getenv("COLLECTION_KEYWORD")

# Initialize and configure OpenAI LLM
# Using gpt-4o-mini model with low temperature for more focused responses
llm_openai = OpenAI(
    model="gpt-4o-mini",
    temperature=0.3,  # Lower temperature for more deterministic outputs
    api_key=openai.api_key,
)

# Set LLM as global default in Settings
Settings.llm = llm_openai

# Initialize and configure OpenAI embeddings model
# Using text-embedding-3-large for high quality embeddings
embed_model_openai = OpenAIEmbedding(
    model="text-embedding-3-large",
    api_key=openai.api_key,
)

# Set embeddings model as global default in Settings
Settings.embed_model = embed_model_openai

# Initialize Qdrant clients for vector storage
# Regular client for synchronous operations
client = QdrantClient(
    location=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

# Async client for non-blocking operations
aclient = AsyncQdrantClient(
    location=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

# Initialize vector stores for text and keyword collections
# Using same client instances for efficiency
vector_store_text = QdrantVectorStore(
    client=client,
    aclient=aclient,
    collection_name=COLLECTION_TEXT,
)

vector_store_keywords = QdrantVectorStore(
    client=client,
    aclient=aclient,
    collection_name=COLLECTION_KEYWORD,
)

Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.
Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


# Set Up the Notion Processor

In [4]:
import os
import json
from typing import Dict, List, Optional, Tuple
from notion_client import Client
import pandas as pd
from datetime import datetime

class NotionProcessor:
    """
    A class to process Notion databases and pages, extracting structured content and metadata.
    Handles nested databases, headers, lists, and various text block types.
    """
    def __init__(self, auth_token: str):
        """Initialize the Notion client with authentication token."""
        self.notion = Client(auth=auth_token)
    
    def get_database_pages(self, database_id: str, parent_properties: Dict = None) -> List[Dict]:
        """
        Retrieve all pages from a Notion database with pagination support.
        
        Features:
        - Handles pagination automatically using Notion's cursor-based system
        - Optionally merges parent properties with each page's properties
        - Processes all pages in database before returning
        
        Args:
            database_id (str): Notion database ID to query
            parent_properties (Dict, optional): Properties to inherit from parent
        
        Returns:
            List[Dict]: All pages in database with merged properties
        """
        pages = []
        cursor = None
        
        while True:
            response = self.notion.databases.query(
                database_id=database_id,
                start_cursor=cursor
            )
            
            # If parent properties exist, merge them with each page
            if parent_properties:
                for page in response['results']:
                    self._merge_parent_properties(page, parent_properties)
            
            pages.extend(response['results'])
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return pages
    
    def _merge_parent_properties(self, page: Dict, parent_properties: Dict):
        """
        Merge parent database properties into individual page properties.
        
        Special handling for different property types:
        - Name: Combines parent and child names with separator
        - Description: Preserves child description over parent
        - Tags: Merges parent and child tags, removing duplicates
        - Other: Inherits parent property if not present in child
        
        Args:
            page (Dict): Page object to update
            parent_properties (Dict): Properties from parent database
        """
        for key, value in parent_properties.items():
            if key == 'Name':
                # Handle name merging only if child page has a name
                if 'Name' in page['properties']:
                    child_name = self._get_rich_text_content(page['properties']['Name'].get('title', []))
                    if child_name:
                        merged_name = f"{value} - {child_name}"
                        page['properties']['Name'] = {
                            'type': 'title',
                            'title': [{
                                'type': 'text',
                                'text': {'content': merged_name},
                                'plain_text': merged_name
                            }]
                        }
            elif key == 'Description':
                # Skip Description property - keep child's description if it exists
                continue
            elif key == 'Tags':
                # Merge tags, removing duplicates
                parent_tags = set(value.split(', ')) if value else set()
                if 'Tags' in page['properties']:
                    child_tags = set(tag['name'] for tag in page['properties']['Tags'].get('multi_select', []))
                    merged_tags = parent_tags.union(child_tags)
                    page['properties']['Tags'] = {
                        'type': 'multi_select',
                        'multi_select': [{'name': tag} for tag in sorted(merged_tags)]
                    }
            else:
                # For all other properties, inherit from parent if not present in child
                if key not in page['properties']:
                    page['properties'][key] = {
                        'type': 'rich_text',
                        'rich_text': [{
                            'type': 'text',
                            'text': {'content': str(value)},
                            'plain_text': str(value)
                        }]
                    }
    
    
    def extract_properties(self, page: Dict) -> Dict:
        """
        Extract and normalize page properties from Notion's API response.
        
        Handles various Notion property types:
        - title: Page titles
        - rich_text: Multi-line text
        - select: Single select options
        - multi_select: Multiple select options (converted to comma-separated string)
        - date: Date fields (extracts start date)
        - number/checkbox: Basic data types
        
        Args:
            page (Dict): Raw Notion page object
        
        Returns:
            Dict: Normalized properties with consistent data types
        """
        properties = {}
        
        for prop_name, prop_data in page['properties'].items():
            prop_type = prop_data['type']
            
            if prop_type == 'title':
                properties[prop_name] = self._get_rich_text_content(prop_data['title'])
            elif prop_type == 'rich_text':
                properties[prop_name] = self._get_rich_text_content(prop_data['rich_text'])
            elif prop_type == 'select':
                if prop_data['select']:
                    properties[prop_name] = prop_data['select']['name']
            elif prop_type == 'multi_select':
                # Convert multi-select to comma-separated string
                properties[prop_name] = ', '.join(sorted(item['name'] for item in prop_data['multi_select']))
            elif prop_type == 'date':
                if prop_data['date']:
                    properties[prop_name] = prop_data['date']['start']
            elif prop_type in ['number', 'checkbox']:
                properties[prop_name] = prop_data[prop_type]
                
        return properties
    
    def _normalize_text(self, text: str) -> str:
        """
        Normalize text content for consistent formatting.
        
        Performs the following operations:
        1. Replaces multiple spaces with single space
        2. Removes spaces before colons
        3. Strips whitespace from start/end of lines
        4. Removes empty lines
        5. Joins cleaned lines with newlines
        
        Args:
            text (str): Raw text content to normalize
        
        Returns:
            str: Cleaned and normalized text
        """
        # Replace multiple spaces with single space
        text = ' '.join(text.split())
        
        # Remove spaces before colons
        text = text.replace(' :', ':')
        
        # Split into lines and process each line
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Clean each line individually
            cleaned_line = line.strip()
            if cleaned_line:  # Only keep non-empty lines
                cleaned_lines.append(cleaned_line)
        
        # Join lines back together
        return '\n'.join(cleaned_lines)
    
    def _get_rich_text_content(self, rich_text: List) -> str:
        """
        Extract plain text content from Notion's rich text format.
        
        Features:
        - Combines multiple text segments
        - Preserves plain text content
        - Normalizes whitespace and formatting
        
        Args:
            rich_text (List): Notion rich text array
        
        Returns:
            str: Normalized plain text content
        """        
        text = ' '.join([text['plain_text'] for text in rich_text if text.get('plain_text')])
        return self._normalize_text(text)
    
    def get_block_children(self, block_id: str, level: int = 0) -> List[Tuple[Dict, int]]:
        """
        Recursively retrieve all child blocks of a given block.
        
        Features:
        - Handles nested block structure
        - Tracks nesting level for proper content organization
        - Supports pagination for large block collections
        - Skips recursion for child databases (handled separately)
        
        Args:
            block_id (str): ID of block to get children for
            level (int): Current nesting level (default: 0)
        
        Returns:
            List[Tuple[Dict, int]]: List of (block, nesting_level) pairs
        """
        blocks = []
        cursor = None
        
        while True:
            response = self.notion.blocks.children.list(
                block_id=block_id,
                start_cursor=cursor
            )
            
            for block in response['results']:
                blocks.append((block, level))
                
                if block.get('has_children'):
                    if block['type'] != 'child_database':
                        child_blocks = self.get_block_children(block['id'], level + 1)
                        blocks.extend(child_blocks)
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return blocks
    
    def process_blocks(self, blocks: List[Tuple[Dict, int]]) -> Tuple[Dict, List[str]]:
        """
        Process blocks to extract structured content organized by headers.
        
        Content organization:
        - Level 1 headers start new sections
        - Sub-headers are included in section content
        - Bullet points are grouped and merged
        - Paragraphs are added to current section
        
        Args:
            blocks: List of (block, level) tuples to process
        
        Returns:
            Tuple[Dict, List[str]]: 
                - Dict mapping headers to content section indices
                - List of processed content sections
        """
        current_header = None
        current_content = []
        headers = {}
        content_sections = []
        current_bullet_group = []
        
        def save_current_section():
            """Helper function to save current section's content."""
            nonlocal current_content, current_bullet_group, content_sections, current_header, headers
            
            if current_bullet_group:
                current_content.append(self._merge_bullet_group(current_bullet_group))
                current_bullet_group = []
            
            if current_header is not None and current_content:
                # Join content and normalize the entire section
                section_content = self._normalize_text('\n'.join(filter(None, current_content)))
                content_sections.append(section_content)
                headers[current_header] = len(content_sections) - 1
        
        for block, level in blocks:
            block_type = block['type']
            
            # Handle headers
            if block_type.startswith('heading_'):
                header_text = self._get_rich_text_content(block[block_type]['rich_text'])
                header_level = int(block_type[-1])
                
                if header_level == 1:
                    # Save current section before starting new one
                    save_current_section()
                    current_content = []
                    current_header = header_text
                else:
                    # Treat sub-headers as text content with line break
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_content.append(f"{header_text}\n")
            
            # Handle child database
            elif block_type == 'child_database':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                current_content.append(f"[Database: {block['id']}]")
            
            # Handle bullet points and numbered lists
            elif block_type in ['bulleted_list_item', 'numbered_list_item']:
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                
                if level == 0:
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_bullet_group = [(text_content, level)]
                else:
                    current_bullet_group.append((text_content, level))
            
            # Handle regular paragraphs
            elif block_type == 'paragraph':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                if text_content:
                    current_content.append(text_content)
        
        # Save final section
        save_current_section()
        
        return headers, content_sections
    
    def _merge_bullet_group(self, bullet_group: List[Tuple[str, int]]) -> str:
        """
        Merge a group of bullet points into a single coherent text chunk.
        
        Handles nested bullet points by:
        - Keeping main (level 0) bullets as separate lines
        - Merging sub-bullets inline with their parent bullet
        - Preserving the hierarchical relationship in the final text
        
        Args:
            bullet_group: List of (text, level) tuples representing bullet hierarchy
        
        Returns:
            str: Merged bullet points as normalized text
        """
        if not bullet_group:
            return ""
        
        result = []
        current_main_bullet = []
        
        for text, level in bullet_group:
            if level == 0:
                if current_main_bullet:
                    result.append(self._normalize_text(' '.join(current_main_bullet)))
                current_main_bullet = [text]
            else:
                current_main_bullet.append(text)
        
        if current_main_bullet:
            result.append(self._normalize_text(' '.join(current_main_bullet)))
        
        return '\n'.join(result)
    
    def process_page_whole(self, page: Dict, parent_properties: Dict = None) -> List[Dict]:
        """
        Process a page as a single document without splitting by headers.
        
        Features:
        - Preserves headers as part of content
        - Converts bullets to text with bullet markers
        - Maintains paragraph structure
        - Recursively processes child databases
        
        Args:
            page (Dict): Notion page to process
            parent_properties (Dict, optional): Properties to inherit
        
        Returns:
            List[Dict]: List containing single document with full page content
        """
        results = []
        
        # Extract properties
        properties = self.extract_properties(page)
        
        # Merge with parent properties if they exist
        if parent_properties:
            for key, value in parent_properties.items():
                if key not in ['Name', 'Description', 'Tags'] and key not in properties:
                    properties[key] = value
        
        # Process page blocks
        blocks = self.get_block_children(page['id'])
        content_parts = []
        
        for block, _ in blocks:
            block_type = block['type']
            
            if block_type.startswith('heading_'):
                # Add headers as text with line breaks
                header_text = self._get_rich_text_content(block[block_type]['rich_text'])
                content_parts.append(f"{header_text}\n")
                
            elif block_type in ['bulleted_list_item', 'numbered_list_item']:
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                content_parts.append(f"• {text_content}")
                
            elif block_type == 'paragraph':
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                if text_content:
                    content_parts.append(text_content)
        
        # Combine all content
        full_content = self._normalize_text('\n'.join(content_parts))
        
        if full_content:
            results.append({
                'properties': properties,
                'content': full_content
            })
        
        # Process child databases
        for block, _ in blocks:
            if block['type'] == 'child_database':
                child_pages = self.get_database_pages(block['id'], properties)
                for child_page in child_pages:
                    results.extend(self.process_page_whole(child_page, properties))
        
        return results
    
    def process_page(self, page: Dict, parent_properties: Dict = None) -> List[Dict]:
        """
        Process a page by splitting content at headers.
        
        Features:
        - Creates separate chunks for each header section
        - Preserves header hierarchy in properties
        - Handles nested databases recursively
        - Merges inherited properties appropriately
        
        Args:
            page (Dict): Notion page to process
            parent_properties (Dict, optional): Properties to inherit
        
        Returns:
            List[Dict]: List of content chunks with associated properties
        """
        results = []
        
        # Extract properties before any merging
        properties = self.extract_properties(page)
        
        # Merge with parent properties if they exist
        if parent_properties:
            # Skip special properties handling here as it's done in _merge_parent_properties
            # Only handle properties that weren't merged during database query
            for key, value in parent_properties.items():
                if key not in ['Name', 'Description', 'Tags'] and key not in properties:
                    properties[key] = value
        
        # Process page blocks
        blocks = self.get_block_children(page['id'])
        headers, content_sections = self.process_blocks(blocks)
        
        # Create entries for each section
        for header, section_index in headers.items():
            section_properties = properties.copy()
            section_properties['header'] = header
            
            if 0 <= section_index < len(content_sections):
                results.append({
                    'properties': section_properties,
                    'content': content_sections[section_index]
                })
        
        # Process child databases
        for block, _ in blocks:
            if block['type'] == 'child_database':
                child_pages = self.get_database_pages(block['id'], properties)
                for child_page in child_pages:
                    results.extend(self.process_page(child_page, properties))
        
        return results

    def process_page_granular(self, page: Dict, parent_properties: Dict = None) -> List[Dict]:
        """
        Process a single page with granular text extraction for optimal chunking.
        
        Key features:
        - Plain text blocks are merged into one chunk per header section
        - List items are combined with their nested content into separate chunks
        - Headers are preserved as metadata properties
        - Handles nested databases recursively
        
        Args:
            page (Dict): Notion page object to process
            parent_properties (Dict, optional): Properties inherited from parent database
        
        Returns:
            List[Dict]: List of processed chunks with properties and content
        """
        results = []
        properties = self.extract_properties(page)
        
        # Merge with parent properties if they exist
        if parent_properties:
            for key, value in parent_properties.items():
                if key not in ['Name', 'Description', 'Tags'] and key not in properties:
                    properties[key] = value
        
        blocks = self.get_block_children(page['id'])
        current_header = "Main"
        current_text_chunk = []
        current_list = []
        in_list = False
        
        def save_text_chunk():
            """Helper to save accumulated text chunk"""
            nonlocal current_text_chunk, results, properties, current_header
            if current_text_chunk:
                chunk_properties = properties.copy()
                chunk_properties['header'] = current_header
                results.append({
                    'properties': chunk_properties,
                    'content': self._normalize_text('\n'.join(current_text_chunk))
                })
                current_text_chunk = []
        
        def save_list_chunk():
            """Helper to save accumulated list chunk"""
            nonlocal current_list, results, properties, current_header
            if current_list:
                chunk_properties = properties.copy()
                chunk_properties['header'] = current_header
                results.append({
                    'properties': chunk_properties,
                    'content': self._normalize_text('\n'.join(current_list))
                })
                current_list = []
        
        prev_level = 0
        for block, level in blocks:
            block_type = block['type']
            
            # Handle headers
            if block_type.startswith('heading_'):
                save_text_chunk()
                save_list_chunk()
                in_list = False
                current_header = self._get_rich_text_content(block[block_type]['rich_text'])
                
            # Handle list items
            elif block_type in ['bulleted_list_item', 'numbered_list_item']:
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                
                # If this is a new list (not in a list or level decreased)
                if not in_list or level < prev_level:
                    save_text_chunk()
                    save_list_chunk()
                    current_list.append(text_content)
                    in_list = True
                else:
                    # Continue existing list
                    current_list.append(text_content)
                
                prev_level = level
                
            # Handle paragraphs
            elif block_type == 'paragraph':
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                if text_content:
                    if in_list:
                        # If we're in a list, append to the current list item
                        current_list.append(text_content)
                    else:
                        # Otherwise, add to text chunk
                        current_text_chunk.append(text_content)
        
        # Save any remaining chunks
        save_text_chunk()
        save_list_chunk()
        
        # Process child databases
        for block, _ in blocks:
            if block['type'] == 'child_database':
                child_pages = self.get_database_pages(block['id'], properties)
                for child_page in child_pages:
                    results.extend(self.process_page_granular(child_page, properties))
        
        return results

    def process_database(self, database_id: str, extraction_mode: str = 'header') -> List[Dict]:
        """
        Process entire database and return structured data.
        
        Args:
            database_id (str): The ID of the Notion database to process
            extraction_mode (str): Controls how content is extracted and chunked:
                - 'header': splits pages by headers (default)
                - 'whole': processes each page as a single document
                - 'granular': extracts text blocks and list items separately
        
        Returns:
            List[Dict]: List of processed content chunks, each containing:
                - properties: Dict of page metadata
                - content: Extracted and normalized text content
        """
        processed_data = []
        pages = self.get_database_pages(database_id)
        
        for page in pages:
            if extraction_mode == 'header':
                processed_data.extend(self.process_page(page))
            elif extraction_mode == 'whole':
                processed_data.extend(self.process_page_whole(page))
            elif extraction_mode == 'granular':
                processed_data.extend(self.process_page_granular(page))
            else:
                raise ValueError("extraction_mode must be one of: 'header', 'whole', 'granular'")
            
        return processed_data

# Extract the data from the Notion database

In [5]:
# Initialize NotionProcessor for projects database
# Using 'header' extraction mode to split pages by headers for more granular content chunks
processor_projects = NotionProcessor(NOTION_TOKEN)

# Extract and process all project data from Notion database
# PROJECTS_DATABASE_ID is defined in environment variables
# Returns list of dicts containing page properties and processed content
processed_data_projects = processor_projects.process_database(
    database_id=PROJECTS_DATABASE_ID,
    extraction_mode='header'  # Split by headers for better content organization
)

In [6]:
# Initialize NotionProcessor for work experience database
# Using 'whole' extraction mode since experience entries are typically shorter
# and don't need to be split into chunks
processor_experience = NotionProcessor(NOTION_TOKEN)

# Extract and process all work experience data from Notion database
# EXPERIENCE_DATABASE_ID is defined in environment variables
# Returns list of dicts containing page properties and processed content
processed_data_experience = processor_experience.process_database(
    database_id=EXPERIENCE_DATABASE_ID,
    extraction_mode='whole'  # Process each experience as a single document
)

## Process the data into LlamaIndex Documents

In [5]:
from llama_index.core import Document

# Define metadata keys to exclude from project documents
# These keys are excluded from both embedding and LLM contexts to reduce noise
# and focus on the most relevant information
excluded_metadata_keys_projects = [
    'Project Size',  # Numerical project size not relevant for semantic search
    'When',         # Temporal information not needed for content matching
    'Position',     # Role information handled elsewhere
    'Tags'          # Tags handled separately in keyword extraction
    # Keeping 'Employer', 'Description', 'Name', 'header' as they provide important context
]

# Create Document objects for each project record using list comprehension
# - Applies consistent metadata exclusions for both embedding and LLM
# - Preserves core content and relevant properties
# - More efficient than iterative approach
documents_projects = [
    Document(
        text=record['content'],
        metadata=record['properties'],
        excluded_embed_metadata_keys=excluded_metadata_keys_projects,
        excluded_llm_metadata_keys=excluded_metadata_keys_projects
    ) for record in processed_data_projects
]

NameError: name 'processed_data_projects' is not defined

In [None]:
# Define metadata keys to exclude from experience documents
# Only excluding 'When' since temporal information isn't needed for matching
# Keeping employer, skills, and name as they provide important context for responses
excluded_metadata_keys_experience = [
    'When',  # Temporal information not needed for content matching
    # keeping employer, skills, and name as they provide important context for responses
]    

# Create Document objects for each experience record using list comprehension
# - Applies consistent metadata exclusions for both embedding and LLM contexts
# - Preserves core content and relevant properties like employer, skills, name
# - More efficient than iterative approach
documents_experience = [
    Document(
        text=record['content'],
        metadata=record['properties'],
        excluded_embed_metadata_keys=excluded_metadata_keys_experience,
        excluded_llm_metadata_keys=excluded_metadata_keys_experience
    ) for record in processed_data_experience
]

In [9]:
# merge the documents
documents = documents_projects + documents_experience

In [6]:
# Verify the content and metadata that the LLM and Embedding model see
from llama_index.core.schema import MetadataMode

print(
    "The LLM sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "The Embedding model sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

NameError: name 'documents' is not defined

# Produce Keywords to embed


In [11]:
# Import required llama-index components
from llama_index.core import PromptTemplate
from llama_index.core.extractors import KeywordExtractor
from llama_index.core.schema import Node, NodeRelationship, RelatedNodeInfo, MetadataMode
from llama_index.core.node_parser import SentenceSplitter
from prompts import KEYWORD_PROMPT
from typing import List, Tuple
from tqdm import tqdm

# Create prompt template for keyword extraction using predefined prompt
keyword_prompt = PromptTemplate(KEYWORD_PROMPT)

# Initialize keyword extractor with optimized settings
keyword_extractor = KeywordExtractor(
    keywords=10,  # Number of keywords to extract per node
    llm=llm_openai,  # Use OpenAI LLM for extraction
    prompt=keyword_prompt,
    is_text_node_only=True,  # Only extract from text content for efficiency
    metadata_mode=MetadataMode.NONE  # Ignore metadata to reduce processing overhead
)

def extract_keywords(node: Node) -> List[str]:
    """
    Extract keywords from a single node using the configured keyword extractor.
    
    Args:
        node: Input node containing text to extract keywords from
        
    Returns:
        List of extracted keywords as strings
    
    Note:
        Uses batch size of 1 for better control over rate limiting
    """
    extracted_keywords = keyword_extractor.extract(nodes=[node])[0]
    return extracted_keywords['excerpt_keywords']

def process_documents(documents: List[Document]) -> Tuple[List[Node], List[Node]]:
    """
    Process documents into text nodes and corresponding keyword nodes.
    Optimizes chunking and keyword extraction for large document sets.
    
    Args:
        documents: List of input documents to process
        
    Returns:
        Tuple containing:
        - List of text nodes split from documents 
        - List of keyword nodes with extracted keywords and parent relationships
        
    Note:
        Uses sentence splitting with overlap to maintain context
        Processes in batches with progress tracking
    """
    # Initialize sentence splitter with optimized parameters
    sentence_splitter = SentenceSplitter(
        chunk_size=512,  # Optimal chunk size for most embedding models
        chunk_overlap=32,  # Minimal overlap to maintain context
        paragraph_separator="\n\n\n"  # Clear separation between content blocks
    )
    
    # Split all documents into text nodes at once
    text_nodes = sentence_splitter.get_nodes_from_documents(documents)
    
    # Pre-allocate keyword nodes list for efficiency
    keyword_nodes = []
    
    # Process nodes with progress tracking
    for text_node in tqdm(text_nodes, desc="Generating keywords"):
        # Extract keywords and create linked keyword node
        keywords = extract_keywords(text_node)
        keyword_node = Node(
            text=keywords,
            node_id=f"kw_{text_node.node_id}",
            relationships={
                NodeRelationship.PARENT: RelatedNodeInfo(node_id=text_node.node_id)
            }
        )
        keyword_nodes.append(keyword_node)
    
    return text_nodes, keyword_nodes


In [None]:
# Process documents into text nodes and keyword nodes
# This step splits documents into chunks and extracts keywords for vector search
# The process_documents function handles batching and progress tracking internally
text_nodes, keyword_nodes = process_documents(documents)

# Print node counts for verification and logging
# This helps validate that document processing completed successfully
# Equal counts indicate 1:1 mapping between text and keyword nodes as expected
node_counts = {
    'text_nodes': len(text_nodes),
    'keyword_nodes': len(keyword_nodes)
}
print(f"Created {node_counts['text_nodes']} text nodes and {node_counts['keyword_nodes']} keyword nodes")

In [None]:
# Import required llama-index components
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.core.ingestion import IngestionPipeline

# Create storage contexts for both keyword and text vector stores
# This allows us to persist the vector stores and reuse them later
storage_context_keywords = StorageContext.from_defaults(
    vector_store=vector_store_keywords,
)

storage_context_text = StorageContext.from_defaults(
    vector_store=vector_store_text,
)

# Create vector index for keyword nodes
# This enables semantic search over extracted keywords
keyword_index = VectorStoreIndex(
    nodes=keyword_nodes,
    storage_context=storage_context_keywords,
    embed_model=embed_model_openai,
    show_progress=True  # Show progress bar for long indexing operations
)

# Create vector index for text nodes 
# This enables semantic search over the actual document content
text_index = VectorStoreIndex(
    nodes=text_nodes,
    storage_context=storage_context_text,
    embed_model=embed_model_openai,
    show_progress=True
)
