In [2]:
import os
import sys
from getpass import getpass
import nest_asyncio
from IPython.display import Markdown, display

nest_asyncio.apply()


In [3]:
import os
import json
from typing import Dict, List, Optional, Tuple
from notion_client import Client
import pandas as pd
from datetime import datetime

class NotionProcessor:
    def __init__(self, auth_token: str):
        """Initialize the Notion client with authentication token."""
        self.notion = Client(auth=auth_token)
    
    def get_database_pages(self, database_id: str, parent_properties: Dict = None) -> List[Dict]:
        """
        Retrieve all pages from a Notion database.
        If parent_properties provided, merge them with each page's properties.
        """
        pages = []
        cursor = None
        
        while True:
            response = self.notion.databases.query(
                database_id=database_id,
                start_cursor=cursor
            )
            
            # If parent properties exist, merge them with each page
            if parent_properties:
                for page in response['results']:
                    self._merge_parent_properties(page, parent_properties)
            
            pages.extend(response['results'])
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return pages
    
    def _merge_parent_properties(self, page: Dict, parent_properties: Dict):
        """
        Merge parent properties into page properties with specific handling for Name, Description, and Tags.
        """
        for key, value in parent_properties.items():
            if key == 'Name':
                # Handle name merging only if child page has a name
                if 'Name' in page['properties']:
                    child_name = self._get_rich_text_content(page['properties']['Name'].get('title', []))
                    if child_name:
                        merged_name = f"{value} - {child_name}"
                        page['properties']['Name'] = {
                            'type': 'title',
                            'title': [{
                                'type': 'text',
                                'text': {'content': merged_name},
                                'plain_text': merged_name
                            }]
                        }
            elif key == 'Description':
                # Skip Description property - keep child's description if it exists
                continue
            elif key == 'Tags':
                # Merge tags, removing duplicates
                parent_tags = set(value.split(', ')) if value else set()
                if 'Tags' in page['properties']:
                    child_tags = set(tag['name'] for tag in page['properties']['Tags'].get('multi_select', []))
                    merged_tags = parent_tags.union(child_tags)
                    page['properties']['Tags'] = {
                        'type': 'multi_select',
                        'multi_select': [{'name': tag} for tag in sorted(merged_tags)]
                    }
            else:
                # For all other properties, inherit from parent if not present in child
                if key not in page['properties']:
                    page['properties'][key] = {
                        'type': 'rich_text',
                        'rich_text': [{
                            'type': 'text',
                            'text': {'content': str(value)},
                            'plain_text': str(value)
                        }]
                    }
    
    
    def extract_properties(self, page: Dict) -> Dict:
        """Extract properties from a page."""
        properties = {}
        
        for prop_name, prop_data in page['properties'].items():
            prop_type = prop_data['type']
            
            if prop_type == 'title':
                properties[prop_name] = self._get_rich_text_content(prop_data['title'])
            elif prop_type == 'rich_text':
                properties[prop_name] = self._get_rich_text_content(prop_data['rich_text'])
            elif prop_type == 'select':
                if prop_data['select']:
                    properties[prop_name] = prop_data['select']['name']
            elif prop_type == 'multi_select':
                # Convert multi-select to comma-separated string
                properties[prop_name] = ', '.join(sorted(item['name'] for item in prop_data['multi_select']))
            elif prop_type == 'date':
                if prop_data['date']:
                    properties[prop_name] = prop_data['date']['start']
            elif prop_type in ['number', 'checkbox']:
                properties[prop_name] = prop_data[prop_type]
                
        return properties
    
    def _normalize_text(self, text: str) -> str:
        """
        Normalize text content by:
        1. Replacing multiple spaces with single space
        2. Removing spaces before newlines
        3. Removing spaces after newlines
        4. Removing empty lines
        """
        # Replace multiple spaces with single space
        text = ' '.join(text.split())
        
        # Remove spaces before colons
        text = text.replace(' :', ':')
        
        # Split into lines and process each line
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Clean each line individually
            cleaned_line = line.strip()
            if cleaned_line:  # Only keep non-empty lines
                cleaned_lines.append(cleaned_line)
        
        # Join lines back together
        return '\n'.join(cleaned_lines)
    
    def _get_rich_text_content(self, rich_text: List) -> str:
        """Extract text content from rich text array and normalize it."""
        text = ' '.join([text['plain_text'] for text in rich_text if text.get('plain_text')])
        return self._normalize_text(text)
    
    def get_block_children(self, block_id: str, level: int = 0) -> List[Tuple[Dict, int]]:
        """Retrieve all child blocks of a given block with their nesting level."""
        blocks = []
        cursor = None
        
        while True:
            response = self.notion.blocks.children.list(
                block_id=block_id,
                start_cursor=cursor
            )
            
            for block in response['results']:
                blocks.append((block, level))
                
                if block.get('has_children'):
                    if block['type'] != 'child_database':
                        child_blocks = self.get_block_children(block['id'], level + 1)
                        blocks.extend(child_blocks)
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return blocks
    
    def process_blocks(self, blocks: List[Tuple[Dict, int]]) -> Tuple[Dict, List[str]]:
        """
        Process blocks to extract headers and content.
        Sub-headers are treated as text content with line breaks.
        """
        current_header = None
        current_content = []
        headers = {}
        content_sections = []
        current_bullet_group = []
        
        def save_current_section():
            """Helper function to save current section's content."""
            nonlocal current_content, current_bullet_group, content_sections, current_header, headers
            
            if current_bullet_group:
                current_content.append(self._merge_bullet_group(current_bullet_group))
                current_bullet_group = []
            
            if current_header is not None and current_content:
                # Join content and normalize the entire section
                section_content = self._normalize_text('\n'.join(filter(None, current_content)))
                content_sections.append(section_content)
                headers[current_header] = len(content_sections) - 1
        
        for block, level in blocks:
            block_type = block['type']
            
            # Handle headers
            if block_type.startswith('heading_'):
                header_text = self._get_rich_text_content(block[block_type]['rich_text'])
                header_level = int(block_type[-1])
                
                if header_level == 1:
                    # Save current section before starting new one
                    save_current_section()
                    current_content = []
                    current_header = header_text
                else:
                    # Treat sub-headers as text content with line break
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_content.append(f"{header_text}\n")
            
            # Handle child database
            elif block_type == 'child_database':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                current_content.append(f"[Database: {block['id']}]")
            
            # Handle bullet points and numbered lists
            elif block_type in ['bulleted_list_item', 'numbered_list_item']:
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                
                if level == 0:
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_bullet_group = [(text_content, level)]
                else:
                    current_bullet_group.append((text_content, level))
            
            # Handle regular paragraphs
            elif block_type == 'paragraph':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                if text_content:
                    current_content.append(text_content)
        
        # Save final section
        save_current_section()
        
        return headers, content_sections
    
    def _merge_bullet_group(self, bullet_group: List[Tuple[str, int]]) -> str:
        """Merge a group of bullets into a single line, with sub-bullets inline."""
        if not bullet_group:
            return ""
        
        result = []
        current_main_bullet = []
        
        for text, level in bullet_group:
            if level == 0:
                if current_main_bullet:
                    result.append(self._normalize_text(' '.join(current_main_bullet)))
                current_main_bullet = [text]
            else:
                current_main_bullet.append(text)
        
        if current_main_bullet:
            result.append(self._normalize_text(' '.join(current_main_bullet)))
        
        return '\n'.join(result)
    
    def process_page(self, page: Dict, parent_properties: Dict = None) -> List[Dict]:
        """Process a single page and its nested databases."""
        results = []
        
        # Extract properties before any merging
        properties = self.extract_properties(page)
        
        # Merge with parent properties if they exist
        if parent_properties:
            # Skip special properties handling here as it's done in _merge_parent_properties
            # Only handle properties that weren't merged during database query
            for key, value in parent_properties.items():
                if key not in ['Name', 'Description', 'Tags'] and key not in properties:
                    properties[key] = value
        
        # Process page blocks
        blocks = self.get_block_children(page['id'])
        headers, content_sections = self.process_blocks(blocks)
        
        # Create entries for each section
        for header, section_index in headers.items():
            section_properties = properties.copy()
            section_properties['header'] = header
            
            if 0 <= section_index < len(content_sections):
                results.append({
                    'properties': section_properties,
                    'content': content_sections[section_index]
                })
        
        # Process child databases
        for block, _ in blocks:
            if block['type'] == 'child_database':
                child_pages = self.get_database_pages(block['id'], properties)
                for child_page in child_pages:
                    results.extend(self.process_page(child_page, properties))
        
        return results

    def process_database(self, database_id: str) -> List[Dict]:
        """Process entire database and return structured data."""
        processed_data = []
        pages = self.get_database_pages(database_id)
        
        for page in pages:
            processed_data.extend(self.process_page(page))
            
        return processed_data

In [4]:
# Define environment variables
NOTION_TOKEN = os.getenv("NOTION_TOKEN")
DATABASE_ID = os.getenv("NOTION_DATABASE_ID")

CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

QDRANT_URL = os.environ['QDRANT_URL']
QDRANT_API_KEY = os.environ['QDRANT_API_KEY']

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']


In [20]:
from llama_index.llms.cohere import Cohere
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.cohere import CohereEmbedding
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.qdrant import QdrantVectorStore

COLLECTION_NAME = "Notion_vector_store"

model_kwargs = {"model": "gpt-4o-mini", 
                "temperature": 0,
                "api_key": OPENAI_API_KEY,
                "system_prompt": """Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
                Your response must be grounded in the provided context and relevant to the essence of the user's query.
                """                
                }

llm_openai = OpenAI(**model_kwargs)
llm_cohere = Cohere(model="command-r-plus", api_key=CO_API_KEY)

embed_model_openai = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
embed_model_cohere = CohereEmbedding(model="embed-english-v3.0", api_key=CO_API_KEY)

# set up the vector store
client = QdrantClient(location=QDRANT_URL, api_key=QDRANT_API_KEY)
aclient = AsyncQdrantClient(location=QDRANT_URL, api_key=QDRANT_API_KEY)

vector_store = QdrantVectorStore(client=client, aclient=aclient, collection_name=COLLECTION_NAME)



Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [26]:
# Process the database

from llama_index.core import Document
from llama_index.readers.notion import NotionPageReader

processor = NotionProcessor(NOTION_TOKEN)
processed_data = processor.process_database(DATABASE_ID)

documents = [Document(text=record['content'], metadata=record['properties']) for record in processed_data]
# documents = NotionPageReader(integration_token=NOTION_TOKEN).load_data(
#     database_ids=[DATABASE_ID]
# )

In [36]:
# Ingest to Vector Database

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

sentence_splitter = SentenceSplitter(
    chunk_size=512, # in tokens
    chunk_overlap=16, # in tokens
    paragraph_separator="\n\n\n"
)

tranforms = [
    sentence_splitter, 
    embed_model_openai
    ]

nodes = IngestionPipeline(
    documents=documents,
    transformations=tranforms,
    vector_store=vector_store
    ).run(nodes=documents)



In [38]:
# Build index over vector database
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

query_engine_kwargs = {
    "llm": llm_openai,
    "embed_model": embed_model_openai,
    "response_mode": "compact",
    "similarity_top_k": 15,
    "vector_store_query_mode": "mmr",
    "vector_store_kwargs": {"mmr_threshold": 0.42}
    }

storage_context = StorageContext.from_defaults(
    vector_store=vector_store
    )

index = VectorStoreIndex.from_vector_store(
    embed_model=embed_model_openai,
    vector_store=vector_store,
    )

# index = VectorStoreIndex.from_documents(
#     documents=documents,
#     embed_model=embed_model_openai,
#     vector_store=vector_store
#     )

# index = VectorStoreIndex(
#     documents,
#     show_progress=True,
#     store_nodes_override=True,
#     transformation=[sentence_splitter],
#     embed_model=embed_model_openai,
#     storage_context=storage_context,
# )

query_engine = index.as_query_engine(llm=llm_openai)
#query_engine = index.as_chat_engine()
#query_engine = index.as_retriever()



Generating embeddings: 100%|██████████| 13/13 [00:01<00:00, 11.00it/s]


In [43]:
documents[0]


Document(id_='f932adb4-0fd0-4fcb-9030-a55d8c4eae45', embedding=None, metadata={'page_id': 'f932adb4-0fd0-4fcb-9030-a55d8c4eae45'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="\nProject Overview:\nThe Automated Sports Betting Tracker and Data Extractor Project involves the development of a set of Python scripts to automate the extraction and processing of betting details from HTML files provided by two bookmakers, Bet365 and Fanduel. The project is designed to parse HTML content, extract relevant betting information, process the data, and save it into a structured CSV format for further analysis or reporting. See the GitHub for full details on the project.\nTasks Performed:\nInitial Setup and Configuration:\n\tCreated a configuration script (config.py) to define data models, classification templates, and helper functions.\n\tSet up an environment for using OpenAI's API for further processing and classification of betting data.\nDevelopment of 

In [33]:
# Query Pipeline
from llama_index.core.query_pipeline import InputComponent
from llama_index.core.query_pipeline import QueryPipeline

input_component = InputComponent()

chain = [input_component, query_engine]

query_pipeline = QueryPipeline(
    chain=chain,
    verbose=True,
    )

In [34]:
# Query

query = "Name projects and their associated challenges"

response = query_pipeline.run(input=query)

print(response)

[1;3;38;2;155;135;227m> Running module 744b5471-615e-4289-8d76-ef4d373713d5 with input: 
input: Name projects and their associated challenges

[0m[1;3;38;2;155;135;227m> Running module 074ee98f-ad24-4823-bcba-8297f44081e5 with input: 
input: Name projects and their associated challenges

[0mThe projects and their associated challenges are as follows:

1. **Project on Software Unit Tests Generation**:
   - **Challenges**:
     - **GPU Cluster Issues**: Faced difficulties in distributing the model across several GPUs, which was attributed to a driver incompatibility problem. The solution involved using a single GPU at reduced precision.
     - **Poor Data Quality**: Encountered discrepancies in the dataset, such as mismatched variable names and commands. This was addressed by flagging issues, selecting high-quality examples, and producing documentation with recommendations for fixing them.
     - **Reproducibility**: Ensured near-identical results by setting a seed for consistent "ra