In [1]:
import os
import sys
from getpass import getpass
import nest_asyncio
from IPython.display import Markdown, display

nest_asyncio.apply()


In [2]:
# Add logging
import logging
import sys

#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
import os
import json
from typing import Dict, List, Optional, Tuple
from notion_client import Client
import pandas as pd
from datetime import datetime

class NotionProcessor:
    def __init__(self, auth_token: str):
        """Initialize the Notion client with authentication token."""
        self.notion = Client(auth=auth_token)
    
    def get_database_pages(self, database_id: str, parent_properties: Dict = None) -> List[Dict]:
        """
        Retrieve all pages from a Notion database.
        If parent_properties provided, merge them with each page's properties.
        """
        pages = []
        cursor = None
        
        while True:
            response = self.notion.databases.query(
                database_id=database_id,
                start_cursor=cursor
            )
            
            # If parent properties exist, merge them with each page
            if parent_properties:
                for page in response['results']:
                    self._merge_parent_properties(page, parent_properties)
            
            pages.extend(response['results'])
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return pages
    
    def _merge_parent_properties(self, page: Dict, parent_properties: Dict):
        """
        Merge parent properties into page properties with specific handling for Name, Description, and Tags.
        """
        for key, value in parent_properties.items():
            if key == 'Name':
                # Handle name merging only if child page has a name
                if 'Name' in page['properties']:
                    child_name = self._get_rich_text_content(page['properties']['Name'].get('title', []))
                    if child_name:
                        merged_name = f"{value} - {child_name}"
                        page['properties']['Name'] = {
                            'type': 'title',
                            'title': [{
                                'type': 'text',
                                'text': {'content': merged_name},
                                'plain_text': merged_name
                            }]
                        }
            elif key == 'Description':
                # Skip Description property - keep child's description if it exists
                continue
            elif key == 'Tags':
                # Merge tags, removing duplicates
                parent_tags = set(value.split(', ')) if value else set()
                if 'Tags' in page['properties']:
                    child_tags = set(tag['name'] for tag in page['properties']['Tags'].get('multi_select', []))
                    merged_tags = parent_tags.union(child_tags)
                    page['properties']['Tags'] = {
                        'type': 'multi_select',
                        'multi_select': [{'name': tag} for tag in sorted(merged_tags)]
                    }
            else:
                # For all other properties, inherit from parent if not present in child
                if key not in page['properties']:
                    page['properties'][key] = {
                        'type': 'rich_text',
                        'rich_text': [{
                            'type': 'text',
                            'text': {'content': str(value)},
                            'plain_text': str(value)
                        }]
                    }
    
    
    def extract_properties(self, page: Dict) -> Dict:
        """Extract properties from a page."""
        properties = {}
        
        for prop_name, prop_data in page['properties'].items():
            prop_type = prop_data['type']
            
            if prop_type == 'title':
                properties[prop_name] = self._get_rich_text_content(prop_data['title'])
            elif prop_type == 'rich_text':
                properties[prop_name] = self._get_rich_text_content(prop_data['rich_text'])
            elif prop_type == 'select':
                if prop_data['select']:
                    properties[prop_name] = prop_data['select']['name']
            elif prop_type == 'multi_select':
                # Convert multi-select to comma-separated string
                properties[prop_name] = ', '.join(sorted(item['name'] for item in prop_data['multi_select']))
            elif prop_type == 'date':
                if prop_data['date']:
                    properties[prop_name] = prop_data['date']['start']
            elif prop_type in ['number', 'checkbox']:
                properties[prop_name] = prop_data[prop_type]
                
        return properties
    
    def _normalize_text(self, text: str) -> str:
        """
        Normalize text content by:
        1. Replacing multiple spaces with single space
        2. Removing spaces before newlines
        3. Removing spaces after newlines
        4. Removing empty lines
        """
        # Replace multiple spaces with single space
        text = ' '.join(text.split())
        
        # Remove spaces before colons
        text = text.replace(' :', ':')
        
        # Split into lines and process each line
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Clean each line individually
            cleaned_line = line.strip()
            if cleaned_line:  # Only keep non-empty lines
                cleaned_lines.append(cleaned_line)
        
        # Join lines back together
        return '\n'.join(cleaned_lines)
    
    def _get_rich_text_content(self, rich_text: List) -> str:
        """Extract text content from rich text array and normalize it."""
        text = ' '.join([text['plain_text'] for text in rich_text if text.get('plain_text')])
        return self._normalize_text(text)
    
    def get_block_children(self, block_id: str, level: int = 0) -> List[Tuple[Dict, int]]:
        """Retrieve all child blocks of a given block with their nesting level."""
        blocks = []
        cursor = None
        
        while True:
            response = self.notion.blocks.children.list(
                block_id=block_id,
                start_cursor=cursor
            )
            
            for block in response['results']:
                blocks.append((block, level))
                
                if block.get('has_children'):
                    if block['type'] != 'child_database':
                        child_blocks = self.get_block_children(block['id'], level + 1)
                        blocks.extend(child_blocks)
            
            if not response.get('has_more'):
                break
                
            cursor = response['next_cursor']
            
        return blocks
    
    def process_blocks(self, blocks: List[Tuple[Dict, int]]) -> Tuple[Dict, List[str]]:
        """
        Process blocks to extract headers and content.
        Sub-headers are treated as text content with line breaks.
        """
        current_header = None
        current_content = []
        headers = {}
        content_sections = []
        current_bullet_group = []
        
        def save_current_section():
            """Helper function to save current section's content."""
            nonlocal current_content, current_bullet_group, content_sections, current_header, headers
            
            if current_bullet_group:
                current_content.append(self._merge_bullet_group(current_bullet_group))
                current_bullet_group = []
            
            if current_header is not None and current_content:
                # Join content and normalize the entire section
                section_content = self._normalize_text('\n'.join(filter(None, current_content)))
                content_sections.append(section_content)
                headers[current_header] = len(content_sections) - 1
        
        for block, level in blocks:
            block_type = block['type']
            
            # Handle headers
            if block_type.startswith('heading_'):
                header_text = self._get_rich_text_content(block[block_type]['rich_text'])
                header_level = int(block_type[-1])
                
                if header_level == 1:
                    # Save current section before starting new one
                    save_current_section()
                    current_content = []
                    current_header = header_text
                else:
                    # Treat sub-headers as text content with line break
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_content.append(f"{header_text}\n")
            
            # Handle child database
            elif block_type == 'child_database':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                current_content.append(f"[Database: {block['id']}]")
            
            # Handle bullet points and numbered lists
            elif block_type in ['bulleted_list_item', 'numbered_list_item']:
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                
                if level == 0:
                    if current_bullet_group:
                        current_content.append(self._merge_bullet_group(current_bullet_group))
                        current_bullet_group = []
                    current_bullet_group = [(text_content, level)]
                else:
                    current_bullet_group.append((text_content, level))
            
            # Handle regular paragraphs
            elif block_type == 'paragraph':
                if current_bullet_group:
                    current_content.append(self._merge_bullet_group(current_bullet_group))
                    current_bullet_group = []
                
                text_content = self._get_rich_text_content(block[block_type]['rich_text'])
                if text_content:
                    current_content.append(text_content)
        
        # Save final section
        save_current_section()
        
        return headers, content_sections
    
    def _merge_bullet_group(self, bullet_group: List[Tuple[str, int]]) -> str:
        """Merge a group of bullets into a single line, with sub-bullets inline."""
        if not bullet_group:
            return ""
        
        result = []
        current_main_bullet = []
        
        for text, level in bullet_group:
            if level == 0:
                if current_main_bullet:
                    result.append(self._normalize_text(' '.join(current_main_bullet)))
                current_main_bullet = [text]
            else:
                current_main_bullet.append(text)
        
        if current_main_bullet:
            result.append(self._normalize_text(' '.join(current_main_bullet)))
        
        return '\n'.join(result)
    
    def process_page(self, page: Dict, parent_properties: Dict = None) -> List[Dict]:
        """Process a single page and its nested databases."""
        results = []
        
        # Extract properties before any merging
        properties = self.extract_properties(page)
        
        # Merge with parent properties if they exist
        if parent_properties:
            # Skip special properties handling here as it's done in _merge_parent_properties
            # Only handle properties that weren't merged during database query
            for key, value in parent_properties.items():
                if key not in ['Name', 'Description', 'Tags'] and key not in properties:
                    properties[key] = value
        
        # Process page blocks
        blocks = self.get_block_children(page['id'])
        headers, content_sections = self.process_blocks(blocks)
        
        # Create entries for each section
        for header, section_index in headers.items():
            section_properties = properties.copy()
            section_properties['header'] = header
            
            if 0 <= section_index < len(content_sections):
                results.append({
                    'properties': section_properties,
                    'content': content_sections[section_index]
                })
        
        # Process child databases
        for block, _ in blocks:
            if block['type'] == 'child_database':
                child_pages = self.get_database_pages(block['id'], properties)
                for child_page in child_pages:
                    results.extend(self.process_page(child_page, properties))
        
        return results

    def process_database(self, database_id: str) -> List[Dict]:
        """Process entire database and return structured data."""
        processed_data = []
        pages = self.get_database_pages(database_id)
        
        for page in pages:
            processed_data.extend(self.process_page(page))
            
        return processed_data

In [4]:
# Define environment variables
NOTION_TOKEN = os.getenv("NOTION_TOKEN")
DATABASE_ID = os.getenv("NOTION_DATABASE_ID")

CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

QDRANT_URL = os.environ['QDRANT_URL']
QDRANT_API_KEY = os.environ['QDRANT_API_KEY']

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']


In [12]:
from llama_index.llms.cohere import Cohere
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.cohere import CohereEmbedding
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.qdrant import QdrantVectorStore

COLLECTION_NAME = "Notion_vector_store"

model_kwargs = {"model": "gpt-4o-mini", 
                "temperature": 0,
                "api_key": OPENAI_API_KEY,
                "system_prompt": """Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
                Your response must be grounded in the provided context and relevant to the essence of the user's query.
                """                
                }

llm_openai = OpenAI(**model_kwargs)
llm_cohere = Cohere(model="command-r-plus", api_key=CO_API_KEY)

embed_model_openai = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
embed_model_cohere = CohereEmbedding(model="embed-english-v3.0", api_key=CO_API_KEY)

# set up the vector store
client = QdrantClient(location=QDRANT_URL, api_key=QDRANT_API_KEY)
aclient = AsyncQdrantClient(location=QDRANT_URL, api_key=QDRANT_API_KEY)

vector_store = QdrantVectorStore(
    client=client, 
    aclient=aclient, 
    collection_name=COLLECTION_NAME,
    enable_hybrid=True
    )



Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 5000.36it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 3310.42it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9480.80it/s]


In [6]:
# Process the database

from llama_index.core import Document
from llama_index.readers.notion import NotionPageReader

from llama_index.core.storage.docstore import SimpleDocumentStore


processor = NotionProcessor(NOTION_TOKEN)
processed_data = processor.process_database(DATABASE_ID)

documents = [Document(text=record['content'], metadata=record['properties']) for record in processed_data]
# documents = NotionPageReader(integration_token=NOTION_TOKEN).load_data(
#     database_ids=[DATABASE_ID]
# )

docstore = SimpleDocumentStore()
docstore.add_documents(documents)


In [13]:
# Build index over vector database
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

from llama_index.core.storage.index_store.simple_index_store import SimpleIndexStore


storage_context = StorageContext.from_defaults(
    docstore=docstore,
    index_store=SimpleIndexStore(),
    vector_store=vector_store
    )


In [14]:
from llama_index.core.node_parser import SentenceSplitter

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

index = VectorStoreIndex.from_documents(
    documents=documents,
    embed_model=embed_model_openai,
    storage_context=storage_context,
    transformations=[sentence_splitter, embed_model_openai],
    )

In [15]:
QUERY_STRING = "What projects involved the usage of React?"

def test_retrievers(query=QUERY_STRING, index=index, **kwargs):
    retriever_engine = index.as_retriever(**kwargs)
    retrieved_docs = retriever_engine.retrieve(query)
    print(f"Retrieved {len(retrieved_docs)} nodes.")
    print("\n")
    for node in retrieved_docs:
        print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")
    
mode_kwargs = {
    'default': {'vector_store_query_mode': 'default', 'similarity_top_k': 3},
    'bm25': {'vector_store_query_mode':'hybrid', 'alpha': 0.0, 'hybrid_top_k': 3}, 
    'hybrid': {'vector_store_query_mode':'hybrid', 'alpha': 0.25, 'hybrid_top_k': 3},
    'semantic_hybrid': {'vector_store_query_mode':'semantic_hybrid', 'alpha': 0.75, 'hybrid_top_k': 3},
    # 'sparse': {"sparse_top_k":5},
    'text_search': {'vector_store_query_mode':'text_search', 'similarity_top_k': 3},
}

for mode, kwargs in mode_kwargs.items():
    print(f"Retrieving nodes using: {mode} retrieval")
    test_retrievers(**kwargs)
    print(f"Retrieval with {mode} complete...")        
    print("\n")

Retrieving nodes using: default retrieval
Retrieved 3 nodes.


Score: 0.56 - React Frontend (UI) React: The primary framework used to build the user interface. It allows for component-based development, ensuring reusability and scalability. React Flow: A specialized library for visualizing nodes and edges, which is central to displaying the conversation flow and relationships between nodes. Bootstrap & Tailwind CSS: Used for styling the components and ensuring a clean, responsive UI. Bootstrap provides pre-built elements like buttons and panels, while Tailwind ensures utility-first, custom styling. Custom Hooks: Utilized to manage state, handle events, and communicate with Chrome storage and the background script efficiently. ELK.js: Provides automatic node layout functionality, ensuring that nodes are arranged hierarchically and without overlap....
-----

Score: 0.55 - Asynchronous Operations: Both the Chrome background script and the React app rely heavily on async/await to handle da

In [21]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core import PromptTemplate
from prompts import QUESTION_GEN_PROMPT

vector_retriever = index.as_retriever(similarity_top_k=1)

bm25_retriever = BM25Retriever.from_defaults(docstore=index.docstore, similarity_top_k=1)

QUERY_GEN_PROMPT_TEMPLATE=PromptTemplate(QUESTION_GEN_PROMPT)

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=15,
    num_queries=3,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    # query_gen_prompt=QUERY_GEN_PROMPT_TEMPLATE, 
)

DEBUG:bm25s:Building index from IDs objects


In [22]:
nodes_with_scores = retriever.retrieve(
    "How did you demonstrate leadership?"
)

Generated queries:
1. Examples of leadership skills in the workplace
2. Tips for showcasing leadership abilities in job interviews


In [23]:
for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

Score: 0.03 - Successfully created a minimum viable product (MVP) with essential logging functionalities. Implemented a user-friendly interface that allows seamless logging without disrupting the user's workflow. Integrated a robust reminder system with flexible settings for different user needs. Achieved a smooth user experience with optimized animations and transitions. Managed the entire project independently, from concept to final implementation, demonstrating strong project management and self-learning capabilities. Additional Development Insights: The project was conceptualized and developed independently, involving learning and applying new skills and technologies. Progressed through all software development phases, from initial concept to final implementation. Utilized GitHub Projects to keep track of tasks, maintain a work log, and ensure steady progress. Performed an initial assessment of technologies and selected Electron.js as the primary framework for its versatility and s

In [28]:
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.query_pipeline import InputComponent

from llama_index.core.query_pipeline import QueryPipeline


from prompts import HYPE_ANSWER_GEN_PROMPT

input_component = InputComponent()

HYPE_ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(HYPE_ANSWER_GEN_PROMPT)

rr_fusion_query_engine = RetrieverQueryEngine.from_args(
    retriever,
    response_mode = ResponseMode.COMPACT_ACCUMULATE,
    use_async = True,
    text_qa_template = HYPE_ANSWER_GEN_PROMPT_TEMPLATE
    )

rr_fusion_chain = [input_component, rr_fusion_query_engine]

rr_fusion_query_pipeline = QueryPipeline(
    rr_fusion_chain,
    verbose=True
    )

ImportError: cannot import name 'HYPE_ANSWER_GEN_PROMPT' from 'prompts' (c:\Users\rbt7r\OneDrive\Documents\VSCode Workspace\RAG_Implementation\prompts.py)

In [None]:
# Query

query = "Name projects and their associated challenges"

response = query_pipeline.run(input=query)

print(response)

In [8]:
# Automatically add more metadata to the documents

from llama_index.core.extractors import KeywordExtractor, QuestionsAnsweredExtractor
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import SentenceSplitter

from prompts import KEYWORD_EXTRACT_PROMPT

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

keyword_extractor = KeywordExtractor(
    keywords=5, 
    llm=llm_openai,
    prompt_template=KEYWORD_EXTRACT_PROMPT
    )

In [9]:
from llama_index.core.ingestion import IngestionPipeline

transformations = [sentence_splitter, keyword_extractor, embed_model_openai]


nodes = IngestionPipeline(
    documents=documents,
    transformations=transformations,
    vector_store=vector_store
    ).run(nodes=documents)

index = VectorStoreIndex.from_vector_store(
    embed_model=embed_model_openai,
    vector_store=vector_store,
    storage_context=storage_context,
    )


100%|██████████| 246/246 [00:32<00:00,  7.56it/s]


In [10]:
print(nodes[50].get_content(metadata_mode="all"))

[Excerpt from document]
Employer: McMaster University - Partnered with Cubic Transportation
Description: Deploying a Radar-Fusion Python script on Cubic’s GridSmart system, focusing on setting up the Python and ROS2 environment, interfacing external hardware, and ensuring stable network configuration between the Jetson Nano and Intel Processor
Project Size: Small
When: 2023-01-09
Position: Research Engineer II
Tags: Embedded System, Python, ROS2
Name: Cubic Gridsmart (GS3) Deployment
header: Tasks Performed:
excerpt_keywords: Radar-Fusion, Jetson Nano, ROS2, Embedded Systems, Documentation
Excerpt:
-----
ROS 2 Execution: Ran ROS 2 nodes and ensured proper communication between the hardware components and the GS3 system. Supervisory Role: Guidance and Support: Directed an undergraduate student, providing suggestions and assistance as needed. Documentation: Produced comprehensive documentation and user guides for setting up the GS3, accessing the Jetson Nano and Intel Processor, and conf

In [17]:
from llama_index.core.vector_stores.types import VectorStoreInfo, MetadataInfo

vector_store_info = VectorStoreInfo(
    content_info="High level overviews of engineering projects from a work portfolio.",
    metadata_info=[
        MetadataInfo(
            name="Name",
            type="str",
            description="Use this to identify the project name and create associations with other sections of the project."
        ),
        MetadataInfo(
            name="Tags",
            type="str",
            description="Tags associated with the overall project."
        ),
        MetadataInfo(
            name="excerpt_keywords",
            type="str",
            description="Keywords of specific skills demonstrated."
        ),
        MetadataInfo(
            name="Employer",
            type="str",
            description="Employer the project was done for."
        ),
        
        MetadataInfo(
            name="Position",
            type="str",
            description="Position the user held during the project."
        ),
        MetadataInfo(
            name="Description",
            type="str",
            description="Brief description of the project."
        ),
        MetadataInfo(
            name="header",
            type="str",
            description="Header of the section."
        ),
    ],
)

In [18]:
# Integrate with the Vector Index AutoRetriever

from typing import List, Tuple, Any
from pydantic import BaseModel, Field

class AutoRetrieveModel(BaseModel):
    query: str = Field(..., description="A question or statement on specific skills demonstrated in projects.")
    filter_key_list: List[str] = Field(
        ..., description="List of metadata filter field names"
    )
    filter_value_list: List[str] = Field(
        ...,
        description=(
            "List of metadata filter field values (corresponding to names specified in filter_key_list)"
        )
    )

In [23]:
# Auto Retriever Functional Tool

from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores.types import MetadataFilter, FilterOperator, MetadataFilters

top_k = 10

def auto_retrieve_fn(
    query: str, filter_key_list: List[str], filter_value_list: List[str]):
    """Auto retrieval function.

    Performs auto-retrieval from a vector database, and then applies a set of filters.

    """
    query = query or "Query"

    # Create a list of metadata filters by zipping together the filter keys and values
    # Each filter checks if the metadata field contains the specified value
    # For example, if filter_key_list=["Role"] and filter_value_list=["Engineer"]
    # It will create a filter that checks if "Engineer" is contained in the "Role" metadata field
    contains_filters = [
        MetadataFilter(key=k, value=v, operator=FilterOperator.CONTAINS)
        for k, v in zip(filter_key_list, filter_value_list)
        ]

    retriever = VectorIndexRetriever(
        index, 
        vector_store_query_mode="hybrid",
        alpha=0.65,
        filters=MetadataFilters(filters=contains_filters),
        top_k=top_k
        )

    query_engine = RetrieverQueryEngine.from_args(
        retriever,
        response_mode="compact",
        verbose=True
        )

    response = query_engine.query(query)
    return str(response)

In [24]:
from llama_index.core.tools import FunctionTool

description = f"""\
Use this tool to answer the user query by retrieving relevant information from the vector database.
The vector database schema is given below, which you should use to find the right information to answer the user's query.:
{vector_store_info.model_dump_json()}
"""

auto_retrieve_tool = FunctionTool.from_defaults(
    fn=auto_retrieve_fn,
    name="work-portfolio-info",
    description=description,
    fn_schema=AutoRetrieveModel
)

In [29]:
description

'Use this tool to answer the user query by retrieving relevant information from the vector database.\nThe vector database schema is given below, which you should use to find the right information to answer the user\'s query.:\n{"metadata_info":[{"name":"Name","type":"str","description":"Use this to identify the project name and create associations with other sections of the project."},{"name":"Tags","type":"str","description":"Use this to identify the tags associated with the overall project."},{"name":"excerpt_keywords","type":"str","description":"Use this to identify the keywords associated with specific sections of the project."},{"name":"Employer","type":"str","description":"Use this when the user asks about the employer."},{"name":"Position","type":"str","description":"Use this when the user asks about the job position."},{"name":"Description","type":"str","description":"Use this to quickly identify relevant projects."}],"content_info":"High level overviews of engineering projects

In [25]:
from llama_index.agent.openai import OpenAIAgent

agent = OpenAIAgent.from_tools(
    tools=[auto_retrieve_tool],
    verbose=True,
)

In [30]:
agent.chat("What projects involved the usage of React?")


Added user message to memory: What projects involved the usage of React?
=== Calling Function ===
Calling function: work-portfolio-info with args: {"query":"projects involving the usage of React","filter_key_list":["excerpt_keywords"],"filter_value_list":["React"]}
Got output: The projects mentioned in the context involve the usage of React for software development.



AgentChatResponse(response='The projects mentioned involve the usage of React for software development.', sources=[ToolOutput(content='The projects mentioned in the context involve the usage of React for software development.', tool_name='work-portfolio-info', raw_input={'args': (), 'kwargs': {'query': 'projects involving the usage of React', 'filter_key_list': ['excerpt_keywords'], 'filter_value_list': ['React']}}, raw_output='The projects mentioned in the context involve the usage of React for software development.', is_error=False)], source_nodes=[], is_dummy_stream=False, metadata=None)

In [36]:
# Ingest to Vector Database

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

sentence_splitter = SentenceSplitter(
    chunk_size=512, # in tokens
    chunk_overlap=16, # in tokens
    paragraph_separator="\n\n\n"
)

tranforms = [
    sentence_splitter, 
    embed_model_openai
    ]

nodes = IngestionPipeline(
    documents=documents,
    transformations=tranforms,
    vector_store=vector_store
    )



In [None]:

query_engine_kwargs = {
    "llm": llm_openai,
    "embed_model": embed_model_openai,
    "response_mode": "compact",
    "similarity_top_k": 15,
    "vector_store_query_mode": "mmr",
    "vector_store_kwargs": {"mmr_threshold": 0.42}
    }

# index = VectorStoreIndex.from_vector_store(
#     embed_model=embed_model_openai,
#     vector_store=vector_store,
#     )

# index = VectorStoreIndex.from_documents(
#     documents=documents,
#     embed_model=embed_model_openai,
#     vector_store=vector_store
#     )

index = VectorStoreIndex(
    documents,
    show_progress=True,
    store_nodes_override=True,
    transformation=[sentence_splitter],
    embed_model=embed_model_openai,
    storage_context=storage_context,
)

query_engine = index.as_query_engine(llm=llm_openai)
#query_engine = index.as_chat_engine()
#query_engine = index.as_retriever()



In [None]:
documents[0]


In [33]:
# Query Pipeline
from llama_index.core.query_pipeline import InputComponent
from llama_index.core.query_pipeline import QueryPipeline

input_component = InputComponent()

chain = [input_component, query_engine]

query_pipeline = QueryPipeline(
    chain=chain,
    verbose=True,
    )

In [None]:
# Query

query = "Name projects and their associated challenges"

response = query_pipeline.run(input=query)

print(response)