## Prerequisites

1. Provide a valid `OPENAI_API_KEY` (and optionally `OPENAI_API_BASE` / `GOOGLE_API_KEY`) so embeddings and chat completions can run.
2. Store the pdf files you want to use in the `data/raw/` directory.

Note) The demo persists a FAISS index under `data/processed/qa_demo`, so rerunning the vector-store cell reuses the cached embeddings.

In [1]:
import os
import sys
from pathlib import Path

ROOT = Path().resolve().parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.settings import settings

settings.ensure_directories()

print("Workspace root:", ROOT)
print("QA model:", settings.qa_model)
key_source = os.environ.get("OPENAI_API_KEY") or settings.openai_api_key
print("OpenAI API key is", "set" if key_source else "missing - set OPENAI_API_KEY before invoking the chain.")

In [2]:
from src.ingestion import DocumentIngestion
from src.settings import settings


raw_data_path = ROOT.joinpath(settings.raw_data_path)

ingestor = DocumentIngestion()
doc_list = ingestor.load_directory(raw_data_path)
print(f"Loaded {len(doc_list)} documents from {raw_data_path}")

In [3]:
from src.raptor.tree_builder import TreeBuilder

tb = TreeBuilder()
raptor_tree = tb.build_from_documents(doc_list)

In [4]:
from pathlib import Path

from langchain_core.documents import Document

from src.retrieval.vector_store import RaptorVectorStore


demo_store_path = Path(settings.processed_data_path) / "qa_demo"
demo_store_path.mkdir(parents=True, exist_ok=True)

# documents = [
#     Document(
#         page_content='RaptorRAGChain wires the RAPTOR vector store retriever with a chat model prompt so questions get grounded answers.',
#         metadata={'title': 'RaptorRAGChain'},
#     ),
#     Document(
#         page_content='RaptorVectorStore manages FAISS and Chroma backends while collapsing RaptorTree nodes into retrievable documents.',
#         metadata={'title': 'RaptorVectorStore'},
#     ),
#     Document(
#         page_content='RaptorTree organizes raw content into layers of summaries so the QA chain can focus on the most relevant clusters.',
#         metadata={'title': 'RaptorTree'},
#     ),
#     Document(
#         page_content='Embedding short passages with OpenAIEmbeddings and saving them under data/processed/qa_demo lets the demo reuse the index.',
#         metadata={'title': 'Embedding Workflow'},
#     ),
# ]
vector_store = RaptorVectorStore.from_tree(raptor_tree)
# vector_store = RaptorVectorStore(persist_directory=demo_store_path)
# index_file = demo_store_path / "index.faiss"

# if index_file.exists():
#     vector_store.load()
#     print("Loaded existing FAISS index from", index_file)
# else:
#     print("Building new index at", demo_store_path)
#     vector_store.add_documents(documents)
#     vector_store.save()

# print("Vector store is ready with", len(documents), "documents.")

In [5]:
from src.retrieval.rag_chain import RaptorRAGChain

rag_chain = RaptorRAGChain(vector_store=vector_store)

question = "What is a nested query? Explain with some examples"
print("Question:", question)

context_docs = vector_store.similarity_search("RaptorRAGChain")
for idx, doc in enumerate(context_docs, 1):
    title = doc.metadata.get("title") if isinstance(doc.metadata, dict) else None
    print(f"\nContext {idx}: {title or 'Document'}")
    print(doc.page_content)

try:
    answer = rag_chain.invoke(question)
except Exception as exc:
    print("\nQA call failed:", exc)
    answer = None

if answer:
    print("\nAnswer:\n", answer)
else:
    print("\nAnswer not available; verify your API key and rerun the cell.")

## Next Steps
1. Experiment with different `settings.qa_model` or `settings.top_k` values by editing `.env` and reloading the notebook.
2. When you need deterministic tests, mock `RaptorRAGChain.invoke` instead of calling the live API.