## Prerequisites

1. Provide a valid `OPENAI_API_KEY` (and optionally `OPENAI_API_BASE` / `GOOGLE_API_KEY`) so embeddings and chat completions can run.
2. Store the pdf files you want to use in the `data/raw/` directory.

Note) The demo persists a FAISS index under `data/processed/qa_demo`, so rerunning the vector-store cell reuses the cached embeddings.

In [1]:
import os
import sys
from pathlib import Path

ROOT = Path().resolve().parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.settings import settings

settings.ensure_directories()

print("Workspace root:", ROOT)
print("QA model:", settings.qa_model)
key_source = os.environ.get("OPENAI_API_KEY") or settings.openai_api_key
print("OpenAI API key is", "set" if key_source else "missing - set OPENAI_API_KEY before invoking the chain.")

Workspace root: C:\Users\Doyoon\Desktop\Doyoon\2025 Fall\BKMS1\Assignment\Assignment 3\raptor-rag-langchain
QA model: google/gemini-2.5-flash-lite-preview-09-2025
OpenAI API key is set


In [2]:
from src.ingestion import DocumentIngestion
from src.settings import settings


raw_data_path = ROOT.joinpath(settings.raw_data_path)

ingestor = DocumentIngestion()
doc_list = ingestor.load_directory(raw_data_path)
print(f"Loaded {len(doc_list)} documents from {raw_data_path}")

Loaded 2191 documents from C:\Users\Doyoon\Desktop\Doyoon\2025 Fall\BKMS1\Assignment\Assignment 3\raptor-rag-langchain\data\raw


In [3]:
from src.raptor.tree_builder import TreeBuilder

from langchain_core.documents import Document

from src.retrieval.vector_store import RaptorVectorStore

import warnings
warnings.filterwarnings("ignore", category=UserWarning)


demo_store_path = ROOT.joinpath(settings.processed_data_path) / "qa_demo"
demo_store_path.mkdir(parents=True, exist_ok=True)

index_file = demo_store_path / "index.faiss"

if index_file.exists():
    vector_store = RaptorVectorStore(persist_directory=demo_store_path)
    vector_store.load()
    print("Loaded existing FAISS index from", index_file)
else:
    
    tb = TreeBuilder()
    raptor_tree = tb.build_from_documents(doc_list)
    print("Building new index at", demo_store_path)
    vector_store = RaptorVectorStore.from_tree(raptor_tree)
    vector_store.save(demo_store_path)
#     vector_store.add_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm


Building new index at C:\Users\Doyoon\Desktop\Doyoon\2025 Fall\BKMS1\Assignment\Assignment 3\raptor-rag-langchain\data\processed\qa_demo


In [4]:
print("Chunk_size: ", ingestor.chunk_size)
print("Chunk_overlap: ", ingestor.chunk_overlap)
print("GMM_threshold: ", tb.clustering.threshold)
print("Reduction_dimension: ", tb.clustering.reduction_dimension)

Chunk_size:  1000
Chunk_overlap:  50
GMM_threshold:  0.1
Reduction_dimension:  20


In [5]:
from src.retrieval.rag_chain import RaptorRAGChain

rag_chain = RaptorRAGChain(vector_store=vector_store)

question = "What is the advantage of B+ tree?"
print("Question:", question)

# context_docs = vector_store.similarity_search("RaptorRAGChain")
# for idx, doc in enumerate(context_docs, 1):
#     title = doc.metadata.get("title") if isinstance(doc.metadata, dict) else None
#     print(f"\nContext {idx}: {title or 'Document'}")
#     print(doc.page_content)

try:
    answer = rag_chain.invoke(question)
except Exception as exc:
    print("\nQA call failed:", exc)
    answer = None

if answer:
    print("\nAnswer:\n", answer)
else:
    print("\nAnswer not available; verify your API key and rerun the cell.")

Question: What is the advantage of B+ tree?

Answer:
 The advantage of the B+-tree index structure is that it maintains efficiency despite frequent data insertions and deletions, overcoming the performance degradation seen in index-sequential files which require undesirable reorganization.


## Next Steps
1. Experiment with different `settings.qa_model` or `settings.top_k` values by editing `.env` and reloading the notebook.
2. When you need deterministic tests, mock `RaptorRAGChain.invoke` instead of calling the live API.