# Step 2: Retrieve Documents

This notebook tests the document retrieval node. It loads a pre-built vector store and uses it to find document chunks that are semantically similar to a user's question.

In [None]:
import os
from langchain_community.embeddings import OllamaEmbeddings

# Add the project root to the Python path
import sys
sys.path.append('..')

from src.utils.vector_store_manager import VectorStoreManager

**Important:** Before running this notebook, you must first run the main Streamlit app (`streamlit run main.py`) and click the **"Build Vector Store"** button to create the `vector_store` index.

In [None]:
# --- Configuration ---
MODEL_NAME = "llama3"
DOCS_PATH = "../documents"
VECTOR_STORE_PATH = "../vector_store"

embeddings = OllamaEmbeddings(model=MODEL_NAME)

# Initialize the manager and get the retriever
manager = VectorStoreManager(DOCS_PATH, VECTOR_STORE_PATH, embeddings)
retriever = manager.get_retriever()

In [None]:
def retrieve_documents(state: dict, retriever) -> dict:
    """
    Retrieves documents from the vector store.
    This function is defined locally for experimentation.
    """
    print("---RETRIEVING DOCUMENTS---")
    question = state['question']
    documents = retriever.invoke(question)
    return {"documents": documents}

In [None]:
question = "What are the requirements for CEMS data quality?"
state = {"question": question}

result = retrieve_documents(state, retriever)

print(f"Found {len(result['documents'])} relevant documents:")
for i, doc in enumerate(result['documents']):
    print(f"--- Document {i+1} ---")
    print(f"Source: {doc.metadata.get('source')}")
    print(doc.page_content)
    print()