# Step 2: Retrieve Documents

This notebook tests the document retrieval node. It loads a pre-built vector store and uses it to find document chunks that are semantically similar to a user's question.

In [None]:
import os
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

# Add the project root to the Python path
import sys
sys.path.append('..')

**Important:** Before running this notebook, you must first run the main Streamlit app (`streamlit run main.py`) and click the **"Build Vector Store"** button to create the `vector_store` index.

In [None]:
# --- Configuration ---
MODEL_NAME = "llama3"
VECTOR_STORE_PATH = "../vector_store"

embeddings = OllamaEmbeddings(model=MODEL_NAME)

# --- Load Vector Store Directly ---
faiss_path = os.path.join(VECTOR_STORE_PATH, "index.faiss")
pkl_path = os.path.join(VECTOR_STORE_PATH, "index.pkl")

if os.path.exists(faiss_path) and os.path.exists(pkl_path) and os.path.getsize(faiss_path) > 0:
    print(f"Loading vector store from {VECTOR_STORE_PATH}...")
    vector_store = FAISS.load_local(
        VECTOR_STORE_PATH, 
        embeddings, 
        allow_dangerous_deserialization=True
    )
    retriever = vector_store.as_retriever()
    print("Retriever is ready.")
else:
    raise FileNotFoundError(
        f"Vector store not found or is corrupted at '{VECTOR_STORE_PATH}'. "
        "Please run the main app and click 'Build Vector Store' first."
    )

In [None]:
def retrieve_documents(state: dict, retriever) -> dict:
    """
    Retrieves documents from the vector store.
    This function is defined locally for experimentation.
    """
    print("---RETRIEVING DOCUMENTS---")
    question = state['question']
    documents = retriever.invoke(question)
    return {"documents": documents}

In [None]:
question = "What are the requirements for CEMS data quality?"
state = {"question": question}

result = retrieve_documents(state, retriever)

print(f"Found {len(result['documents'])} relevant documents:")
for i, doc in enumerate(result['documents']):
    print(f"--- Document {i+1} ---")
    print(f"Source: {doc.metadata.get('source')}")
    print(doc.page_content)
    print()