# Retrieval Quality Test

This notebook tests:
- Indexing a PDF into Chroma
- Running similarity retrieval
- Inspecting citations and retrieved text snippets


In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))

from src.config import settings
from src.indexing.index_manager import IndexManager
from src.retrieval.retriever import RAGRetriever

import logging
logging.basicConfig(level=logging.INFO)

## 1) Choose a PDF

Place a PDF at `./data/sample_papers/sample_paper.pdf` (or change the path below).

In [None]:
pdf_path = "./data/sample_papers/sample_paper.pdf"
collection_name = "sample_paper_collection"

Path("./data/sample_papers").mkdir(parents=True, exist_ok=True)
print("PDF exists:", Path(pdf_path).exists())

## 2) Index the PDF

In [None]:
index_manager = IndexManager(
    embedding_model=settings.EMBEDDING_MODEL,
    chunk_size=settings.CHUNK_SIZE,
    chunk_overlap=settings.CHUNK_OVERLAP,
    persist_dir=settings.CHROMA_PERSIST_DIR,
)

if Path(pdf_path).exists():
    index_manager.index_pdf(pdf_path, collection_name=collection_name)
    print("Indexed into:", collection_name)
else:
    print("Please add a PDF at:", pdf_path)

## 3) Run retrieval queries

In [None]:
vector_store = index_manager.get_vector_store()
embedder = index_manager.get_embedder()

# Ensure the active collection
vector_store.create_collection(collection_name)

retriever = RAGRetriever(
    vector_store=vector_store,
    embedder=embedder,
    top_k=5,
    similarity_threshold=settings.SIMILARITY_THRESHOLD,
)

queries = [
    "What problem does the paper address?",
    "What datasets are used?",
    "Summarize the main contributions.",
]

for q in queries:
    res = retriever.retrieve(q)
    print("\n= QUERY =")
    print(q)
    print("\nSources:")
    for s in res.get_citations():
        print("-", s)
    print("\nTop snippet (first 500 chars):")
    ctx = res.get_context()
    print(ctx[:500] + ("..." if len(ctx) > 500 else ""))