# Test Retrieval on Kaggle with Existing Chroma DB

This notebook loads a prebuilt Chroma DB (unzipped folder) from a Kaggle Input Dataset and runs retrieval using E5 encoder.


In [None]:
pip install --quiet chromadb transformers torch pandas numpy tqdm "protobuf<5,>=3.20.3"


In [None]:
from pathlib import Path
import sys

# Point to your Kaggle Input dataset root (contains the unzipped chroma folder)
# Example: /kaggle/input/science-paper-qa-chroma
INPUT_ROOT = Path("/kaggle/input/<your-chroma-dataset>")
DB_DIR = INPUT_ROOT / "chroma_db"  # adjust if your folder name differs

# Add repo src to path if you also uploaded code as dataset
SRC_ROOT = Path("/kaggle/input/<your-code-dataset>") / "src"
if SRC_ROOT.exists():
    sys.path.insert(0, str(SRC_ROOT))

print("DB_DIR:", DB_DIR, DB_DIR.exists())
print("SRC_ROOT:", SRC_ROOT, SRC_ROOT.exists())


In [None]:
# Configure pipeline to point to the existing DB
try:
    from src.pipeline.config import paths
    from src.pipeline.retrieve import ChromaRetriever
except ModuleNotFoundError:
    from pipeline.config import paths
    from pipeline.retrieve import ChromaRetriever

paths.chroma_persist_dir = str(DB_DIR)
print("Using Chroma at:", paths.chroma_persist_dir)


In [None]:
# Run a sample retrieval
retriever = ChromaRetriever()
query = "What is the electroweak precision program?"
res = retriever.retrieve(query, top_k=20)
len(res["hits"]), res["hits"][0]["metadata"], res["hits"][0]["dense_score"]


## Notes
- Set `INPUT_ROOT` to your dataset path on Kaggle that contains the unzipped Chroma folder.
- If you also upload the code as a dataset, set `SRC_ROOT` accordingly; otherwise, copy `src/` into working dir.
- If protobuf errors appear, add to the install cell: `protobuf<5,>=3.20.3` (already included).
- If no results, ensure DB folder structure is intact: it should be the same as when you zipped it.
