# Test Retrieval on Kaggle (using existing Chroma DB)

This notebook copies the DB from Kaggle Input (read-only) to `/kaggle/working` (writable), points the pipeline to it, and runs a sample query.


In [None]:
pip install --quiet chromadb transformers torch pandas numpy tqdm "protobuf<5,>=3.20.3"


In [None]:
from pathlib import Path
import shutil, sys

# Set these to your Kaggle input datasets
INPUT_ROOT = Path("/kaggle/input/<your-chroma-dataset>")
DB_DIR = INPUT_ROOT / "chroma_db"  # change if your folder name differs

CODE_ROOT = Path("/kaggle/input/<your-code-dataset>") / "src"
if CODE_ROOT.exists():
    sys.path.insert(0, str(CODE_ROOT))

# Copy DB to working (writable)
WORKING_DB = Path("/kaggle/working/chroma_db")
if WORKING_DB.exists():
    shutil.rmtree(WORKING_DB)
shutil.copytree(DB_DIR, WORKING_DB)
print("Copied DB to:", WORKING_DB, WORKING_DB.exists())


In [None]:
# Point pipeline to working DB
try:
    from src.pipeline.config import paths
    from src.pipeline.retrieve import ChromaRetriever
except ModuleNotFoundError:
    from pipeline.config import paths
    from pipeline.retrieve import ChromaRetriever

paths.chroma_persist_dir = str(WORKING_DB)
print("Using Chroma at:", paths.chroma_persist_dir)


In [None]:
# Sample retrieval
retriever = ChromaRetriever()
query = "What is the electroweak precision program?"
res = retriever.retrieve(query, top_k=20)
print("hits:", len(res["hits"]))
print(res["hits"][0]["metadata"], res["hits"][0]["dense_score"])


# Test Retrieval on Kaggle with Existing Chroma DB

This notebook loads a prebuilt Chroma DB (unzipped folder) from a Kaggle Input Dataset and runs retrieval using E5 encoder.


In [1]:
pip install --quiet chromadb transformers torch pandas numpy tqdm "protobuf<5,>=3.20.3"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━

In [2]:
from pathlib import Path
import sys

# Point to your Kaggle Input dataset root (contains the unzipped chroma folder)
# Example: /kaggle/input/science-paper-qa-chroma
INPUT_ROOT = Path("/kaggle/input/sciqagmain-repo/src/vector_db")
DB_DIR = INPUT_ROOT / "chroma_db"  # adjust if your folder name differs

# Add repo src to path if you also uploaded code as dataset
SRC_ROOT = Path("/kaggle/input/sciqagmain-repo") / "src"
if SRC_ROOT.exists():
    sys.path.insert(0, str(SRC_ROOT))

print("DB_DIR:", DB_DIR, DB_DIR.exists())
print("SRC_ROOT:", SRC_ROOT, SRC_ROOT.exists())


DB_DIR: /kaggle/input/sciqagmain-repo/src/vector_db/chroma_db True
SRC_ROOT: /kaggle/input/sciqagmain-repo/src True


In [3]:
# Configure pipeline to point to the existing DB
try:
    from src.pipeline.config import paths
    from src.pipeline.retrieve import ChromaRetriever
except ModuleNotFoundError:
    from pipeline.config import paths
    from pipeline.retrieve import ChromaRetriever

paths.chroma_persist_dir = str(DB_DIR)
print("Using Chroma at:", paths.chroma_persist_dir)


Using Chroma at: /kaggle/input/sciqagmain-repo/src/vector_db/chroma_db


In [4]:
# Run a sample retrieval
retriever = ChromaRetriever()
query = "What is the electroweak precision program?"
res = retriever.retrieve(query, top_k=20)
len(res["hits"]), res["hits"][0]["metadata"], res["hits"][0]["dense_score"]


InternalError: error returned from database: (code: 8) attempt to write a readonly database

## Notes
- Set `INPUT_ROOT` to your dataset path on Kaggle that contains the unzipped Chroma folder.
- If you also upload the code as a dataset, set `SRC_ROOT` accordingly; otherwise, copy `src/` into working dir.
- If protobuf errors appear, add to the install cell: `protobuf<5,>=3.20.3` (already included).
- If no results, ensure DB folder structure is intact: it should be the same as when you zipped it.
