# Build Chroma DB on Kaggle (E5 doc embeddings + topic metadata)

This notebook installs dependencies, loads your repo data from Kaggle Input, builds a Chroma collection at `/kaggle/working/chroma_db` using E5 doc embeddings, stores `top_topic` in metadata, and zips it for download.


In [None]:
pip install --quiet chromadb transformers torch pandas numpy tqdm gensim "protobuf<5,>=3.20.3"


In [None]:
from pathlib import Path
import sys, os

# Point to Kaggle Input dataset containing the repo
REPO_ROOT = Path("/kaggle/input/<your-code-dataset>")
assert REPO_ROOT.exists(), "Upload repo as Input Dataset and set REPO_ROOT accordingly"

SRC_DIR = REPO_ROOT / "src"
if SRC_DIR.exists():
    sys.path.insert(0, str(SRC_DIR))

# Output dir for Chroma
CHROMA_DIR = Path("/kaggle/working/chroma_db")
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
print("Chroma target:", CHROMA_DIR)


In [None]:
# Ensure pipeline points to Kaggle working dir
try:
    from src.pipeline.config import paths, indexing_cfg
    from src.pipeline.index_chroma import build_chroma_collection
except ModuleNotFoundError:
    from pipeline.config import paths, indexing_cfg
    from pipeline.index_chroma import build_chroma_collection

paths.abs_metadata_path = str(REPO_ROOT / "data" / "abs_metadata.json")
paths.finetuned_questions_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv")
paths.graphsage_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "graphSAGE" / "graphsage_embeddings.csv")
paths.chroma_persist_dir = str(CHROMA_DIR)

# Toggle to try PDF fulltext (not implemented in this notebook; placeholder for future)
indexing_cfg.use_pdf_fulltext = False

print("abs_metadata:", paths.abs_metadata_path)
print("questions CSV:", paths.finetuned_questions_embeddings_csv)
print("graph CSV:", paths.graphsage_embeddings_csv)
print("chroma_dir:", paths.chroma_persist_dir)
print("use_pdf_fulltext:", indexing_cfg.use_pdf_fulltext)


In [None]:
# Sanity check inputs exist
from pathlib import Path
inputs = [
    Path(paths.abs_metadata_path),
    Path(paths.finetuned_questions_embeddings_csv),
]
for p in inputs:
    print(p, p.exists())
assert all(p.exists() for p in inputs), "Missing required inputs. Upload the repo dataset correctly."



In [None]:
# Build DB (E5 doc embeddings + top_topic metadata)
build_chroma_collection()


In [None]:
# Zip for download
import shutil
zip_path = Path("/kaggle/working/chroma_db.zip")
if zip_path.exists():
    zip_path.unlink()
shutil.make_archive(str(zip_path.with_suffix('')), 'zip', CHROMA_DIR)
zip_path.exists(), zip_path
