# Build Chroma DB on Kaggle (E5 doc embeddings + topic metadata)

This notebook installs dependencies, loads your repo data from a Kaggle Input dataset, builds a Chroma collection at `/kaggle/working/chroma_db` using E5 doc embeddings, stores `top_topic` in metadata for topic-prefiltered RAG, and zips it for download.


In [None]:
pip install --quiet chromadb transformers torch pandas numpy tqdm gensim "protobuf<5,>=3.20.3"


In [None]:
from pathlib import Path
import sys

# Input dataset containing the repo
REPO_ROOT = Path("/kaggle/input/<your-code-dataset>")
assert REPO_ROOT.exists(), "Upload repo as Input Dataset and set REPO_ROOT accordingly"

SRC_DIR = REPO_ROOT / "src"
if SRC_DIR.exists():
    sys.path.insert(0, str(SRC_DIR))

from src.pipeline.config import paths, indexing_cfg

# Output DB dir
CHROMA_DIR = Path("/kaggle/working/chroma_db")
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

# Map inputs from dataset
paths.abs_metadata_path = str(REPO_ROOT / "data" / "abs_metadata.json")
paths.finetuned_questions_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv")
paths.graphsage_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "graphSAGE" / "graphsage_embeddings.csv")
paths.chroma_persist_dir = str(CHROMA_DIR)

# Toggle fulltext option if you want (not implemented yet; placeholder)
indexing_cfg.use_pdf_fulltext = False

print("abs_metadata:", paths.abs_metadata_path)
print("questions CSV:", paths.finetuned_questions_embeddings_csv)
print("graph CSV:", paths.graphsage_embeddings_csv)
print("chroma_dir:", paths.chroma_persist_dir)
print("use_pdf_fulltext:", indexing_cfg.use_pdf_fulltext)


In [None]:
# Sanity check
from pathlib import Path
for p in [paths.abs_metadata_path, paths.finetuned_questions_embeddings_csv]:
    print(p, Path(p).exists())
assert Path(paths.abs_metadata_path).exists() and Path(paths.finetuned_questions_embeddings_csv).exists()


In [None]:
# Build
from src.pipeline.index_chroma import build_chroma_collection
build_chroma_collection()


In [None]:
# Zip output
import shutil
zip_path = Path("/kaggle/working/chroma_db.zip")
if zip_path.exists():
    zip_path.unlink()
shutil.make_archive(str(zip_path.with_suffix('')), 'zip', CHROMA_DIR)
print(zip_path, zip_path.exists())


# Build Chroma DB on Kaggle and Download

This notebook installs dependencies, loads your metadata and embeddings from the repo, builds a Chroma collection at `/kaggle/working/chroma_db`, and zips it for download.


In [1]:
pip install --quiet chromadb langchain langchain_community langchain_core transformers torch pandas numpy tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.8/449.8 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from pathlib import Path

# Mount repo root - in Kaggle, this notebook should be placed at project root after uploading the repo as a dataset
REPO_ROOT = Path("/kaggle/input/sciqagmain-repo")  # Kaggle's working directory
# If you upload the repo as input dataset, you can also set REPO_ROOT = Path("/kaggle/input/science-paper-qa-rag")

CHROMA_DIR = Path("/kaggle/working/chroma_db")
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

print("Chroma target dir:", CHROMA_DIR)


Chroma target dir: /kaggle/working/chroma_db


In [3]:
# Add repo root and src/ to Python path to support both import styles
import sys
repo_root = str(REPO_ROOT)
repo_src = str(REPO_ROOT / "src")
for p in [repo_root, repo_src]:
    if p not in sys.path:
        sys.path.insert(0, p)
print("Python path includes:", repo_root, "and", repo_src)


Python path includes: /kaggle/input/sciqagmain-repo and /kaggle/input/sciqagmain-repo/src


In [4]:
# Override persist dir to Kaggle working folder and point inputs to the Input Dataset
try:
    from src.pipeline.config import paths
except ModuleNotFoundError:
    from pipeline.config import paths
from pathlib import Path

# Inputs live under REPO_ROOT (often /kaggle/input/<dataset-name>)
paths.abs_metadata_path = str(REPO_ROOT / "data" / "abs_metadata.json")
paths.finetuned_questions_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv")
paths.graphsage_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "graphSAGE" / "graphsage_embeddings.csv")

# Output (persisted Chroma) in working dir
paths.chroma_persist_dir = str(CHROMA_DIR)

print("Chroma persist dir set to:", paths.chroma_persist_dir)
print("Emb CSV:", paths.finetuned_questions_embeddings_csv, Path(paths.finetuned_questions_embeddings_csv).exists())
print("Metadata:", paths.abs_metadata_path, Path(paths.abs_metadata_path).exists())
print("GraphSAGE:", paths.graphsage_embeddings_csv, Path(paths.graphsage_embeddings_csv).exists())


Chroma persist dir set to: /kaggle/working/chroma_db
Emb CSV: /kaggle/input/sciqagmain-repo/src/citation_net/finetune_embedding_model/combined_doi_questions_embeddings.csv True
Metadata: /kaggle/input/sciqagmain-repo/data/abs_metadata.json True
GraphSAGE: /kaggle/input/sciqagmain-repo/src/citation_net/graphSAGE/graphsage_embeddings.csv True


In [5]:
# Sanity check input files exist (mounted with the repo / uploaded as dataset)
from pathlib import Path

required_files = [
    REPO_ROOT / "data" / "abs_metadata.json",
    REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv",  # only for ID list filtering
]
for p in required_files:
    print(p, "exists:", p.exists())

assert all(p.exists() for p in required_files), "Missing required input files in Kaggle environment. Upload the repo or set REPO_ROOT accordingly."


/kaggle/input/sciqagmain-repo/data/abs_metadata.json exists: True
/kaggle/input/sciqagmain-repo/src/citation_net/finetune_embedding_model/combined_doi_questions_embeddings.csv exists: True


In [6]:
# Build the Chroma collection
try:
    from src.pipeline.index_chroma import build_chroma_collection
except ModuleNotFoundError:
    from pipeline.index_chroma import build_chroma_collection

build_chroma_collection()


TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates