# Build Chroma DB on Kaggle and Download

This notebook installs dependencies, loads your metadata and embeddings from the repo, builds a Chroma collection at `/kaggle/working/chroma_db`, and zips it for download.


In [None]:
pip install --quiet chromadb langchain langchain_community langchain_core transformers torch pandas numpy tqdm


In [None]:
import os
from pathlib import Path

# Mount repo root - in Kaggle, this notebook should be placed at project root after uploading the repo as a dataset
REPO_ROOT = Path("/kaggle/working")  # Kaggle's working directory
# If you upload the repo as input dataset, you can also set REPO_ROOT = Path("/kaggle/input/science-paper-qa-rag")

CHROMA_DIR = REPO_ROOT / "chroma_db"
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

print("Chroma target dir:", CHROMA_DIR)


In [None]:
# Add repo root and src/ to Python path to support both import styles
import sys
repo_root = str(REPO_ROOT)
repo_src = str(REPO_ROOT / "src")
for p in [repo_root, repo_src]:
    if p not in sys.path:
        sys.path.insert(0, p)
print("Python path includes:", repo_root, "and", repo_src)


In [None]:
# Override persist dir to Kaggle working folder and point inputs to the Input Dataset
try:
    from src.pipeline.config import paths
except ModuleNotFoundError:
    from pipeline.config import paths
from pathlib import Path

# Inputs live under REPO_ROOT (often /kaggle/input/<dataset-name>)
paths.abs_metadata_path = str(REPO_ROOT / "data" / "abs_metadata.json")
paths.finetuned_questions_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv")
paths.graphsage_embeddings_csv = str(REPO_ROOT / "src" / "citation_net" / "graphSAGE" / "graphsage_embeddings.csv")

# Output (persisted Chroma) in working dir
paths.chroma_persist_dir = str(CHROMA_DIR)

print("Chroma persist dir set to:", paths.chroma_persist_dir)
print("Emb CSV:", paths.finetuned_questions_embeddings_csv, Path(paths.finetuned_questions_embeddings_csv).exists())
print("Metadata:", paths.abs_metadata_path, Path(paths.abs_metadata_path).exists())
print("GraphSAGE:", paths.graphsage_embeddings_csv, Path(paths.graphsage_embeddings_csv).exists())


In [None]:
# Sanity check input files exist (mounted with the repo / uploaded as dataset)
from pathlib import Path

required_files = [
    REPO_ROOT / "data" / "abs_metadata.json",
    REPO_ROOT / "src" / "citation_net" / "finetune_embedding_model" / "combined_doi_questions_embeddings.csv",  # only for ID list filtering
]
for p in required_files:
    print(p, "exists:", p.exists())

assert all(p.exists() for p in required_files), "Missing required input files in Kaggle environment. Upload the repo or set REPO_ROOT accordingly."


In [None]:
# Build the Chroma collection
try:
    from src.pipeline.index_chroma import build_chroma_collection
except ModuleNotFoundError:
    from pipeline.index_chroma import build_chroma_collection

build_chroma_collection()


In [None]:
# Zip the Chroma DB for download
import shutil
zip_path = REPO_ROOT / "chroma_db.zip"
if zip_path.exists():
    zip_path.unlink()
shutil.make_archive(str(zip_path.with_suffix('')), 'zip', CHROMA_DIR)
zip_path.exists(), zip_path, CHROMA_DIR


## Hướng dẫn tải về trên Kaggle
- Sau khi chạy xong, file zip nằm ở `/kaggle/working/chroma_db.zip`.
- Vào tab Output/Files của notebook Kaggle để download.

## Tùy chỉnh
- Muốn thay vị trí DB: đổi `CHROMA_DIR` ở ô đầu.
- Nếu bạn upload repo dưới dạng Input Dataset ở Kaggle, hãy đặt `REPO_ROOT = Path("/kaggle/input/<dataset-name>")` và copy code/inputs vào `/kaggle/working` trước khi build nếu cần.
- Nếu cần chỉ số khác (cosine/l2), sửa trong `src/pipeline/index_chroma.py` khi tạo collection.
