In [1]:
# 1. Install required packages (run once)
#!pip install langchain langchain-community unstructured chromadb
#!pip install langchain unstructured chromadb

In [27]:
# 2. Configuration — edit these:
GITHUB_REPO_URL = "https://github.com/devitocodes/devito.git"
CLONE_DIR        = "devito" 
PERSIST_CODE     = "chroma_code_cells"
CHUNK_SIZE       = 500
CHUNK_OVERLAP    = 100

import os, shutil
from pathlib import Path

import nbformat
from langchain.schema import Document
from langchain_community.document_loaders.python import PythonLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from collections import Counter

In [2]:
# # 3. Clone (or re-clone) the repo
# if os.path.exists(CLONE_DIR):
#     shutil.rmtree(CLONE_DIR)
# !git clone {GITHUB_REPO_URL} {CLONE_DIR}

In [16]:
# 4. Load ONLY code cells as Documents (no markdown)
code_docs = []

Doc_path = CLONE_DIR+'/examples/seismic/'

# 4a. Notebook code cells
for nb_path in Path(Doc_path).rglob("*.ipynb"):
    nb = nbformat.read(nb_path, as_version=4)
    for idx, cell in enumerate(nb.cells):
        if cell.cell_type == "code" and cell.source.strip():
            code_docs.append(Document(
                page_content=cell.source,
                metadata={
                    "source": str(nb_path),
                    "cell_index": idx
                }
            ))

# 4b. Optional: treat whole .py files as single code “cell”
for py_path in Path(Doc_path).rglob("*.py"):
    text = py_path.read_text()
    if text.strip():
        code_docs.append(Document(
            page_content=text,
            metadata={
                "source": str(py_path),
                "cell_index": -1      # indicate “script” vs notebook
            }
        ))

print(f"Loaded {len(code_docs)} code‐cell documents.")


Loaded 616 code‐cell documents.


In [17]:
api_key = os.environ["OPENAI_API_KEY"]
print("OPENAI_API_KEY found:", "OPENAI_API_KEY" in os.environ)


OPENAI_API_KEY found: True


In [25]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
vectordb_cells = Chroma.from_documents(
    documents=code_docs,        # 📄 our list of code‐cell Documents
    embedding=embeddings,       # 🔢 the embedding function
    persist_directory=PERSIST_CELLS  # 📂 where to persist
)
vectordb_cells.persist()
print("Indexed code‐cells in Chroma at:", PERSIST_CELLS)

Indexed code‐cells in Chroma at: chroma_code_cells


In [29]:
query = "forward wavefield simulation using devito"
# 6a. Get top 30 code cells matching the query
raw_hits = vectordb_cells.similarity_search(query, k=30)

# 6b. Identify which notebook/script appears most often
counts = Counter(hit.metadata["source"] for hit in raw_hits)
best_source = counts.most_common(1)[0][0]
print(f"Best match source: {best_source}")

# 6c. Pull all code_docs from that source and sort by cell_index
selected = [
    d for d in code_docs 
    if d.metadata["source"] == best_source
]
selected.sort(key=lambda d: d.metadata["cell_index"])

# 6d. Print them in order, giving you the continuous code
print("\n--- Continuous code from top to bottom ---\n")
for doc in selected:
    print(doc.page_content)
    print("\n")      # spacer between cells


Best match source: devito/examples/seismic/tutorials/05_staggered_acoustic.ipynb

--- Continuous code from top to bottom ---

from devito import *
from examples.seismic.source import DGaussSource, TimeAxis
from examples.seismic import plot_image
import numpy as np

from sympy import init_printing, latex
init_printing(use_latex='mathjax')


# Initial grid: 1km x 1km, with spacing 100m
extent = (2000., 2000.)
shape = (81, 81)
x = SpaceDimension(name='x', spacing=Constant(name='h_x', value=extent[0]/(shape[0]-1)))
z = SpaceDimension(name='z', spacing=Constant(name='h_z', value=extent[1]/(shape[1]-1)))
grid = Grid(extent=extent, shape=shape, dimensions=(x, z))


# Timestep size from Eq. 7 with V_p=6000. and dx=100
t0, tn = 0., 200.
dt = 1e2*(1. / np.sqrt(2.)) / 60.
time_range = TimeAxis(start=t0, stop=tn, step=dt)

src = DGaussSource(name='src', grid=grid, f0=0.01, time_range=time_range, a=0.004)
src.coordinates.data[:] = [1000., 1000.]


#NBVAL_SKIP

src.show()


# Now we create the veloc