In [1]:
# 1. Install required packages (run once)
#!pip install langchain langchain-community unstructured chromadb
#!pip install langchain unstructured chromadb

In [2]:
# 2. Configuration — edit these:
GITHUB_REPO_URL = "https://github.com/devitocodes/devito.git"
CLONE_DIR        = "devito" 
PERSIST_DIR     = "chroma_index" 
CHUNK_SIZE       = 500
CHUNK_OVERLAP    = 100

import os, shutil
from pathlib import Path

import nbformat
from langchain.schema import Document
from langchain_community.document_loaders.python import PythonLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


In [11]:
# 3. Clone (or re-clone) the repo
if os.path.exists(CLONE_DIR):
    shutil.rmtree(CLONE_DIR)
!git clone {GITHUB_REPO_URL} {CLONE_DIR}

In [3]:
# ─── 4. Load all .ipynb & .py into LangChain Documents ──────────────────────────
def load_repo_documents(repo_path: str):
    docs = []
    # 4a. Notebooks via nbformat
    for nb_path in Path(repo_path).rglob("*.ipynb"):
        nb = nbformat.read(nb_path, as_version=4)
        for idx, cell in enumerate(nb.cells):
            if cell.cell_type in ("markdown", "code"):
                docs.append(
                    Document(
                        page_content=cell.source,
                        metadata={
                            "source": str(nb_path),
                            "cell_type": cell.cell_type,
                            "cell_index": idx
                        }
                    )
                )
    # 4b. Python scripts via PythonLoader
    for py_path in Path(repo_path).rglob("*.py"):
        loader = PythonLoader(str(py_path))
        docs.extend(loader.load())
    return docs

Doc_path = CLONE_DIR+'/examples/seismic/'
docs = load_repo_documents(Doc_path)
print(f"Loaded {len(docs)} documents (cells + scripts).")

Loaded 1133 documents (cells + scripts).


In [5]:
api_key = os.environ["OPENAI_API_KEY"]
print("OPENAI_API_KEY found:", "OPENAI_API_KEY" in os.environ)


OPENAI_API_KEY found: True


In [6]:
# ─── 5. Chunk, embed, and persist in Chroma ──────────────────────────────────────
# 5a. Chunk
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks.")

# 5b. Embed & index
embeddings = OpenAIEmbeddings()  # needs OPENAI_API_KEY in your env
vectordb   = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=PERSIST_DIR
)
vectordb.persist()
print("Chroma index persisted.")

Split into 2671 chunks.


  embeddings = OpenAIEmbeddings()  # needs OPENAI_API_KEY in your env


Chroma index persisted.


  vectordb.persist()


In [7]:
# ─── Inspect your chunks ──────────────────────────────────────────────────────────
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# (Re-load Chroma and embeddings if you restarted the kernel)
embeddings = OpenAIEmbeddings()  
vectordb   = Chroma(
    persist_directory=PERSIST_DIR, 
    embedding_function=embeddings
)

  vectordb   = Chroma(


In [8]:
# (Re-split your docs if `chunks` isn't in memory)
# ––– you can skip this block if you still have `chunks` from before –––
from pathlib import Path
import nbformat
from langchain.schema import Document
from langchain_community.document_loaders.python import PythonLoader

def load_repo_documents(repo_path: str):
    docs = []
    # Notebooks
    for nb_path in Path(repo_path).rglob("*.ipynb"):
        nb = nbformat.read(nb_path, as_version=4)
        for idx, cell in enumerate(nb.cells):
            if cell.cell_type in ("markdown", "code"):
                docs.append(Document(page_content=cell.source))
    # Python files
    for py_path in Path(repo_path).rglob("*.py"):
        docs.extend(PythonLoader(str(py_path)).load())
    return docs



In [9]:

docs    = load_repo_documents(Doc_path)
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks  = splitter.split_documents(docs)

# 1️⃣ How many chunks?  Length of each?
total = len(chunks)
lengths = [len(c.page_content) for c in chunks]

print(f"🔖 Total chunks: {total}")
for i in range(min(5, total)):
    print(f"  • Chunk {i}: {lengths[i]} characters")

# Show a passage from chunk 0
print("\n📄 Sample from chunk 0:\n")
print(chunks[0].page_content[:500].rstrip() + "\n…")

# 2️⃣ Embedding format & size
# Embed that same chunk to inspect the vector
vec = embeddings.embed_documents([chunks[0].page_content])[0]

print(f"\n🔢 Embedding type: {type(vec)}")
print(f"🔢 Embedding length (dimensionality): {len(vec)}")
print(f"🔢 First 10 dims: {vec[:10]}")

🔖 Total chunks: 2671
  • Chunk 0: 310 characters
  • Chunk 1: 141 characters
  • Chunk 2: 322 characters
  • Chunk 3: 213 characters
  • Chunk 4: 69 characters

📄 Sample from chunk 0:

import numpy as np
from scipy.special import hankel2
from examples.seismic.acoustic import AcousticWaveSolver
from examples.seismic import Model, RickerSource, Receiver, TimeAxis, AcquisitionGeometry
from devito import set_log_level

import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
…

🔢 Embedding type: <class 'list'>
🔢 Embedding length (dimensionality): 1536
🔢 First 10 dims: [0.007253124227135887, 0.018491968126319654, 0.016193363477338282, -0.024400555950403212, -0.015766059650513142, 0.03150264853009067, 0.013828455181687098, -0.01881612984280546, -0.010829956975887001, -0.0348916158243366]


In [30]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

vectordb = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=OpenAIEmbeddings()
)
results = vectordb.similarity_search("forward wavefield simulation using devito", k=10, filter={"cell_type": "code"})

In [31]:
print(len(results))

10


In [32]:
print(results[0])
print('\n')
print(results[1])
print('\n')
print(results[2])

page_content='from devito import *
from examples.seismic.source import WaveletSource, RickerSource, GaborSource, TimeAxis
from examples.seismic import plot_image
import numpy as np

from sympy import init_printing, latex
init_printing(use_latex='mathjax')' metadata={'cell_type': 'code', 'source': 'devito/examples/seismic/tutorials/06_elastic.ipynb', 'cell_index': 1}


page_content='from devito import Function
from examples.seismic import Receiver

# Serial FWI objective function
def fwi_objective_single_shot(model, geometry, d_obs):

    # Devito objects for gradient and data residual
    grad = Function(name="grad", grid=model.grid)
    residual = Receiver(name='rec', grid=model.grid,
                        time_range=geometry.time_axis, 
                        coordinates=geometry.rec_positions)
    solver = AcousticWaveSolver(model, geometry, space_order=4)' metadata={'cell_type': 'code', 'source': 'devito/examples/seismic/tutorials/04_dask.ipynb', 'cell_index': 23}


page_content

In [33]:
# # Post‐filter for lines you expect, e.g. look for "TimeFunction" or "Operator"
# sims = [d for d in results if "TimeFunction" in d.page_content or "Operator" in d.page_content]

# for i, doc in enumerate(sims[:3], 1):
#     print(f"\n=== Simulation snippet #{i} from {doc.metadata['source']} ===\n")
#     print(doc.page_content)


=== Simulation snippet #1 from devito/examples/seismic/tutorials/06_elastic_varying_parameters.ipynb ===

from devito import div45, grad45

all_node = [[NODE for _ in range(model.grid.dim)] for _ in range(model.grid.dim)]
all_vert = [model.grid.dimensions for _ in range(model.grid.dim)] 

so = 8
v_rsfd = VectorTimeFunction(name='vr', grid=model.grid, space_order=so, time_order=1, staggered=all_vert)
tau_rsfd = TensorTimeFunction(name='tr', grid=model.grid, space_order=so, time_order=1, staggered=all_node)

# The source injection term
src_xx = src.inject(field=v_rsfd.forward.diagonal(), expr=s*src)


In [35]:
keywords = ["demo_model", "TimeFunction", "Operator", "AcquisitionGeometry"]
sims = [d for d in results 
        if any(kw in d.page_content for kw in keywords)]

# 3. Print out the first few full snippets
for i, doc in enumerate(sims[:5], 1):
    print(f"\n=== Simulation snippet #{i} ===")
    print(f"Source: {doc.metadata['source']}  Cell: {doc.metadata['cell_index']}\n")
    print(doc.page_content)


=== Simulation snippet #1 ===
Source: devito/examples/seismic/tutorials/06_elastic_varying_parameters.ipynb  Cell: 37

from devito import div45, grad45

all_node = [[NODE for _ in range(model.grid.dim)] for _ in range(model.grid.dim)]
all_vert = [model.grid.dimensions for _ in range(model.grid.dim)] 

so = 8
v_rsfd = VectorTimeFunction(name='vr', grid=model.grid, space_order=so, time_order=1, staggered=all_vert)
tau_rsfd = TensorTimeFunction(name='tr', grid=model.grid, space_order=so, time_order=1, staggered=all_node)

# The source injection term
src_xx = src.inject(field=v_rsfd.forward.diagonal(), expr=s*src)

=== Simulation snippet #2 ===
Source: devito/examples/seismic/tutorials/17_fourier_mode.ipynb  Cell: 3

from devito import *

from examples.seismic import demo_model, AcquisitionGeometry, plot_velocity

import matplotlib.pyplot as plt
from IPython.display import Code
