In [2]:
#!pip install sentence-transformers
#!pip install langchain langchain-community unstructured chromadb
#!pip install langchain unstructured chromadb

In [8]:
GITHUB_REPO_URL = "https://github.com/devitocodes/devito.git"
CLONE_DIR        = "devito" 
PERSIST_CODE     = "chroma_code_cells_hf"
CHUNK_SIZE       = 500
CHUNK_OVERLAP    = 100

import os, shutil
from pathlib import Path

import nbformat
from langchain.schema import Document
from langchain_community.document_loaders.python import PythonLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from collections import Counter

In [4]:
# # 3. Clone (or re-clone) the repo
# if os.path.exists(CLONE_DIR):
#     shutil.rmtree(CLONE_DIR)
# !git clone {GITHUB_REPO_URL} {CLONE_DIR}

In [9]:
# 4. Load ONLY code cells as Documents (no markdown)
code_docs = []

Doc_path = CLONE_DIR+'/examples/seismic/'

# 4a. Notebook code cells
for nb_path in Path(Doc_path).rglob("*.ipynb"):
    nb = nbformat.read(nb_path, as_version=4)
    for idx, cell in enumerate(nb.cells):
        if cell.cell_type == "code" and cell.source.strip():
            code_docs.append(Document(
                page_content=cell.source,
                metadata={
                    "source": str(nb_path),
                    "cell_index": idx
                }
            ))

# 4b. Optional: treat whole .py files as single code “cell”
for py_path in Path(Doc_path).rglob("*.py"):
    text = py_path.read_text()
    if text.strip():
        code_docs.append(Document(
            page_content=text,
            metadata={
                "source": str(py_path),
                "cell_index": -1      # indicate “script” vs notebook
            }
        ))

print(f"Loaded {len(code_docs)} code‐cell documents.")


Loaded 616 code‐cell documents.


In [10]:
# ─── Replace OpenAIEmbeddings with HuggingFaceEmbeddings ───────────────────────
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Create the HF embedding (no API key needed)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Rebuild or reload your cell-level index exactly as before
vectordb_cells = Chroma.from_documents(
    documents=code_docs,
    embedding=embeddings,
    persist_directory=PERSIST_CODE
)
vectordb_cells.persist()
print("✅ Indexed code-cells locally with HuggingFaceEmbeddings.")


✅ Indexed code-cells locally with HuggingFaceEmbeddings.


  vectordb_cells.persist()


In [12]:
# ─── 6. Query + group + print continuous code ───────────────────────────
# Reload with your HF embeddings
vectordb_cells = Chroma(
    persist_directory=PERSIST_CODE,
    embedding_function=embeddings
)

query = "forward wavefield simulation using devito"
# Grab more candidates so grouping is robust
raw_hits = vectordb_cells.similarity_search(query, k=30)

# Find the source with the most hits
source_counts = Counter(hit.metadata["source"] for hit in raw_hits)
best_source = source_counts.most_common(1)[0][0]
print(f"🏆 Best‐matching source: {best_source}\n")

# Pull & sort all its cells
selected = [d for d in code_docs if d.metadata["source"] == best_source]
selected.sort(key=lambda d: d.metadata["cell_index"])

# Print them continuously
print("--- Continuous code from top to bottom ---\n")
for doc in selected:
    print(doc.page_content)
    print("\n")  # spacer


  vectordb_cells = Chroma(


🏆 Best‐matching source: devito/examples/seismic/tutorials/06_elastic_varying_parameters.ipynb

--- Continuous code from top to bottom ---

from devito import *
from examples.seismic.source import RickerSource, Receiver, TimeAxis
from examples.seismic import plot_image, demo_model
import numpy as np

import matplotlib.pyplot as plt

from sympy import init_printing, latex
init_printing(use_latex='mathjax')

# Some ploting setup
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20)


#NBVAL_IGNORE_OUTPUT
# Initial grid: 3km x 3km, with spacing 10m
nlayers = 5
so = 8
model = demo_model(preset='layers-elastic', nlayers=nlayers, shape=(301, 301), spacing=(10., 10.),
                   space_order=so)


#NBVAL_SKIP
aspect_ratio = model.shape[0]/model.shape[1]

plt_options_model = {'cmap': 'jet', 'extent': [model.origin[0], model.origin[0] + model.domain_size[0],
                                               model.origin[1] + m