In [None]:
# !pip -q uninstall -y langchain langchain-core langchain-community langchain-openai


In [1]:
!pip install -q "openai==1.52.0" "langchain==0.2.16" "langchain-community" "langchain-openai" "sentence-transformers==3.0.1" "qdrant-client==1.9.1" "faiss-cpu==1.8.0.post1" "beautifulsoup4==4.12.3" "lxml==5.3.0" "requests==2.32.3" "pypdf==4.3.1"

In [2]:
from bs4 import BeautifulSoup as Soup
import re, requests
from typing import List
from langchain_openai import ChatOpenAI

from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


In [3]:
SITES_HTML = [
    ("BIDS",     "https://bids-specification.readthedocs.io/en/stable/"),
    ("fMRIPrep", "https://fmriprep.readthedocs.io/en/stable/"),
]
SITES_PDF = [
    
    ("MRtrix", "https://media.readthedocs.org/pdf/mrtrix/latest/mrtrix.pdf"),
    ("SPM12",  "https://www.fil.ion.ucl.ac.uk/spm/doc/spm12_manual.pdf"),

]
MAX_DEPTH = 1  # keep tiny & fast

In [4]:
def clean_html_text(html: str) -> str:
    t = Soup(html, "html.parser").get_text(" ", strip=True)
    return re.sub(r"\s{2,}", " ", t)

def load_web_docs() -> List:
    docs = []
    for name, url in SITES_HTML:
        loader = RecursiveUrlLoader(url=url, max_depth=MAX_DEPTH,
                                    extractor=lambda x: clean_html_text(x))
        d = loader.load()
        for x in d:
            x.metadata["tool"] = name
        print(f"[{name}] pages:", len(d))
        docs.extend(d)
    return docs

def load_pdf_docs() -> List:
    docs = []
    for name, url in SITES_PDF:
        pdf_bytes = requests.get(url, timeout=120).content
        with open(f"{name}.pdf", "wb") as f: f.write(pdf_bytes)
        ld = PyPDFLoader(f"{name}.pdf").load()
        for x in ld:
            x.metadata["tool"] = name
            x.metadata["source"] = f"{name}.pdf"
        print(f"[{name}] pdf pages:", len(ld))
        docs.extend(ld)
    return docs

web_docs = load_web_docs()
pdf_docs = load_pdf_docs()
all_docs = web_docs + pdf_docs
print("TOTAL docs:", len(all_docs))


[BIDS] pages: 1
[fMRIPrep] pages: 1
[MRtrix] pdf pages: 378
[SPM12] pdf pages: 533
TOTAL docs: 913


In [5]:

section_splitter = RecursiveCharacterTextSplitter(
    separators=["\n## ", "\n### ", "\n# ", "\n\n", "\n", " "],
    chunk_size=1800, chunk_overlap=150
)

def sentenceish_pack(text, size=1100, overlap=150):
    sents = re.split(r'(?<=[\.\?\!])\s+', text.strip())
    cur, out = "", []
    for s in sents:
        if len(cur) + len(s) + 1 <= size:
            cur = (cur + " " + s).strip()
        else:
            if cur: out.append(cur)
            cur = s
    if cur: out.append(cur)
    stitched = []
    for i, ch in enumerate(out):
        tail = out[i-1][-overlap:] if i>0 else ""
        stitched.append((tail + " " + ch).strip())
    return stitched

def make_chunks(docs: List):
    sections = section_splitter.split_documents(docs)
    chunks = []
    for d in sections:
        for piece in sentenceish_pack(d.page_content):
            nd = d.copy()
            nd.page_content = piece
            chunks.append(nd)
    return chunks

chunks = make_chunks(all_docs)
print("TOTAL chunks:", len(chunks))

TOTAL chunks: 2794


In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Initialize E5 embeddings (small & efficient)
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-small-v2",
    model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
    encode_kwargs={'normalize_embeddings': True}
)

# Create FAISS vector store from chunks
print("Creating embeddings... (this may take a few minutes)")
vectorstore = FAISS.from_documents(chunks, embeddings)

# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}  # Return top 4 most relevant chunks
)

print("✓ Embeddings created and retriever ready!")

  embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange


Creating embeddings... (this may take a few minutes)
✓ Embeddings created and retriever ready!


In [7]:
!pip -q install httpx==0.27.2

In [8]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"
SYSTEM = """You are a precise RAG assistant for BIDS/fMRIPrep/MRtrix/SPM.
Use ONLY the provided context. If missing, say you don't know.
Be concise (<=6 sentences). Add a final 'Sources:' line with URLs/filenames."""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs, max_chars=2000):
    out = []
    for d in docs:
        src = d.metadata.get("source") or d.metadata.get("url") or "unknown"
        txt = d.page_content[:max_chars]
        if len(d.page_content) > max_chars:
            txt = txt.rsplit(". ", 1)[0] + "."
        out.append(f"{txt}\n(Source: {src})")
    return "\n\n---\n\n".join(out)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

def ask(q: str):
    # E5 works best with "query: " prefix for queries
    return chain.invoke("query: " + q)


In [11]:
print(os.environ.get("OPENAI_API_KEY")[:10])  # Shows first 10 chars

sk-proj-ew


In [9]:
# For MiniLM (no prefix)
print(chain.invoke("What outputs does fMRIPrep produce for BOLD runs?"))


fMRIPrep produces several outputs for BOLD runs, including preprocessed functional data, confound regressors, and visual reports. The preprocessed data is resampled onto standard spaces, such as EPI sampled to FreeSurfer surfaces and HCP Grayordinates. Additionally, fMRIPrep generates confound regressors that can be used in subsequent analyses, along with a description of these confounds. Visual reports are also created to facilitate quality control and interpretation of the preprocessing results. 

Sources: https://fmriprep.readthedocs.io/en/stable/


In [10]:
!pip -q install gradio

In [14]:


import gradio as gr

USE_E5 = True  # True only if you used E5 embeddings

custom_css = """
#title {font-size: 28px; font-weight: 800; letter-spacing:.2px}
#subtitle {opacity:.85; margin-top:-8px}
.gr-textbox textarea {font-size:16px}
"""

def answer_fn(q, hist):
    if not q.strip():
        return q, gr.update(), hist
    q_in = f"query: {q}" if USE_E5 else q
    ans = chain.invoke(q_in)

    new_block = f"**Q:** {q}\n\n{ans}"
    hist = [new_block] + hist          # <-- put newest at the TOP

    return "", "\n\n---\n\n".join(hist), hist

with gr.Blocks(title="Neuro-Docs RAG", css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("<div id='title'> Neuro-Docs RAG</div>"
                "<div id='subtitle'>BIDS · fMRIPrep · MRtrix · SPM</div>")
    state = gr.State([])  # keeps the running Q/A markdown

    # Input row (on top)
    with gr.Group():
        qbox = gr.Textbox(label=None, placeholder="Ask about BIDS / fMRIPrep…",
                          lines=2, autofocus=True)
        send = gr.Button("Ask", variant="primary")

    # Answers area (below input)
    out_md = gr.Markdown(label=None)

    qbox.submit(answer_fn, [qbox, state], [qbox, out_md, state])
    send.click(answer_fn, [qbox, state], [qbox, out_md, state])

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://ddf0a2a58ea3a957fd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


