In [None]:
#basic chunk method since tokenizer model(miniLM) wasnt working

In [None]:
!pip install -q sentence-transformers faiss-cpu rank-bm25

In [2]:
from pathlib import Path
import json
#path definition
ROOT = Path.cwd()
CHUNKS_PATH = ROOT / "artifacts/chunks/chunks.jsonl"
INDEX_DIR   = ROOT / "artifacts/index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)

#loading chunk data
texts, meta = [], []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        r = json.loads(line)
        texts.append(r["text"])
        meta.append({"id": r["id"], "source_file": r["source_file"], "source_url": r.get("source_url","(unknown)")})

len(texts)


291

In [3]:
#creating model instance

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np, faiss, json

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  
model = SentenceTransformer(MODEL_NAME)

emb = model.encode(
    texts, batch_size=64, convert_to_numpy=True,
    show_progress_bar=True, normalize_embeddings=True
)
index = faiss.IndexFlatIP(emb.shape[1]) 
index.add(emb)

faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
(INDEX_DIR / "meta.json").write_text(json.dumps(meta), encoding="utf-8")
(INDEX_DIR / "model.txt").write_text(MODEL_NAME, encoding="utf-8")

index.ntotal


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

291

In [5]:
#adding bm25 ranker for optimization

In [6]:
from rank_bm25 import BM25Okapi
import re, numpy as np


tokenized = [re.findall(r"\w+", t.lower()) for t in texts]
bm25 = BM25Okapi(tokenized)

def bm25_search(q, k=50):
    scores = bm25.get_scores(re.findall(r"\w+", q.lower()))
    idx = np.argsort(scores)[::-1][:k]
    return [{"idx": int(i), "bm25": float(scores[i])} for i in idx]

def dense_search(q, k=50):
    qv = model.encode([q], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(qv, k)
    return [{"idx": int(i), "dense": float(s)} for i, s in zip(I[0], D[0])]

def rrf_fuse(q, k=10, K=60):
    a = bm25_search(q, k=50)
    b = dense_search(q, k=50)
    rank = {}
    for results in (a, b):
        for pos, item in enumerate(results, start=1):
            rank[item["idx"]] = rank.get(item["idx"], 0.0) + 1.0/(K + pos)
    order = sorted(rank.items(), key=lambda x: x[1], reverse=True)[:k]
    fused = []
    for i, score in order:
        fused.append({
            "score_rrf": float(score),
            "source_url": meta[i]["source_url"],
            "source_file": meta[i]["source_file"],
            "text": texts[i]
        })
    return fused


In [7]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("BAAI/bge-reranker-base")  

def rerank(query, candidates, topk=5):
    pairs = [(query, c["text"]) for c in candidates]
    scores = reranker.predict(pairs)
    for c, s in zip(candidates, scores):
        c["rerank"] = float(s)
    return sorted(candidates, key=lambda x: x["rerank"], reverse=True)[:topk]


In [8]:
def search(q, k=5, use_hybrid=True, use_rerank=True):
    if use_hybrid:
        cands = rrf_fuse(q, k=max(k*4, 20))   # get a wider candidate set
    else:
        # dense only
        D, I = index.search(model.encode([q], convert_to_numpy=True, normalize_embeddings=True), max(k*4, 20))
        cands = [{"text": texts[i], "source_url": meta[i]["source_url"], "source_file": meta[i]["source_file"], "score": float(s)}
                 for i, s in zip(I[0], D[0])]
    if use_rerank:
        cands = rerank(q, cands, topk=k)
    else:
        cands = cands[:k]
    # pretty view
    results = []
    for c in cands:
        results.append({
            "source_file": c["source_file"],
            "source_url": c["source_url"],
            "score": float(c.get("rerank", c.get("score", 0.0))),
            "text": c["text"][:400].replace("\n", " ") + ("..." if len(c["text"])>400 else "")
        })
    return results

# quick tests
for q in [
    "what does recon-all -all do?",
    "how to run fmriprep participant level",
    "what outputs are generated by fmriprep"
]:
    print("\nQ:", q)
    for h in search(q, k=3, use_hybrid=True, use_rerank=True):
        print(f"  • {h['score']:.3f} | {h['source_file']}")
        print(f"    {h['source_url']}")
        print("   ", h["text"])



Q: what does recon-all -all do?
  • 0.896 | fmriprep.org_en_stable_faq.html.md
    (unknown-url) fmriprep.org_en_stable_faq.html.md
    the Linux kernel to kill processes as a response to running out of memory. Depending on the process killed by the kernel, *fMRIPrep* may crash with a `BrokenProcessPool` error or hang indefinitely, depending on settings. While we are working on finding a solution that does not run up against this bug, this may take some time. This can be most easily resolved by allocating more memory to the proces...
  • 0.814 | fmriprep.org_en_stable_workflows.html.md
    (unknown-url) fmriprep.org_en_stable_workflows.html.md
    ### Surface preprocessing[](https://fmriprep.org#surface-preprocessing) *fMRIPrep* uses [FreeSurfer](https://surfer.nmr.mgh.harvard.edu/) to reconstruct surfaces from T1w/T2w structural images. If enabled, several steps in the *fMRIPrep* pipeline are added or replaced. All surface preprocessing may be disabled with the `--fs-no-reconall` fla