In [None]:
# rag_pipeline.py
"""
End‑to‑end Retrieval‑Augmented Generation (RAG) pipeline
======================================================
This single script can be run top‑to‑bottom or imported as a module.  It
covers the **full stack** you need to build a local proof‑of‑concept QA
system:

1. **Data ingestion** – reads the pre‑cleaned paragraph file (combo_long.pkl)
2. **Chunking**        – token‑bounded with overlap
3. **Dual retrieval**  – BM25 (sparse) + MiniLM (dense) hybrid
4. **Vector store**    – FAISS index persisted to disk
5. **Lite evaluator**  – recall@k and MRR on a YAML gold file
6. **FastAPI server**  – /search endpoint returns passages & (optional) LLM answer

Dependencies
------------
```bash
pip install pandas numpy nltk faiss-cpu             \
            sentence-transformers rank-bm25         \
            pyyaml fastapi uvicorn[standard]         \
            tiktoken openai                         # optional for LLM step
```
"""

In [2]:
!pip install pandas numpy nltk faiss-cpu sentence-transformers rank-bm25 pyyaml fastapi uvicorn[standard] tiktoken openai  

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp310-cp310-win_amd64.whl (15.0 MB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
Collecting uvicorn[standard]
  Downloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp310-cp310-win_amd64.whl (894 kB)
Collecting openai
  Downloading openai-1.77.0-py3-none-any.whl (662 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
Collecting scipy
  Downloading scipy-1.15.2-cp310-cp310-win_amd64.whl (41.2 MB)
Collecting starlette<0.47.0,>=0.40.0
  Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4
  Downloading pydantic-2.11.4-py3-none-any.whl (443 kB)
Collecting h11>=0.8
  Down

You should consider upgrading via the 'C:\Users\hp\Desktop\feb research\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:
from __future__ import annotations
import os, re, json, unicodedata, functools, textwrap
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
import faiss
from rank_bm25 import BM25Okapi
import tiktoken  # for accurate token counts per model

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ============ CONFIG =========================================================
VENV_ROOT      = Path(os.environ.get("VENV_ROOT", Path('venv').resolve().parent))
PARA_PKL       = VENV_ROOT / "combo_long.pkl"
FAISS_PATH     = VENV_ROOT / "para_index.faiss"
META_PATH      = VENV_ROOT / "para_meta.json"
BM25_PATH      = VENV_ROOT / "bm25.npy"
LLM_MODEL_NAME = os.environ.get("LLM_MODEL", "gpt-4o-mini")  # if using OpenAI
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_TOKEN    = 350
OVERLAP_TOKEN  = 40

In [5]:
def ensure_nltk():
    """Download punkt tokenizer if not present."""
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

In [6]:
# 1. LOAD & PREP DATA
# -----------------------------------------------------------------------------

def load_paragraphs(path: Path = PARA_PKL) -> pd.DataFrame:
    """Load the pickled paragraph table created earlier."""
    if not path.exists():
        raise FileNotFoundError(f"Expected paragraph pickle at {path}")
    return pd.read_pickle(path)

In [7]:
# 2. CHUNKING UTILITIES
# -----------------------------------------------------------------------------
_token_cache = {}

def _get_encoder(model: str):
    if model not in _token_cache:
        _token_cache[model] = tiktoken.encoding_for_model(model)
    return _token_cache[model]

def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    enc = _get_encoder(model)
    return len(enc.encode(text))

def split_text(text: str, max_tokens: int = CHUNK_TOKEN, overlap: int = OVERLAP_TOKEN) -> List[str]:
    """Split text into chunks with word overlap; uses tiktoken lengths."""
    words = text.split()
    out, start = [], 0
    while start < len(words):
        end = start + max_tokens
        chunk = " ".join(words[start:end])
        out.append(chunk)
        start += max_tokens - overlap
    return out


In [14]:

# 3. BUILD RETRIEVERS (BM25 + FAISS)
# -----------------------------------------------------------------------------
class HybridRetriever:
    """Combines sparse BM25 and dense MiniLM scores."""
    def __init__(self, emb_model: str = EMB_MODEL_NAME):
        self.model   = SentenceTransformer(emb_model)
        self.bm25    = None
        self.faiss   = None
        self.meta    = []
            # ── index construction ─────────────────────────────────────────────
    def build(self, df: pd.DataFrame, force_rebuild=False):
        if FAISS_PATH.exists() and META_PATH.exists() and not force_rebuild:
            self._load_index()
            return self

        ensure_nltk()
        all_chunks, meta = [], []
        for _, row in df.iterrows():
            for chunk in split_text(row["Para_list"]):
                cleaned = unicodedata.normalize("NFKD", chunk)
                all_chunks.append(cleaned)
                meta.append({"doi": row.DOI, "para_id": int(row.Para_id)})
        # BM25
        tokenized = [word_tokenize(c.lower()) for c in all_chunks]
        self.bm25  = BM25Okapi(tokenized)
        np.save(BM25_PATH, tokenized, allow_pickle=True)
        # Dense
        emb = self.model.encode(all_chunks, batch_size=64, show_progress_bar=True, convert_to_numpy=True).astype("float32")
        index = faiss.IndexFlatIP(emb.shape[1])
        index.add(emb)
        faiss.write_index(index, FAISS_PATH)
        # meta
        META_PATH.write_text(json.dumps(meta))
        self.faiss, self.meta = index, meta
        return self

    def _load_index(self):
        self.faiss = faiss.read_index(str(FAISS_PATH))
        self.meta  = json.loads(META_PATH.read_text())
        tokenized  = np.load(BM25_PATH, allow_pickle=True)
        self.bm25  = BM25Okapi(tokenized)
            # ── querying ──────────────────────────────────────────────────────
    def search(self, query: str, top_k: int = 6, bm25_weight: float = 0.4):
        q_emb = self.model.encode([query], convert_to_numpy=True).astype("float32")
        D_dense, I_dense = self.faiss.search(q_emb, top_k)
        dense_scores = {int(i): float(s) for i, s in zip(I_dense[0], D_dense[0])}

        token_q = word_tokenize(query.lower())
        sparse_scores = self.bm25.get_scores(token_q)
        # linear combination
        hybrid = {}
        for idx, dscore in dense_scores.items():
            hybrid[idx] = (1 - bm25_weight) * dscore + bm25_weight * sparse_scores[idx]
        # fill extras from BM25 if fewer than k
        bm25_ranks = np.argsort(-sparse_scores)
        for idx in bm25_ranks:
            if idx not in hybrid and len(hybrid) < top_k:
                hybrid[int(idx)] = sparse_scores[idx]
        # return sorted
        best = sorted(hybrid.items(), key=lambda x: x[1], reverse=True)[:top_k]
        out  = [{**self.meta[idx], "score": score} for idx, score in best]
        return out
