<a href="https://colab.research.google.com/github/prrmzz/RAG-for-Iranian-High-School-Biology-Textbook/blob/main/RAG_biology_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- PIP (ML + Persian text utils + FAISS) ---
!pip -q install -U "transformers==4.44.2" "sentence-transformers==3.0.1" \
                   "bitsandbytes>=0.43.1,<0.47" "accelerate>=0.33.0" \
                   faiss-cpu rank-bm25 arabic-reshaper python-bidi

# --- APT (OCR fallback & PDF tools; optional but recommended) ---
!apt -yq install ocrmypdf tesseract-ocr-fas poppler-utils >/dev/null

# Sanity print
import importlib.metadata as im, torch, sys
def v(p):
    try: return im.version(p)
    except: return "not-installed"
print("torch:", torch.__version__)
print("transformers:", v("transformers"))
print("sentence-transformers:", v("sentence-transformers"))
print("faiss-cpu:", v("faiss-cpu"))
print("bitsandbytes:", v("bitsandbytes"))
print("accelerate:", v("accelerate"))
print("arabic-reshaper:", v("arabic-reshaper"), "| python-bidi:", v("python-bidi"))
print("Python:", sys.version)
print("CUDA available:", torch.cuda.is_available())




debconf: apt-extracttemplates failed: No such file or directory
E: Sub-process /usr/sbin/dpkg-preconfigure --apt || true received signal 2.
E: Failure running script /usr/sbin/dpkg-preconfigure --apt || true
torch: 2.8.0+cu126
transformers: 4.44.2
sentence-transformers: 3.0.1
faiss-cpu: 1.12.0
bitsandbytes: 0.46.1
accelerate: 1.10.1
arabic-reshaper: 3.0.0 | python-bidi: 0.6.6
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
CUDA available: True


In [2]:
from pathlib import Path
import os, json, re, shutil, subprocess, uuid
import fitz  # PyMuPDF
from tqdm import tqdm
import arabic_reshaper
from bidi.algorithm import get_display

# --- Path to your Drive folder (change if needed) ---
FOLDER = "/content/drive/MyDrive/biologybooks"  # e.g. contains: 1.pdf, 2.pdf, 3.pdf
OUT_DIR = Path("/content/rag_fa")
OCR_DIR = OUT_DIR / "ocr_cache"
OUT_DIR.mkdir(parents=True, exist_ok=True)
OCR_DIR.mkdir(parents=True, exist_ok=True)

def fa_normalize(s: str) -> str:
    # Basic Persian normalization (no hazm needed)
    s = s.replace("\u064a", "ی").replace("\u0649", "ی").replace("\u06cc", "ی")  # yeh
    s = s.replace("\u0643", "ک").replace("\u06a9", "ک")  # keh
    # remove diacritics
    s = re.sub(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]", "", s)
    # collapse spaces
    s = re.sub(r"[ \t\r\f\v]+", " ", s)
    s = re.sub(r"\s*\n\s*", "\n", s)
    return s.strip()

def fa_shape_bidi(s: str) -> str:
    # Many extractors already return correct order; but if you see reversed text,
    # enable shaping+bidi. We keep it on by default for consistency.
    try:
        reshaped = arabic_reshaper.reshape(s)
        return get_display(reshaped)
    except Exception:
        return s

def extract_with_pymupdf(pdf_path: Path) -> list:
    """Return list[str] per page (UTF-8)."""
    pages = []
    with fitz.open(pdf_path) as doc:
        for p in range(len(doc)):
            text = doc[p].get_text("text") or ""
            text = fa_normalize(text)
            text = fa_shape_bidi(text)
            pages.append(text)
    return pages

def ocr_pdf(src: Path, dst: Path):
    # Fast, language=fas; keep vector, deskew, clean
    cmd = [
        "ocrmypdf",
        "--language", "fas",
        "--skip-text",               # don't OCR pages that already have text
        "--rotate-pages", "--deskew",
        "--optimize", "1",
        "--output-type", "pdf",
        str(src), str(dst)
    ]
    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

def maybe_ocr_then_extract(pdf_path: Path) -> list:
    pages = extract_with_pymupdf(pdf_path)
    total_chars = sum(len(p) for p in pages)
    if total_chars >= 500:   # likely digitally searchable already
        return pages

    # OCR fallback (scanned PDF)
    ocr_path = OCR_DIR / f"{pdf_path.stem}.ocr.pdf"
    if not ocr_path.exists():
        try:
            ocr_pdf(pdf_path, ocr_path)
        except subprocess.CalledProcessError as e:
            print(f"⚠️ OCR failed for {pdf_path.name}: {e}. Returning raw extract.")
            return pages
    return extract_with_pymupdf(ocr_path)

def chunk_pages(pages: list, doc_name: str, max_chars=900, overlap=150):
    """Simple paragraph-based chunking with overlap."""
    chunks = []
    for i, page in enumerate(pages, start=1):
        # split by paragraph boundaries
        parts = [p.strip() for p in re.split(r"\n{2,}", page) if p.strip()]
        buf, clen = [], 0
        for par in parts:
            if clen + len(par) + 1 <= max_chars:
                buf.append(par); clen += len(par) + 1
            else:
                if buf:
                    text = "\n".join(buf).strip()
                    if text:
                        chunks.append({"doc": doc_name, "page": i, "text": text})
                    # start new buffer with overlap from the end
                    tail = text[-overlap:] if overlap and len(text) > overlap else ""
                    buf = [tail, par] if tail else [par]
                    clen = sum(len(s)+1 for s in buf)
                else:
                    # very long single paragraph
                    for j in range(0, len(par), max_chars - overlap):
                        piece = par[j:j + (max_chars - overlap)]
                        if j > 0 and overlap:
                            piece = (par[max(0, j-overlap):j] + piece)
                        piece = piece.strip()
                        if piece:
                            chunks.append({"doc": doc_name, "page": i, "text": piece})
                    buf, clen = [], 0
        if buf:
            text = "\n".join(buf).strip()
            if text:
                chunks.append({"doc": doc_name, "page": i, "text": text})
    return chunks

# --- Run extraction over folder ---
folder = Path(FOLDER)
pdfs = sorted([p for p in folder.iterdir() if p.suffix.lower()==".pdf"])
print("PDFs:", [p.name for p in pdfs])

all_chunks = []
report = []
for pdf in tqdm(pdfs, desc="Processing PDFs"):
    pages = maybe_ocr_then_extract(pdf)
    chs = chunk_pages(pages, pdf.name, max_chars=900, overlap=140)
    all_chunks.extend(chs)
    report.append((pdf.name, sum(len(x) for x in pages), len(pages), len(chs)))

with open(OUT_DIR/"chunks.jsonl", "w", encoding="utf-8") as f:
    for c in all_chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

print("Extract report:", report)
print("Total chunks:", len(all_chunks))
print("Saved:", str(OUT_DIR/"chunks.jsonl"))


PDFs: ['1.pdf', '2.pdf', '3.pdf']


Processing PDFs: 100%|██████████| 3/3 [00:05<00:00,  1.77s/it]

Extract report: [('1.pdf', 196869, 120, 313), ('2.pdf', 230954, 168, 377), ('3.pdf', 228017, 136, 365)]
Total chunks: 1055
Saved: /content/rag_fa/chunks.jsonl





In [3]:
import os, json
import numpy as np
import faiss
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# If you haven't already done login() in another cell:
# from getpass import getpass
# from huggingface_hub import login
# login(getpass("HF token: "))

# --- Load chunks ---
CHUNKS_PATH = "/content/rag_fa/chunks.jsonl"
chunks = [json.loads(line) for line in open(CHUNKS_PATH, encoding="utf-8")]
texts = [c["text"] for c in chunks]

# --- BM25 (light lexical recall; optional but cheap) ---
bm25 = BM25Okapi([t.split() for t in texts])

# --- Embeddings: GTE multilingual (strong for Persian) ---
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("Alibaba-NLP/gte-multilingual-base", device=device, trust_remote_code=True)
# direct sentence list; normalize for IP similarity
emb = embedder.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
emb = emb.astype("float32")

# --- FAISS index (inner product on normalized vectors == cosine) ---
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
print("FAISS index size:", index.ntotal)

# --- Reranker (cross-encoder; multilingual, efficient) ---
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("BAAI/bge-reranker-v2-m3", device=device, trust_remote_code=True)

# --- Qwen loader with safe fallbacks (4-bit -> 8-bit -> fp16 -> cpu) ---
def load_qwen(prefer_7b=True):
    primary = "Qwen/Qwen2.5-7B-Instruct"
    secondary = "Qwen/Qwen2.5-3B-Instruct"
    order = [primary, secondary] if prefer_7b else [secondary, primary]
    has_gpu = torch.cuda.is_available()

    # 4-bit
    try:
        import bitsandbytes as bnb  # noqa
        if has_gpu:
            qcfg = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
            )
            for mid in order:
                tok = AutoTokenizer.from_pretrained(mid, use_fast=True, trust_remote_code=True)
                llm = AutoModelForCausalLM.from_pretrained(
                    mid, device_map="auto", quantization_config=qcfg,
                    torch_dtype=torch.bfloat16, trust_remote_code=True
                )
                return tok, llm, mid, "4bit"
    except Exception as e:
        print("⚠️  4-bit skipped:", e)

    # 8-bit
    try:
        import bitsandbytes as bnb  # noqa
        if has_gpu:
            qcfg = BitsAndBytesConfig(load_in_8bit=True)
            for mid in order:
                tok = AutoTokenizer.from_pretrained(mid, use_fast=True, trust_remote_code=True)
                llm = AutoModelForCausalLM.from_pretrained(
                    mid, device_map="auto", quantization_config=qcfg,
                    torch_dtype=torch.float16, trust_remote_code=True
                )
                return tok, llm, mid, "8bit"
    except Exception as e:
        print("⚠️  8-bit skipped:", e)

    # fp16 (GPU, no quantization) — prefer 3B to avoid OOM
    if has_gpu:
        try:
            mid = "Qwen/Qwen2.5-3B-Instruct"
            tok = AutoTokenizer.from_pretrained(mid, use_fast=True, trust_remote_code=True)
            llm = AutoModelForCausalLM.from_pretrained(
                mid, device_map="auto", torch_dtype=torch.float16,
                low_cpu_mem_usage=True, trust_remote_code=True
            )
            return tok, llm, mid, "fp16"
        except Exception as e:
            print("⚠️  fp16 skipped:", e)

    # CPU fallback
    mid = "Qwen/Qwen2.5-3B-Instruct"
    tok = AutoTokenizer.from_pretrained(mid, use_fast=True, trust_remote_code=True)
    llm = AutoModelForCausalLM.from_pretrained(
        mid, device_map="cpu", torch_dtype=torch.float32, trust_remote_code=True
    )
    return tok, llm, mid, "cpu32"

tok, llm, LLM_ID, MODE = load_qwen(prefer_7b=True)
print(f"✅ LLM: {LLM_ID} | mode={MODE}")

# --- Retrieval (BM25 + FAISS hybrid) ---
def retrieve(query: str, top_k_vec=30, top_k_bm25=10, fuse_k=30):
    q_norm = query.strip()
    # vector
    qv = embedder.encode([q_norm], normalize_embeddings=True)
    sims, idxs = index.search(qv.astype("float32"), top_k_vec)
    vec_hits = set(idxs[0].tolist())
    # bm25
    bm = bm25.get_top_n(q_norm.split(), list(range(len(texts))), n=top_k_bm25)
    bm_hits = set(bm)
    # union then cut
    cand = list(vec_hits.union(bm_hits))
    # rerank cross-encoder
    pairs = [[q_norm, texts[i]] for i in cand]
    scores = reranker.predict(pairs, batch_size=64, show_progress_bar=False)
    order = np.argsort(-scores)[:fuse_k]
    ranked = [(cand[i], float(scores[i])) for i in order]
    return ranked

# --- Answer generation (Persian) ---
def generate_answer(question: str, k=6, max_new_tokens=384, temperature=0.2, top_p=0.9):
    ranked = retrieve(question, fuse_k=max(k, 6))
    ctx_items = []
    for i, (idx, sc) in enumerate(ranked[:k], 1):
        c = chunks[idx]
        cite = f"- {c['doc']} | صفحه {c['page']}"
        ctx_items.append(cite + "\n" + c["text"])

    context = "\n\n---\n\n".join(ctx_items)
    # ChatML via tokenizer (works with Qwen)
    messages = [
        {"role": "system", "content":
         "شما یک کمک‌یار فارسی دقیق و بی‌طرف هستید. فقط بر اساس متن‌های «ارجاع‌شده» پاسخ دهید. "
         "اگر مطمئن نبودید، صادقانه بگویید که در منابع موجود پاسخ را پیدا نکرده‌اید."},
        {"role": "user", "content":
         f"پرسش:\n{question}\n\n"
         f"متن‌های مرتبط:\n{context}\n\n"
         "خروجی را به فارسی و با این ساختار بده:\n"
         "۱) خلاصه یک‌جمله‌ای\n۲) تعریف علمی کوتاه (۲–۳ جمله)\n۳) مراحل گام‌به‌گام (بولت)\n۴) نکات کلیدی (بولت)\n۵) ارجاعات (فقط نام فایل و شماره صفحه)"},
    ]
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok([prompt], return_tensors="pt").to(llm.device)
    with torch.no_grad():
        out = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.05,
            eos_token_id=tok.eos_token_id,
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    # keep only the assistant part (after last user)
    return text.split(messages[-1]["content"])[-1].strip(), [chunks[i]["page"] for i,_ in ranked[:k]]

# --- Example usage ---
q = "هدف میتوز چیست و نتیجهٔ نهایی آن از نظر تعداد و شباهت یاخته‌ها چگونه است؟"
answer, pages = generate_answer(q, k=6, max_new_tokens=320)
print(answer)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceC

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

FAISS index size: 1055
⚠️  4-bit skipped: There was a specific connection error when trying to load Qwen/Qwen2.5-7B-Instruct:
401 Client Error: Unauthorized for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/model.safetensors (Request ID: Root=1-68f35ded-35831dd666bbd1c55619a2d8;66f4103e-e212-4b4d-ab0d-7dfc230f699f)

Invalid credentials in Authorization header
⚠️  8-bit skipped: There was a specific connection error when trying to load Qwen/Qwen2.5-7B-Instruct:
401 Client Error: Unauthorized for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/model.safetensors (Request ID: Root=1-68f35dee-103331171e3c9e3b2da26b36;937260a4-edf8-4bc5-b9dd-3259822987ab)

Invalid credentials in Authorization header


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ LLM: Qwen/Qwen2.5-3B-Instruct | mode=fp16
assistant
۱) خلاصه یک‌جمله‌ای:
هدف میتوز گسترش یک یاخته به یک یکسان است و نتیجه آن تولید یک یاخته‌ی جدید با شباهت بالایی به یاخته‌های قبلی است.

۲) تعریف علمی کوتاه:
هدف میتوز گسترش یک یاخته به یک یکسان است و نتیجه آن تولید یک یاخته‌ی جدید با شباهت بالایی به یاخته‌های قبلی است. این مراحل شامل چندین مرحله‌ی مختلف مانند گسترش یک یاخته به دو یاخته، سازگاری یاخته‌ها و تولید یاخته‌ی جدید است.

۳) مراحل گام‌به‌گام:
مرحله اول: گسترش یک یاخته به دو یاخته (پروکاریوت‌ها و اکسون‌ها).
مرحله دوم: سازگاری یاخته‌ها (پروکاریوت‌ها و اینترون‌ها).
مرحله سوم: ت
