# Agent #2: The Librarian

In [None]:
# ============================================================
# ADVANCED MULTI-HOP AGENTIC PDF QA (DSPy SAFE)
# + Markdown Export
# + Character Count Report (COUNT ONLY)
# + Runtime Measurement
# ============================================================

import fitz
import re
import json
import hashlib
import requests
import numpy as np
import faiss
import dspy
import time

from typing import List, Dict
from pathlib import Path
from tqdm import tqdm

# ============================================================
# CONFIG
# ============================================================

PDF_PATH = "doc/dsi2566.pdf"

WORKDIR = Path("./rag_store")
WORKDIR.mkdir(exist_ok=True)

PDF_NAME = Path(PDF_PATH).stem

MD_PATH    = WORKDIR / f"{PDF_NAME}_document.md"
META_PATH  = WORKDIR / f"{PDF_NAME}_chunks.jsonl"
EMB_PATH   = WORKDIR / f"{PDF_NAME}_embeddings.npy"
INDEX_PATH = WORKDIR / f"{PDF_NAME}_faiss.index"

EMBEDDING_API = "http://localhost:11434/api/embeddings"
EMBED_MODEL   = "bge-m3:567m"

LLM_MODEL = "gpt-oss:120b-cloud"

BASE_TOP_K   = 3
EXPAND_TOP_K = 6
SCORE_TH     = 0.15

# ============================================================
# DSPy LLM (RAW TEXT ONLY)
# ============================================================

class OllamaDSPyLM(dspy.LM):
    def __init__(self, model: str):
        super().__init__(model=model)
        self.model = model

    def __call__(self, prompt=None, messages=None, **kwargs):
        from ollama import chat
        if messages is None:
            messages = [{"role": "user", "content": prompt}]
        r = chat(model=self.model, messages=messages)
        return r.message.content

dspy.configure(lm=OllamaDSPyLM(LLM_MODEL), track_usage=True)

# ============================================================
# UTILS
# ============================================================

def sha1(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()

def normalize_query(q: str) -> str:
    if not q:
        return ""
    q = q.strip()
    q = re.sub(r"^[\-\*\•\d\.\s]+", "", q)
    return q.strip()

# ---------- character counting ----------

def count_pdf_chars(path: str) -> int:
    doc = fitz.open(path)
    total = 0
    for p in doc:
        t = p.get_text("text")
        if t:
            total += len(t)
    doc.close()
    return total

def count_md_chars(path: Path) -> int:
    if not path.exists():
        return 0
    return len(path.read_text(encoding="utf-8"))

def count_jsonl_chars(path: Path) -> int:
    if not path.exists():
        return 0
    total = 0
    with open(path, encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            total += len(obj.get("text", ""))
    return total

# ============================================================
# 1) LOAD PDF
# ============================================================

def load_pdf_pages(path: str):
    doc = fitz.open(path)
    pages = []

    for i, page in enumerate(doc):
        text = page.get_text("text")
        if text and text.strip():
            pages.append({"page": i + 1, "text": text})

    doc.close()
    return pages

# ============================================================
# 2) PAGE → CHUNKS
# ============================================================

def page_to_chunks(page: Dict) -> List[Dict]:
    page_num = page["page"]
    lines = page["text"].splitlines()

    chunks = []
    section = "(no section)"
    buffer = []

    def flush():
        nonlocal buffer
        if buffer:
            text = "\n".join(buffer).strip()
            chunks.append({
                "id": sha1(text),
                "page": page_num,
                "section": section,
                "text": text
            })
            buffer = []

    for line in lines:
        l = line.strip()
        if not l:
            continue

        if re.match(r"^(บทที่|Chapter)\s+\d+", l) or (len(l) < 60 and l.isupper()):
            flush()
            section = l
            buffer.append(f"## {l}")
            continue

        buffer.append(l)

    flush()
    return chunks

# ============================================================
# 3) BUILD CHUNKS
# ============================================================

def build_chunks() -> List[Dict]:
    pages = load_pdf_pages(PDF_PATH)
    chunks = []
    for p in pages:
        chunks.extend(page_to_chunks(p))
    return chunks

# ============================================================
# 4) SAVE / LOAD FILES
# ============================================================

def save_chunks_jsonl(chunks: List[Dict]):
    with open(META_PATH, "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")

def save_markdown(chunks: List[Dict]):
    with open(MD_PATH, "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(
                f"<!-- page:{c['page']} section:{c['section']} -->\n"
                f"{c['text']}\n\n"
            )

def load_chunks() -> List[Dict]:
    with open(META_PATH, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# ============================================================
# 5) CHARACTER COUNT REPORT
# ============================================================

def report_char_count(pdf_chars: int, md_chars: int, json_chars: int):
    print("\nCHARACTER COUNT REPORT")
    print("-" * 50)
    print(f"PDF (raw text)      : {pdf_chars:,} chars")
    print(f"Markdown (.md)      : {md_chars:,} chars")
    print(f"Metadata (.jsonl)   : {json_chars:,} chars")
    print("-" * 50)

# ============================================================
# 6) EMBEDDING + FAISS
# ============================================================

def embed_texts(texts: List[str]) -> np.ndarray:
    vectors = []
    for t in tqdm(texts, desc="Embedding", unit="chunk"):
        r = requests.post(
            EMBEDDING_API,
            json={"model": EMBED_MODEL, "prompt": t},
            timeout=120
        )
        r.raise_for_status()
        vectors.append(r.json()["embedding"])
    return np.array(vectors, dtype="float32")

def embed_query(query: str, dim: int):
    query = normalize_query(query)
    if not query:
        return None

    r = requests.post(
        EMBEDDING_API,
        json={"model": EMBED_MODEL, "prompt": query},
        timeout=60
    )
    r.raise_for_status()

    vec = np.array([r.json()["embedding"]], dtype="float32")
    if vec.shape[1] != dim:
        return None

    faiss.normalize_L2(vec)
    return vec

def build_or_load_embeddings(chunks):
    if EMB_PATH.exists():
        return np.load(EMB_PATH)

    emb = embed_texts([c["text"] for c in chunks])
    faiss.normalize_L2(emb)
    np.save(EMB_PATH, emb)
    return emb

def build_or_load_index(embeddings):
    if INDEX_PATH.exists():
        return faiss.read_index(str(INDEX_PATH))

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, str(INDEX_PATH))
    return index

# ============================================================
# 7) RETRIEVAL
# ============================================================

def retrieve(query: str, chunks, index, top_k):
    q_emb = embed_query(query, index.d)
    if q_emb is None:
        return []

    scores, idxs = index.search(q_emb, top_k)
    return [chunks[i] for s, i in zip(scores[0], idxs[0]) if s >= SCORE_TH]

# ============================================================
# 8) MULTI-HOP AGENTS
# ============================================================

class QueryExpansionAgent(dspy.Module):
    def forward(self, question: str):
        prompt = f"""
แตกคำถามต่อไปนี้เป็นคำถามย่อย 2-3 ข้อ
ตอบเป็นบรรทัดสั้น ๆ เท่านั้น

[Question]
{question}
"""
        text = dspy.settings.lm(prompt)
        return [normalize_query(l) for l in text.splitlines() if len(normalize_query(l)) > 3]

class DocumentQAAgent(dspy.Module):
    def forward(self, context: str, question: str):
        prompt = f"""
คุณคือ AI สำหรับตอบคำถามจากเอกสาร

กติกา:
- ใช้เฉพาะข้อมูลใน Context เท่านั้น
- ต้องอ้างเลขหน้า
- ถ้าไม่มีข้อมูล ให้ตอบว่า "ไม่พบข้อมูลในเอกสาร"
- **ตอบด้วยภาษาเดียวกับภาษาของคำถามผู้ใช้**
  - ถ้าคำถามเป็นภาษาไทย ให้ตอบภาษาไทย
  - ถ้าคำถามเป็นภาษาอังกฤษ ให้ตอบภาษาอังกฤษ

[Context]
{context}

[Question]
{question}

[Answer]
"""
        return dspy.Prediction(answer=dspy.settings.lm(prompt))

# ============================================================
# 9) ORCHESTRATOR
# ============================================================

def ask_agent(question: str, chunks, index):
    base = retrieve(question, chunks, index, BASE_TOP_K)

    subs = QueryExpansionAgent()(question)
    expanded = []
    for q in subs:
        expanded.extend(retrieve(q, chunks, index, EXPAND_TOP_K))

    final = list({c["id"]: c for c in base + expanded}.values())
    if not final:
        return "ไม่พบข้อมูลในเอกสาร"

    context = "\n\n---\n\n".join(
        f"[หน้า {c['page']} | {c['section']}]\n{c['text']}"
        for c in final[:6]
    )

    out = DocumentQAAgent()(context=context, question=question)
    ans = out.answer.strip()
    return ans if len(ans) > 10 else "ไม่พบข้อมูลในเอกสาร"

# ============================================================
# MAIN
# ============================================================

if __name__ == "__main__":
    t_start = time.perf_counter()
    print("Agent #2: The Librarian")

    if not META_PATH.exists():
        chunks = build_chunks()
        save_chunks_jsonl(chunks)
        save_markdown(chunks)
    else:
        chunks = load_chunks()

    pdf_chars  = count_pdf_chars(PDF_PATH)
    md_chars   = count_md_chars(MD_PATH)
    json_chars = count_jsonl_chars(META_PATH)

    report_char_count(pdf_chars, md_chars, json_chars)

    t_embed_start = time.perf_counter()
    embeddings = build_or_load_embeddings(chunks)
    index = build_or_load_index(embeddings)
    t_embed_end = time.perf_counter()

    print(f"⏱ Data preparation time : {t_embed_start - t_start:.2f} sec")
    print(f"⏱ Embedding + Index time: {t_embed_end - t_embed_start:.2f} sec")

    print("\nAgent Ready (type 'exit')")

    while True:
        q = input("\nQuestion: ")
        if q.lower() == "exit":
            break

        t_q = time.perf_counter()
        ans = ask_agent(q, chunks, index)
        t_q2 = time.perf_counter()

        print("\nAnswer:\n", ans)
        print(f"⏱ Query processing time : {t_q2 - t_q:.2f} sec")


Agent #2: The Librarian

CHARACTER COUNT REPORT
--------------------------------------------------
PDF (raw text)      : 138,990 chars
Markdown (.md)      : 141,236 chars
Metadata (.jsonl)   : 132,285 chars
--------------------------------------------------


Embedding: 100%|██████████████████████████████████████████████████████████████████| 220/220 [05:19<00:00,  1.45s/chunk]

⏱ Data preparation time : 0.91 sec
⏱ Embedding + Index time: 319.66 sec

Agent Ready (type 'exit')






Question:  นอกเหนือจากหน่วยงานภายใน มธ. แล้ว หลักสูตร DSI (พ.ศ. 2566) ยังร่วมมือกับบริษัทเอกชนใดบ้าง?





Answer:
 นอกจากหน่วยงานภายในมหาวิทยาลัยธรรมศาสตร์แล้ว หลักสูตร DSI (พ.ศ. 2566) ยังได้รับความร่วมมือจากบริษัทเอกชนสองแห่งคือ  

* **บริษัท อี‑ซี‑โอ‑พี (ประเทศไทย) จำกัด**  
* **บริษัท กสิกร แล็ป จำกัด**  

(ข้อมูลอ้างอิงจาก [หน้า 3]​)
⏱ Query processing time : 10.22 sec



Question:  หลักสูตรวิทยาศาสตร์และนวัตกรรมข้อมูล พ.ศ. 2566 เป็นการปรับปรุงจากหลักสูตรเดิมในปี พ.ศ. ใด?





Answer:
 หลักสูตรวิทยาศาสตร์และนวัตกรรมข้อมูล พ.ศ. 2566 ได้รับการปรับปรุงจากหลักสูตรเดิมในปี พ.ศ. 2561 (ระบุในหน้า 3)
⏱ Query processing time : 7.56 sec



Question:  นักศึกษาหลักสูตร DSI (พ.ศ. 2566) ต้องศึกษาทั้งหมดกี่หน่วยกิตจึงจะสำเร็จการศึกษา?





Answer:
 นักศึกษาหลักสูตร DSI (พ.ศ. 2566) ต้องสะสมหน่วยกิตอย่างน้อย **100 หน่วยกิต** จึงจะสามารถสำเร็จการศึกษาได้  
(รวม 12 หน่วยกิต วิชาแกนร่วมคณะ + 21 หน่วยกิต วิชาพื้นฐาน + 36 หน่วยกิต วิชาบังคับในสาขา + 12 หน่วยกิต วิชาเลือกในสาขา + 1 หน่วยกิต ฝึกปฏิบัติงาน + 12 หน่วยกิต โครงงาน‑สหกิจศึกษา + 6 หน่วยกิต วิชาเลือกเสรี) [หน้า 11]​, [หน้า 14]​.
⏱ Query processing time : 11.10 sec
