# 🌱 RAG Chatbot Demo Notebook
本 Notebook 展示如何：
- 上傳與解析 PDF / PPT / 圖片文件
- 建立向量庫並查詢語意段落
- 呼叫 OpenAI GPT 回答問題
- 顯示引用來源與 token 使用統計


In [None]:
# 初始化：載入模組與模型
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, openai
from app.config import *
from app.loader import load_document_text
from app.sensitive import contains_sensitive
from pprint import pprint

openai.api_key = OPENAI_API_KEY
model = SentenceTransformer(EMBEDDING_MODEL)
index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
chunks, meta = [], []

In [None]:
# 上傳並處理文件（PDF / PPT / 圖片）
file_path = "../data/documents/demo.pdf"  # 替換為你自己的檔案
text = load_document_text(file_path)

def split_and_embed(text, filename):
    lines = text.split("\n")
    current = ""
    for i, line in enumerate(lines):
        if len(current) + len(line) < CHUNK_SIZE:
            current += line.strip() + " "
        else:
            chunks.append(current.strip())
            meta.append({"source": filename, "position": f"第{i+1}行附近"})
            current = line.strip() + " "
    if current:
        chunks.append(current.strip())
        meta.append({"source": filename, "position": "結尾"})
    vecs = model.encode(chunks[-len(meta):])
    index.add(np.array(vecs))

split_and_embed(text, "demo.pdf")
print(f"向量數量：{index.ntotal}")

In [None]:
# 輸入問題並查詢相似段落
query = "SBTi 目標為何選擇 near-term 而不是 net-zero？"
qvec = model.encode([query])
D, I = index.search(np.array(qvec), 3)
top_chunks = [chunks[i] for i in I[0]]
references = [meta[i] for i in I[0]]

print("\n\n【相關段落】")
for i, (chunk, ref) in enumerate(zip(top_chunks, references)):
    print(f"\n--- Top {i+1} ---\n{chunk[:300]}...\n來源：{ref['source']}（{ref['position']}）")

In [None]:
# 呼叫 OpenAI GPT 回答問題並顯示 token 使用情況
if USE_OPENAI:
    prompt = ("根據以下內容回答問題：\n" + "\n---\n".join(top_chunks) + f"\n\n問題：{query}")
    response = openai.ChatCompletion.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": "你是專業永續顧問，請根據資料回答問題。"},
            {"role": "user", "content": prompt}
        ]
    )
    answer = response.choices[0].message.content.strip()
    usage = response.usage

    print("\n【GPT 回答】\n", answer)
    print("\n📎 引用來源：")
    for r in references:
        print(f"- {r['source']}（{r['position']}）")
    print("\n📊 Token 使用：")
    pprint(dict(usage))