In [5]:
"""13_RAPTOR/
├─ data/
├─ outputs/              # 여기 있는 결과를 재사용
├─ notebooks/
│  ├─ 01_day1_tree_build.ipynb
│  └─ 02_day2_retrieval.ipynb   ← 여기 맨 위에 1번 셀 붙이기

"""

'13_RAPTOR/\n├─ data/\n├─ outputs/              # 여기 있는 결과를 재사용\n├─ notebooks/\n│  ├─ 01_day1_tree_build.ipynb\n│  └─ 02_day2_retrieval.ipynb   ← 여기 맨 위에 1번 셀 붙이기\n\n'

In [2]:
import os, json
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 경고 끄기(선택)


In [6]:
#Step 2. 데이터 불러오기

In [7]:
BASE = Path.cwd().parents[0] / "13_RAPTOR" if (Path.cwd().name != "13_RAPTOR") else Path.cwd()
OUT  = BASE / "outputs"

chunks_path = OUT / "chunks.jsonl"
summ_smoke  = OUT / "chunk_summaries_smoke.jsonl"
summ_all    = OUT / "chunk_summaries.jsonl"
nodes_path  = OUT / "tree_nodes.jsonl"

summ_path = summ_smoke if summ_smoke.exists() else summ_all

# 청크 원문
chunk_text = {json.loads(l)["chunk_id"]: json.loads(l)["text"] for l in open(chunks_path, encoding="utf-8")}
# leaf 요약
leaf_summary = {json.loads(l)["chunk_id"]: json.loads(l)["summary"] for l in open(summ_path, encoding="utf-8")}
# 노드 요약
nodes = [json.loads(l) for l in open(nodes_path, encoding="utf-8")]
node_info = {nd["node_id"]: (nd["level"], nd["children"], nd["summary"]) for nd in nodes}

# 검색 대상(노드 + 리프)
corpus_ids, corpus_txt = [], []
for nid, (_,_,summ) in node_info.items():
    corpus_ids.append(nid); corpus_txt.append(summ)
for cid, summ in leaf_summary.items():
    corpus_ids.append(cid); corpus_txt.append(summ)

print("✅ 로드 완료:", len(corpus_ids), "개 요약")


✅ 로드 완료: 11 개 요약


In [8]:
#Step 3. 간단 임베딩 인덱스

In [9]:
try:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    emb_mat = model.encode(corpus_txt, normalize_embeddings=True, show_progress_bar=True)
    backend = "sbert"
    print("✅ SBERT 사용")
except:
    vect = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
    emb_mat = vect.fit_transform(corpus_txt)
    backend = "tfidf"
    print("✅ TF-IDF 사용")

def topk_in_corpus(query, k=5):
    if backend=="sbert":
        qv = model.encode([query], normalize_embeddings=True)[0]
        sims = emb_mat @ qv
    else:
        qv = vect.transform([query])
        sims = cosine_similarity(emb_mat, qv).ravel()
    idx = np.argsort(-sims)[:k]
    return [(corpus_ids[i], float(sims[i])) for i in idx]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ SBERT 사용


In [11]:
#Step 4. Retrieval & 답변

In [24]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 간단 전처리
def _clean(s: str) -> str:
    s = re.sub(r"\bM\s+r\.", "Mr.", s)      # "M r." -> "Mr."
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def _split_sents(text: str):
    # 따옴표/마침표 기준 문장 분할
    sents = re.split(r'(?<=[.!?]")\s+|(?<=[.!?])\s+', text)
    return [s.strip() for s in sents if s.strip()]

def _keywords(q: str):
    stop = {"the","a","an","and","or","of","to","in","on","at","for","with","is","are","was","were","do","does","did","what","who","where","when","how","why"}
    toks = re.findall(r"[A-Za-z']+", q.lower())
    return sorted({t for t in toks if t not in stop and len(t) >= 3})

def is_chunk_id(x): 
    return isinstance(x, str) and x.startswith("C")

def descend_to_chunks(ids, max_hops=3):
    """노드 id들에서 시작 → 리프(청크)까지 내려가기"""
    out, frontier = [], list(ids)
    for _ in range(max_hops):
        nxt = []
        for _id in frontier:
            if is_chunk_id(_id):
                out.append(_id)
            elif _id in node_info:
                _, children, _ = node_info[_id]
                nxt.extend(children)
            elif _id in leaf_summary:  # 요약만 있는 리프일 수도 있음
                out.append(_id)
        frontier = nxt
        if not frontier:
            break
    # 중복 제거, 순서 유지
    seen, uniq = set(), []
    for cid in out:
        if cid not in seen:
            uniq.append(cid); seen.add(cid)
    return uniq

def raptor_retrieve(query: str, topk_nodes=6, topk_chunks=5):
    """상위 후보 노드/리프 → 리프로 내려가서 청크 재랭킹"""
    hits = topk_in_corpus(query, k=topk_nodes)              # [(id, score), ...]
    hit_ids = [hid for hid, _ in hits]

    candidate_chunks = descend_to_chunks(hit_ids, max_hops=3)
    if not candidate_chunks:
        candidate_chunks = [hid for hid, _ in hits if is_chunk_id(hid)]

    # 청크 텍스트 기준 재랭킹 (TF-IDF)
    cids, ctexts = [], []
    for cid in candidate_chunks:
        if cid in chunk_text:
            cids.append(cid)
            ctexts.append(_clean(chunk_text[cid]))
    if not ctexts:
        return {"nodes": hits, "chunks": []}

    vect = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=50000)
    M    = vect.fit_transform(ctexts)
    qv   = vect.transform([query])
    sims = cosine_similarity(M, qv).ravel()
    order = np.argsort(-sims)[:topk_chunks]
    top_chunks = [(cids[i], float(sims[i])) for i in order]
    return {"nodes": hits, "chunks": top_chunks}

def answer_query(query: str, topk_nodes=6, topk_chunks=5, sent_per_chunk=2, max_chars=400):
    """dict 반환: {'retrieval':..., 'answer': str} — 질문 키워드가 포함된 문장 위주로 추출"""
    res = raptor_retrieve(query, topk_nodes, topk_chunks)
    kws = _keywords(query)
    snippets = []

    for cid, _ in res.get("chunks", []):
        text = _clean(chunk_text.get(cid, ""))
        if not text:
            continue
        sents = _split_sents(text)
        if not sents:
            continue

        # 1) 키워드 포함 문장 먼저 필터
        key_sents = [s for s in sents if any(k in s.lower() for k in kws)] or sents

        # 2) 필터된 문장들만 재랭킹(TF-IDF) 후 상위 n개 선택
        vect = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=20000)
        M = vect.fit_transform(key_sents)
        q = vect.transform([query])
        sims = cosine_similarity(M, q).ravel()
        idxs = np.argsort(-sims)[:sent_per_chunk]

        picked = " ".join(key_sents[i] for i in idxs)
        snippets.append(f"[{cid}] {picked}")

        if len(" ".join(snippets)) > max_chars:
            break

    # 보조: 아무 것도 못 뽑았으면 leaf summary를 1~2개 추가
    if not snippets:
        for hid, _ in topk_in_corpus(query, k=6):
            if is_chunk_id(hid) and hid in leaf_summary:
                snippets.append(f"[{hid}-summary] {leaf_summary[hid]}")
                if len(snippets) >= 2:
                    break

    answer = " ".join(snippets).strip()
    return {"retrieval": res, "answer": answer if answer else "(no matching evidence)"}


In [25]:
#Step 5. 테스트!

In [26]:
def pretty_answer(query, topk_nodes=6, topk_chunks=5, sent_per_chunk=2):
    out = answer_query(query, topk_nodes, topk_chunks, sent_per_chunk)
    print("🔎 Q:", query)
    print("💬 A:", out["answer"])
    if out["retrieval"]["chunks"]:
        print("📑 Evidence:", [cid for cid,_ in out["retrieval"]["chunks"]])
    print("-"*80)


In [27]:
pretty_answer("Who is Harry Potter's best friend?")
pretty_answer("What strange events happened on Privet Drive?")


🔎 Q: Who is Harry Potter's best friend?
💬 A: [C0003] He was sure there were lots of people called Potter who had a son called Harry. Come to think of it, he wasn’t even sure his nephew was called Harry. [C0001] Potter was Mrs. None of them noticed a large, tawny owl flutter past the window. [C0004] Rejoice, for You-Know-Who has gone at last! Dursley; she always got so upset at any mention of her sister. [C0002] Dursley couldn’t bear people who dressed in funny clothes — the getups you saw on young people! Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls.
📑 Evidence: ['C0003', 'C0001', 'C0004', 'C0002', 'C0005']
--------------------------------------------------------------------------------
🔎 Q: What strange events happened on Privet Drive?
💬 A: [C0002] There was a tabby cat standing on the corner of Privet Drive, but there wasn’t a map in sight. It was now reading the sign that said Privet 