In [8]:
"""node_info, chunk_text 로드

Cell 1~5 실행 (레벨 인덱스 → 임베딩 → 노드 랭킹 → 리프 확장 → 검색/답변)

pretty_answer("질문")으로 바로 테스트
"""

'node_info, chunk_text 로드\n\nCell 1~5 실행 (레벨 인덱스 → 임베딩 → 노드 랭킹 → 리프 확장 → 검색/답변)\n\npretty_answer("질문")으로 바로 테스트\n'

In [3]:
import json
from pathlib import Path

BASE = Path.cwd().parents[0] / "13_RAPTOR" if (Path.cwd().name != "13_RAPTOR") else Path.cwd()
OUT  = BASE / "outputs"

nodes_path = OUT / "tree_nodes.jsonl"
chunks_path = OUT / "chunks.jsonl"

# 1) 노드 정보 불러오기
nodes = [json.loads(l) for l in open(nodes_path, encoding="utf-8")]
node_info = {nd["node_id"]: (nd["level"], nd["children"], nd["summary"]) for nd in nodes}

# 2) 청크 텍스트 불러오기
chunk_text = {json.loads(l)["chunk_id"]: json.loads(l)["text"] for l in open(chunks_path, encoding="utf-8")}

print("✅ node_info:", len(node_info), "nodes")
print("✅ chunk_text:", len(chunk_text), "chunks")

✅ node_info: 6 nodes
✅ chunk_text: 227 chunks


In [4]:
#레벨 인덱스

In [5]:
from collections import defaultdict
levels = defaultdict(list)  # level -> [(node_id, summary)]
for nid, (lvl, children, summ) in node_info.items():
    levels[lvl].append((nid, summ))

In [9]:
#임베딩(간단 TF-IDF)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def _fit(texts):
    v = TfidfVectorizer(ngram_range=(1,2), max_features=60000)
    return v, v.fit_transform(texts)
def _score(M, v, q):  # 코사인 유사도
    return cosine_similarity(M, v.transform([q])).ravel()

In [11]:
#3 — 레벨 선택 + 노드 랭킹

In [12]:
import re, numpy as np
def select_start_level(q):
    ks = sorted(levels.keys())
    if not ks: return None
    qlen = len(re.findall(r"\w+", q))
    return (ks[-1] if qlen<=5 else (ks[-2] if len(ks)>1 and qlen<=12 else ks[0]))

def rank_nodes_at_level(q, L, topk=6):
    items = levels.get(L, [])
    ids = [i for i,_ in items]; txts = [t for _,t in items]
    v, M = _fit(txts); sims = _score(M, v, q)
    idx = np.argsort(-sims)[:topk]
    return [ids[i] for i in idx]

In [13]:
#4 — 확장→리프→재랭킹

In [14]:
def _is_chunk(x): return isinstance(x,str) and x.startswith("C")

def expand_to_leaves(seed_ids, hops=2):
    out, fr = [], list(seed_ids)
    for _ in range(hops):
        nxt=[]
        for nid in fr:
            if _is_chunk(nid): out.append(nid)
            elif nid in node_info: nxt += node_info[nid][1]
        fr = nxt
        if not fr: break
    # 중복 제거
    seen, uniq = set(), []
    for cid in out:
        if cid not in seen: uniq.append(cid); seen.add(cid)
    return uniq

def rerank_chunks(q, cids, topk=5):
    ids, txts = [], []
    for cid in cids:
        if cid in chunk_text: ids.append(cid); txts.append(chunk_text[cid])
    if not txts: return []
    v, M = _fit(txts); sims = _score(M, v, q)
    idx = np.argsort(-sims)[:topk]
    return [(ids[i], float(sims[i])) for i in idx]

In [15]:
#— 메인 검색 + 간단 답변

In [16]:
def raptor_search(q, topk_nodes=6, topk_chunks=5, hops=2):
    L = select_start_level(q)
    if L is None: return {"level": None, "nodes": [], "chunks": []}
    seeds = rank_nodes_at_level(q, L, topk=topk_nodes)
    cand  = expand_to_leaves(seeds, hops=hops) or seeds
    chunks = rerank_chunks(q, cand, topk=topk_chunks)
    return {"level": L, "nodes": seeds, "chunks": chunks}

def pretty_answer(q, sent_per_chunk=2):
    import re, numpy as np
    res = raptor_search(q)
    picks=[]
    for cid,_ in res["chunks"]:
        sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', chunk_text[cid]) if s.strip()]
        if not sents: continue
        v, M = _fit(sents); sims = _score(M, v, q)
        idx = np.argsort(-sims)[:sent_per_chunk]
        picks.append(f"[{cid}] " + " ".join(sents[i] for i in idx))
    print(f"🔎 Q: {q}\n📍 start level: L{res['level']}\n📑 nodes: {res['nodes']}\n💬 A: {' '.join(picks) if picks else '(no matching evidence)'}")

# 사용 예시
pretty_answer("What strange events happened on Privet Drive?")

🔎 Q: What strange events happened on Privet Drive?
📍 start level: L2
📑 nodes: ['L2_N0002', 'L2_N0001']
💬 A: (no matching evidence)


In [17]:
res = raptor_search("What strange events happened on Privet Drive?")
print("seeds:", res["nodes"])
print("cand chunks (topN):", [c for c,_ in res["chunks"]][:10])

seeds: ['L2_N0002', 'L2_N0001']
cand chunks (topN): []


In [18]:
def debug_children(nids, depth=2):
    for nid in nids:
        cur = [nid]
        print(f"\n[seed] {nid}")
        for d in range(depth):
            nxt = []
            for x in cur:
                info = node_info.get(x)
                kids = [] if info is None else info[1]
                print(f"  hop{d}: {x} -> children({len(kids)}): {kids[:6]}")
                nxt += kids
            cur = nxt
            if not cur: break

# raptor_search 전에 한 번:
seeds = ['L2_N0002','L2_N0001']  # 방금 출력된 시드로 시험
debug_children(seeds, depth=2)



[seed] L2_N0002
  hop0: L2_N0002 -> children(1): ['L1_N0003']
  hop1: L1_N0003 -> children(1): ['C0005']

[seed] L2_N0001
  hop0: L2_N0001 -> children(2): ['L1_N0001', 'L1_N0002']
  hop1: L1_N0001 -> children(2): ['C0001', 'C0002']
  hop1: L1_N0002 -> children(2): ['C0003', 'C0004']
