In [2]:
import json
from pathlib import Path

BASE = Path.cwd().parents[0] / "13_RAPTOR" if (Path.cwd().name != "13_RAPTOR") else Path.cwd()
OUT  = BASE / "outputs"

nodes_path = OUT / "tree_nodes.jsonl"
chunks_path = OUT / "chunks.jsonl"

# 1) 노드 정보 불러오기
nodes = [json.loads(l) for l in open(nodes_path, encoding="utf-8")]
node_info = {nd["node_id"]: (nd["level"], nd["children"], nd["summary"]) for nd in nodes}

# 2) 청크 텍스트 불러오기
chunk_text = {json.loads(l)["chunk_id"]: json.loads(l)["text"] for l in open(chunks_path, encoding="utf-8")}

print("✅ node_info:", len(node_info), "nodes")
print("✅ chunk_text:", len(chunk_text), "chunks")

✅ node_info: 6 nodes
✅ chunk_text: 227 chunks


In [6]:
# (꼭) node_info를 로드한 "바로 다음 셀"에 실행
from collections import defaultdict
levels = defaultdict(list)
for nid, (lvl, children, summ) in node_info.items():
    levels[lvl].append((nid, summ))

In [7]:
# 3) ✅ 패치 2: 확장 함수 새 버전
def _norm_id(x):  return x.strip() if isinstance(x, str) else x
def _is_chunk(x): return isinstance(x, str) and x.startswith("C")

def expand_to_leaves(seed_ids, hops=2):
    out, fr = [], list(map(_norm_id, seed_ids))
    for _ in range(hops):
        nxt=[]
        for nid in fr:
            if _is_chunk(nid):
                out.append(nid)
            elif nid in node_info:
                kids = [ _norm_id(c) for c in node_info[nid][1] ]
                nxt += kids
        fr = nxt
        if not fr: break
    if out:
        seen, uniq = set(), []
        for cid in out:
            if cid not in seen: uniq.append(cid); seen.add(cid)
        return uniq
    # 폴백들
    one_hop=[]
    for nid in map(_norm_id, seed_ids):
        if nid in node_info:
            one_hop += [ _norm_id(c) for c in node_info[nid][1] ]
    if one_hop: return one_hop
    return list(map(_norm_id, seed_ids))

In [8]:
# === 최소 유틸 & 인덱스 ===
import re, json, numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def _fit(texts):
    v = TfidfVectorizer(ngram_range=(1,2), max_features=60000)
    return v, v.fit_transform(texts)

def _score(M, v, q):
    return cosine_similarity(M, v.transform([q])).ravel()

def _is_chunk(x): return isinstance(x, str) and x.startswith("C")
def _norm_id(x):  return x.strip() if isinstance(x, str) else x

# 레벨 인덱스 (level -> [(node_id, summary)])
levels = defaultdict(list)
for nid, (lvl, children, summ) in node_info.items():
    levels[lvl].append((nid, summ))

# leaf summary (본문 없을 때 폴백용)
try:
    summ_smoke = OUT / "chunk_summaries_smoke.jsonl"
    summ_all   = OUT / "chunk_summaries.jsonl"
    summ_path  = summ_smoke if summ_smoke.exists() else summ_all
    leaf_summary = {json.loads(l)["chunk_id"]: json.loads(l)["summary"]
                    for l in open(summ_path, encoding="utf-8")}
except Exception:
    leaf_summary = {}


In [9]:
def select_start_level(q):
    ks = sorted(levels.keys())
    if not ks: return None
    qlen = len(re.findall(r"\w+", q))
    return (ks[-1] if qlen<=5 else (ks[-2] if len(ks)>1 and qlen<=12 else ks[0]))

def rank_nodes_at_level(q, L, topk=6):
    items = levels.get(L, [])
    if not items: return []
    ids  = [i for i,_ in items]
    txts = [t for _,t in items]
    v, M = _fit(txts); sims = _score(M, v, q)
    idx  = np.argsort(-sims)[:topk]
    return [ids[i] for i in idx]

def expand_to_leaves(seed_ids, hops=2):
    out, fr = [], list(map(_norm_id, seed_ids))
    for _ in range(hops):
        nxt=[]
        for nid in fr:
            if _is_chunk(nid): out.append(nid)
            elif nid in node_info: nxt += [ _norm_id(c) for c in node_info[nid][1] ]
        fr = nxt
        if not fr: break
    # 중복 제거
    seen, uniq = set(), []
    for cid in out:
        if cid not in seen: uniq.append(cid); seen.add(cid)
    return uniq

def rerank_chunks_with_fallback(q, candidate_cids, topk=5):
    # 1) 본문 기준
    ids, txts = [], []
    for cid in candidate_cids:
        if cid in chunk_text:
            ids.append(cid); txts.append(chunk_text[cid])
    if ids:
        v, M = _fit(txts); sims = _score(M, v, q)
        idx = np.argsort(-sims)[:topk]
        return [(ids[i], float(sims[i])) for i in idx]
    # 2) 폴백: 요약 기준
    ids, txts = [], []
    for cid in candidate_cids:
        if cid in leaf_summary:
            ids.append(cid); txts.append(leaf_summary[cid])
    if not ids: return []
    v, M = _fit(txts); sims = _score(M, v, q)
    idx = np.argsort(-sims)[:topk]
    return [(ids[i], float(sims[i])) for i in idx]

def raptor_search(q, topk_nodes=6, topk_chunks=5, hops=2):
    L = select_start_level(q)
    if L is None: return {"level": None, "nodes": [], "chunks": []}
    seeds = rank_nodes_at_level(q, L, topk=topk_nodes)
    cand  = expand_to_leaves(seeds, hops=hops) or seeds
    chunks = rerank_chunks_with_fallback(q, cand, topk=topk_chunks)
    return {"level": L, "nodes": seeds, "chunks": chunks}

def pretty_answer(q, sent_per_chunk=2):
    res = raptor_search(q)
    picks=[]
    for cid,_ in res["chunks"]:
        text = chunk_text.get(cid, "")
        sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
        if not sents: continue
        v, M = _fit(sents); sims = _score(M, v, q)
        idx = np.argsort(-sims)[:sent_per_chunk]
        picks.append(f"[{cid}] " + " ".join(sents[i] for i in idx))
    print(f"🔎 Q: {q}\n📍 start level: L{res['level']}\n📑 nodes: {res['nodes']}\n💬 A: {' '.join(picks) if picks else '(no matching evidence)'}")

# 사용 예시
pretty_answer("What strange events happened on Privet Drive?")


🔎 Q: What strange events happened on Privet Drive?
📍 start level: L2
📑 nodes: ['L2_N0002', 'L2_N0001']
💬 A: (no matching evidence)


In [10]:
q = "What strange events happened on Privet Drive?"
L = select_start_level(q)
seeds = rank_nodes_at_level(q, L, topk=6)
cand  = expand_to_leaves(seeds, hops=2)

print("start level:", L)
print("seeds:", seeds)
print("cand:", cand)

print("len(chunk_text):", len(chunk_text))
print("have chunk_text:", [c for c in cand if c in chunk_text])

try:
    print("len(leaf_summary):", len(leaf_summary))
    print("have leaf_summary:", [c for c in cand if c in leaf_summary])
except NameError:
    print("leaf_summary not loaded")


start level: 2
seeds: ['L2_N0002', 'L2_N0001']
cand: []
len(chunk_text): 227
have chunk_text: []
len(leaf_summary): 5
have leaf_summary: []
