In [7]:
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# === 1. 데이터 불러오기 ===
chunks = []
with open("outputs/chunks.jsonl", "r") as f:
    for line in f:
        chunks.append(json.loads(line))

tree = []
with open("outputs/tree_nodes.jsonl", "r") as f:
    for line in f:
        tree.append(json.loads(line))

In [15]:
# === 2. 간단 검색 함수 ===
def raptor_search(query, tree, chunks, topk=2):
    node_ids = [n["id"] for n in tree]
    summaries = [n["summary"] for n in tree]

    # TF-IDF 기반 검색
    vec = TfidfVectorizer().fit(summaries + [query])
    q_vec = vec.transform([query])
    sims = cosine_similarity(q_vec, vec.transform(summaries))[0]

    # 상위 노드 뽑기
    top_idx = sims.argsort()[-topk:][::-1]
    results = []
    for idx in top_idx:
        node = tree[idx]
        child_chunks = [c for c in node.get("children", []) if c.startswith("C")]
        chunk_texts = [c["text"] for c in chunks if c["id"] in child_chunks]
        results.append({
            "node": node["id"],
            "summary": node["summary"],
            "chunks": chunk_texts
        })
    return results# === 3. 실행 ===
query = "What strange events happened on Privet Drive?"
res = raptor_search(query, tree, chunks)

for r in res:
    print("📌 Node:", r["node"])
    print("📝 Summary:", r["summary"])
    print("📑 Chunks:", r["chunks"][:2])  # 일부만 출력
    print("---")

In [16]:
# === 3. 실행 ===
query = "What strange events happened on Privet Drive?"
res = raptor_search(query, tree, chunks)

for r in res:
    print("📌 Node:", r["node"])
    print("📝 Summary:", r["summary"])
    print("📑 Chunks:", r["chunks"][:2])  # 일부만 출력
    print("---")

KeyError: 'id'

In [31]:
def detect_node2chunks_from_linkfile(path):
    """node_chunks.jsonl 류 (node_id -> [chunk_id,...]) 형태 탐지"""
    node2chunks = defaultdict(list)
    try:
        for obj in load_jsonl(path):
            nid = obj.get("node_id")
            cids = None
            for k in NODE_CHUNK_KEYS:
                if k in obj:
                    cids = obj[k]
                    break
            if nid and cids:
                if not isinstance(cids, list): cids = [cids]
                for cid in cids:
                    if cid in chunk_by_id:
                        node2chunks[nid].append(cid)
    except Exception:
        pass
    return dict(node2chunks)

In [32]:
def detect_node2chunks_from_reverse_linkfile(path):
    """chunk_node_links.jsonl 류 (chunk_id -> node_id/...) 형태 탐지 후 역인덱스"""
    node2chunks = defaultdict(list)
    try:
        for obj in load_jsonl(path):
            cid = obj.get("chunk_id")
            if not cid or cid not in chunk_by_id: 
                continue
            ref = None
            for k in CHUNK_NODE_KEYS:
                if k in obj:
                    ref = obj[k]; break
            if ref is None: 
                continue
            if not isinstance(ref, list): ref = [ref]
            for nid in ref:
                if nid in node_by_id:
                    node2chunks[nid].append(cid)
    except Exception:
        pass
    return dict(node2chunks)

node2chunks = {}

In [33]:
# 3) 링크 파일에서 먼저 찾아보기
for lf in link_files:
    node2chunks = detect_node2chunks_from_linkfile(lf)
    if node2chunks:
        print(f"✅ 링크 감지 (node->chunks): {lf.name}")
        break
    node2chunks = detect_node2chunks_from_reverse_linkfile(lf)
    if node2chunks:
        print(f"✅ 링크 감지 (chunk->nodes): {lf.name}")
        break


In [34]:
# 4) 링크 파일이 없거나 텅 비면, children/청크필드에서 휴리스틱 시도
if not node2chunks:
    # (A) children 안에 chunk_id 패턴이 섞인 경우
    chunk_like = re.compile(r"^c[\w-]+", re.I)
    for nid, nd in node_by_id.items():
        kids = nd.get("children") or []
        cids = [x for x in kids if isinstance(x, str) and chunk_like.match(x)]
        if cids:
            node2chunks[nid] = [cid for cid in cids if cid in chunk_by_id]

if not node2chunks:
    # (B) chunks.jsonl 안에 소속 노드 힌트가 들어있는 경우
    for ch in chunks:
        cid = ch.get("chunk_id")
        if not cid: 
            continue
        for k in ["node_id","belongs_to","owner_node","nodes","parents","node_path"]:
            if k in ch:
                ref = ch[k]
                if not isinstance(ref, list): ref = [ref]
                for nid in ref:
                    if nid in node_by_id:
                        node2chunks.setdefault(nid, []).append(cid)
                break

# 5) 보여주기: 청크가 붙은 노드 하나 찾아서 요약 + 청크 1~2개 미리보기
def preview_text(t, n=160):
    s = t if isinstance(t, str) else str(t)
    return (s[:n] + "...") if len(s) > n else s


In [35]:
if node2chunks:
    # 청크 달린 노드들만
    candidates = [(nid, cids) for nid, cids in node2chunks.items() if cids]
    if candidates:
        nid, cids = candidates[0]  # 첫 번째만 보여줌
        nd = node_by_id[nid]
        print("\n[노드]")
        print(" ID   :", nid)
        print(" 레벨 :", nd.get("level"))
        print(" 자식 :", nd.get("children"))
        print(" 요약 :", nd.get("summary"))
        print("\n[연결된 청크 미리보기]")
        for cid in cids[:2]:
            ch = chunk_by_id.get(cid, {})
            print(f" - {cid}:", preview_text(ch.get("text", "")))
    else:
        print("\n⚠️ 링크는 감지했지만, 실제 연결된 청크 리스트가 비어있습니다.")
else:
    print("\n⚠️ 청크-노드 연결을 찾지 못했습니다.")
    print("   - outputs 폴더에 'node_chunks.jsonl' 또는 'chunk_node_links.jsonl' 같은 파일이 있는지 확인")
    print("   - 파일 안에서 'node_id', 'chunk_ids/chunks/sources' 등의 키가 있는지 확인")
    print("   - 키 이름이 다르면 위 NODE_CHUNK_KEYS / CHUNK_NODE_KEYS에 추가 후 다시 실행")


[노드]
 ID   : L1_N0001
 레벨 : 1
 자식 : ['C0001', 'C0002']
 요약 : This is the story of the Dursleys and the Potters.

[연결된 청크 미리보기]
 - C0001: M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d exp...
 - C0002: Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,”...


In [36]:
#특정 노드 하나만 보기

In [37]:
nid = "L1_N0001"  # 보고 싶은 노드ID로 변경
lvl, kids, summ = node_by_id[nid].get("level"), node_by_id[nid].get("children"), node_by_id[nid].get("summary")
print(f"[노드ID] {nid}\n[레벨] {lvl}\n[자식] {kids}\n[요약] {summ}")
for cid in node2chunks.get(nid, [])[:3]:  # 청크 3개 미리보기
    print(f"- {cid}: {chunk_by_id[cid]['text'][:160]}...")

[노드ID] L1_N0001
[레벨] 1
[자식] ['C0001', 'C0002']
[요약] This is the story of the Dursleys and the Potters.
- C0001: M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d exp...
- C0002: Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,”...


In [39]:
#2) “루트→자식→leaf” 따라가서 첫 청크까지 보기

In [40]:
def first_leaf_with_chunks(root_id):
    stack = [root_id]
    seen = set()
    while stack:
        nid = stack.pop()
        if nid in seen: 
            continue
        seen.add(nid)
        cids = node2chunks.get(nid, [])
        if cids:  # 청크 달린 leaf(또는 중간노드) 발견
            return nid, cids
        for child in node_by_id[nid].get("children", [])[::-1]:
            if child in node_by_id:
                stack.append(child)
    return None, []

root = "L1_N0001"  # 시작 노드
nid, cids = first_leaf_with_chunks(root)
print("→ 찾은 노드:", nid)
for cid in cids[:2]:
    print(f"- {cid}: {chunk_by_id[cid]['text'][:160]}...")


→ 찾은 노드: L1_N0001
- C0001: M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d exp...
- C0002: Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. “Little tyke,”...


In [41]:
# 3) 레벨별로 “한 노드씩” 샘플 들여다보기

In [42]:
from collections import defaultdict

levels = defaultdict(list)
for nid, nd in node_by_id.items():
    levels[nd.get("level")].append(nid)

for lvl in sorted(levels):
    sample = levels[lvl][0]
    print(f"\n=== 레벨 {lvl} 샘플: {sample} ===")
    print("요약:", node_by_id[sample].get("summary"))
    for cid in node2chunks.get(sample, [])[:1]:
        print(f"- {cid}: {chunk_by_id[cid]['text'][:160]}...")



=== 레벨 1 샘플: L1_N0001 ===
요약: This is the story of the Dursleys and the Potters.
- C0001: M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d exp...

=== 레벨 2 샘플: L2_N0001 ===
요약: Harry Potter and the Philosopher’s Stone by JK Rowling

=== 레벨 3 샘플: L3_N0001 ===
요약: All images are copyrighted.


In [13]:
# 청크가 연결된 노드를 하나 찾기
chunked_nodes = [nid for nid, (_, _, _, cids) in node_info.items() if cids]

if chunked_nodes:
    nid = chunked_nodes[0]   # 첫 번째로 청크가 달린 노드
    level, children, summary, node_chunks = node_info[nid]

    print(f"\n[노드ID] {nid}")
    print(f"[레벨]   {level}")
    print(f"[자식]   {children}")
    print(f"[요약]   {summary}")

    print("\n=== 연결된 청크 예시 ===")
    for cid in node_chunks[:2]:  # 앞에서 2개만
        print(f"chunk {cid}: {chunks.get(cid,'(없음)')[:120]}...")
else:
    print("⚠️ 청크가 붙은 노드가 하나도 없음")

⚠️ 청크가 붙은 노드가 하나도 없음


In [25]:
#LABEL INDEX

In [26]:
from collections import defaultdict
levels = defaultdict(list)  # level -> [(node_id, summary)]
for nid, (lvl, children, summ) in node_info.items():
    levels[lvl].append((nid, summ))

In [27]:
#TF-IDF 유틸

In [28]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def _fit(texts):
    v = TfidfVectorizer(ngram_range=(1,2), max_features=60000)
    return v, v.fit_transform(texts)

def _score(M, v, q):
    return cosine_similarity(M, v.transform([q])).ravel()


In [29]:
#시작 레벨 선택

In [30]:
import re
def select_start_level(q:str):
    ks = sorted(levels.keys())
    if not ks: return None
    qn = len(re.findall(r"\w+", q))
    return (ks[-1] if qn<=5 else (ks[-2] if len(ks)>1 and qn<=12 else ks[0]))


In [31]:
#5 — 레벨 내 노드 랭킹

In [32]:
def rank_nodes_at_level(q, L, topk=6):
    items = levels.get(L, [])
    if not items: return []
    ids  = [i for i,_ in items]
    txts = [t for _,t in items]
    v, M = _fit(txts); sims = _score(M, v, q)
    idx = np.argsort(-sims)[:topk]
    return [ids[i] for i in idx]


In [33]:
#확장(FIX: 공백정규화 + 폴백)

In [34]:
def _norm_id(x):  return x.strip() if isinstance(x, str) else x
def _is_chunk(x): return isinstance(x, str) and x.startswith("C")

def expand_to_leaves(seed_ids, hops=2):
    out, fr = [], list(map(_norm_id, seed_ids))
    for _ in range(hops):
        nxt=[]
        for nid in fr:
            if _is_chunk(nid): out.append(nid)
            elif nid in node_info: nxt += [_norm_id(c) for c in node_info[nid][1]]
        fr = nxt
        if not fr: break
    if out:
        seen, uniq = set(), []
        for cid in out:
            if cid not in seen: uniq.append(cid); seen.add(cid)
        return uniq
    # 폴백1: 한 hop 자식 반환(L2->L1 등)
    one_hop=[]
    for nid in map(_norm_id, seed_ids):
        if nid in node_info: one_hop += [_norm_id(c) for c in node_info[nid][1]]
    if one_hop: return one_hop
    # 폴백2: seed 그대로
    return list(map(_norm_id, seed_ids))


In [35]:
# 4) L1 → C 강제 변환
def to_chunks(ids, max_hops=3):
    out, fr = [], list(ids)
    for _ in range(max_hops):
        nxt=[]
        for x in fr:
            if isinstance(x, str) and x.startswith("C"):
                out.append(x)
            elif x in node_info:
                nxt += node_info[x][1]
        fr = nxt
        if not fr: break
    seen, uniq = set(), []
    for c in out:
        if c not in seen: uniq.append(c); seen.add(c)
    return uniq


In [36]:
#Cell 7 — 리프 요약 로드(폴백용)

In [37]:
try:
    summ_smoke = OUT / "chunk_summaries_smoke.jsonl"
    summ_all   = OUT / "chunk_summaries.jsonl"
    summ_path  = summ_smoke if summ_smoke.exists() else summ_all
    leaf_summary = {json.loads(l)["chunk_id"]: json.loads(l)["summary"] for l in open(summ_path, encoding="utf-8")}
except Exception:
    leaf_summary = {}
print("✅ leaf_summary:", len(leaf_summary))


✅ leaf_summary: 5


In [38]:
#Cell 8 — 재랭킹(본문 우선, 요약 폴백)

In [39]:
def rerank_chunks_with_fallback(q, candidate_cids, topk=5):
    # 본문 기준
    ids, txts = [], []
    for cid in candidate_cids:
        if cid in chunk_text:
            ids.append(cid); txts.append(chunk_text[cid])
    if ids:
        v, M = _fit(txts); sims = _score(M, v, q)
        idx = np.argsort(-sims)[:topk]
        return [(ids[i], float(sims[i])) for i in idx]
    # 요약 폴백
    ids, txts = [], []
    for cid in candidate_cids:
        if cid in leaf_summary:
            ids.append(cid); txts.append(leaf_summary[cid])
    if not ids: return []
    v, M = _fit(txts); sims = _score(M, v, q)
    idx = np.argsort(-sims)[:topk]
    return [(ids[i], float(sims[i])) for i in idx]


In [40]:
#Cell 9 — RAPTOR 검색 + 답변

In [41]:
def raptor_search(q, topk_nodes=6, topk_chunks=5, hops=2):
    L = select_start_level(q)
    if L is None: return {"level": None, "nodes": [], "chunks": []}
    seeds = rank_nodes_at_level(q, L, topk=topk_nodes)
    cand  = expand_to_leaves(seeds, hops=hops) or seeds
    cand  = to_chunks(cand, max_hops=3) or cand   # ⭐ 여기를 추가: L1 → C로 강제 변환
    chunks = rerank_chunks_with_fallback(q, cand, topk=topk_chunks)
    return {"level": L, "nodes": seeds, "chunks": chunks}

def pretty_answer(q, sent_per_chunk=3):
    res = raptor_search(q)
    picks=[]
    for cid,_ in res["chunks"]:
        text = chunk_text.get(cid, "")
        sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
        if not sents: continue
        v, M = _fit(sents); sims = _score(M, v, q)
        idx = np.argsort(-sims)[:sent_per_chunk]
        picks.append(f"[{cid}] " + " ".join(sents[i] for i in idx))
    print(f"🔎 Q: {q}\n📍 start level: L{res['level']}\n📑 nodes: {res['nodes']}\n💬 A: {' '.join(picks) if picks else '(no matching evidence)'}")


In [42]:
pretty_answer("What strange events happened on Privet Drive?")
pretty_answer("Describe the Dursleys and how they feel about magic.")

🔎 Q: What strange events happened on Privet Drive?
📍 start level: L2
📑 nodes: ['L2_N0002', 'L2_N0001']
💬 A: [C0002] There was a tabby cat standing on the corner of Privet Drive, but there wasn’t a map in sight. It was now reading the sign that said Privet Drive — no, looking at the sign; cats couldn’t read maps or signs. What could he have been thinking of? [C0001] Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. [C0003] It was on his way back past them, clutching a large doughnut in a bag, that he caught a few words of what they were saying. “The Potters, that’s right, that’s what I heard —”

“ — yes, their son, Harry —”

Mr. Dursl

In [43]:
# ===================== RAPTOR Day3 : 왕초보용 검색 스켈레톤 =====================
# 쿼리 → 관련 노드(요약) → 연결 청크 → 간단 요약 답변
# - 외부 라이브러리 없이 동작 (정규식/중복 단어 겹침 기반)
# - 파일: tree_nodes.jsonl, chunks.jsonl, (node_chunks.jsonl 또는 유사 링크 파일)
# ================================================================================

import json, re
from pathlib import Path
from collections import defaultdict, Counter

# --------------------------- 유틸: 텍스트 전처리 ---------------------------
WORD_RE = re.compile(r"[A-Za-z가-힣0-9']+")

def tokenize(text: str):
    if not text:
        return []
    return [w.lower() for w in WORD_RE.findall(text)]

def wordset(text: str):
    return set(tokenize(text))

def sent_split(text: str):
    # 아주 단순한 문장 분리 (., !, ? 기준)
    if not isinstance(text, str):
        return []
    parts = re.split(r"(?<=[.!?\n])\s+", text.strip())
    # 빈 문장 제거
    return [s.strip() for s in parts if s.strip()]

def overlap_score(query_ws: set, text: str):
    if not text: 
        return 0
    ws = wordset(text)
    if not ws:
        return 0
    return len(query_ws & ws)

# --------------------------- JSONL 로드 ---------------------------
def load_jsonl(path: Path):
    with open(path, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if line:
                yield json.loads(line)

# --------------------------- 경로 추정 ---------------------------
def find_base():
    CWD = Path.cwd()
    if CWD.name == "13_RAPTOR":
        return CWD
    cands = [CWD/"13_RAPTOR", CWD.parent/"13_RAPTOR", CWD/"09_Mini_Project"/"13_RAPTOR"]
    for p in cands:
        if p.exists():
            return p
    return CWD

BASE = find_base()
OUT = BASE / "outputs"

# --------------------------- 데이터 읽기 ---------------------------
nodes_path  = OUT / "tree_nodes.jsonl"
chunks_path = OUT / "chunks.jsonl"

nodes  = list(load_jsonl(nodes_path))
chunks = list(load_jsonl(chunks_path))

node_by_id  = {nd["node_id"]: nd for nd in nodes if "node_id" in nd}
chunk_by_id = {ch["chunk_id"]: ch for ch in chunks if "chunk_id" in ch}

# --------------------------- 노드↔청크 링크 자동 감지 ---------------------------
# (outputs 폴더에서 map/link 파일을 찾아 node2chunks를 구성)
NODE_CHUNK_KEYS = ["chunk_ids", "chunks", "sources"]
CHUNK_NODE_KEYS = ["node_id", "belongs_to", "owner_node", "nodes", "parents", "node_path"]

def detect_node2chunks():
    node2chunks = defaultdict(list)

    # 1) 별도 링크 파일 스캔
    for p in OUT.glob("*"):
        if p.suffix.lower() not in {".jsonl", ".json", ".ndjson"}:
            continue
        if not re.search(r"(map|link|assign|attach|node.*chunk|chunk.*node|leaf)", p.name, re.I):
            continue
        # (a) node -> chunks 구조
        try:
            for obj in load_jsonl(p):
                nid = obj.get("node_id")
                if not nid:
                    continue
                cids = None
                for k in NODE_CHUNK_KEYS:
                    if k in obj:
                        cids = obj[k]
                        break
                if cids:
                    if not isinstance(cids, list):
                        cids = [cids]
                    for cid in cids:
                        if cid in chunk_by_id:
                            node2chunks[nid].append(cid)
        except Exception:
            pass

        # 링크가 생겼다면 OK
        if node2chunks:
            return dict(node2chunks)

        # (b) chunk -> nodes 구조 (역인덱스)
        try:
            tmp = defaultdict(list)
            for obj in load_jsonl(p):
                cid = obj.get("chunk_id")
                if not cid or cid not in chunk_by_id:
                    continue
                ref = None
                for k in CHUNK_NODE_KEYS:
                    if k in obj:
                        ref = obj[k]
                        break
                if ref is None:
                    continue
                if not isinstance(ref, list):
                    ref = [ref]
                for nid in ref:
                    if nid in node_by_id:
                        tmp[nid].append(cid)
            if tmp:
                return dict(tmp)
        except Exception:
            pass

    # 2) 링크 파일이 없으면 휴리스틱들

    # (A) children에 chunk_id가 섞인 경우 (예: 'C0001' 패턴)
    chunk_like = re.compile(r"^c[\w-]+", re.I)
    for nid, nd in node_by_id.items():
        kids = nd.get("children") or []
        cids = [x for x in kids if isinstance(x, str) and chunk_like.match(x)]
        for cid in cids:
            if cid in chunk_by_id:
                node2chunks[nid].append(cid)

    if node2chunks:
        return dict(node2chunks)

    # (B) 청크 쪽에 소속 노드 힌트가 있는 경우
    tmp = defaultdict(list)
    for ch in chunks:
        cid = ch.get("chunk_id")
        if not cid:
            continue
        for k in CHUNK_NODE_KEYS:
            if k in ch:
                ref = ch[k]
                if not isinstance(ref, list):
                    ref = [ref]
                for nid in ref:
                    if nid in node_by_id:
                        tmp[nid].append(cid)
                break
    if tmp:
        return dict(tmp)

    # 아무것도 못 찾음
    return {}

node2chunks = detect_node2chunks()

# --------------------------- 노드 스코어링 (요약 기반) ---------------------------
def rank_nodes_by_query(query: str, top_k: int = 3):
    qset = wordset(query)
    scored = []
    for nid, nd in node_by_id.items():
        summ = nd.get("summary", "")
        score = overlap_score(qset, summ)
        # 보너스: 제목/요약이 짧을수록 가점 (간단한 길이 보정)
        score += max(0, 5 - len(summ.split())//40)
        if score > 0:
            scored.append((score, nid))
    scored.sort(reverse=True)
    return [nid for _, nid in scored[:top_k]]

# --------------------------- 청크 스코어링 (내용 기반) ---------------------------
def rank_chunks_for_nodes(query: str, nids, per_node: int = 3):
    qset = wordset(query)
    results = []
    for nid in nids:
        cids = node2chunks.get(nid, [])
        scored = []
        for cid in cids:
            text = chunk_by_id.get(cid, {}).get("text", "")
            sc = overlap_score(qset, text)
            if sc > 0:
                scored.append((sc, cid))
        scored.sort(reverse=True)
        results.extend([cid for _, cid in scored[:per_node]])
    # 중복 제거, 간단 정렬 유지
    seen, ordered = set(), []
    for cid in results:
        if cid not in seen:
            seen.add(cid)
            ordered.append(cid)
    return ordered

# --------------------------- 간단 요약 생성 (추출식) ---------------------------
def make_answer(query: str, chunk_texts, max_sents: int = 5, max_chars: int = 800):
    # 문장 단위로 나눠서, 쿼리와 겹치는 문장을 우선 선택
    qset = wordset(query)
    cand_sents = []
    for txt in chunk_texts:
        for s in sent_split(txt):
            sc = overlap_score(qset, s)
            cand_sents.append((sc, s))

    # 점수 높은 문장부터 뽑되, 너무 비슷한 문장 반복 방지(간단 중복 제거)
    cand_sents.sort(key=lambda x: x[0], reverse=True)
    picked, seen = [], set()
    for sc, s in cand_sents:
        norm = " ".join(tokenize(s))[:120]  # 간단한 중복 키
        if norm in seen:
            continue
        picked.append(s)
        seen.add(norm)
        if len(picked) >= max_sents:
            break

    if not picked:
        # 백업: 첫 청크 앞부분을 잘라서 반환
        fallback = (chunk_texts[0] if chunk_texts else "").strip()
        return (fallback[:max_chars] + "...") if len(fallback) > max_chars else fallback

    ans = " ".join(picked)
    return (ans[:max_chars] + "...") if len(ans) > max_chars else ans

# --------------------------- 메인: raptor_search ---------------------------
def raptor_search(query: str, top_k_nodes=3, chunks_per_node=3, max_sents=5, max_chars=800):
    # 1) 관련 노드 고르기 (요약 매칭)
    cand_nodes = rank_nodes_by_query(query, top_k=top_k_nodes)
    # 백업: 점수 0이면 루트/레벨 낮은 순서에서 몇 개라도 선택
    if not cand_nodes:
        lvl_pairs = sorted([(nd.get("level", 999), nid) for nid, nd in node_by_id.items()])[:top_k_nodes]
        cand_nodes = [nid for _, nid in lvl_pairs]

    # 2) 노드에 연결된 청크 중에서 관련도 높은 것 추리기
    cand_chunks = rank_chunks_for_nodes(query, cand_nodes, per_node=chunks_per_node)
    if not cand_chunks:
        # 백업: 노드에 달린 첫 청크라도 사용
        for nid in cand_nodes:
            for cid in node2chunks.get(nid, [])[:chunks_per_node]:
                cand_chunks.append(cid)
        # 그래도 없으면 전체 청크에서 일부
        if not cand_chunks:
            cand_chunks = list(chunk_by_id.keys())[:chunks_per_node]

    # 3) 답변 만들기 (추출식)
    chunk_texts = [chunk_by_id[cid]["text"] for cid in cand_chunks if cid in chunk_by_id]
    answer = make_answer(query, chunk_texts, max_sents=max_sents, max_chars=max_chars)

    # 4) 디버그/설명용 메타
    used = {
        "nodes": [{"id": nid, "summary": node_by_id[nid].get("summary","")} for nid in cand_nodes],
        "chunks": [{"id": cid, "preview": (chunk_by_id[cid].get("text","")[:160] + "...")} for cid in cand_chunks]
    }
    return answer, used

# ============================ 사용 예시 ============================
# query = "Who are the Dursleys?"
# ans, meta = raptor_search(query)
# print("[Answer]\n", ans)
# print("\n[Used nodes]")
# for it in meta["nodes"]: print("-", it["id"], ":", it["summary"])
# print("\n[Used chunks]")
# for it in meta["chunks"]: print("-", it["id"], ":", it["preview"])
# ==================================================================


In [9]:
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# === 1. 데이터 불러오기 ===
with open("hp_chunks_100tok.dedup.jsonl", "r") as f:
    chunks = [json.loads(line) for line in f]

with open("raptor_tree.json", "r") as f:
    tree = json.load(f)  # Day2에서 만든 노드-요약 구조

# === 2. 간단 검색 함수 ===
def raptor_search(query, tree, chunks, topk=2):
    # 노드 요약 텍스트 모음
    node_ids = list(tree.keys())
    summaries = [tree[n]["summary"] for n in node_ids]

    # TF-IDF 기반 검색
    vec = TfidfVectorizer().fit(summaries + [query])
    q_vec = vec.transform([query])
    sims = cosine_similarity(q_vec, vec.transform(summaries))[0]

    # 상위 노드 뽑기
    top_idx = sims.argsort()[-topk:][::-1]
    results = []
    for idx in top_idx:
        node_id = node_ids[idx]
        node = tree[node_id]
        child_chunks = [c for c in node["children"] if c.startswith("C")]
        chunk_texts = [c["text"] for c in chunks if c["id"] in child_chunks]
        results.append({
            "node": node_id,
            "summary": node["summary"],
            "chunks": chunk_texts
        })
    return results

# === 3. 실행 ===
query = "What strange events happened on Privet Drive?"
res = raptor_search(query, tree, chunks)

for r in res:
    print("📌 Node:", r["node"])
    print("📝 Summary:", r["summary"])
    print("📑 Chunks:", r["chunks"][:2])  # 일부만 출력
    print("---")


FileNotFoundError: [Errno 2] No such file or directory: 'raptor_tree.json'