In [5]:
from pathlib import Path
BASE = Path.home() / "gitclone/NLP_study/09_Mini_Project/13_RAPTOR"

# 이제 outputs 한 번만 붙이세요
chunks_raw = read_jsonl(BASE / "outputs/chunks.jsonl")
nodes_raw  = read_jsonl(BASE / "outputs/tree_nodes.jsonl")

In [6]:
import json, itertools
from collections import Counter

def read_jsonl(path):
    out = []
    with open(path, "r") as f:
        for line in f:
            line=line.strip()
            if not line: 
                continue
            out.append(json.loads(line))
    return out

def sniff_keys(recs, n=50):
    """앞부분 n개 레코드의 키를 집계해서 어떤 키들이 있는지 보여줍니다."""
    cnt = Counter()
    for r in recs[:n]:
        cnt.update(r.keys())
    return cnt

def pick_first_key(d, candidates, default=None):
    """사전 d에서 후보 키들 중 최초로 존재하는 키를 골라 반환"""
    for k in candidates:
        if k in d:
            return k
    return default

def standardize_nodes(nodes):
    """
    다양한 스키마를 허용:
      - id: ['id','node_id','nid','uid','name']
      - summary: ['summary','node_summary','text','desc','description','title']
      - children: ['children','child_ids','kids','edges','links']
    반환: [{id, summary, children(list of ids)}...]
    """
    std = []
    for n in nodes:
        id_key = pick_first_key(n, ['id','node_id','nid','uid','name'])
        sum_key = pick_first_key(n, ['summary','node_summary','text','desc','description','title'])
        ch_key = pick_first_key(n, ['children','child_ids','kids','edges','links'])

        node_id = n.get(id_key, None)
        summary = n.get(sum_key, "")
        children = n.get(ch_key, [])

        # children이 문자열 하나로 들어오는 경우 보정
        if isinstance(children, str):
            children = [children]
        # children이 None인 경우 보정
        if children is None:
            children = []

        std.append({
            "id": node_id,
            "summary": summary if isinstance(summary, str) else str(summary),
            "children": children
        })
    # id가 없는 레코드는 제거
    std = [x for x in std if x["id"]]
    return std

def standardize_chunks(chunks):
    """
    다양한 스키마 허용:
      - id: ['id','chunk_id','cid','name']
      - text: ['text','content','raw','body']
    반환: {chunk_id: text}
    """
    out = {}
    for c in chunks:
        id_key = pick_first_key(c, ['id','chunk_id','cid','name'])
        txt_key = pick_first_key(c, ['text','content','raw','body'])

        cid = c.get(id_key, None)
        txt = c.get(txt_key, "")
        if cid:
            out[cid] = txt if isinstance(txt, str) else str(txt)
    return out


In [8]:
# 파일 읽기
chunks_raw = read_jsonl(BASE / "outputs/chunks.jsonl")
nodes_raw  = read_jsonl(BASE / "outputs/tree_nodes.jsonl")

# 스키마 파악(참고 출력)
print("== chunks.jsonl key freq (head) ==")
print(sniff_keys(chunks_raw, n=20))
print("\n== tree_nodes.jsonl key freq (head) ==")
print(sniff_keys(nodes_raw, n=20))

# 표준화
nodes = standardize_nodes(nodes_raw)
chunk_map = standardize_chunks(chunks_raw)

# 표본 확인
print("\n[Sample node] ->", nodes[0] if nodes else "NO NODES")
some_chunk_id = next((cid for cid in chunk_map.keys() if cid.startswith("C")), None)
print("[Sample chunk text exists?]", some_chunk_id is not None)


== chunks.jsonl key freq (head) ==
Counter({'chunk_id': 20, 'text': 20, 'tokens': 20})

== tree_nodes.jsonl key freq (head) ==
Counter({'node_id': 6, 'level': 6, 'children': 6, 'summary': 6})

[Sample node] -> {'id': 'L1_N0001', 'summary': 'This is the story of the Dursleys and the Potters.', 'children': ['C0001', 'C0002']}
[Sample chunk text exists?] True


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def raptor_search(query, nodes, chunk_map, topk_nodes=2, max_chunks_per_node=3):
    # 1) 노드 요약 기반 검색
    ids = [n["id"] for n in nodes]
    sums = [n["summary"] if n["summary"] else "" for n in nodes]

    # 요약 전부 비어있다면 안전탈출
    if not any(sums):
        return []

    vec = TfidfVectorizer().fit(sums + [query])
    qv = vec.transform([query])
    nv = vec.transform(sums)
    sims = cosine_similarity(qv, nv)[0]

    top_idx = sims.argsort()[-topk_nodes:][::-1]

    results = []
    for idx in top_idx:
        node = nodes[idx]
        # 2) 자식 중 청크ID만 추출 (C로 시작한다고 가정)
        child_chunk_ids = [c for c in node["children"] if isinstance(c, str) and c.startswith("C")]
        # 3) 해당 청크 텍스트 모으기
        texts = [chunk_map[cid] for cid in child_chunk_ids if cid in chunk_map]
        # 너무 길면 일부만
        texts = texts[:max_chunks_per_node]
        results.append({
            "node_id": node["id"],
            "node_summary": node["summary"],
            "linked_chunk_ids": child_chunk_ids[:max_chunks_per_node],
            "chunk_texts": texts
        })
    return results

# 실행 예시
query = "What strange events happened on Privet Drive?"
res = raptor_search(query, nodes, chunk_map, topk_nodes=2, max_chunks_per_node=2)

for r in res:
    print("📌 Node:", r["node_id"])
    print("📝 Summary:", r["node_summary"][:180], "..." if len(r["node_summary"])>180 else "")
    print("🔗 Chunks:", r["linked_chunk_ids"])
    for i, t in enumerate(r["chunk_texts"], 1):
        print(f"   [{i}] {t[:160]}{'...' if len(t)>160 else ''}")
    print("-"*60)


📌 Node: L1_N0003
📝 Summary: Dudley and Petunia Dursley had a strange day. 
🔗 Chunks: ['C0005']
   [1] When Dudley had been put to bed, he went into the living room in time to catch the last report on the evening news:

“And finally, bird-watchers everywhere have...
------------------------------------------------------------
📌 Node: L3_N0001
📝 Summary: All images are copyrighted. 
🔗 Chunks: []
------------------------------------------------------------
