In [174]:
"""
 BASE = 13_RAPTOR/
 ├── DATA   → 13_RAPTOR/data
 │    └── harry_potter.txt   (원본 문서)
 │
 ├── SRC    → 13_RAPTOR/src
 │    ├── chunking.py
 │    ├── summarize_chunks.py
 │    └── build_tree.py      (코드 모듈들)
 │
 └── OUT    → 13_RAPTOR/outputs
      ├── chunks.jsonl            (Step 1 결과: 문서 → 청크)
      ├── chunk_summaries.jsonl   (Step 2 결과: 청크 → 요약)
      ├── tree_nodes.jsonl        (Step 3 결과: 트리 구조 노드)
      └── tree_root.json          (Step 3 결과: 최종 루트 요약)

"""

'\n BASE = 13_RAPTOR/\n ├── DATA   → 13_RAPTOR/data\n │    └── harry_potter.txt   (원본 문서)\n │\n ├── SRC    → 13_RAPTOR/src\n │    ├── chunking.py\n │    ├── summarize_chunks.py\n │    └── build_tree.py      (코드 모듈들)\n │\n └── OUT    → 13_RAPTOR/outputs\n      ├── chunks.jsonl            (Step 1 결과: 문서 → 청크)\n      ├── chunk_summaries.jsonl   (Step 2 결과: 청크 → 요약)\n      ├── tree_nodes.jsonl        (Step 3 결과: 트리 구조 노드)\n      └── tree_root.json          (Step 3 결과: 최종 루트 요약)\n\n'

In [175]:
!pip install -q sentencepiece tokenizers transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [176]:
#Step 0. 준비

In [177]:
import json
from pathlib import Path
from tqdm.auto import tqdm

In [178]:
# 현재 노트북이 13_RAPTOR 안에서 열려 있으니, CWD 기준으로 고정
BASE = Path.cwd()                 # /.../09_Mini_Project/13_RAPTOR
DATA = BASE / "data"
OUT  = BASE / "outputs"
SRC  = BASE / "src"
OUT.mkdir(parents=True, exist_ok=True)

In [179]:
print("BASE:", BASE)
print("DATA:", DATA)
print("OUT :", OUT)

BASE: /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR
DATA: /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/data
OUT : /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/outputs


In [180]:
#Step 1. Chunking

In [181]:
# === 문장 분리 함수 ===
def split_sentences(text: str):
    """간단한 문장 단위 분리 (마침표, 물음표, 느낌표 기준)"""
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sents if s]

In [182]:
# === 청크 생성 함수 ===
def chunk_by_sentences(sents, max_chars=2000):
    """문장을 이어 붙이다가 max_chars 넘으면 새 청크 시작"""
    chunks, cur, cur_len = [], [], 0
    for s in sents:
        if cur_len + len(s) > max_chars and cur:
            chunks.append(" ".join(cur))
            cur, cur_len = [], 0
        cur.append(s); cur_len += len(s) + 1
    if cur:
        chunks.append(" ".join(cur))
    return chunks

In [183]:
#2) Step 1: 문서 로드 → 청크 저장

In [184]:
# === 문서 경로 지정 (현재 위치에서 ../../11_data/ 안에 있음) ===
# 문서 경로: 현재 위치에서 ../../11_data/ 아래에 파일이 있음
DOC_NAME = "01 Harry Potter and the Sorcerers Stone.txt"
doc_path = Path("../../11_data") / DOC_NAME

In [185]:
# 문서 읽기 → 문장 분리 → 청크 생성
text   = doc_path.read_text(encoding="utf-8")
sents  = split_sentences(text)
chunks = chunk_by_sentences(sents, max_chars=2000)  # Jessica가 2000 사용

In [186]:
# === 문장 분리 & 청크 생성 ===
# chunks.jsonl 저장
chunk_path = OUT / "chunks.jsonl"
with chunk_path.open("w", encoding="utf-8") as f:
    for i, ch in enumerate(chunks, 1):
        f.write(json.dumps({
            "chunk_id": f"C{i:04d}",
            "text": ch,
            "tokens": len(ch.split())
        }, ensure_ascii=False) + "\n")

In [187]:
print("✅ chunks.jsonl 저장 완료:", chunk_path)
print("총 문장 수:", len(sents))
print("생성된 청크 수:", len(chunks))
print("첫 청크 미리보기:\n", chunks[0][:300], "...")

✅ chunks.jsonl 저장 완료: /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/outputs/chunks.jsonl
총 문장 수: 5003
생성된 청크 수: 227
첫 청크 미리보기:
 M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a fi ...


In [188]:
#✅ 셀 3 — PEGASUS 준비

In [189]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [190]:
model_name = "google/pegasus-xsum"
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [191]:
# 로컬에 있으면 로컬로 로드, 없으면 자동 다운로드
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [192]:
#Step 2 — Chunk Summarization

In [193]:
def summarize_pegasus(text, max_in=512, max_out=64, num_beams=4):
    inputs = tok(text, return_tensors="pt", truncation=True, max_length=max_in).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_length=max_out,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=3,
        )
    return tok.decode(out[0], skip_special_tokens=True)

print(f"✅ PEGASUS 준비 완료 (device={device})")


✅ PEGASUS 준비 완료 (device=mps)


In [194]:
if have_local_model(model_name):
    print("✅ PEGASUS 로컬에 이미 있습니다.")
else:
    print("⬇️ PEGASUS 다운로드 중… (인터넷 필요)")
    tok = AutoTokenizer.from_pretrained(model_name)             # 다운로드
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_name)     # 다운로드
    print("✅ 다운로드 완료")

device = "mps" if torch.backends.mps.is_available() else "cpu"
print("사용 장치:", device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ PEGASUS 로컬에 이미 있습니다.
사용 장치: mps


In [195]:
#3) 요약 함수 (PEGASUS)

In [196]:
from tqdm.auto import tqdm
import pandas as pd
chunks_path = OUT / "chunks.jsonl"

In [197]:
# --- 4-1) 스모크 테스트: 앞 5개만 ---
summ_smoke = OUT / "chunk_summaries_smoke.jsonl"
sample = []
with open(chunks_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        sample.append(json.loads(line))
        if i >= 5: break

In [198]:
with open(summ_smoke, "w", encoding="utf-8") as fout:
    for obj in tqdm(sample, desc="Smoke summarizing (5)"):
        cid, text = obj["chunk_id"], obj["text"]
        summ = summarize_pegasus(text, max_in=512, max_out=96, num_beams=4)  # 살짝 길게
        item = {
            "chunk_id": cid,
            "summary": summ,
            "key_points": [s.strip() for s in summ.split(". ") if s.strip()][:4]
        }
        fout.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ 스모크 요약 저장:", summ_smoke)

Smoke summarizing (5):   0%|          | 0/5 [00:00<?, ?it/s]

✅ 스모크 요약 저장: /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/outputs/chunk_summaries_smoke.jsonl


In [199]:
pd.set_option("display.max_colwidth", None)
df_smoke = pd.read_json(summ_smoke, lines=True)
display(df_smoke)

Unnamed: 0,chunk_id,summary,key_points
0,C0001,This is the story of the Dursleys and the Potters.,[This is the story of the Dursleys and the Potters.]
1,C0002,"The Dursleys left the house for the day, with Mr. Dursley couldn’t bear people who dressed in funny clothes — the getups you saw on young people!","[The Dursleys left the house for the day, with Mr, Dursley couldn’t bear people who dressed in funny clothes — the getups you saw on young people!]"
2,C0003,"On the morning of the first day of school, Mr.","[On the morning of the first day of school, Mr.]"
3,C0004,The first thing Mr.,[The first thing Mr.]
4,C0005,Dudley and Petunia Dursley had a strange day.,[Dudley and Petunia Dursley had a strange day.]


In [217]:
# 3. meta_summarize 선택 (멀티뉴스 시도 → 실패하면 XSum 폴백)
TRY_MULTINEWS = False  # 멀티뉴스에서 에러 많이 났으니 우선 False 권장. OK면 True로 바꿔도 됨.

# summarize_pegasus가 위에서 정의돼 있어야 XSum 폴백이 작동해요.
if TRY_MULTINEWS:
    try:
        meta_summarize = _build_meta_with_multinews()
        print("✅ meta: pegasus-multi_news 사용")
    except Exception as e:
        print(f"ℹ️ multi_news 로드 실패 → XSum으로 폴백: {e}")
        meta_summarize = _build_meta_with_xsum()
        print("✅ meta: pegasus-xsum(프롬프트 강화) 사용")
else:
    meta_summarize = _build_meta_with_xsum()
    print("✅ meta: pegasus-xsum(프롬프트 강화) 사용")

# 4. fanout 설정 (스모크=2, 전량=6)
fanout = 2 if len(leaves) <= 10 else 6
print(f"fanout = {fanout}, leaves = {len(leaves)}")


✅ meta: pegasus-xsum(프롬프트 강화) 사용
fanout = 2, leaves = 5


In [200]:
#트리빌드

In [218]:
# 5. 트리 빌드
level, nodes, current = 0, [], leaves
while len(current) > 1:
    level += 1
    grouped = [current[i:i+fanout] for i in range(0, len(current), fanout)]
    next_level = []
    for gi, group in enumerate(grouped, 1):
        children = [cid for cid,_ in group]
        texts    = [t   for _,t   in group]
        summ = meta_summarize(texts)
        node_id = f"L{level}_N{gi:04d}"
        nodes.append({"node_id": node_id, "level": level, "children": children, "summary": summ})
        next_level.append((node_id, summ))
    current = next_level

In [219]:
## 6. 저장 + 루트 미리보기
root_id, root_text = current[0]
nodes_path.write_text("\n".join(json.dumps(n, ensure_ascii=False) for n in nodes), encoding="utf-8")
root_path.write_text(json.dumps({"root_id": root_id, "summary": root_text}, ensure_ascii=False, indent=2), encoding="utf-8")

print("✅ tree_nodes.jsonl:", nodes_path)
print("✅ tree_root.json :", root_path)
print("\n📌 Root Summary:\n", root_path.read_text(encoding="utf-8"))

✅ tree_nodes.jsonl: /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/outputs/tree_nodes.jsonl
✅ tree_root.json : /Users/jessicahong/gitclone/NLP_study/09_Mini_Project/13_RAPTOR/outputs/tree_root.json

📌 Root Summary:
 {
  "root_id": "L3_N0001",
  "summary": "Do you think you know what is going on in the story?"
}


In [220]:
def _build_meta_with_xsum():
    # summarize_pegasus() 가 이미 정의돼 있다고 가정 (google/pegasus-xsum)
    def _fn(texts, max_in=512, max_out=220, num_beams=8):
        prompt = (
            "Summarize the following bullet points into a cohesive 4–6 sentence paragraph. "
            "Write declarative sentences only (no questions, no instructions). "
            "Include main characters, setting, key events/conflict, and why it matters.\n\n"
            + "\n".join(f"- {t}" for t in texts)
        )
        out = summarize_pegasus(prompt, max_in=max_in, max_out=max_out, num_beams=num_beams)
        bad = out.strip().endswith("?") or out.strip().lower().startswith(("how ", "do you ", "what ")) or len(out.split()) < 35
        if bad:  # 한 번 재시도
            out = summarize_pegasus(prompt, max_in=max_in, max_out=max_out+40, num_beams=num_beams+2)
        return out
    return _fn


In [221]:
# --- meta_summarize 준비 (multi_news → 실패 시 xsum) ---
try:
    meta_summarize = _build_meta_with_multinews()
    print("✅ meta: pegasus-multi_news 사용")
except Exception as e:
    print(f"ℹ️ multi_news 로드 실패, XSum으로 폴백: {e}")
    meta_summarize = _build_meta_with_xsum()
    print("✅ meta: pegasus-xsum(프롬프트 강화) 사용")

ℹ️ multi_news 로드 실패, XSum으로 폴백: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
✅ meta: pegasus-xsum(프롬프트 강화) 사용


In [130]:
#RAPTOR retrieval)

In [222]:
import os, json, math
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
from tqdm.auto import tqdm

In [223]:
# SBERT 임베딩 우선, 실패시 TF-IDF로 폴백
_USE_SBERT = True
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    _USE_SBERT = False
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
#3) 파이프라인 검증: 스모크 5개로 트리 만들어보기