In [44]:
from pathlib import Path
import json, hashlib, statistics as stats
import re, unicodedata
import nltk
import tiktoken

In [45]:
# ========= 경로/설정 =========
DOC_NAME = "01 Harry Potter and the Sorcerers Stone.txt"
DOC_PATH = Path("../../11_data") / DOC_NAME             # 필요시 경로만 변경
OUT_JSONL = Path("hp_chunks_100tok.jsonl")
OUT_JSONL_DEDUP = Path("hp_chunks_100tok.dedup.jsonl")
MAX_TOKENS = 100
OVERLAP_TOKENS = 0  # (옵션) 10~20 추천. 논문 필수 아님.

In [46]:
# ========= 전처리 =========
def clean_text(text: str) -> str:
    """OCR/특수공백/축약어 깨짐 보정 (강화 + B 보강 포함)"""
    t = unicodedata.normalize("NFKC", text)

    # 줄바꿈 통일
    t = t.replace("\r\n", "\n").replace("\r", "\n")

    # 대표적인 제로폭/nbsp 정리
    t = re.sub(r"[\u00A0\u200B\u200C\u200D]", " ", t)
    t = re.sub(r"[ \t]{2,}", " ", t)

    # 하이픈 줄바꿈 연결: some-\nthing → something
    t = re.sub(r"-\s*\n\s*", "", t)

    # 단일 개행 → 공백, 이중 이상 개행은 문단 유지
    t = re.sub(r"\n{2,}", "\n\n", t)
    t = re.sub(r"(?<!\n)\n(?!\n)", " ", t)
    t = re.sub(r"[ \t]{2,}", " ", t).strip()

    # --- Mr. 패턴(일반 케이스) ---
    t = re.sub(
        r'(?i)(["“‘\'(\[]?\s*)m[\s\u00A0\u200B\u200C\u200D]*r[\s\u00A0\u200B\u200C\u200D]*\.(\s*["”’\'\])]?)+',
        r'\1Mr.\2',
        t
    )

    # --- B 보강 1) 진짜 'M r.'만 강제 치유 (M과 r 사이에 공백류 1개 이상일 때만) ---
    t = re.sub(
        r'(?i)(["“‘\'(\[]?\s*)m[\s\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000\uFEFF\u200B\u200C\u200D\u2060]+r\s*\.(\s*["”’\'\])]?)+',
        r'\1Mr.\2',
        t
    )

    # --- B 보강 2) Mr./Mrs./Ms./Dr./Prof. 뒤에 공백이 없으면 한 칸 추가 ---
    t = re.sub(r'\b(Mr|Mrs|Ms|Dr|Prof)\.(?=[A-Za-z])', r'\1. ', t)

    # --- 다른 축약어(Mrs./Ms./Dr./Prof.) 일반 보정 ---
    abbrev_patterns = {
        r"\bm\s*rs\s*\.\b": "Mrs.",
        r"\bm\s*s\s*\.\b":  "Ms.",
        r"\bd\s*r\s*\.\b":  "Dr.",
        r"\bp\s*rof\s*\.\b": "Prof.",
    }
    for pat, rep in abbrev_patterns.items():
        t = re.sub(pat, rep, t, flags=re.IGNORECASE)

    # --- 이니셜 H . → H. ---
    t = re.sub(r"(?i)(?<![A-Za-z])([A-Z])[\s\u00A0\u200B\u200C\u200D]+\.", r"\1.", t)

    return t

In [47]:
# ========= 토크나이저 =========
try:
    enc = tiktoken.encoding_for_model("gpt-4")
except Exception:
    enc = tiktoken.get_encoding("cl100k_base")

def tok_len(text: str) -> int:
    return len(enc.encode(text))


In [48]:
# ========= 청크 함수 (문장 중간 절대 자르지 않음) =========
def chunk_by_tokens_sentence_safe(sents, max_tokens=100, overlap_tokens=0):
    """
    - 문장을 하나씩 추가하다가 max_tokens 넘으면 새 청크로.
    - 문장 자체가 max_tokens를 넘으면 '그 문장만 단독 청크'(초과 허용).
    - overlap_tokens>0이면 이전 청크의 꼬리를 다음 청크 앞에 겹침(선택).
    반환: [{"text":..., "tokens":...}, ...]
    """
    chunks = []
    cur_texts, cur_tokens = [], 0

    def flush():
        nonlocal cur_texts, cur_tokens
        if not cur_texts:
            return
        text = " ".join(cur_texts).strip()
        chunks.append({"text": text, "tokens": cur_tokens})
        cur_texts, cur_tokens = [], 0

    for s in sents:
        n = tok_len(s)

        # 1) 한 문장이 너무 길면 그 문장만 단독 청크
        if n > max_tokens:
            flush()
            chunks.append({"text": s.strip(), "tokens": n})
            continue

        # 2) 현재 청크에 넣으면 초과 → 비우고 새로 시작
        if cur_tokens > 0 and (cur_tokens + n > max_tokens):
            flush()
            if overlap_tokens > 0 and len(chunks) > 0:
                tail_text = chunks[-1]["text"]
                tail_ids = enc.encode(tail_text)
                ov_ids = tail_ids[max(0, len(tail_ids) - overlap_tokens):]
                ov_text = enc.decode(ov_ids).strip()
                cur_texts = [ov_text] if ov_text else []
                cur_tokens = len(enc.encode(" ".join(cur_texts))) if cur_texts else 0

        # 3) 현재 청크에 문장 추가
        cur_texts.append(s)
        cur_tokens += n

    flush()
    return chunks

In [36]:
# ========= 청크 함수 (문장 중간 절대 자르지 않음) =========
def chunk_by_tokens_sentence_safe(sents, max_tokens=100, overlap_tokens=0):
    """
    - 문장을 하나씩 추가하다가 max_tokens 넘으면 새 청크로.
    - 문장 자체가 max_tokens를 넘으면 '그 문장만 단독 청크'(초과 허용).
    - overlap_tokens>0이면 이전 청크의 마지막 토큰 일부를 다음 청크 앞에 겹침(선택).
    반환: [{"text":..., "tokens":...}, ...]
    """
    chunks = []
    cur_texts, cur_tokens = [], 0

    def flush():
        nonlocal cur_texts, cur_tokens
        if not cur_texts:
            return
        text = " ".join(cur_texts).strip()
        chunks.append({"text": text, "tokens": cur_tokens})
        cur_texts, cur_tokens = [], 0

    for s in sents:
        n = tok_len(s)

        # 1) 한 문장이 너무 길면 그 문장만 단독 청크
        if n > max_tokens:
            flush()
            chunks.append({"text": s.strip(), "tokens": n})
            continue

        # 2) 현재 청크에 넣으면 초과 → 비우고 새로 시작
        if cur_tokens > 0 and (cur_tokens + n > max_tokens):
            flush()
            if overlap_tokens > 0 and len(chunks) > 0:
                tail_text = chunks[-1]["text"]
                tail_ids = enc.encode(tail_text)
                ov_ids = tail_ids[max(0, len(tail_ids) - overlap_tokens):]
                ov_text = enc.decode(ov_ids).strip()
                cur_texts = [ov_text] if ov_text else []
                cur_tokens = len(enc.encode(" ".join(cur_texts))) if cur_texts else 0

        # 3) 현재 청크에 문장 추가
        cur_texts.append(s)
        cur_tokens += n

    flush()
    return chunks

In [49]:
# ========= 해시(ID) =========
def cid16(text: str) -> str:
    """내용 기반 SHA-256 해시 앞 16자리"""
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]

In [50]:
# ========= 메인 =========
def main():
    # (A) NLTK 토큰라이저 준비
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

    # (B) 파일 확인/로드
    if not DOC_PATH.exists():
        print(f"[오류] 파일을 찾을 수 없습니다: {DOC_PATH.resolve()}")
        print("→ DOC_PATH 경로를 확인하거나 파일을 해당 위치로 옮겨주세요.")
        return
    text = DOC_PATH.read_text(encoding="utf-8", errors="ignore")

    # (C) 전처리 + 문장 분리
    text = clean_text(text)
    sents = nltk.sent_tokenize(text)

    # (D) 청크 생성
    chunks = chunk_by_tokens_sentence_safe(
        sents, max_tokens=MAX_TOKENS, overlap_tokens=OVERLAP_TOKENS
    )

    # (E) 통계 계산
    lens = [c["tokens"] for c in chunks]
    print("=== CHUNKS SUMMARY ===")
    print(f"총 청크 수: {len(chunks)}")
    print(
        "토큰수(평균/중앙/최소/최대): "
        f"{round(stats.mean(lens),2)} / {stats.median(lens)} / {min(lens)} / {max(lens)}"
    )
    pct_90_100 = round(sum(1 for x in lens if 90 <= x <= 100) / len(lens) * 100, 2)
    print(f"100 토큰 근접(90~100) 비율: {pct_90_100}%")
    topk = sorted(enumerate(lens, 1), key=lambda x: x[1], reverse=True)[:3]
    print("\n가장 긴 청크 Top3 (id, tokens):", topk)

    # (F) 저장(JSONL, cid 부여)
    with OUT_JSONL.open("w", encoding="utf-8") as f:
        for i, ch in enumerate(chunks, 1):
            obj = {
                "id": i,
                "cid": cid16(ch["text"]),
                "tokens": ch["tokens"],
                "text": ch["text"],
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    # (G) 미리보기 3개
    print("\n--- 미리보기 (앞 3개) ---")
    with OUT_JSONL.open(encoding="utf-8") as f:
        for _ in range(3):
            line = f.readline()
            if not line:
                break
            o = json.loads(line)
            print(f"[{o['id']}] cid={o['cid']} · {o['tokens']} tokens")
            print(o["text"][:200].replace("\n", " ") + "...\n")

    # (H) 정확중복 제거본도 저장(같은 cid는 1개만 유지)
    seen, kept = set(), []
    with OUT_JSONL.open(encoding="utf-8") as f:
        for line in f:
            o = json.loads(line)
            if o["cid"] in seen:
                continue
            seen.add(o["cid"])
            kept.append(o)
    with OUT_JSONL_DEDUP.open("w", encoding="utf-8") as f:
        for o in kept:
            f.write(json.dumps(o, ensure_ascii=False) + "\n")
    print(f"정확중복 제거본 저장: {OUT_JSONL_DEDUP} (원본 {len(chunks)} → {len(kept)})")

if __name__ == "__main__":
    main()

=== CHUNKS SUMMARY ===
총 청크 수: 1222
토큰수(평균/중앙/최소/최대): 85.51 / 89.0 / 7 / 294
100 토큰 근접(90~100) 비율: 46.56%

가장 긴 청크 Top3 (id, tokens): [(739, 294), (263, 236), (72, 178)]

--- 미리보기 (앞 3개) ---
[1] cid=978646441d4a7b5a · 79 tokens
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or...

[2] cid=c8797b0f0bde0730 · 87 tokens
He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as s...

[3] cid=ca7fd7fb387dd3d3 · 45 tokens
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Pot...

정확중복 제거본 저장: hp_chunks_100tok.dedup.jsonl (원본 1222 → 1222)


In [39]:
# ========= 메인 =========
def main():
    # (A) NLTK 토큰라이저 준비
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

    # (B) 파일 확인/로드
    if not DOC_PATH.exists():
        print(f"[오류] 파일을 찾을 수 없습니다: {DOC_PATH.resolve()}")
        print("→ DOC_PATH 경로를 확인하거나 파일을 해당 위치로 옮겨주세요.")
        return
    text = DOC_PATH.read_text(encoding="utf-8", errors="ignore")

    # (C) 전처리 + 문장 분리
    text = clean_text(text)
    sents = nltk.sent_tokenize(text)

    # (D) 청크 생성
    chunks = chunk_by_tokens_sentence_safe(
        sents, max_tokens=MAX_TOKENS, overlap_tokens=OVERLAP_TOKENS
    )

    # (E) 통계 계산
    lens = [c["tokens"] for c in chunks]
    print("=== CHUNKS SUMMARY ===")
    print(f"총 청크 수: {len(chunks)}")
    print(
        "토큰수(평균/중앙/최소/최대): "
        f"{round(stats.mean(lens),2)} / {stats.median(lens)} / {min(lens)} / {max(lens)}"
    )
    pct_90_100 = round(sum(1 for x in lens if 90 <= x <= 100) / len(lens) * 100, 2)
    print(f"100 토큰 근접(90~100) 비율: {pct_90_100}%")
    topk = sorted(enumerate(lens, 1), key=lambda x: x[1], reverse=True)[:3]
    print("\n가장 긴 청크 Top3 (id, tokens):", topk)

    # (F) 저장(JSONL, cid 부여)
    with OUT_JSONL.open("w", encoding="utf-8") as f:
        for i, ch in enumerate(chunks, 1):
            obj = {
                "id": i,
                "cid": cid16(ch["text"]),
                "tokens": ch["tokens"],
                "text": ch["text"],
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    # (G) 미리보기 3개
    print("\n--- 미리보기 (앞 3개) ---")
    with OUT_JSONL.open(encoding="utf-8") as f:
        for _ in range(3):
            line = f.readline()
            if not line:
                break
            o = json.loads(line)
            print(f"[{o['id']}] cid={o['cid']} · {o['tokens']} tokens")
            print(o["text"][:200].replace("\n", " ") + "...\n")

    # (H) 정확중복 제거본도 저장(같은 cid는 1개만 유지)
    seen, kept = set(), []
    with OUT_JSONL.open(encoding="utf-8") as f:
        for line in f:
            o = json.loads(line)
            if o["cid"] in seen:
                continue
            seen.add(o["cid"])
            kept.append(o)
    with OUT_JSONL_DEDUP.open("w", encoding="utf-8") as f:
        for o in kept:
            f.write(json.dumps(o, ensure_ascii=False) + "\n")
    print(f"정확중복 제거본 저장: {OUT_JSONL_DEDUP} (원본 {len(chunks)} → {len(kept)})")

if __name__ == "__main__":
    main()

=== CHUNKS SUMMARY ===
총 청크 수: 1219
토큰수(평균/중앙/최소/최대): 85.69 / 89 / 7 / 294
100 토큰 근접(90~100) 비율: 47.17%

가장 긴 청크 Top3 (id, tokens): [(737, 294), (261, 236), (70, 178)]

--- 미리보기 (앞 3개) ---
[1] cid=70ea7b1bb3a13b4d · 100 tokens
Mr.and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or ...

[2] cid=eca2e02b6f307ca6 · 91 tokens
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Durs...

[3] cid=a3c0d6f207742a58 · 79 tokens
They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years; in fact, Mrs. Dursley pretended she didn’t hav...

정확중복 제거본 저장: hp_chunks_100tok.dedup.jsonl (원본 1219 → 1219)


In [51]:
import re
from pathlib import Path

raw = Path("../../11_data/01 Harry Potter and the Sorcerers Stone.txt").read_text(encoding="utf-8", errors="ignore")
txt = clean_text(raw)

# 진짜 'M r.' (M과 r 사이에 공백류 1개 이상)
print("M r. 유형:", len(re.findall(r"(?i)m[\s\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000\uFEFF\u200B\u200C\u200D\u2060]+r\s*\.", txt)))

# Mr./Mrs./Ms./Dr./Prof. 뒤 공백 없음
print("Mr.뒤 공백 없음:", len(re.findall(r"\b(Mr|Mrs|Ms|Dr|Prof)\.(?=[A-Za-z])", txt)))


M r. 유형: 0
Mr.뒤 공백 없음: 0
