In [54]:
"""
Build chunks with different sizes (e.g., 256/512/1024 tokens [approx words]).
- Compare retrieval performance across chunk sizes.
- Try Fusion Retriever: BM25 + Embedding (late fusion with weighted sum).
"""

'\nBuild chunks with different sizes (e.g., 256/512/1024 tokens [approx words]).\n- Compare retrieval performance across chunk sizes.\n- Try Fusion Retriever: BM25 + Embedding (late fusion with weighted sum).\n'

In [22]:
import os
os.environ["TRANSFORMERS_NO_TF"]="1"
os.environ["TRANSFORMERS_NO_FLAX"]="1"

In [23]:
from pathlib import Path
import shutil, os

In [55]:
# 1) set your file path (absolute path is safest)

In [74]:
#get current working directory
root = Path.cwd()
# set the source file using relative path (go 2 levels up, then into 11_data)
src = Path("../../../11_data/01 Harry Potter and the Sorcerers Stone.txt")
#make a "corpus" folder inside current working directory
corpus_dir = root / "corpus"
corpus_dir.mkdir(exist_ok=True, parents=True)

In [75]:
#check the source file really exists, stop if nots
assert src.exists(), f"File not found: {src}"


# copy the file into the corpus folder (only if not already copied)
dst = corpus_dir / src.name
if not dst.exists():
    shutil.copy2(src, dst)
dst, list(p.name for p in corpus_dir.glob("*"))

(PosixPath('/Users/jessicahong/gitclone/NLP_study/15_llm/Applications/RAG/corpus/01 Harry Potter and the Sorcerers Stone.txt'),
 ['01 Harry Potter and the Sorcerers Stone.txt'])

In [77]:
# 3) simple loader for utf-8 txt files
def load_txt(path: Path) -> str:
    """Read a UTF-8 text file and return as a string"""
    return path.read_text(encoding="utf-8", errors="ignore")

# check again that the file exists (optional, for debugging)
print(src.exists(), src)

True ../../../11_data/01 Harry Potter and the Sorcerers Stone.txt


In [78]:
# 4) actually read the text into memory
text = load_txt(src)

# show number of characters and preview
print("num chars:", len(text))
print("\n=== preview (first 600 chars) ===\n", text[:600])


num chars: 439478

=== preview (first 600 chars) ===
 M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, sp


In [79]:
# --- Clean up the raw text ---

import re, unicodedata

def normalize_text(t: str) -> str:
    # 1) normalize unicode (e.g., full-width to normal width)
    t = unicodedata.normalize("NFKC", t)

    # 2) replace multiple spaces or tabs with a single space
    t = re.sub(r"[ \t]+", " ", t)

    # 3) if there are 3 or more newlines, shrink to just 2 newlines
    t = re.sub(r"\n{3,}", "\n\n", t)

    # 4) remove extra spaces at start and end
    return t.strip()

# apply the cleaning function
text = normalize_text(text)

In [81]:
# check new length and preview first 400 characters
print("num chars after clean:", len(text))
print(text[:400])

num chars after clean: 439606
M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he d


In [65]:
#chunking by words
# we split the text into pieces (chunks) of N words
# to keep some context, we overlap chunks by a small amount

In [82]:
from typing import List

# pattern to split into words and punctuation
WORD_SPLIT = re.compile(r"\w+|[^\w\s]")  

def simple_tokenize(t: str) -> List[str]:
    """Turn text into a list of lowercase tokens (words + punctuation)."""
    return WORD_SPLIT.findall(t.lower())


In [83]:
def chunk_by_words(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
    """
    Split text into chunks by word count.
    - Each chunk has about `chunk_size` words
    - Neighbor chunks overlap by `overlap` words
    """
    # 1) tokenize the text
    words = simple_tokenize(text)
    chunks = []

    # 2) step size = chunk_size - overlap
    step = max(1, chunk_size - overlap)

    # 3) slide through the words list with this step size
    for i in range(0, len(words), step):
        # take words from i to i+chunk_size
        w = words[i:i+chunk_size]
        if not w:
            break

        # 4) join words back into a string (collapse spaces)
        chunk = re.sub(r"\s+", " ", " ".join(w)).strip()
        chunks.append(chunk)

        # 5) stop if we reach the end
        if i + chunk_size >= len(words):
            break

    return chunks

In [84]:
# --- Test the chunking function ---

# choose chunk sizes to compare
CHUNK_SIZES = [256, 512, 1024]
OVERLAP = 50

chunked_map = {}
for sz in CHUNK_SIZES:
    ch = chunk_by_words(text, chunk_size=sz, overlap=OVERLAP)
    chunked_map[sz] = ch
    print(f"chunk_size={sz} -> number of chunks: {len(ch)}")

# preview: show the first 1–2 chunks for each size
for sz in CHUNK_SIZES:
    print(f"\n=== Sample from chunk_size={sz} ===")
    print("Chunk 0:", chunked_map[sz][0][:200], "...")
    if len(chunked_map[sz]) > 1:
        print("Chunk 1:", chunked_map[sz][1][:200], "...")


chunk_size=256 -> number of chunks: 502
chunk_size=512 -> number of chunks: 224
chunk_size=1024 -> number of chunks: 107

=== Sample from chunk_size=256 ===
Chunk 0: m r . and mrs . dursley , of number four , privet drive , were proud to say that they were perfectly normal , thank you very much . they were the last people you ’ d expect to be involved in anything  ...
Chunk 1: sister , but they hadn ’ t met for several years ; in fact , mrs . dursley pretended she didn ’ t have a sister , because her sister and her good - for - nothing husband were as undursleyish as it was ...

=== Sample from chunk_size=512 ===
Chunk 0: m r . and mrs . dursley , of number four , privet drive , were proud to say that they were perfectly normal , thank you very much . they were the last people you ’ d expect to be involved in anything  ...
Chunk 1: . dursley as he left the house . he got into his car and backed out of number four ’ s drive . it was on the corner of the street that he noticed the first 

In [85]:
import pandas as pd
from pathlib import Path

# 1) make an output folder (rag_lab_outputs)
out_dir = Path("rag_lab_outputs")
out_dir.mkdir(exist_ok=True, parents=True)

# 2) collect chunk info into a list of dicts
rows = []
for sz, chunks in chunked_map.items():
    for i, ch in enumerate(chunks):
        rows.append({
            "doc_name": src.name,                     # which file
            "chunk_size": sz,                         # size setting
            "chunk_idx": i,                           # index number
            "chunk_preview": ch[:160].replace("\n"," "), # first 160 chars
            "chunk_len_words": len(simple_tokenize(ch)), # number of words
        })

# 3) convert list into a pandas DataFrame (table)
df = pd.DataFrame(rows)

# 4) save DataFrame to CSV file
csv_path = out_dir / "chunking_summary.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")

print("✅ saved to:", csv_path)

# 5) show the first 8 rows as a quick preview
df.head(8)


✅ saved to: rag_lab_outputs/chunking_summary.csv


Unnamed: 0,doc_name,chunk_size,chunk_idx,chunk_preview,chunk_len_words
0,01 Harry Potter and the Sorcerers Stone.txt,256,0,"m r . and mrs . dursley , of number four , pri...",256
1,01 Harry Potter and the Sorcerers Stone.txt,256,1,"sister , but they hadn ’ t met for several yea...",256
2,01 Harry Potter and the Sorcerers Stone.txt,256,2,", mr . dursley picked up his briefcase , pecke...",256
3,01 Harry Potter and the Sorcerers Stone.txt,256,3,"said privet drive — no , looking at the sign ;...",256
4,01 Harry Potter and the Sorcerers Stone.txt,256,4,- green cloak ! the nerve of him ! but then it...,256
5,01 Harry Potter and the Sorcerers Stone.txt,256,5,and walk across the road to buy himself a bun ...,256
6,01 Harry Potter and the Sorcerers Stone.txt,256,6,. he put the receiver back down and stroked hi...,256
7,01 Harry Potter and the Sorcerers Stone.txt,256,7,that the man was wearing a violet cloak . he d...,256


In [87]:
# --- Simple statistics by chunk size ---

# 1) count how many chunks were created for each chunk_size
cnt = (
    df.groupby("chunk_size")["chunk_idx"]
      .count()
      .rename("num_chunks")
      .reset_index()
)

# 2) compute the average number of words in chunks (rounded to 1 decimal)
avg = (
    df.groupby("chunk_size")["chunk_len_words"]
      .mean()
      .round(1)
      .rename("avg_words")
      .reset_index()
)

# 3) print the tables
print("=== number of chunks ===\n", cnt, "\n")
print("=== average words per chunk ===\n", avg)


=== number of chunks ===
    chunk_size  num_chunks
0         256         502
1         512         224
2        1024         107 

=== average words per chunk ===
    chunk_size  avg_words
0         256      255.9
1         512      511.4
2        1024     1015.9


In [34]:
#[R1] BM25 인덱스 만들기

In [35]:
from rank_bm25 import BM25Okapi

bm25_indexes = {}
for sz, chunks in chunked_map.items():
    toks = [simple_tokenize(c) for c in chunks]
    bm25_indexes[sz] = BM25Okapi(toks) if toks else None
print("✅ BM25 built for:", list(bm25_indexes.keys()))


✅ BM25 built for: [256, 512]


In [37]:
#[R2] 임베딩 인덱스 만들기

In [36]:
import numpy as np
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer

EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # 빠르고 안정적
embedder = SentenceTransformer(EMB_MODEL_NAME)

emb_indexes = {}
for sz, chunks in chunked_map.items():
    if not chunks:
        emb_indexes[sz] = np.zeros((0, 384))
        continue
    E = embedder.encode(chunks, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
    emb_indexes[sz] = normalize(E)
for sz, E in emb_indexes.items():
    print(f"{sz}: emb shape {E.shape}")


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

256: emb shape (502, 384)
512: emb shape (224, 384)


In [38]:
#[R3] 검색 함수 (BM25 / Embedding / Fusion)

In [39]:
import numpy as np

def bm25_search(query: str, sz: int, k: int = 5):
    bm = bm25_indexes[sz]
    if bm is None: return []
    scores = bm.get_scores(simple_tokenize(query))
    idx = np.argsort(scores)[::-1][:k]
    return [(int(i), float(scores[i])) for i in idx]

def emb_search(query: str, sz: int, k: int = 5):
    E = emb_indexes[sz]
    if E.shape[0] == 0: return []
    q = embedder.encode([query], convert_to_numpy=True)
    q = normalize(q)
    sims = (E @ q.T).ravel()
    idx = np.argsort(sims)[::-1][:k]
    return [(int(i), float(sims[i])) for i in idx]

def fusion_search(query: str, sz: int, k: int = 5, alpha: float = 0.6):
    """late fusion: score = alpha*BM25_norm + (1-alpha)*Emb_norm"""
    # 전체 점수 얻기
    n = len(chunked_map[sz])
    if n == 0: return []
    # BM25 전체
    bm_scores = np.zeros(n)
    bm_all = bm25_search(query, sz, k=n)
    for i, s in bm_all: bm_scores[i] = s
    # Emb 전체
    em_scores = np.zeros(n)
    em_all = emb_search(query, sz, k=n)
    for i, s in em_all: em_scores[i] = s
    # min-max 정규화
    def minmax(a):
        if a.size == 0: return a
        mn, mx = a.min(), a.max()
        return np.zeros_like(a) if mx - mn < 1e-9 else (a - mn) / (mx - mn)
    fused = alpha*minmax(bm_scores) + (1-alpha)*minmax(em_scores)
    idx = np.argsort(fused)[::-1][:k]
    return [(int(i), float(fused[i])) for i in idx]


In [40]:
#[R4] 데모 쿼리 실행 (HP 예시)

In [41]:
DEMO_QUERIES = [
    "Who is the headmaster of Hogwarts?",
    "What is the Sorcerer's Stone?",
    "Describe the relationship between Harry and Hagrid."
]

TOPK = 5
ALPHA = 0.6

results = []
for q in DEMO_QUERIES:
    for sz in chunked_map.keys():
        # 각 방법 결과
        bm = bm25_search(q, sz, k=TOPK)
        em = emb_search(q, sz, k=TOPK)
        fu = fusion_search(q, sz, k=TOPK, alpha=ALPHA)
        # 행 적재
        for method, arr in [("bm25", bm), ("embed", em), ("fusion", fu)]:
            for rank, (i, score) in enumerate(arr, start=1):
                ch = chunked_map[sz][i]
                results.append({
                    "query": q,
                    "method": method,
                    "chunk_size": sz,
                    "rank": rank,
                    "score": score,
                    "chunk_idx": i,
                    "chunk_preview": ch[:180].replace("\n", " ")
                })

import pandas as pd
res_df = pd.DataFrame(results)
print("Rows:", len(res_df))
res_df.head(10)


Rows: 90


Unnamed: 0,query,method,chunk_size,rank,score,chunk_idx,chunk_preview
0,Who is the headmaster of Hogwarts?,bm25,256,1,13.64968,166,", inside them , you know , to collect — famous..."
1,Who is the headmaster of Hogwarts?,bm25,256,2,13.024994,82,’ you ’ ve kept it from him all these years ? ...
2,Who is the headmaster of Hogwarts?,bm25,256,3,12.414052,83,"floor , hut - on - the - rock , the sea . he p..."
3,Who is the headmaster of Hogwarts?,bm25,256,4,11.601054,96,i ’ ve read those letters and he needs all sor...
4,Who is the headmaster of Hogwarts?,bm25,256,5,10.229471,79,", don ’ worry . ” he passed the sausages to ha..."
5,Who is the headmaster of Hogwarts?,embed,256,1,0.476788,96,i ’ ve read those letters and he needs all sor...
6,Who is the headmaster of Hogwarts?,embed,256,2,0.451792,431,but a voice suddenly rang across the hall . “ ...
7,Who is the headmaster of Hogwarts?,embed,256,3,0.441645,83,"floor , hut - on - the - rock , the sea . he p..."
8,Who is the headmaster of Hogwarts?,embed,256,4,0.440015,197,"have been chosen and not put in slytherin , he..."
9,Who is the headmaster of Hogwarts?,embed,256,5,0.438926,126,"ve heard of him . he ’ s a sort of servant , i..."


In [42]:
#[R5] CSV 저장 (깃허브 커밋용)

In [46]:
from pathlib import Path
out_dir = Path("rag_lab_outputs")
out_dir.mkdir(exist_ok=True, parents=True)

res_path = out_dir / "retrieval_results.csv"
res_df.to_csv(res_path, index=False, encoding="utf-8")
print("✅ Saved →", res_path, "| rows:", len(res_df))



✅ Saved → rag_lab_outputs/retrieval_results.csv | rows: 90


In [48]:
#2) Top-1 요약 테이블 (쿼리×방법×청크크기별 최고 1개만)

In [49]:
top1 = (res_df.sort_values(["query","method","chunk_size","rank"])
              .groupby(["query","method","chunk_size"])
              .first()
              .reset_index()[["query","method","chunk_size","chunk_idx","score","chunk_preview"]])
print("Top-1 rows:", len(top1))
top1.head(20)


Top-1 rows: 18


Unnamed: 0,query,method,chunk_size,chunk_idx,score,chunk_preview
0,Describe the relationship between Harry and Ha...,bm25,256,120,14.327207,"even deeper , passing an underground lake wher..."
1,Describe the relationship between Harry and Ha...,bm25,512,115,12.736408,"open — they piled through it , shut it quickly..."
2,Describe the relationship between Harry and Ha...,embed,256,99,0.611663,hp 1 - harry potter and the sorcerer ' s stone...
3,Describe the relationship between Harry and Ha...,embed,512,36,0.612445,"hagrid simply waved his hand and said , “ abou..."
4,Describe the relationship between Harry and Ha...,fusion,256,120,0.945285,"even deeper , passing an underground lake wher..."
5,Describe the relationship between Harry and Ha...,fusion,512,138,0.853807,"snape . “ i know a jinx when i see one , hagri..."
6,What is the Sorcerer's Stone?,bm25,256,365,18.250279,told them what he ’ d seen and heard . “ so we...
7,What is the Sorcerer's Stone?,bm25,512,162,14.845405,beast of hagrid ’ s yet ? ” “ b - b - but seve...
8,What is the Sorcerer's Stone?,embed,256,352,0.687569,was looking for . “ i knew it ! i knew it ! ” ...
9,What is the Sorcerer's Stone?,embed,512,157,0.661211,” said ron grumpily . hermione ignored him . “...


In [51]:
#3) 빠른 비교: 방법별 평균 랭크 & 점수

In [50]:
# 각 (query, method, chunk_size)에서 rank의 평균이 낮을수록 좋음 (1이 최고)
rank_stats = (res_df.groupby(["query","method","chunk_size"])["rank"]
                    .mean().rename("avg_rank").reset_index())

score_stats = (res_df.groupby(["query","method","chunk_size"])["score"]
                     .mean().rename("avg_score").reset_index())

print("=== 평균 랭크 ===")
display(rank_stats.sort_values(["query","method","chunk_size"]))

print("\n=== 평균 점수 ===")
display(score_stats.sort_values(["query","method","chunk_size"], ascending=[True,True,True]))


=== 평균 랭크 ===


Unnamed: 0,query,method,chunk_size,avg_rank
0,Describe the relationship between Harry and Ha...,bm25,256,3.0
1,Describe the relationship between Harry and Ha...,bm25,512,3.0
2,Describe the relationship between Harry and Ha...,embed,256,3.0
3,Describe the relationship between Harry and Ha...,embed,512,3.0
4,Describe the relationship between Harry and Ha...,fusion,256,3.0
5,Describe the relationship between Harry and Ha...,fusion,512,3.0
6,What is the Sorcerer's Stone?,bm25,256,3.0
7,What is the Sorcerer's Stone?,bm25,512,3.0
8,What is the Sorcerer's Stone?,embed,256,3.0
9,What is the Sorcerer's Stone?,embed,512,3.0



=== 평균 점수 ===


Unnamed: 0,query,method,chunk_size,avg_score
0,Describe the relationship between Harry and Ha...,bm25,256,14.057053
1,Describe the relationship between Harry and Ha...,bm25,512,12.284921
2,Describe the relationship between Harry and Ha...,embed,256,0.59653
3,Describe the relationship between Harry and Ha...,embed,512,0.58955
4,Describe the relationship between Harry and Ha...,fusion,256,0.873418
5,Describe the relationship between Harry and Ha...,fusion,512,0.788779
6,What is the Sorcerer's Stone?,bm25,256,16.641119
7,What is the Sorcerer's Stone?,bm25,512,13.959215
8,What is the Sorcerer's Stone?,embed,256,0.594142
9,What is the Sorcerer's Stone?,embed,512,0.522457


In [52]:
#선택: Alpha 바꿔 재실험(퓨전 효과 확인)

In [53]:
for ALPHA in [0.3, 0.6]:
    fu_rows = []
    for q in DEMO_QUERIES:
        for sz in chunked_map.keys():
            fu = fusion_search(q, sz, k=TOPK, alpha=ALPHA)
            for rank, (i, score) in enumerate(fu, start=1):
                fu_rows.append({"ALPHA": ALPHA, "query": q, "chunk_size": sz,
                                "rank": rank, "score": score, "chunk_idx": i})
    fu_df = pd.DataFrame(fu_rows)
    print(f"\n=== Fusion ALPHA={ALPHA} 요약 ===")
    display(fu_df.groupby(["query","chunk_size"])["rank"].mean().rename("avg_rank").reset_index())



=== Fusion ALPHA=0.3 요약 ===


Unnamed: 0,query,chunk_size,avg_rank
0,Describe the relationship between Harry and Ha...,256,3.0
1,Describe the relationship between Harry and Ha...,512,3.0
2,What is the Sorcerer's Stone?,256,3.0
3,What is the Sorcerer's Stone?,512,3.0
4,Who is the headmaster of Hogwarts?,256,3.0
5,Who is the headmaster of Hogwarts?,512,3.0



=== Fusion ALPHA=0.6 요약 ===


Unnamed: 0,query,chunk_size,avg_rank
0,Describe the relationship between Harry and Ha...,256,3.0
1,Describe the relationship between Harry and Ha...,512,3.0
2,What is the Sorcerer's Stone?,256,3.0
3,What is the Sorcerer's Stone?,512,3.0
4,Who is the headmaster of Hogwarts?,256,3.0
5,Who is the headmaster of Hogwarts?,512,3.0
