
# K-IFRS RAG — HF v2 (From Scratch Builder)

**원본 v2 로직**을 그대로 사용하면서, **임베딩을 처음부터 생성**하도록 구성한 버전입니다.  
- `REBUILD=True`로 두면, 캐시가 있어도 **항상 다시 계산**합니다.  
- 임베딩: `intfloat/multilingual-e5-large` (무료, 다국어)  
- 재랭커(옵션): `jinaai/jina-reranker-v2-base-multilingual` (`trust_remote_code=True`)  
- 기능: **Alias 확장 + Two-Track 라우팅 + Vector-only Fallback + BM25+벡터 하이브리드**


In [None]:

# 0) 설치 (최초 1회 필요)
!pip install -qU sentence-transformers transformers accelerate


In [None]:

from pathlib import Path
import json, os, math, numpy as np
from collections import defaultdict, Counter
from typing import List

# ===== 경로/설정 =====
BASE = Path('.../raws/raws_K-ifrs')  # 필요 시 수정
JSON_PATH = BASE / 'kifrs_cleaned_final.json'  # 기준서 JSON
CACHE_DIR = BASE / 'hf_cache'
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# ★ 항상 처음부터 다시 만들려면 True
REBUILD = True

print('JSON exists:', JSON_PATH.exists(), JSON_PATH)
print('Cache dir:', CACHE_DIR)
print('REBUILD =', REBUILD)


In [None]:

# ===== JSON 로드 =====
with JSON_PATH.open(encoding='utf-8') as f:
    data = json.load(f)

docs = data.get('documents', [])
total_paras = sum(len(d.get('paragraphs', [])) for d in docs)
print('Loaded standards:', len(docs), '| Total paragraphs:', total_paras)

# ===== 토큰화 & BM25 =====
import re
HAN_ENG_NUM = re.compile(r'[가-힣A-Za-z0-9]+', re.UNICODE)
STOP = set(['그리고','등','및','또는','그러나','이는','그','이','저','것','수','등의'])

def normalize(t: str) -> str:
    return t.strip().lower()

def tokenize(t: str):
    return HAN_ENG_NUM.findall(normalize(t))

def filter_tokens(tokens: List[str]) -> List[str]:
    return [w for w in tokens if w not in STOP and len(w) > 1]

class BM25:
    def __init__(self, docs_tokens, k1=1.5, b=0.75):
        self.docs_tokens = docs_tokens
        self.N = len(docs_tokens)
        self.k1 = k1; self.b = b
        self.avgdl = sum(len(d) for d in docs_tokens) / max(1, self.N)
        self.df = Counter()
        for doc in docs_tokens:
            for term in set(doc):
                self.df[term] += 1
        self.idf = {t: math.log(1 + (self.N - df + 0.5)/(df + 0.5)) for t, df in self.df.items()}
        self.tf = [Counter(doc) for doc in docs_tokens]

    def _score_doc(self, q_tokens, i):
        score, tf, dl = 0.0, self.tf[i], len(self.docs_tokens[i])
        for term in q_tokens:
            idf = self.idf.get(term)
            if idf is None: 
                continue
            f = tf.get(term, 0)
            if f == 0: 
                continue
            denom = f + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
            score += idf * (f * (self.k1 + 1)) / denom
        return score

    def search(self, q_tokens, topk=50):
        scores = []
        for i in range(self.N):
            s = self._score_doc(q_tokens, i)
            if s != 0.0:
                scores.append((i, s))
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[:topk]

    def search_subset(self, q_tokens, allowed: set, topk=50):
        scores = []
        for i in allowed:
            s = self._score_doc(q_tokens, i)
            if s != 0.0:
                scores.append((i, s))
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[:topk]

# ===== 라우팅 인덱스 =====
title_keys = []
title_texts = []
title_to_para_indices = defaultdict(list)
paragraphs = []

for d in docs:
    std = d.get('standard_no')
    ttl = d.get('title') or ''
    src = d.get('source_file') or ''
    key = (std, ttl, src)
    if key not in title_keys:
        title_keys.append(key)
        head = (d.get('paragraphs', [{}])[:3])
        head_txt = " ".join([(p.get('text') or '') for p in head])
        title_texts.append(f"{ttl}\n{src}\n{head_txt[:1000]}")
    base_idx = len(paragraphs)
    for p in d.get('paragraphs', []):
        paragraphs.append({
            "std": std, "title": ttl, "source": src,
            "page": p.get('page'), "para_id": p.get('para_id'), "text": p.get('text') or ''
        })
        title_to_para_indices[key].append(base_idx); base_idx += 1

title_tokens = [filter_tokens(tokenize(t)) for t in title_texts]
bm25_title = BM25(title_tokens)

para_texts = [p['text'] for p in paragraphs]
para_tokens = [filter_tokens(tokenize(t)) for t in para_texts]
bm25_para = BM25(para_tokens)

print('Titles:', len(title_keys), '| Paragraphs:', len(paragraphs))


In [None]:

# ===== 임베딩 모델 =====
from sentence_transformers import SentenceTransformer, CrossEncoder

EMBED_MODEL_NAME = "intfloat/multilingual-e5-large"
RERANK_MODEL_NAME = "jinaai/jina-reranker-v2-base-multilingual"  # 옵션

embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def embed_queries(texts: List[str]) -> np.ndarray:
    inputs = [f"query: {t}" for t in texts]
    return embed_model.encode(inputs, normalize_embeddings=True, convert_to_numpy=True, batch_size=64)

def embed_passages(texts: List[str]) -> np.ndarray:
    inputs = [f"passage: {t}" for t in texts]
    return embed_model.encode(inputs, normalize_embeddings=True, convert_to_numpy=True, batch_size=64)


In [None]:

# ===== 캐시 경로 & (재)생성 =====
TITLE_EMB_PATH = (CACHE_DIR / f"title_emb_{EMBED_MODEL_NAME.replace('/','_')}.npy")
PARA_EMB_PATH  = (CACHE_DIR / f"para_emb_{EMBED_MODEL_NAME.replace('/','_')}.npy")

def build_or_load_embeddings(rebuild: bool = False):
    if rebuild or (not TITLE_EMB_PATH.exists()):
        print("Encoding titles from scratch...")
        title_vecs = embed_passages(title_texts)
        np.save(TITLE_EMB_PATH, title_vecs)
    else:
        title_vecs = np.load(TITLE_EMB_PATH)

    if rebuild or (not PARA_EMB_PATH.exists()):
        print("Encoding paragraphs from scratch...")
        para_vecs = embed_passages(para_texts)
        np.save(PARA_EMB_PATH, para_vecs)
    else:
        para_vecs = np.load(PARA_EMB_PATH)
    return title_vecs, para_vecs

title_vecs, para_vecs = build_or_load_embeddings(REBUILD)
title_vecs.shape, para_vecs.shape


In [None]:

# ===== 유사도 =====
def cosine(a: np.ndarray, b: np.ndarray) -> float:
    denom = float(np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0: return 0.0
    return float(np.dot(a, b) / denom)

def topk_by_cosine(q_vec: np.ndarray, mat: np.ndarray, k: int = 50):
    sims = (mat @ q_vec)  # 정규화 가정
    idxs = np.argpartition(-sims, kth=min(k, len(sims)-1))[:k]
    idxs = idxs[np.argsort(-sims[idxs])]
    return [(int(i), float(sims[i])) for i in idxs]


In [None]:

# ===== Alias =====
ALIAS = {
    "개발비": ["개발활동", "무형자산"],
    "감가상각": ["정액법", "체감잔액법", "생산량비례법"],
    "매출": ["수익"],
    "선수수익": ["계약부채"],
    "미수수익": ["계약자산"],
    "비연결재무제표": ["별도재무제표"],
}

def expand_query(q: str) -> str:
    terms = set()
    for k, syns in ALIAS.items():
        if k in q:
            terms.update(syns)
    if terms:
        return q + " " + " ".join(sorted(terms))
    return q


In [None]:

# ===== 라우팅 & 하이브리드 검색 =====
def route_titles(query: str, topn: int = 5, w_bm25=0.4, w_vec=0.6, debug=False):
    q_exp = expand_query(query)
    qtok = filter_tokens(tokenize(q_exp))
    bm_hits = bm25_title.search(qtok, topk=max(topn*6, 30))

    qv = embed_queries([q_exp])[0]
    vec_hits = topk_by_cosine(qv, title_vecs, k=max(topn*6, 30))
    vec_dict = dict(vec_hits)

    combined = []
    seen = set([i for i,_ in bm_hits])
    for i, s_bm in bm_hits:
        s_vec = vec_dict.get(i, 0.0)
        combined.append((i, w_bm25*s_bm + w_vec*s_vec, s_bm, s_vec))
    for i, s_vec in vec_hits:
        if i in seen: 
            continue
        combined.append((i, w_vec*s_vec, 0.0, s_vec))

    combined.sort(key=lambda x: x[1], reverse=True)
    routed = combined[:topn]
    if debug:
        print("[Routing] top candidates:")
        for r in routed:
            i, s, sb, sv = r
            std, ttl, src = title_keys[i]
            print(f" - {std} | {ttl} | score={s:.3f} (bm25={sb:.3f}, vec={sv:.3f})")
    return routed

def ensure_reranker():
    global reranker
    if 'reranker' not in globals() or reranker is None:
        print("Loading reranker:", RERANK_MODEL_NAME)
        from sentence_transformers import CrossEncoder
        reranker = CrossEncoder(RERANK_MODEL_NAME, trust_remote_code=True)
    return reranker

def hybrid_search_within_titles(query: str, title_idxs: List[int],
                                topk_bm25=150, topk_final=12,
                                w_bm25=0.4, w_vec=0.6,
                                use_reranker: bool = True, rerank_top: int = 20,
                                debug=False):
    allowed = set()
    for ti in title_idxs:
        allowed.update(title_to_para_indices[title_keys[ti]])
    if debug:
        print(f"[Within] allowed paragraphs: {len(allowed)}")

    if not allowed:
        if debug: print("[Within] allowed empty -> fallback to global search")
        return global_hybrid_search(query, topk_bm25=200, topk_final=15,
                                    w_bm25=w_bm25, w_vec=w_vec,
                                    use_reranker=use_reranker, rerank_top=rerank_top)

    q_exp = expand_query(query)
    qtok = filter_tokens(tokenize(q_exp))
    bm_hits = bm25_para.search_subset(qtok, allowed, topk=topk_bm25)

    # ★ BM25=0이면 벡터만으로 랭킹
    if not bm_hits:
        if debug: print("[Within] BM25 hits=0 -> vector-only ranking")
        qv = embed_queries([q_exp])[0]
        idxs = list(allowed)
        sims = np.dot(para_vecs[idxs], qv)
        order = np.argsort(-sims)[:topk_final]
        results = [{**paragraphs[idxs[i]], "score": float(sims[idxs[i]]), "bm25": 0.0, "vector": float(sims[idxs[i]])} for i in order]
        if use_reranker and len(results) > 1:
            ensure_reranker()
            pairs = [(q_exp, r["text"]) for r in results[:rerank_top]]
            scores = reranker.predict(pairs)
            order = np.argsort(-scores)
            results = [results[int(i)] for i in order]
        return results

    qv = embed_queries([q_exp])[0]
    vec_scores = {i: float(np.dot(para_vecs[i], qv)) for i, _ in bm_hits}

    combined = []
    for i, s_bm in bm_hits:
        s_vec = vec_scores.get(i, 0.0)
        combined.append((i, w_bm25*s_bm + w_vec*s_vec, s_bm, s_vec))
    combined.sort(key=lambda x: x[1], reverse=True)
    combined = combined[:topk_final]

    results = [{**paragraphs[i], "score": s, "bm25": s_bm, "vector": s_vec} for (i, s, s_bm, s_vec) in combined]

    if use_reranker and len(results) > 1:
        ensure_reranker()
        pairs = [(q_exp, r["text"]) for r in results[:rerank_top]]
        scores = reranker.predict(pairs)
        order = np.argsort(-scores)
        results = [results[int(i)] for i in order]
    return results

def global_hybrid_search(query: str, topk_bm25=200, topk_final=15,
                         w_bm25=0.4, w_vec=0.6, use_reranker=True, rerank_top=20, debug=False):
    q_exp = expand_query(query)
    qtok = filter_tokens(tokenize(q_exp))
    bm_hits = bm25_para.search(qtok, topk=topk_bm25)

    # ★ BM25=0이면 벡터만으로 랭킹
    if not bm_hits:
        if debug: print("[Global] BM25 hits=0 -> vector-only ranking")
        qv = embed_queries([q_exp])[0]
        sims = np.dot(para_vecs, qv)
        order = np.argsort(-sims)[:topk_final]
        results = [{**paragraphs[i], "score": float(sims[i]), "bm25": 0.0, "vector": float(sims[i])} for i in order]
        if use_reranker and len(results) > 1:
            ensure_reranker()
            pairs = [(q_exp, r["text"]) for r in results[:rerank_top]]
            scores = reranker.predict(pairs)
            order = np.argsort(-scores)
            results = [results[int(i)] for i in order]
        return results

    qv = embed_queries([q_exp])[0]
    vec_scores = {i: float(np.dot(para_vecs[i], qv)) for i, _ in bm_hits}

    combined = []
    for i, s_bm in bm_hits:
        s_vec = vec_scores.get(i, 0.0)
        combined.append((i, w_bm25*s_bm + w_vec*s_vec, s_bm, s_vec))
    combined.sort(key=lambda x: x[1], reverse=True)
    combined = combined[:topk_final]

    results = [{**paragraphs[i], "score": s, "bm25": s_bm, "vector": s_vec} for (i, s, s_bm, s_vec) in combined]

    if use_reranker and len(results) > 1:
        ensure_reranker()
        pairs = [(q_exp, r["text"]) for r in results[:rerank_top]]
        scores = reranker.predict(pairs)
        order = np.argsort(-scores)
        results = [results[int(i)] for i in order]
    return results

def hierarchical_search_two_track(query: str, top_titles=4, **kw):
    print(f"[Query] {query}")
    routed = route_titles(query, topn=top_titles, debug=True)
    title_idx = [i for (i, *_ ) in routed]
    if not title_idx:
        print("[Two-Track] routed empty -> GLOBAL")
        return routed, global_hybrid_search(query, **kw)
    results = hybrid_search_within_titles(query, title_idx, **kw)
    return routed, results

def print_titles(routed):
    for rank, (i, s, s_bm, s_vec) in enumerate(routed, 1):
        std, ttl, src = title_keys[i]
        print(f"{rank:>2}. [{std}] {ttl}  <{src}>  score={s:.3f} (bm25={s_bm:.3f}, vec={s_vec:.3f})")


In [None]:

# ===== Demo =====
queries = [
    "감가상각 방법에는 무엇이 있나?",
    "수익 인식의 5단계는?",
    "개발비 자산 인식 요건은?",
    "선수수익과 계약부채 관계",
    "환불부채 회계처리는 어떻게 되나?",
]

for q in queries:
    print("\n=== Q:", q)
    routed, results = hierarchical_search_two_track(q, top_titles=4,
                                                    topk_bm25=150,
                                                    topk_final=10,
                                                    w_bm25=0.4, w_vec=0.6,
                                                    use_reranker=True, rerank_top=10)
    print("Top titles:")
    print_titles(routed)
    for i, r in enumerate(results[:5], 1):
        snippet = r['text'].replace('\n', ' ')
        if len(snippet) > 240: snippet = snippet[:240] + '…'
        print(f"  {i:>2}. [{r['std']}:{r['para_id']}] ({r['title']}) p.{r.get('page','?')} score={r['score']:.3f}")
        print("     ", snippet)
