In [1]:
import pandas as pd

# 기존 파일 로드
df = pd.read_csv("queries_scored.csv")

# query_id 새로 매기기 (q1, q2, q3 … 순서대로)
df["query_id"] = [f"q{i+1}" for i in range(len(df))]

# 새 파일 저장
df.to_csv("queries_scored_renumbered.csv", index=False, encoding="utf-8-sig")

print(f"✅ 완료: queries_scored_renumbered.csv (총 {len(df)}개)")

✅ 완료: queries_scored_renumbered.csv (총 160개)


In [5]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from rank_bm25 import BM25Okapi
import chromadb
from sentence_transformers import SentenceTransformer

# 데이터 로드
LABEL_CSV = "./queries_scored_renumbered.csv"
CHROMA_DIR = "../chroma_google_api_db"
EVAL_K = 5

df = pd.read_csv(LABEL_CSV)

# 라벨 파싱 함수
def parse_relevant(s):
    out = {}
    for pair in str(s).split(";"):
        if "=" in pair:
            d, g = pair.split("=")
            out[d.strip()] = int(g)
    return out

# BM25 검색기
okt = Okt()
client = chromadb.PersistentClient(path=CHROMA_DIR)
col = client.list_collections()[0]
collection = client.get_collection(col.name)

# 문서 불러오기
docs = collection.get(include=["documents"])
doc_ids, doc_texts = docs["ids"], docs["documents"]

tokenized_docs = [okt.morphs(t) for t in doc_texts]
bm25 = BM25Okapi(tokenized_docs)

def bm25_search(query, top_k=5):
    q_tok = okt.morphs(query)
    scores = bm25.get_scores(q_tok)
    order = np.argsort(-scores)[:top_k]
    return [doc_ids[i] for i in order]

# Dense 검색기
model = SentenceTransformer("BAAI/bge-m3")

def dense_search(query, top_k=5):
    q_emb = model.encode([query], normalize_embeddings=True).tolist()
    res = collection.query(query_embeddings=q_emb, n_results=top_k)
    return list(res["ids"][0])

# RRF 결합
def rrf_fuse(a, b, k=60):
    s = {}
    for r,d in enumerate(a): s[d] = s.get(d,0) + 1/(k+r+1)
    for r,d in enumerate(b): s[d] = s.get(d,0) + 1/(k+r+1)
    return [d for d,_ in sorted(s.items(), key=lambda x:x[1], reverse=True)]

# 평가 지표
def compute_metrics(pred_ids, rel_dict, k=EVAL_K):
    rel_set = set(rel_dict.keys())
    topk = pred_ids[:k]
    hits = sum(1 for d in topk if d in rel_set)

    precision = hits / k
    recall = hits / len(rel_set) if rel_set else 0

    rr = 0
    for i,d in enumerate(pred_ids):
        if d in rel_set:
            rr = 1/(i+1); break

    num=0; precs=[]
    for i,d in enumerate(topk):
        if d in rel_set:
            num += 1
            precs.append(num/(i+1))
    ap = np.mean(precs) if precs else 0

    return precision, recall, rr, ap

def evaluate_all(method_fn, label_df, k=EVAL_K):
    P=R=RR=AP=0; n=0
    for _, row in label_df.iterrows():
        q = row["query_text"]
        rel = parse_relevant(row["relevant_doc_ids"])
        preds = method_fn(q)
        p,r,rr,ap = compute_metrics(preds, rel, k)
        P+=p; R+=r; RR+=rr; AP+=ap; n+=1
    return {"P@K":P/n, "R@K":R/n, "MRR":RR/n, "MAP":AP/n}

# 평가 실행
bm25_m = evaluate_all(lambda q: bm25_search(q, EVAL_K), df, k=EVAL_K)
dense_m = evaluate_all(lambda q: dense_search(q, EVAL_K), df, k=EVAL_K)
rrf_m = evaluate_all(lambda q: rrf_fuse(bm25_search(q,EVAL_K), dense_search(q,EVAL_K)), df, k=EVAL_K)

summary = pd.DataFrame({
    "Metric": ["P@K","R@K","MRR","MAP"],
    "BM25"  : [bm25_m["P@K"], bm25_m["R@K"], bm25_m["MRR"], bm25_m["MAP"]],
    "Dense" : [dense_m["P@K"], dense_m["R@K"], dense_m["MRR"], dense_m["MAP"]],
    "RRF"   : [rrf_m["P@K"],  rrf_m["R@K"],  rrf_m["MRR"],  rrf_m["MAP"]],
})

summary

Unnamed: 0,Metric,BM25,Dense,RRF
0,P@K,0.2325,0.1425,0.21375
1,R@K,0.444792,0.275,0.41875
2,MRR,0.453854,0.337917,0.522445
3,MAP,0.439392,0.321667,0.48441


In [6]:
def weighted_rrf(bm25_list, dense_list, w_bm25=0.7, w_dense=0.3, k=60):
    scores = {}
    for rank, doc in enumerate(bm25_list):
        scores[doc] = scores.get(doc, 0) + w_bm25 / (k + rank + 1)
    for rank, doc in enumerate(dense_list):
        scores[doc] = scores.get(doc, 0) + w_dense / (k + rank + 1)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in ranked]

In [7]:
weighted_rrf_results = {}

for idx, row in df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']

    bm25_top20 = bm25_search(query_text, top_k=20)
    dense_top20 = dense_search(query_text, top_k=20)

    weighted_rrf_results[qid] = weighted_rrf(
        bm25_top20, dense_top20, w_bm25=0.7, w_dense=0.3
    )[:5]

In [8]:
bm25_metrics = evaluate_all(lambda q: bm25_search(q, 5), df)
dense_metrics = evaluate_all(lambda q: dense_search(q, 5), df)
rrf_metrics = evaluate_all(lambda q: rrf_fuse(bm25_search(q, 5), dense_search(q, 5)), df)
weighted_rrf_metrics = evaluate_all(lambda q: weighted_rrf(bm25_search(q, 20), dense_search(q, 20), w_bm25=0.7, w_dense=0.3), df)

metrics_df = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'BM25': [bm25_metrics['P@K'], bm25_metrics['R@K'], bm25_metrics['MRR'], bm25_metrics['MAP']],
    'Dense': [dense_metrics['P@K'], dense_metrics['R@K'], dense_metrics['MRR'], dense_metrics['MAP']],
    'RRF': [rrf_metrics['P@K'], rrf_metrics['R@K'], rrf_metrics['MRR'], rrf_metrics['MAP']],
    'Weighted RRF': [weighted_rrf_metrics['P@K'], weighted_rrf_metrics['R@K'], weighted_rrf_metrics['MRR'], weighted_rrf_metrics['MAP']]
})

metrics_df

Unnamed: 0,Metric,BM25,Dense,RRF,Weighted RRF
0,P@5,0.2325,0.1425,0.21375,0.25125
1,R@5,0.444792,0.275,0.41875,0.479167
2,MRR,0.453854,0.337917,0.522445,0.54937
3,MAP,0.439392,0.321667,0.48441,0.510885


### evaluate_all 호출을 여러 가중치 조합으로

In [9]:
weights = [(0.5, 0.5), (0.7, 0.3), (0.3, 0.7), (0.8, 0.2)]

results = {}
for w_bm25, w_dense in weights:
    label = f"W-RRF({w_bm25:.1f}/{w_dense:.1f})"
    metrics = evaluate_all(
        lambda q: weighted_rrf(bm25_search(q, 20), dense_search(q, 20),
                               w_bm25=w_bm25, w_dense=w_dense), df
    )
    results[label] = metrics

metrics_df = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'BM25': [bm25_metrics['P@K'], bm25_metrics['R@K'], bm25_metrics['MRR'], bm25_metrics['MAP']],
    'Dense': [dense_metrics['P@K'], dense_metrics['R@K'], dense_metrics['MRR'], dense_metrics['MAP']],
    'RRF': [rrf_metrics['P@K'], rrf_metrics['R@K'], rrf_metrics['MRR'], rrf_metrics['MAP']],
})

# weighted RRF 추가
for label, m in results.items():
    metrics_df[label] = [m['P@K'], m['R@K'], m['MRR'], m['MAP']]

metrics_df

Unnamed: 0,Metric,BM25,Dense,RRF,W-RRF(0.5/0.5),W-RRF(0.7/0.3),W-RRF(0.3/0.7),W-RRF(0.8/0.2)
0,P@5,0.2325,0.1425,0.21375,0.24,0.25125,0.19125,0.25375
1,R@5,0.444792,0.275,0.41875,0.46875,0.479167,0.36875,0.483333
2,MRR,0.453854,0.337917,0.522445,0.542387,0.54937,0.473785,0.55989
3,MAP,0.439392,0.321667,0.48441,0.499618,0.510885,0.43592,0.517951
