# Retriever 성능 평가 및 RRF 분석 (Hit@k)

이 노트북은 BM25, Vector, Hybrid(RRF) 검색의 성능을 정량적으로 평가(Hit@k)하고 비교 분석합니다.

In [None]:
# 환경 설정
import sys
import os
from dotenv import load_dotenv
import pandas as pd

# 프로젝트 루트 경로 설정
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

load_dotenv(os.path.join(project_root, ".env"))
print(f"Project Root: {project_root}")

In [None]:
from src.retrieval.bm25_retriever import BM25Retriever
from src.retrieval.vector_retriever import VectorRetriever
from src.retrieval.hybrid_search import HybridRetriever

bm25 = BM25Retriever(version="v3", collection_name="care_guides")
vector = VectorRetriever(version="v3", collection_name="care_guides")
hybrid = HybridRetriever(version="v3", collection_name="care_guides")

print("✅ All Retrievers initialized.")

## 1. Ground Truth 데이터셋 정의
평가를 위한 [질문, 정답(품종명)] 쌍을 정의합니다.

In [None]:
ground_truth = [
    {"query": "메인쿤 특징", "target": "메인 쿤"},
    {"query": "렉돌 성격", "target": "랙돌"},
    {"query": "페르시안 고양이", "target": "페르시안"},
    {"query": "털 안빠지는 고양이", "target": "스핑크스"}, # 예시: 스핑크스가 나오길 기대
    {"query": "다리 짧은 고양이", "target": "먼치킨"},
    {"query": "귀 접힌 고양이", "target": "스코티시 폴드"}
]

print(f"Dataset size: {len(ground_truth)}")

## 2. Hit@k 평가 함수 구현

In [None]:
async def calculate_hit_at_k(retriever_func, k_list=[1, 3, 5]):
    results_summary = {k: 0 for k in k_list}
    
    for item in ground_truth:
        query = item["query"]
        target = item["target"]
        
        # 검색 실행 (최대 k_list의 max값만큼)
        max_k = max(k_list)
        search_results = await retriever_func(query, limit=max_k)
        
        # 검색 결과 중 target이 포함되어 있는지 확인
        found_ranks = []
        for rank, res in enumerate(search_results):
            if target in res.get('name_ko', '') or target in res.get('name_en', ''):
                found_ranks.append(rank + 1)
        
        # Hit@k 계산
        for k in k_list:
            if any(rank <= k for rank in found_ranks):
                results_summary[k] += 1
                
    # 평균 Hit@k
    total = len(ground_truth)
    return {f"Hit@{k}": round(count / total, 2) for k, count in results_summary.items()}

print("Evaluator ready.")

## 3. Retriever별 성능 비교

In [None]:
results = []

# 1. BM25
print("Running BM25 evaluation...")
bm25_metrics = await calculate_hit_at_k(lambda q, limit: bm25.search(q, limit=limit))
bm25_metrics["Retriever"] = "BM25"
results.append(bm25_metrics)

# 2. Vector
print("Running Vector evaluation...")
vector_metrics = await calculate_hit_at_k(lambda q, limit: vector.search(q, limit=limit))
vector_metrics["Retriever"] = "Vector"
results.append(vector_metrics)

# 3. Hybrid (RRF)
print("Running Hybrid evaluation...")
hybrid_metrics = await calculate_hit_at_k(lambda q, limit: hybrid.search(q, limit=limit))
hybrid_metrics["Retriever"] = "Hybrid (RRF)"
results.append(hybrid_metrics)

# 결과 출력
df = pd.DataFrame(results).set_index("Retriever")
df = df[["Hit@1", "Hit@3", "Hit@5"]]
display(df)