In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [None]:
import json

output_file_path = "/workspace/final_project/outputs/GENexaone_ENCbge_500_0_top5_generated_newsqa_qa_pairs_20251218_162943.json"

def load_jsonl(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records

In [26]:
def extract_qrel_prediction(
    results: list[dict],
) -> list[dict]:
    """
    Extract qrel and prediction from the retrieval results.
    """
    retrieval_result = []

    for record_dict in results:
        relevance_pair = {
            'qid' : record_dict['qid'],
            'gt' : record_dict['docid'],
            'retrieved' : [doc_dict['original_docid'] for doc_dict in record_dict['retrieval_results']]
        }
        retrieval_result.append(relevance_pair)
    return retrieval_result

In [None]:
import math
from collections import Counter
from typing import List, Dict, Union, Tuple

Example = Dict[str, object]  # {'qid': int, 'gt': int, 'retrieved': List[int]}

def dcg_at_k(rels: List[int], k: int) -> float:
    """rels: relevance list aligned with ranking positions (0/1/2/...)"""
    dcg = 0.0
    for i, rel in enumerate(rels[:k], start=1):
        if rel > 0:
            dcg += (2**rel - 1) / math.log2(i + 1)
    return dcg

def ndcg_at_k(retrieved: List[int], gt: int, k: int) -> float:
    """
    Binary relevance per position, but duplicates of gt are all counted as relevant
    """
    rels = [1 if docid == gt else 0 for docid in retrieved[:k]]
    dcg = dcg_at_k(rels, k)

    # number of relevant occurrences within top-k (since only those can contribute to DCG)
    r = sum(rels)
    if r == 0:
        return 0.0

    # ideal relevance list: r ones first
    ideal_rels = [1] * r + [0] * (k - r)
    idcg = dcg_at_k(ideal_rels, k)
    return dcg / idcg if idcg > 0 else 0.0

def recall_at_k(
    retrieved: List[int],
    gt: int,
    k: int) -> float:
    """
    Recall@k with duplicate-aware hits.
    """
    hits = sum(1 for docid in retrieved[:k] if docid == gt)

    denom = sum(1 for docid in retrieved if docid == gt)
    # edge: if gt never appears anywhere, define recall as 0
    if denom == 0:
        return 0.0
    
    return min(1.0, hits / denom)

def evaluate(
    data: List[Example],
    ks: Tuple[int, ...] = (1, 3, 5),
    total_relevant: Union[int, str] = 1
) -> Dict[str, Dict[int, float]]:
    """
    Returns mean metrics across queries:
    {
      "recall": {k: mean},
      "ndcg": {k: mean}
    }
    """
    out = {
        "recall": {k: 0.0 for k in ks},
        "ndcg": {k: 0.0 for k in ks},
    }

    n = len(data)
    for ex in data:
        gt = ex["gt"]
        retrieved = ex["retrieved"]

        for k in ks:
            out["recall"][k] += recall_at_k(retrieved, gt, k, total_relevant=total_relevant)
            out["ndcg"][k] += ndcg_at_k(retrieved, gt, k)

    for k in ks:
        out["recall"][k] /= n
        out["ndcg"][k] /= n

    return out

# -----------------------
# 사용 예시:
# metrics = evaluate(your_list, ks=(1,3,5), total_relevant="count_in_retrieved")
# print(metrics)


In [None]:
print(metrics)

{'recall': {1: 0.21995000000000012, 3: 0.4858500000000001, 5: 0.662}, 'ndcg': {1: 0.377, 3: 0.48219963085343265, 5: 0.5175911066368606}}


In [28]:
output_file_path = "/workspace/final_project/outputs/GENexaone_ENCbge_500_0_top5_generated_newsqa_qa_pairs_20251218_162943.json"
results = load_jsonl(output_file_path)
qrels = extract_qrel_prediction(results)

metrics = evaluate(qrels, ks={1,3,5})
print(metrics)

{'recall': {1: 0.377, 3: 0.563, 5: 0.662}, 'ndcg': {1: 0.377, 3: 0.48219963085343265, 5: 0.5175911066368606}}


In [32]:
output_file_path = "/workspace/final_project/outputs/GENexaone_ENCbge_500_200_top5_generated_newsqa_qa_pairs_20251218_165655.json"
results = load_jsonl(output_file_path)
qrels = extract_qrel_prediction(results)

metrics = evaluate(qrels, ks={1,3,5})
print(metrics)

{'recall': {1: 0.385, 3: 0.562, 5: 0.658}, 'ndcg': {1: 0.385, 3: 0.4843192298741273, 5: 0.5193263856746497}}


In [None]:
output_file_path = "/workspace/final_project/outputs/GENmidm_ENCbge_500_0_top5_generated_newsqa_qa_pairs_20251218_150820.json"
results = load_jsonl(output_file_path)
qrels = extract_qrel_prediction(results)

metrics = evaluate(qrels, ks={1,3,5})
print(metrics)

{'recall': {1: 0.377, 3: 0.563, 5: 0.662}, 'ndcg': {1: 0.377, 3: 0.48219963085343265, 5: 0.5175911066368606}}


### LLM as judge

In [35]:
from pipeline.generator.llm import OpenAIGenerator

generator = OpenAIGenerator(model_name="gpt-4o-mini")

In [42]:
from pipeline.generator.prompter import PromptGenerator

prompts_by_qids = {}
for record_dict in results:
    prompts = []

    for ret_result in record_dict['retrieval_results']:
        prompt = PromptGenerator.generate_relevance_judge(doc=record_dict['question'], question=ret_result['chunked_text'])
        prompts.append(prompt)

    prompts_by_qids[record_dict['qid']] = prompts
    


In [44]:
qrels

[{'qid': 0, 'gt': 20952, 'retrieved': [15784, 20952, 6039, 24557, 13307]},
 {'qid': 1, 'gt': 3648, 'retrieved': [21644, 3648, 15716, 11373, 25656]},
 {'qid': 2, 'gt': 819, 'retrieved': [819, 18571, 862, 7605, 26203]},
 {'qid': 3, 'gt': 24299, 'retrieved': [18906, 2864, 5051, 26520, 7800]},
 {'qid': 4, 'gt': 9012, 'retrieved': [9012, 27158, 27158, 9012, 27158]},
 {'qid': 5, 'gt': 8024, 'retrieved': [25960, 12481, 15096, 11345, 17582]},
 {'qid': 6, 'gt': 7314, 'retrieved': [7314, 16971, 20104, 16971, 20104]},
 {'qid': 7, 'gt': 4572, 'retrieved': [4572, 21088, 17059, 25086, 22466]},
 {'qid': 8, 'gt': 24132, 'retrieved': [14648, 7342, 16151, 24132, 11531]},
 {'qid': 9, 'gt': 3358, 'retrieved': [10531, 11616, 7925, 4686, 17106]},
 {'qid': 10, 'gt': 22174, 'retrieved': [13904, 17289, 22174, 22568, 9794]},
 {'qid': 11, 'gt': 24270, 'retrieved': [15372, 24270, 26328, 23923, 24270]},
 {'qid': 12, 'gt': 17870, 'retrieved': [17870, 17870, 17870, 22267, 4940]},
 {'qid': 13, 'gt': 2848, 'retrieved'

In [50]:
rel_scores = []

for qid, prompts in prompts_by_qids.items():
    scores = generator.generate(prompts)
    scores_parsed = []

    for score in scores:
        try:
            score = float(score.strip())
        except:
            print(score)
            score = 0.0  # default to 0.0 if parsing fails
        scores_parsed.append(score)
    
    rel_score = {
        'qid': qid,
        'rel_scores': scores_parsed
    }
    rel_scores.append(rel_score)
    if qid == 5:
        break

[2025-12-18 17:48:19] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:20] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:21] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:22] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:23] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:24] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:31] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:34] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 17:48:35] INFO _client.py:1025: HTTP Request: POST h

In [51]:
rel_scores

[{'qid': 0, 'rel_scores': [1.0, 1.0, 1.0, 1.0, 1.0]},
 {'qid': 1, 'rel_scores': [1.0, 1.0, 1.0, 1.0, 1.0]},
 {'qid': 2, 'rel_scores': [1.0, 1.0, 1.0, 1.0, 0.0]},
 {'qid': 3, 'rel_scores': [1.0, 1.0, 1.0, 1.0, 1.0]},
 {'qid': 4, 'rel_scores': [1.0, 1.0, 1.0, 1.0, 1.0]},
 {'qid': 5, 'rel_scores': [1.0, 0.0, 0.0, 1.0, 0.0]}]

## generation result 평가

In [52]:
from pipeline.eval.utils import answer_extractor


answer_extractor(results[0]['generated_answer'])

'서울과 부산 구간의 1Gbps 양자암호통신 서비스 이용료는 월 3억 8650만원입니다.'

In [54]:
from pipeline.dataset.newsqa import load_news_qa_dataset

newsqa = load_news_qa_dataset()

Loading NewsQA dataset from /workspace/final_project/qa/newsqa.json


In [74]:
prompts = []
for qid, newsqa_dict in enumerate(newsqa):
    assert newsqa_dict['qid'] == results[qid]['qid']

    parsed_answer = answer_extractor(results[qid]['generated_answer'])

    prompt = PromptGenerator.generate_generation_judge(
        question = results[qid]['question'],
        answer = newsqa_dict['answer'],
        prediction = parsed_answer
        )
    prompts.append(prompt)

In [82]:
generation_scores = []

for qid, prompt in enumerate(prompts):
    score = generator.generate([prompt])[0]
    try:
        score = float(score.strip())
    except:
        print(score)
        score = 0.0  # default to 0.0 if parsing fails
    
    generation_score = {
        'qid': qid,
        'generation_score': score
    }
    generation_scores.append(generation_score)
    if qid == 100:
        break

[2025-12-18 18:12:32] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:36] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:37] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:38] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:39] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:39] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:43] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:47] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:12:48] INFO _client.py:1025: HTTP Request: POST h

In [83]:
generation_scores

[{'qid': 0, 'generation_score': 1.0},
 {'qid': 1, 'generation_score': 1.0},
 {'qid': 2, 'generation_score': 1.0},
 {'qid': 3, 'generation_score': 1.0},
 {'qid': 4, 'generation_score': 0.0},
 {'qid': 5, 'generation_score': 1.0},
 {'qid': 6, 'generation_score': 1.0},
 {'qid': 7, 'generation_score': 1.0},
 {'qid': 8, 'generation_score': 1.0},
 {'qid': 9, 'generation_score': 1.0},
 {'qid': 10, 'generation_score': 1.0},
 {'qid': 11, 'generation_score': 1.0},
 {'qid': 12, 'generation_score': 1.0},
 {'qid': 13, 'generation_score': 1.0},
 {'qid': 14, 'generation_score': 1.0},
 {'qid': 15, 'generation_score': 1.0},
 {'qid': 16, 'generation_score': 1.0},
 {'qid': 17, 'generation_score': 1.0},
 {'qid': 18, 'generation_score': 1.0},
 {'qid': 19, 'generation_score': 1.0},
 {'qid': 20, 'generation_score': 1.0},
 {'qid': 21, 'generation_score': 1.0},
 {'qid': 22, 'generation_score': 1.0},
 {'qid': 23, 'generation_score': 1.0},
 {'qid': 24, 'generation_score': 1.0},
 {'qid': 25, 'generation_score': 1.

In [89]:
print(prompts[40])


        다음 질문에 대한 답변이 올바른지 판단해줘.
        생성된 답변이 올바르면 1, 올바르지 않으면 0으로 답해줘.
        반드시 " 없이 1이나 0으로만 답변해줘.
        
        질문: 정김경숙(로이스 김)은 몇 살에 구글 본사로 옮겼나?
        올바른 답변: 50세
        생성된 답변: 정김경숙은 쉰 살(54세)에 구글 본사로 옮겼습니다.
        
        출력: 
        


In [90]:
hit = 0
for scores in generation_scores:
    hit+=scores['generation_score']

hit/len(generation_scores)

0.8910891089108911

# 모듈화 확인

In [None]:
from pipeline.eval.evaluator import LLMRelevanceEvaluator, AtkEvaluator



In [None]:
output_file_path = "/workspace/final_project/outputs/GENexaone_ENCbge_500_0_top5_generated_newsqa_qa_pairs_20251218_162943.json"
atk_evaluator = AtkEvaluator()
atk_evaluator.evaluate(output_file_path)

{'recall': {5: 0.662}, 'ndcg': {5: 0.5175911066368606}}

In [103]:
llm_relevance_evaluator = LLMRelevanceEvaluator(model_name="gpt-4o-mini")
llm_relevance_evaluator.evaluate(output_file_path)

[2025-12-18 18:45:47] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:48] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:50] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:51] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:51] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:52] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:53] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:53] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:45:55] INFO _client.py:1025: HTTP Request: POST h

{'recall': {5: 0.7000000000000001}, 'ndcg': {5: 0.9632319662651642}}

In [116]:
from pipeline.eval.evaluator import LLMGenerationEvaluator
llm_generation_evaluator = LLMGenerationEvaluator()

Loading NewsQA dataset from /workspace/final_project/qa/newsqa.json


In [117]:
gen_score = llm_generation_evaluator.evaluate(output_file_path)

[2025-12-18 18:55:35] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:36] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:37] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:38] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:39] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:39] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:40] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:41] INFO _client.py:1025: HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[2025-12-18 18:55:42] INFO _client.py:1025: HTTP Request: POST h

In [118]:
llm_generation_evaluator.generation_scores

[{'qid': 0, 'generation_score': 0.0},
 {'qid': 1, 'generation_score': 1.0},
 {'qid': 2, 'generation_score': 1.0},
 {'qid': 3, 'generation_score': 1.0},
 {'qid': 4, 'generation_score': 0.0},
 {'qid': 5, 'generation_score': 0.0},
 {'qid': 6, 'generation_score': 1.0},
 {'qid': 7, 'generation_score': 1.0},
 {'qid': 8, 'generation_score': 1.0},
 {'qid': 9, 'generation_score': 1.0},
 {'qid': 10, 'generation_score': 1.0},
 {'qid': 11, 'generation_score': 0.0},
 {'qid': 12, 'generation_score': 1.0},
 {'qid': 13, 'generation_score': 0.0},
 {'qid': 14, 'generation_score': 0.0},
 {'qid': 15, 'generation_score': 1.0},
 {'qid': 16, 'generation_score': 0.0},
 {'qid': 17, 'generation_score': 0.0},
 {'qid': 18, 'generation_score': 1.0},
 {'qid': 19, 'generation_score': 0.0},
 {'qid': 20, 'generation_score': 1.0}]

In [130]:
qid = 5
print(llm_generation_evaluator.newsqa[qid]['question'])
print(llm_generation_evaluator.newsqa[qid]['answer'])
print(llm_generation_evaluator.results[qid]['generated_answer'])

5일 국회 회의에서 류성걸 위원장이 밝힌 월세 세액공제 확대 상한은 몇 %인가?
15%
{"answer": "12~15%"}
