# Session 6 — 평가 & A/B Test 파이프 구축

다양한 프롬프트 버전에 대해 **정량·정성 평가**, **A/B 테스트**, **LLM‑as‑a‑Judge** 실험을 수행합니다.

---

## 🔍 실습 목표
- V0~V4 버전별 DataFrame 준비
- 정량적 지표 계산 (정답률, Latency, Cost)
- Judge 평가를 통한 종합 Score 산출
- 우승 Prompt 결정


## 📦 패키지 설치

In [None]:
!pip install -r ../requirements.txt

zsh:1: command not found: pip


## 🗂 평가 데이터셋 전처리

In [None]:
# ────────────────────────────────────────────────
# 📦 기본 유틸리티
# ────────────────────────────────────────────────
import re
import time
import asyncio
from pathlib import Path

# ────────────────────────────────────────────────
# 📊 데이터 처리 및 환경설정
# ────────────────────────────────────────────────
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import nest_asyncio

# ────────────────────────────────────────────────
# 📈 평가 지표
# ────────────────────────────────────────────────
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# ────────────────────────────────────────────────
# 🤖 Langfuse + OpenAI 연동
# ────────────────────────────────────────────────
from langfuse import Langfuse
from langfuse.openai import AsyncOpenAI


load_dotenv()

nest_asyncio.apply()
langfuse = Langfuse()
client = AsyncOpenAI()

# 결과 폴더 경로 설정
result_dir = Path("../data/01_order_delivery/answer_results")

# 파일 리스트 가져오기
excel_files = list(result_dir.glob("Scenario_QA_V*_gpt-4o-mini_*.xlsx"))

# 버전별 최신 파일 추출
latest_files = {}
pattern = re.compile(r"Scenario_QA_(V\d+)_gpt-4o-mini_(\d{8}_\d{6})\.xlsx")

for f in excel_files:
    match = pattern.match(f.name)
    if match:
        version, timestamp = match.groups()
        if version not in latest_files or timestamp > latest_files[version][0]:
            latest_files[version] = (timestamp, f)

# DataFrame으로 읽기
dataframes = {}
for version, (_, filepath) in latest_files.items():
    df = pd.read_excel(filepath)
    dataframes[version] = df
    print(f"✅ Loaded {version} from {filepath.name}, shape: {df.shape}")

# 예시: V1 최신 결과 보기
dataframes["V1"].head()


✅ Loaded V1 from Scenario_QA_V1_gpt-4o-mini_20250615_190609.xlsx, shape: (10, 34)
✅ Loaded V4 from Scenario_QA_V4_gpt-4o-mini_20250615_190709.xlsx, shape: (10, 34)
✅ Loaded V2 from Scenario_QA_V2_gpt-4o-mini_20250615_190642.xlsx, shape: (10, 34)
✅ Loaded V0 from Scenario_QA_V0_gpt-4o-mini_20250615_190459.xlsx, shape: (10, 35)
✅ Loaded V3 from Scenario_QA_V3_gpt-4o-mini_20250615_190659.xlsx, shape: (10, 34)


Unnamed: 0,scenario_id,customer_id,customer_name,question,best_answer,customer_name_cust,contact,email,primary_address_id,register_date,...,is_default,issue_id,issue_date,issue_description,resolution_status,updated_date,answer,latency_ms,prompt_tokens,completion_tokens
0,S01,C001,Alice Kim,왜 아직도 ‘대기’예요? 주소도 수정했는데 반영도 안 돼서 화나요!,"안녕하세요, Alice Kim 고객님. 배송 지원 담당 이성재입니다.\n\n주문하신...",Alice Kim,010-1456-3540,alice@example.com,A001,2024-08-27,...,1,I001,2025-06-13 00:00:00,Shipping stalled at pending; address change no...,open,2025-06-13 00:00:00,"안녕하세요, Alice Kim 고객님. 주문번호 O001의 상품 Product J3...",2196.408208,326,116
1,S02,C002,Bob Lee,배송이 늦어지는지 확인 부탁드려요.,"Bob Lee 고객님, 안녕하세요. 요청 주신 [Product I586] (주문번호...",Bob Lee,010-8074-5410,bob@example.com,A002,2024-10-25,...,1,,,,,,"안녕하세요, Bob Lee 고객님. 주문번호 O002의 상품 Product I586...",1864.174333,324,129
2,S03,C003,Cathy Park,주소를 바꿨는데 시스템엔 안 뜨네요. 괜찮을까요?,"안녕하세요, Cathy Park 고객님. 새 주소 반영 여부가 걱정되실 것 같아 바...",Cathy Park,010-9966-1836,cathy@example.com,A003,2024-12-08,...,1,I003,2025-06-13 00:00:00,No shipping company assigned; address change p...,open,2025-06-13 00:00:00,"Cathy Park님, 주문번호 O003의 상품 Product I404는 현재 배송...",1828.594334,323,111
3,S04,C004,David Choi,제 주문 어디쯤 왔나요?,"David Choi 고객님, [Product Z277] (주문번호 O004) 의 현...",David Choi,010-1633-6893,david@example.com,A004,2025-01-10,...,1,,,,,,"안녕하세요, David Choi님. \n\n주문번호 O004의 상품 Product ...",2102.826958,322,143
4,S05,C005,Eunji Han,운송장이 조회가 안 돼요. 뭔가 오류죠?,"안녕하세요, Eunji Han 고객님. 송장 조회 오류로 불편을 드려 죄송합니다.\...",Eunji Han,010-8612-8148,eunji@example.com,A005,2024-09-24,...,1,I005,2025-06-13 00:00:00,Tracking number not found in carrier system.,open,2025-06-13 00:00:00,"안녕하세요, Eunji Han 고객님. 주문번호 O005의 상품 Product L4...",2110.625,332,121


In [41]:
dataframes["V0"].columns, dataframes["V1"].columns

(Index(['scenario_id', 'customer_id', 'customer_name', 'question',
        'best_answer', 'customer_name_customer', 'contact', 'email',
        'primary_address_id', 'register_date', 'notes', 'order_id',
        'product_name', 'order_date', 'shipping_status', 'shipping_company',
        'tracking_number', 'last_update', 'address_id', 'address_type',
        'address_line1', 'address_line2', 'postal_code', 'city', 'is_default',
        'issue_id', 'issue_date', 'issue_description', 'resolution_status',
        'updated_date', 'answer_gpt-4o-mini', 'latency_ms_gpt-4o-mini',
        'prompt_tok_gpt-4o-mini', 'completion_tok_gpt-4o-mini',
        'usd_cost_gpt-4o-mini'],
       dtype='object'),
 Index(['scenario_id', 'customer_id', 'customer_name', 'question',
        'best_answer', 'customer_name_cust', 'contact', 'email',
        'primary_address_id', 'register_date', 'notes', 'order_id',
        'product_name', 'order_date', 'shipping_status', 'shipping_company',
        'tracking_numb

In [52]:
###############################################################################
# 1. DataFrame 준비 ------------------------------------------------------------
###############################################################################
# 앞서 로드해 둔 dict 형태: {'V0': df0, 'V1': df1, ...}
dfs = dataframes          # ← 이전 답변 코드에서 나온 결과를 그대로 씁니다.

# ── 공통 전처리 함수 ────────────────────────────────────────────────────────────
def _standardize(df: pd.DataFrame, version: str) -> pd.DataFrame:
    df = df.copy()

    # ① 답변 컬럼 통일
    ans_col = (
        "answer_gpt-4o-mini" if version == "V0"
        else "answer"
    )
    df.rename(columns={ans_col: "answer"}, inplace=True)

    # ② latency → 초
    lat_col = (
        "latency_ms_gpt-4o-mini" if version == "V0"
        else "latency_ms"
    )
    df["latency_s"] = df[lat_col] / 1000.0

    # ③ 토큰·비용
    if version == "V0":
        df["prompt_tokens"]     = df["prompt_tok_gpt-4o-mini"]
        df["completion_tokens"] = df["completion_tok_gpt-4o-mini"]
        df["total_tokens"]      = df["prompt_tokens"] + df["completion_tokens"]
        df["usd_cost"]          = df["usd_cost_gpt-4o-mini"]      # 이미 계산되어 있음
    else:
        df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
        # 비용 계산
        df["usd_cost"] = (
            df["prompt_tokens"] * 0.15 + df["completion_tokens"] * 0.6
        ) / 1_000_000

    # ④ 공통 컬럼 keep
    keep_cols = [
        "scenario_id", "customer_id", "question", "best_answer", "answer",
        "latency_s", "prompt_tokens", "completion_tokens",
        "total_tokens", "usd_cost"
    ]
    return df[keep_cols].assign(version=version)

# ── 버전별 표준화
std_dfs = [_standardize(dfs[v], v) for v in ["V0", "V1", "V2", "V3", "V4"]]
full_df = pd.concat(std_dfs, ignore_index=True)

###############################################################################
# 2. 정량적 유사도 지표 계산 ------------------------------------------------------
###############################################################################
# BLEU-4
_bleu_smooth = SmoothingFunction().method1
def bleu_score(ref, hyp):
    ref_toks = ref.split()
    hyp_toks = hyp.split()
    return sentence_bleu(
        [ref_toks], hyp_toks, smoothing_function=_bleu_smooth
    )

# ROUGE-L
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
def rouge_l_f(ref, hyp):
    return rouge.score(ref, hyp)["rougeL"].fmeasure

# Sentence-BERT (cosine sim)
sbert = SentenceTransformer("all-mpnet-base-v2")
def cosine_sim(ref, hyp):
    emb = sbert.encode([ref, hyp], convert_to_tensor=True, normalize_embeddings=True)
    return float(util.cos_sim(emb[0], emb[1]))

# ── 계산
full_df["bleu"]     = full_df.apply(lambda r: bleu_score(r.best_answer, r.answer), axis=1)
full_df["rougeL"]   = full_df.apply(lambda r: rouge_l_f(r.best_answer, r.answer), axis=1)
full_df["cosine"]   = full_df.apply(lambda r: cosine_sim(r.best_answer, r.answer), axis=1)

###############################################################################
# 3. GPT-4o 정성 평가 (Langfuse 기반)  <<— 수정 버전
###############################################################################

PRICING = {"input": 0.15 / 1_000_000, "output": 0.60 / 1_000_000}
full_df["gpt_score"] = np.nan   # 50행 전부 초기화

@observe()
async def call_openai(
    system_prompt: str,
    user_prompt: str,
    user_id: str,
    scenario_id: str,
    model: str = "gpt-4o-mini",
):
    """GPT-4o judge 호출 → gpt_score(int) 포함 dict 반환"""
    tic = time.perf_counter_ns()

    # Langfuse trace 업데이트
    langfuse_context.update_current_trace(
        name="smart_cs_evaluator",
        user_id=user_id,
        session_id=scenario_id,
        tags=["evaluator", "smart_cs"],
        metadata={"model": model},
    )

    # --- OpenAI 호출 ---------------------------------------------------------
    resp = await client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
    )

    toc_ms = (time.perf_counter_ns() - tic) / 1_000_000
    usage = resp.usage

    # 원문·토큰·비용
    answer_raw = resp.choices[0].message.content.strip()
    try:
        gpt_score = int(re.search(r"\b([0-5])\b", answer_raw).group(1))
    except AttributeError:
        gpt_score = None  # 파싱 실패 시 NaN 처리 용도

    cost = (
        usage.prompt_tokens * PRICING["input"]
        + usage.completion_tokens * PRICING["output"]
    )

    return {"gpt_score": gpt_score}



def build_judge_prompt(_row) -> str:
    return (
        "You are a customer-service QA judge. "
        "Score the assistant's answer on how well it satisfies the customer's "
        "question compared to the provided best_answer. "
        "Return only an integer 0-5 (0=poor, 5=perfect)."
    )


async def run_judge_version(df: pd.DataFrame, build_system_prompt):
    """
    df (한 버전만 필터링된 DataFrame) → gpt_score 컬럼 채워서 반환
    """
    row_ids, tasks = [], []

    for idx, row in df.iterrows():
        row_ids.append(idx)
        user_prompt = (
            f"BEST_ANSWER:\n{row.best_answer}\n\n"
            f"ASSISTANT_ANSWER:\n{row.answer}"
        )
        tasks.append(
            call_openai(
                build_system_prompt(row),
                user_prompt,
                user_id=str(row.customer_id),
                scenario_id=str(row.scenario_id),
            )
        )

    results = await asyncio.gather(*tasks)

    # --- gpt_score 덮어쓰기 (index 정확히 유지) -------------------------------
    for idx, res in zip(row_ids, results):
        df.loc[idx, "gpt_score"] = res["gpt_score"]

    return df



# ── 버전별 채점 --------------------------------------------------------------
for v in ["V0", "V1", "V2", "V3", "V4"]:
    idx_mask = full_df["version"] == v
    df_scored = asyncio.get_event_loop().run_until_complete(
        run_judge_version(full_df.loc[idx_mask].copy(), build_judge_prompt)
    )
    # 채점 결과를 원본 full_df에 반영
    full_df.loc[idx_mask, "gpt_score"] = df_scored["gpt_score"].values


###############################################################################
# 4. V0 대비 비용 비율 ------------------------------------------------------------
###############################################################################
# V0 비용을 시나리오별로 매핑
v0_cost = full_df.query("version=='V0'")[["scenario_id", "usd_cost"]].set_index("scenario_id")["usd_cost"]
full_df["cost_ratio_vs_V0"] = full_df.apply(
    lambda r: r.usd_cost / v0_cost.get(r.scenario_id, np.nan),
    axis=1
)

###############################################################################
# 5. 결과 Pivot (버전별 한눈에) ----------------------------------------------------
###############################################################################
pivot_cols = [
    "latency_s", "bleu", "rougeL", "cosine", "gpt_score",
    "prompt_tokens", "completion_tokens", "total_tokens",
    "usd_cost", "cost_ratio_vs_V0"
]
summary = full_df.pivot_table(
    index="version",
    values=pivot_cols,
    aggfunc="mean"
)

print("✅ summary DataFrame ready:", summary.shape)
# 

✅ summary DataFrame ready: (5, 10)


In [54]:
summary

Unnamed: 0_level_0,bleu,completion_tokens,cosine,cost_ratio_vs_V0,gpt_score,latency_s,prompt_tokens,rougeL,total_tokens,usd_cost
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
V0,0.005491,123.3,0.579788,1.0,2.1,2.227642,27.7,0.052991,151.0,7.8e-05
V1,0.031286,118.2,0.685668,1.86669,3.4,2.203607,324.5,0.454969,442.7,0.00012
V2,0.015503,99.9,0.661857,2.009888,3.4,1.864886,463.5,0.526619,563.4,0.000129
V3,0.014371,107.5,0.641338,2.152802,3.2,1.942878,492.5,0.451534,600.0,0.000138
V4,0.020603,107.7,0.681569,2.270868,3.1,2.139483,548.5,0.383717,656.2,0.000147


In [None]:
# summary.to_excel("../data/01_order_delivery/answer_results/scenario_comparison_summary.xlsx")  # 필요하면 저장

In [None]:
# full_df.to_excel("../data/01_order_delivery/answer_results/scenario_comparison_full.xlsx") # 필요하면 저장

## 🏆 종합 Score 계산 및 우승 Prompt 선정

In [55]:
# 1. 기준 메트릭 정의
maximize_metrics = ["bleu", "rougeL", "cosine", "gpt_score"]
minimize_metrics = ["latency_s", "total_tokens", "usd_cost", "cost_ratio_vs_V0"]

# 선택적으로 사용할 수 있는 가중치 (모두 1로 두면 동등 반영)
weights = {
    "bleu": 1.0,
    "rougeL": 1.0,
    "cosine": 1.0,
    "gpt_score": 2.0,  # GPT 평가를 더 중요하게 반영할 수도 있음
    "latency_s": 1.0,
    "total_tokens": 0.5,
    "usd_cost": 1.0,
    "cost_ratio_vs_V0": 0.5,
}

# 2. 전체 DataFrame 복사 및 정규화
norm_df = summary.copy()

# 최대화 지표: 0~1로 정규화 (높을수록 좋음)
for col in maximize_metrics:
    min_v = norm_df[col].min()
    max_v = norm_df[col].max()
    norm_df[col + "_score"] = (norm_df[col] - min_v) / (max_v - min_v)

# 최소화 지표: 반대로 1 - 정규화
for col in minimize_metrics:
    min_v = norm_df[col].min()
    max_v = norm_df[col].max()
    norm_df[col + "_score"] = 1.0 - (norm_df[col] - min_v) / (max_v - min_v)

# 3. 종합 점수 계산 (가중 합산)
norm_df["final_score"] = 0
for col in maximize_metrics + minimize_metrics:
    score_col = col + "_score"
    norm_df["final_score"] += norm_df[score_col] * weights.get(col, 1.0)

# 4. 최종 우승자
winner = norm_df["final_score"].idxmax()
winner_score = norm_df["final_score"].max()

print(f"🏆 최종 우승 Prompt 버전: {winner} (score: {winner_score:.4f})")

🏆 최종 우승 Prompt 버전: V1 (score: 5.6823)


---

## ✍️ 개인 실습
- 프롬프트 버전을 추가하거나 수정해보세요.
- 다양한 Judge 기준으로 평가를 변경해보세요.
- 평가 지표(가중치 등)를 조정해보세요.
