## Datathon 클래스

In [1]:
import re
import os
import pathlib
import pandas as pd
from typing import Any, List, Dict
from typing import Optional, Dict, Any, List, Union
from abc import ABC, abstractmethod
from langchain.prompts import ChatPromptTemplate  # 프롬프트 템플릿 처리용
from langevaluate.config import ModelConfig # LLM 설정용
from langevaluate.llmfactory import LLMFactory  # LLM 팩토리용
from tqdm.asyncio import tqdm_asyncio
import asyncio

class DatathonProcessor(ABC):
    """
    데이터톤용 AI 처리 통합 클래스
    쿼리, 평가, 임베딩을 일괄 처리할 수 있습니다.
    사용자는 이 클래스를 상속받아 특정 메서드만 구현하면 됩니다.
    """
    # LLM 설정 상수들
    
    DEFAULT_MODEL_CONFIG = {
        'model_name': 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ',
        'api_base': 'https://api.snubhai.org/api/v1/llm',
        'max_tokens': 2000,
        'seed': 777,
        'temperature': 0,
        'rpm': 10
    }

    def __init__(
        self,
        api_key : str,
    ):
        # 기본 설정 복사
        config = self.DEFAULT_MODEL_CONFIG.copy()
        
        # model_name만 클래스별 설정으로 업데이트
        config['model_name'] = self.get_model_name()
        
        # LLM 설정 생성
        custom_config = ModelConfig(
            model_name=config['model_name'],
            api_base=config['api_base'],
            api_key=api_key,
            max_tokens=config['max_tokens'],
            seed=config['seed'],
            provider="openai"
        )
        
        # LLM 인스턴스 생성
        self.llm = LLMFactory.create_llm(
            custom_config, 
            temperature=config['temperature'], 
            rpm=config['rpm']
        )
        
        # 프롬프트 템플릿 설정
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

        # 결과 저장소
        self.results: List[str] = []
        
        # metric 저장소
        self.metrics: Dict[str, Any] = {}
    
        
    def get_model_name(self) -> str:
        """
        사용할 모델명을 반환합니다.
        상속 클래스에서 이 메서드를 오버라이드하여 특정 모델을 설정할 수 있습니다.
        """
        return self.DEFAULT_MODEL_CONFIG['model_name']


    @abstractmethod
    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """데이터 전처리 메서드"""
        pass
    
    @abstractmethod
    def get_prompt_template(self) -> str:
        """사용자가 구현해야 하는 프롬프트 템플릿 메서드"""
        pass
    
    @abstractmethod
    async def postprocess_result(self, result: Any) -> str:
        """데이터 후처리 메서드"""
        pass

    async def summarize(
        self, 
        data: pd.DataFrame
    ) -> List[str]:
        """
        단일 입력과 배치 입력을 모두 처리하는 통합 메서드
        """
        # 데이터 전처리
        
        preprocess_tasks = [self.preprocess_data(row) for _, row in data.iterrows()]
        preprocessed_data = await tqdm_asyncio.gather(*preprocess_tasks)

        # 각각을 별도의 coroutine으로 실행
        tasks = [self.chain.ainvoke(vars) for vars in preprocessed_data]

        # tqdm_asyncio.gather로 동시에 실행하며 progress bar 표시
        responses = await tqdm_asyncio.gather(*tasks)

        postprocess_tasks = [self.postprocess_result(r.content) for r in responses]
        results = await tqdm_asyncio.gather(*postprocess_tasks)
        
        return results

  from .autonotebook import tqdm as notebook_tqdm


## 자체평가

In [None]:
import pandas as pd
import numpy as np
import torch
import asyncio
from typing import Any, Dict, List
from bert_score import BERTScorer
from langchain.prompts import ChatPromptTemplate
from langevaluate.config import ModelConfig
from langevaluate.llmfactory import LLMFactory
import time
import re

# 대회 제공 BertScore 클래스 (정확히 동일)
class BertScore:
    def __init__(self, model_type="distilbert-base-uncased", batch_size=16):
        with torch.no_grad():
            self.bert_scorer = BERTScorer(
                model_type=model_type,
                batch_size=batch_size,
            )

    def __call__(self, refs, hyps):
        p, r, f = self.bert_scorer.score(
            cands=hyps,
            refs=refs,
            verbose=False,
            batch_size=8,
        )
        return f.tolist()

# 대회 제공 FairnessScore 클래스 (정확히 동일)
class FairnessScore:
    def __init__(self, bin_width: int = 10, min_samples_per_group: int = 1):
        self.bin_width = int(bin_width)
        self.min_samples_per_group = int(min_samples_per_group)
        self.last_stats = None

    @staticmethod
    def _ensure_1d(a) -> np.ndarray:
        a = np.asarray(a)
        if a.ndim == 2 and a.shape[1] == 1:
            a = a[:, 0]
        if a.ndim != 1:
            raise ValueError("Input must be 1D or (N,1) shaped.")
        return a

    def _bin_ages(self, ages) -> np.ndarray:
        a = self._ensure_1d(ages).astype(float)
        if np.any(np.isnan(a)):
            raise ValueError("ages contain NaN.")
        if self.bin_width <= 0:
            raise ValueError("bin_width must be positive.")
        starts = (np.floor(a / self.bin_width) * self.bin_width).astype(int)
        ends = starts + self.bin_width
        labels = np.array([f"{s:d}-{e:d}" for s, e in zip(starts, ends)], dtype=object)
        return labels

    def _groups_from_type(self, groups, type: str) -> np.ndarray:
        t = (type or "sex").lower()
        if t not in ("sex", "age"):
            raise ValueError("type must be 'sex' or 'age'.")
        if t == "sex":
            g = self._ensure_1d(groups)
            return g
        else:
            return self._bin_ages(groups)

    def __call__(self, groups, scores, type: str = "sex", sample_weight=None) -> float:
        g = self._groups_from_type(groups, type=type)
        s = self._ensure_1d(scores).astype(float)
        if s.shape[0] != g.shape[0]:
            raise ValueError("groups and scores must have the same length.")

        if sample_weight is None:
            w = np.ones_like(s, dtype=float)
        else:
            w = self._ensure_1d(sample_weight).astype(float)
            if w.shape[0] != s.shape[0]:
                raise ValueError("sample_weight length must match scores.")

        s = np.clip(s, 0.0, 1.0)

        uniq = np.unique(g)
        means = []
        by_group = {}
        for grp in uniq:
            mask = (g == grp)
            if np.sum(mask) < self.min_samples_per_group:
                continue
            denom = np.sum(w[mask])
            if denom <= 0:
                continue
            m = float(np.average(s[mask], weights=w[mask]))
            means.append(m)
            by_group[str(grp)] = m

        if len(means) <= 1:
            self.last_stats = {"by_group": by_group, "gap": 0.0, "min": None, "max": None}
            return 1.0

        max_m = float(np.max(means))
        min_m = float(np.min(means))
        fairness = 1.0 if max_m == 0.0 else float(min_m / max_m)
        fairness = float(np.clip(fairness, 0.0, 1.0))

        self.last_stats = {"by_group": by_group, "gap": max_m - min_m, "min": min_m, "max": max_m}
        return fairness

# TaskB Processor 
class TaskBProcessor(DatathonProcessor):
    """Task B: Radiology Impression 요약 - 극한 최적화"""
    
    def get_model_name(self) -> str:
        return "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ"
    
    def get_prompt_template(self) -> str:
        return """You are a board-certified radiologist with 20+ years of experience creating diagnostic impressions. Generate a precise, clinically actionable IMPRESSION that maximizes diagnostic clarity and conciseness.

CRITICAL OPTIMIZATION FOR EVALUATION:
- QUALITY: Capture ALL key findings with complete accuracy - miss nothing important
- CLINICAL CLARITY: Use precise radiological terminology that clinicians can act upon immediately  
- CONCISENESS: Zero redundancy, no verbose phrases, every word essential for diagnosis
- ACCURACY: Only document what is explicitly stated - absolutely no assumptions or hallucinations

OPTIMAL IMPRESSION STRUCTURE:
1. Primary/acute findings first (most clinically significant)
2. Secondary findings (supportive/chronic conditions)  
3. Explicit negatives for critical differentials when mentioned
4. Numbered format for multiple distinct findings (1., 2., 3.)

CONCISENESS OPTIMIZATION EXAMPLES:

VERBOSE: "There appears to be some degree of mild enlargement of the cardiac silhouette that suggests possible cardiomegaly"
CONCISE: "Mild cardiomegaly"

VERBOSE: "The findings are consistent with and compatible with pneumonia involving the right lower lobe"
CONCISE: "Right lower lobe pneumonia"

VERBOSE: "No evidence of any acute fractures or significant osseous abnormalities are identified"
CONCISE: "No acute fractures"

EXEMPLAR CASES FOR MAXIMUM SCORES:

CT CHEST HIGH-COMPLEXITY:
FINDINGS: Multiple bilateral pulmonary nodules, the largest measuring 2.3 cm in the right upper lobe with spiculated margins and adjacent pleural thickening. Moderate right pleural effusion with loculations. Mild mediastinal lymphadenopathy with nodes up to 1.2 cm. Left lower lobe consolidation with air bronchograms. No pericardial effusion.
IMPRESSION: 1. Right upper lobe spiculated nodule (2.3 cm) concerning for malignancy with pleural involvement.
2. Loculated right pleural effusion.  
3. Mediastinal lymphadenopathy.
4. Left lower lobe pneumonia.

CHEST X-RAY OPTIMIZATION:
FINDINGS: Heart size is enlarged. There is bilateral lower lobe airspace opacification consistent with consolidation or atelectasis. Small bilateral pleural effusions are present. The mediastinal contours appear normal. No pneumothorax is evident.
IMPRESSION: 1. Cardiomegaly with bilateral lower lobe consolidation.
2. Small bilateral pleural effusions.

MR HEAD PRECISION:
FINDINGS: There is a 4.2 cm enhancing extra-axial mass centered in the right frontal region with adjacent dural thickening and enhancement. Mild surrounding vasogenic edema extends into the frontal white matter with 3 mm of leftward midline shift. No restricted diffusion or hemorrhage.
IMPRESSION: 1. Right frontal extra-axial enhancing mass (4.2 cm) consistent with meningioma.
2. Mild mass effect with 3 mm leftward midline shift.
3. No acute infarction or hemorrhage.

Now generate optimal IMPRESSION for:
FINDINGS: {user_input}
IMPRESSION:"""

    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """방사선 보고서 전처리 - 정확도 및 공정성 최적화"""
        import re
        import pandas as pd

        try:
            radiology_text = data.get('radiology report', '')

            if pd.isna(radiology_text) or not isinstance(radiology_text, str) or not radiology_text.strip():
                return {'user_input': 'Normal examination without acute abnormalities.'}

            # Enhanced FINDINGS extraction with multiple fallback strategies
            findings_text = radiology_text
            
            # Primary extraction - FINDINGS section
            if 'FINDINGS:' in radiology_text:
                findings_section = radiology_text.split('FINDINGS:')[1]
                if 'IMPRESSION:' in findings_section:
                    findings_section = findings_section.split('IMPRESSION:')[0]
                elif 'CONCLUSION:' in findings_section:
                    findings_section = findings_section.split('CONCLUSION:')[0]
                elif 'ASSESSMENT:' in findings_section:
                    findings_section = findings_section.split('ASSESSMENT:')[0]
                findings_text = findings_section.strip()
            
            # Fallback - less specific FINDINGS
            elif 'FINDINGS' in radiology_text and len(findings_text) == len(radiology_text):
                findings_section = radiology_text.split('FINDINGS')[1]
                if 'IMPRESSION' in findings_section:
                    findings_section = findings_section.split('IMPRESSION')[0]
                elif 'CONCLUSION' in findings_section:
                    findings_section = findings_section.split('CONCLUSION')[0]
                findings_text = findings_section.strip()
            
            # Alternative section names
            elif 'INTERPRETATION:' in radiology_text:
                findings_section = radiology_text.split('INTERPRETATION:')[1]
                if 'IMPRESSION:' in findings_section:
                    findings_section = findings_section.split('IMPRESSION:')[0]
                findings_text = findings_section.strip()

            # Aggressive text cleaning for accuracy
            findings_text = re.sub(r'^[:\s]*', '', findings_text)
            findings_text = re.sub(r'\b___+\b', '[REDACTED]', findings_text)  # Preserve redacted info pattern
            findings_text = re.sub(r'\[\*+[^\]]*\*+\]', '[REDACTED]', findings_text)  # Remove bracketed redactions
            findings_text = re.sub(r'\s+', ' ', findings_text)  # Normalize whitespace
            findings_text = findings_text.strip()

            # Quality control - ensure substantial findings content
            if len(findings_text) < 20:
                # Try to extract from full text if FINDINGS section too short
                sentences = re.split(r'[.!?]+', radiology_text)
                medical_sentences = []
                
                medical_terms = [
                    'normal', 'abnormal', 'mass', 'lesion', 'consolidation', 'effusion',
                    'edema', 'hemorrhage', 'fracture', 'dislocation', 'stenosis', 
                    'dilatation', 'enhancement', 'atelectasis', 'pneumonia', 'cardiomegaly',
                    'opacity', 'density', 'nodule', 'calcification'
                ]
                
                for sentence in sentences:
                    sentence = sentence.strip()
                    if len(sentence) > 15 and any(term in sentence.lower() for term in medical_terms):
                        medical_sentences.append(sentence)
                        if len(' '.join(medical_sentences)) > 400:
                            break
                
                if medical_sentences:
                    findings_text = '. '.join(medical_sentences)

            return {'user_input': findings_text if findings_text else 'Normal examination without acute abnormalities.'}

        except Exception as e:
            # Robust fallback
            fallback_text = str(data.get('radiology report', ''))
            if fallback_text.strip():
                # Extract first meaningful sentence as fallback
                sentences = re.split(r'[.!?]+', fallback_text)
                for sentence in sentences[:3]:
                    if len(sentence.strip()) > 20:
                        return {'user_input': sentence.strip()}
            return {'user_input': 'Normal examination without acute abnormalities.'}

    async def postprocess_result(self, result: str) -> str:
        """후처리 최적화 - Conciseness 및 Clinical Clarity 강화"""
        import re

        try:
            if not result or not isinstance(result, str):
                return "No acute abnormalities identified."

            result = result.strip()

            # Remove impression prefixes
            impression_prefixes = [
                'IMPRESSION:', 'Impression:', 'impression:', 'OPTIMAL IMPRESSION:',
                'CONCLUSION:', 'ASSESSMENT:', 'SUMMARY:'
            ]
            for prefix in impression_prefixes:
                if result.startswith(prefix):
                    result = result[len(prefix):].strip()
                    break

            if not result:
                return "No acute abnormalities identified."

            # Ensure proper sentence ending
            if not result.endswith('.'):
                result += '.'

            # CONCISENESS OPTIMIZATION - Remove verbose phrases
            conciseness_replacements = {
                # Verbose medical expressions to concise equivalents
                r'there (?:is|are) evidence of': '',
                r'findings (?:are )?consistent with(?:\s+and\s+compatible\s+with)?': '',
                r'(?:appears to|seems to) (?:be|demonstrate|show)': '',
                r'compatible with(?:\s+a\s+diagnosis\s+of)?': '',
                r'suggestive of(?:\s+the\s+presence\s+of)?': '',
                r'concerning for(?:\s+the\s+possibility\s+of)?': 'concerning for',
                r'no evidence of(?:\s+any)?': 'no',
                r'there is no(?:\s+evidence\s+of)?': 'no',
                r'demonstrates?(?:\s+evidence\s+of)?': '',
                r'shows?(?:\s+signs\s+of)?': '',
                r'reveals?(?:\s+the\s+presence\s+of)?': '',
                r'indicates?(?:\s+the\s+presence\s+of)?': '',
                r'mild(?:\s+degree\s+of)?': 'mild',
                r'moderate(?:\s+degree\s+of)?': 'moderate',
                r'severe(?:\s+degree\s+of)?': 'severe',
                r'small(?:\s+amount\s+of)?': 'small',
                r'large(?:\s+amount\s+of)?': 'large',
                r'(?:some\s+)?degree\s+of\s+': '',
                r'(?:a\s+)?finding\s+of\s+': '',
                r'presence\s+of\s+': '',
                r'(?:most\s+)?likely\s+represents?': 'likely',
                r'probably\s+represents?': 'probably',
                r'possibly\s+represents?': 'possibly',
            }

            for pattern, replacement in conciseness_replacements.items():
                result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)

            # Clean up extra spaces and punctuation
            result = re.sub(r'\s+', ' ', result)
            result = re.sub(r'\s+([,.])', r'\1', result)  # Remove space before punctuation
            result = re.sub(r'([,.])([A-Z])', r'\1 \2', result)  # Add space after punctuation before capital
            
            # CLINICAL CLARITY - Standardize medical terminology
            medical_standardizations = {
                r'cardiomegaly': 'cardiomegaly',
                r'pulmonary edema': 'pulmonary edema', 
                r'pleural effusion': 'pleural effusion',
                r'pneumothorax': 'pneumothorax',
                r'consolidation': 'consolidation',
                r'atelectasis': 'atelectasis',
                r'lymphadenopathy': 'lymphadenopathy',
                r'hepatomegaly': 'hepatomegaly',
                r'splenomegaly': 'splenomegaly'
            }

            # NUMBERING OPTIMIZATION for multiple findings
            try:
                if not result.startswith(('1.', '2.', '3.')) and ('. ' in result or ';' in result or ',' in result):
                    # Split by various delimiters
                    if ';' in result:
                        sentences = [s.strip() for s in result.split(';') if s.strip()]
                    elif '.' in result and len(result.split('.')) > 2:
                        sentences = [s.strip() for s in result.split('.') if s.strip() and len(s.strip()) > 5]
                    else:
                        sentences = [s.strip() for s in result.split(',') if s.strip() and len(s.strip()) > 10]
                    
                    # Only number if we have 2+ substantial findings
                    if len(sentences) >= 2 and all(len(s) > 8 for s in sentences):
                        numbered_sentences = []
                        for i, sentence in enumerate(sentences):
                            if sentence and not sentence.endswith('.'):
                                sentence += '.'
                            numbered_sentences.append(f"{i+1}. {sentence}")
                        
                        if numbered_sentences:
                            result = ' '.join(numbered_sentences)
            except Exception:
                pass  # Keep original if numbering fails

            # Final length optimization for conciseness
            words = result.split()
            if len(words) > 50:  # If too verbose, prioritize key findings
                sentences = [s.strip() for s in result.split('.') if s.strip()]
                if sentences:
                    # Priority scoring for medical relevance
                    priority_terms = [
                        'fracture', 'mass', 'tumor', 'hemorrhage', 'infarction', 'pneumonia',
                        'effusion', 'pneumothorax', 'cardiomegaly', 'consolidation', 'embolism',
                        'stenosis', 'occlusion', 'aneurysm', 'dissection', 'malignancy'
                    ]
                    
                    scored_sentences = []
                    for sentence in sentences:
                        score = sum(2 for term in priority_terms if term in sentence.lower())
                        score += len([w for w in sentence.split() if len(w) > 6])  # Medical terms tend to be longer
                        scored_sentences.append((sentence, score))
                    
                    # Keep highest scoring sentences within word limit
                    scored_sentences.sort(key=lambda x: x[1], reverse=True)
                    final_sentences = []
                    current_words = 0
                    
                    for sentence, score in scored_sentences:
                        sentence_words = len(sentence.split())
                        if current_words + sentence_words <= 45:
                            final_sentences.append(sentence)
                            current_words += sentence_words
                        if current_words >= 25:  # Ensure minimum content
                            break
                    
                    if final_sentences:
                        result = '. '.join(final_sentences)
                        if not result.endswith('.'):
                            result += '.'

            # Final cleanup
            result = result.strip()
            if not result:
                return "No acute abnormalities identified."
            
            return result

        except Exception as e:
            return "No acute abnormalities identified."




# 대회와 정확히 동일한 평가 함수
async def exact_competition_evaluation(train_csv_path: str, api_key: str):
    """대회 조건과 정확히 동일한 평가"""
    
    print("=" * 80)
    print("🏆 대회 정확한 평가 조건 시뮬레이션 - Task B")
    print("=" * 80)
    
    # 1. 전체 Test 데이터 로드 (대회와 동일)
    print("1. 전체 Test 데이터 로드 중...")
    test_df = pd.read_csv(train_csv_path)
    
    # 🔧 NaN 값 처리 추가
    print("   데이터 품질 확인 중...")
    print(f"   전체 데이터: {len(test_df)}개")
    
    # NaN 값 확인
    nan_count = test_df['radiology report'].isna().sum()
    print(f"   NaN 값: {nan_count}개")
    
    # NaN 값이 있는 행 제거
    test_df = test_df.dropna(subset=['radiology report', 'target'])
    print(f"   유효 데이터: {len(test_df)}개")
    
    total_samples = len(test_df)
    
    # 2. 대회에서 사용할 평가 샘플 크기 결정 (실제 Test 세트 크기와 유사하게)
    # Test 1: 300건, Test 2: 300건이므로 300개로 평가
    eval_samples = min(300, total_samples)
    
    # 3. 연속된 샘플 사용 (대회에서는 특정 Test 세트를 사용하므로 bias 없는 연속 샘플)
    eval_df = test_df.iloc[:eval_samples].copy()  # 처음 300개 사용
    print(f"평가 샘플: {eval_samples}개 (연속 샘플, 대회 Test 세트와 동일한 크기)")
    
    # 4. 데이터 분포 확인
    print(f"\n📊 평가 데이터 분포:")
    print(f"   성별 분포: {eval_df['gender'].value_counts().to_dict()}")
    print(f"   연령 분포: 평균 {eval_df['anchor_age'].mean():.1f}세 (범위: {eval_df['anchor_age'].min()}-{eval_df['anchor_age'].max()})")
    
    # 5. TaskB 처리기 초기화
    print("\n2. TaskB 처리기 초기화 (Llama 모델)...")
    processor = TaskBProcessor(api_key)
    
    # 6. 예측 생성 (대회와 동일한 배치 크기)
    print("3. AI 예측 생성 중 (API 제한 준수)...")
    start_time = time.time()
    
    data_batch = [{'radiology report': row['radiology report']} for _, row in eval_df.iterrows()]
    
    # 대회 API 제한 준수 (1분당 10건)
    results = []
    batch_size = 8  # 안전 마진
    
    for i in range(0, len(data_batch), batch_size):
        batch = data_batch[i:i+batch_size]
        print(f"   배치 {i//batch_size + 1}/{(len(data_batch)-1)//batch_size + 1} 처리 중...")
        
        # 전처리
        preprocessed = [await processor.preprocess_data(row) for row in batch]
        
        # API 호출
        tasks = [processor.chain.ainvoke(prep) for prep in preprocessed]
        responses = await asyncio.gather(*tasks)
        
        # 후처리
        batch_results = [await processor.postprocess_result(r.content) for r in responses]
        results.extend(batch_results)
        
        # API 제한 준수
        if i + batch_size < len(data_batch):
            print(f"   API 제한 준수를 위해 70초 대기...")
            await asyncio.sleep(70)
    
    predictions = results
    generation_time = time.time() - start_time
    print(f"예측 생성 완료 (총 소요 시간: {generation_time:.1f}초)")
    
    # 7. 정답 데이터 준비
    references = eval_df['target'].tolist()
    
    # 8. 대회 제공 BERTScore 계산 (정확히 동일한 설정)
    print("\n4. 대회 BERTScore 계산 중...")
    bert_scorer = BertScore(model_type="distilbert-base-uncased", batch_size=16)
    bert_scores = bert_scorer(refs=references, hyps=predictions)
    bert_mean = np.mean(bert_scores)
    bert_std = np.std(bert_scores)
    
    # 9. 대회 제공 공정성 지표 계산 (정확히 동일한 설정)
    print("5. 대회 공정성 지표 계산 중...")
    fairness_scorer = FairnessScore(bin_width=10, min_samples_per_group=1)
    
    # 성별 공정성
    gender_fairness = fairness_scorer(
        groups=eval_df['gender'].tolist(),
        scores=bert_scores,
        type='sex'
    )
    gender_stats = fairness_scorer.last_stats
    
    # 연령 공정성
    age_fairness = fairness_scorer(
        groups=eval_df['anchor_age'].tolist(),
        scores=bert_scores,
        type='age'
    )
    age_stats = fairness_scorer.last_stats
    
    # 10. 대회 정확한 결과 출력
    print("\n" + "=" * 80)
    print("🎯 대회 정확한 평가 결과 - Task B (Test 데이터)")
    print("=" * 80)
    
    print(f"📊 BERTScore (대회 공식 계산)")
    print(f"   평균: {bert_mean:.6f}")
    print(f"   표준편차: {bert_std:.6f}")
    print(f"   최고: {max(bert_scores):.6f}")
    print(f"   최저: {min(bert_scores):.6f}")
    print(f"   중앙값: {np.median(bert_scores):.6f}")
    
    print(f"\n⚖️ 공정성 지표 (대회 공식 계산)")
    print(f"   성별 공정성: {gender_fairness:.6f}")
    print(f"   성별별 성능: {gender_stats['by_group']}")
    print(f"   성별 격차: {gender_stats['gap']:.6f}")
    print(f"   ")
    print(f"   연령 공정성: {age_fairness:.6f}")
    print(f"   연령대별 성능: {age_stats['by_group']}")
    print(f"   연령 격차: {age_stats['gap']:.6f}")
    
    # 11. 정량 평가 점수 계산 (대회 기준)
    print(f"\n🏆 대회 정량 평가 점수")
    
    # BERTScore 점수 (3점 만점)
    bert_score_points = min(3.0, max(0.0, (bert_mean / 0.85) * 3.0))
    
    # 공정성 점수 (2점 만점)
    fairness_avg = (gender_fairness + age_fairness) / 2.0
    fairness_points = min(2.0, max(0.0, (fairness_avg / 0.95) * 2.0))
    
    # 총점
    total_quantitative = bert_score_points + fairness_points
    
    print(f"   BERTScore: {bert_score_points:.3f}/3.000 점")
    print(f"   공정성 지표: {fairness_points:.3f}/2.000 점")
    print(f"   정량 총점: {total_quantitative:.3f}/5.000 점")
    print(f"   정량 달성률: {total_quantitative/5.0*100:.1f}%")
    
    # 12. 성능 등급 판정
    print(f"\n🎖️ 성능 등급")
    if total_quantitative >= 4.5:
        grade = "S급 (최우수)"
        recommendation = "즉시 제출 권장"
    elif total_quantitative >= 4.0:
        grade = "A급 (우수)"
        recommendation = "제출 권장"
    elif total_quantitative >= 3.5:
        grade = "B급 (양호)"
        recommendation = "소폭 개선 후 제출"
    elif total_quantitative >= 3.0:
        grade = "C급 (보통)"
        recommendation = "개선 필요"
    else:
        grade = "D급 (미흡)"
        recommendation = "대폭 개선 필요"
    
    print(f"   등급: {grade}")
    print(f"   권장사항: {recommendation}")
    
    # 13. 샘플 결과 분석
    print(f"\n📝 예측 품질 샘플 (상위/하위 각 2개)")
    print("-" * 80)
    
    sorted_indices = np.argsort(bert_scores)
    
    print("🏆 최고 성능 샘플:")
    for i in range(2):
        idx = sorted_indices[-(i+1)]
        print(f"샘플 {idx} (BERTScore: {bert_scores[idx]:.4f})")
        print(f"예측: {predictions[idx][:120]}...")
        print(f"정답: {references[idx][:120]}...")
        print()
    
    print("⚠️ 최저 성능 샘플:")
    for i in range(2):
        idx = sorted_indices[i]
        print(f"샘플 {idx} (BERTScore: {bert_scores[idx]:.4f})")
        print(f"예측: {predictions[idx][:120]}...")
        print(f"정답: {references[idx][:120]}...")
        print()
    
    return {
        'bert_score_mean': bert_mean,
        'bert_score_std': bert_std,
        'bert_scores': bert_scores,
        'gender_fairness': gender_fairness,
        'age_fairness': age_fairness,
        'total_score': total_quantitative,
        'grade': grade,
        'predictions': predictions,
        'references': references,
        'evaluation_samples': eval_samples,
        'processing_time': generation_time
    }

# 실행 (taskB_test.csv 사용)
API_KEY = "cfa06ca698c85aa9c9d4b55440aeef0f85ed94f644cd7b931fdd69f2421c6ecb"
TEST_CSV_PATH = "../data/taskB_train.csv"

# 대회 정확한 조건으로 Test 데이터 평가 실행
test_results = await exact_competition_evaluation(
    train_csv_path=TEST_CSV_PATH,
    api_key=API_KEY
)

print("\n🎉 TaskB Test 데이터 평가 완료!")
print(f"최종 예상 점수: {test_results['total_score']:.3f}/5.000 점")


🏆 대회 정확한 평가 조건 시뮬레이션 - Task B
1. 전체 Test 데이터 로드 중...
   데이터 품질 확인 중...
   전체 데이터: 1000개
   NaN 값: 11개
   유효 데이터: 989개
평가 샘플: 300개 (연속 샘플, 대회 Test 세트와 동일한 크기)

📊 평가 데이터 분포:
   성별 분포: {'M': 154, 'F': 146}
   연령 분포: 평균 63.6세 (범위: 19-91)

2. TaskB 처리기 초기화 (Llama 모델)...
3. AI 예측 생성 중 (API 제한 준수)...
   배치 1/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 2/38 처리 중...
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'