## Datathon 클래스

In [1]:
import re
import os
import pathlib
import pandas as pd
from typing import Any, List, Dict
from typing import Optional, Dict, Any, List, Union
from abc import ABC, abstractmethod
from langchain.prompts import ChatPromptTemplate  # 프롬프트 템플릿 처리용
from langevaluate.config import ModelConfig # LLM 설정용
from langevaluate.llmfactory import LLMFactory  # LLM 팩토리용
from tqdm.asyncio import tqdm_asyncio
import asyncio

class DatathonProcessor(ABC):
    """
    데이터톤용 AI 처리 통합 클래스
    쿼리, 평가, 임베딩을 일괄 처리할 수 있습니다.
    사용자는 이 클래스를 상속받아 특정 메서드만 구현하면 됩니다.
    """
    # LLM 설정 상수들
    
    DEFAULT_MODEL_CONFIG = {
        'model_name': 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ',
        'api_base': 'https://api.snubhai.org/api/v1/llm',
        'max_tokens': 2000,
        'seed': 777,
        'temperature': 0,
        'rpm': 10
    }

    def __init__(
        self,
        api_key : str,
    ):
        # 기본 설정 복사
        config = self.DEFAULT_MODEL_CONFIG.copy()
        
        # model_name만 클래스별 설정으로 업데이트
        config['model_name'] = self.get_model_name()
        
        # LLM 설정 생성
        custom_config = ModelConfig(
            model_name=config['model_name'],
            api_base=config['api_base'],
            api_key=api_key,
            max_tokens=config['max_tokens'],
            seed=config['seed'],
            provider="openai"
        )
        
        # LLM 인스턴스 생성
        self.llm = LLMFactory.create_llm(
            custom_config, 
            temperature=config['temperature'], 
            rpm=config['rpm']
        )
        
        # 프롬프트 템플릿 설정
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

        # 결과 저장소
        self.results: List[str] = []
        
        # metric 저장소
        self.metrics: Dict[str, Any] = {}
    
        
    def get_model_name(self) -> str:
        """
        사용할 모델명을 반환합니다.
        상속 클래스에서 이 메서드를 오버라이드하여 특정 모델을 설정할 수 있습니다.
        """
        return self.DEFAULT_MODEL_CONFIG['model_name']


    @abstractmethod
    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """데이터 전처리 메서드"""
        pass
    
    @abstractmethod
    def get_prompt_template(self) -> str:
        """사용자가 구현해야 하는 프롬프트 템플릿 메서드"""
        pass
    
    @abstractmethod
    async def postprocess_result(self, result: Any) -> str:
        """데이터 후처리 메서드"""
        pass

    async def summarize(
        self, 
        data: pd.DataFrame
    ) -> List[str]:
        """
        단일 입력과 배치 입력을 모두 처리하는 통합 메서드
        """
        # 데이터 전처리
        
        preprocess_tasks = [self.preprocess_data(row) for _, row in data.iterrows()]
        preprocessed_data = await tqdm_asyncio.gather(*preprocess_tasks)

        # 각각을 별도의 coroutine으로 실행
        tasks = [self.chain.ainvoke(vars) for vars in preprocessed_data]

        # tqdm_asyncio.gather로 동시에 실행하며 progress bar 표시
        responses = await tqdm_asyncio.gather(*tasks)

        postprocess_tasks = [self.postprocess_result(r.content) for r in responses]
        results = await tqdm_asyncio.gather(*postprocess_tasks)
        
        return results

  from .autonotebook import tqdm as notebook_tqdm


## 자체평가

In [3]:
import pandas as pd
import numpy as np
import asyncio
import time
from typing import Any, Dict, List
from langchain.prompts import ChatPromptTemplate
from langevaluate.config import ModelConfig
from langevaluate.llmfactory import LLMFactory
import re
from scipy.optimize import linear_sum_assignment

# 대회 제공 ICDScore 클래스 (100% 동일)
def parse_icd_codes(icd_string: str) -> List[str]:
    """ICD 코드 문자열을 파싱하여 리스트로 변환"""
    if not icd_string or pd.isna(icd_string):
        return []
    
    if icd_string.startswith('[') and icd_string.endswith(']'):
        codes = icd_string.strip('[]').replace("'", "").replace('"', '').split(',')
    else:
        codes = re.split('[,\s]+', icd_string)
    
    cleaned_codes = [code.strip().upper() for code in codes if code.strip()]
    return cleaned_codes

def icd_similarity(code1: str, code2: str) -> float:
    """ICD-10 계층 기반 유사도 계산"""
    if not code1 or not code2:
        return 0.0
    
    clean_code1 = code1.replace('.', '')
    clean_code2 = code2.replace('.', '')
    
    max_len = min(len(clean_code1), len(clean_code2))
    common = 0
    
    for i in range(max_len):
        if clean_code1[i] == clean_code2[i]:
            common += 1
        else:
            break
    
    max_depth = max(len(clean_code1), len(clean_code2))
    return common / max_depth if max_depth > 0 else 0.0

def hierarchical_f1(y_true: List[str], y_pred: List[str]) -> float:
    """계층적 부분 점수를 반영한 F1-score 계산"""
    if len(y_true) == 0 and len(y_pred) == 0:
        return 1.0
    if len(y_true) == 0 or len(y_pred) == 0:
        return 0.0
    
    sim_matrix = np.zeros((len(y_true), len(y_pred)))
    for i, true_code in enumerate(y_true):
        for j, pred_code in enumerate(y_pred):
            sim_matrix[i, j] = icd_similarity(true_code, pred_code)
    
    row_ind, col_ind = linear_sum_assignment(-sim_matrix)
    matched_score = sim_matrix[row_ind, col_ind].sum()
    
    partial_TP = matched_score
    FP = len(y_pred) - partial_TP
    FN = len(y_true) - partial_TP
    
    precision = partial_TP / len(y_pred) if len(y_pred) > 0 else 0
    recall = partial_TP / len(y_true) if len(y_true) > 0 else 0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * precision * recall / (precision + recall)
    return f1

class ICDScore:
    """ICD-10 계층적 F1-score 평가 클래스"""
    
    def __init__(self):
        pass
    
    def __call__(self, refs: List[List[str]], hyps: List[List[str]]) -> List[float]:
        if len(refs) != len(hyps):
            raise ValueError(f"참조와 예측 데이터의 길이가 다릅니다: {len(refs)} vs {len(hyps)}")
        
        scores = []
        for ref, hyp in zip(refs, hyps):
            score = hierarchical_f1(ref, hyp)
            scores.append(score)
        
        return scores

# 대회 제공 FairnessScore 클래스 (100% 동일)
class FairnessScore:
    def __init__(self, bin_width: int = 10, min_samples_per_group: int = 1):
        self.bin_width = int(bin_width)
        self.min_samples_per_group = int(min_samples_per_group)
        self.last_stats = None

    @staticmethod
    def _ensure_1d(a) -> np.ndarray:
        a = np.asarray(a)
        if a.ndim == 2 and a.shape[1] == 1:
            a = a[:, 0]
        if a.ndim != 1:
            raise ValueError("Input must be 1D or (N,1) shaped.")
        return a

    def _bin_ages(self, ages) -> np.ndarray:
        a = self._ensure_1d(ages).astype(float)
        if np.any(np.isnan(a)):
            raise ValueError("ages contain NaN.")
        if self.bin_width <= 0:
            raise ValueError("bin_width must be positive.")
        starts = (np.floor(a / self.bin_width) * self.bin_width).astype(int)
        ends = starts + self.bin_width
        labels = np.array([f"{s:d}-{e:d}" for s, e in zip(starts, ends)], dtype=object)
        return labels

    def _groups_from_type(self, groups, type: str) -> np.ndarray:
        t = (type or "sex").lower()
        if t not in ("sex", "age"):
            raise ValueError("type must be 'sex' or 'age'.")
        if t == "sex":
            g = self._ensure_1d(groups)
            return g
        else:
            return self._bin_ages(groups)

    def __call__(self, groups, scores, type: str = "sex", sample_weight=None) -> float:
        g = self._groups_from_type(groups, type=type)
        s = self._ensure_1d(scores).astype(float)
        if s.shape[0] != g.shape[0]:
            raise ValueError("groups and scores must have the same length.")

        if sample_weight is None:
            w = np.ones_like(s, dtype=float)
        else:
            w = self._ensure_1d(sample_weight).astype(float)
            if w.shape[0] != s.shape[0]:
                raise ValueError("sample_weight length must match scores.")

        s = np.clip(s, 0.0, 1.0)

        uniq = np.unique(g)
        means = []
        by_group = {}
        for grp in uniq:
            mask = (g == grp)
            if np.sum(mask) < self.min_samples_per_group:
                continue
            denom = np.sum(w[mask])
            if denom <= 0:
                continue
            m = float(np.average(s[mask], weights=w[mask]))
            means.append(m)
            by_group[str(grp)] = m

        if len(means) <= 1:
            self.last_stats = {"by_group": by_group, "gap": 0.0, "min": None, "max": None}
            return 1.0

        max_m = float(np.max(means))
        min_m = float(np.min(means))
        fairness = 1.0 if max_m == 0.0 else float(min_m / max_m)
        fairness = float(np.clip(fairness, 0.0, 1.0))

        self.last_stats = {"by_group": by_group, "gap": max_m - min_m, "min": min_m, "max": max_m}
        return fairness

# TaskC Processor (앞서 작성한 코드)
class TaskCProcessor(DatathonProcessor):
    """Task C: ICD 코드 예측"""

    def get_model_name(self) -> str:
        return "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ"
                # model_name="meta-llama/Llama-3.1-8B-Instruct"

    def get_prompt_template(self) -> str:
        return """You are a certified professional medical coder (CPC) with 15+ years of experience in ICD-10-CM coding. Analyze the hospital course and assign precise ICD-10 codes based on documented diagnoses and procedures.

CRITICAL ICD-10 CODING REQUIREMENTS:
- Use EXACT ICD-10-CM format with appropriate specificity
- Code only DOCUMENTED diagnoses and conditions
- Apply ICD-10 hierarchical coding rules strictly
- Include laterality, episode of care, and severity when specified
- Prioritize primary reason for admission first
- Follow ICD-10 Official Guidelines for Coding and Reporting

SYSTEMATIC CODING APPROACH:
1. IDENTIFY: Primary diagnosis from chief complaint/admission reason
2. EXTRACT: All documented secondary diagnoses and comorbidities  
3. VERIFY: Each diagnosis against ICD-10-CM classification
4. APPLY: Appropriate seventh characters for injuries/external causes
5. SEQUENCE: Codes by clinical significance and coding guidelines

ENHANCED EXAMPLES WITH EXACT ICD-10 MATCHING:

HOSPITAL COURSE: 67-year-old male with acute ST-elevation myocardial infarction...
Service: CARDIOVASCULAR
Chief Complaint: Chest pain
History: Sudden onset severe chest pain, found to have anterior STEMI...
Cardiac catheterization: 100% occlusion of LAD, successfully stented...
CODES: I213, Z9561

HOSPITAL COURSE: 45-year-old female with type 2 diabetes and diabetic nephropathy...
Service: ENDOCRINOLOGY  
Chief Complaint: Poorly controlled diabetes
History: Long-standing T2DM with complications, presenting with hyperglycemia...
Labs: HbA1c 11.2%, creatinine elevated, proteinuria present...
CODES: E1122, N083

HOSPITAL COURSE: 23-year-old male with closed fracture right tibia from motor vehicle accident...
Service: ORTHOPEDICS
Chief Complaint: Right leg pain after MVA
Imaging: Closed fracture of shaft of right tibia, no displacement...
Treatment: Closed reduction and casting...
CODES: S82201A, V499XXA

HOSPITAL COURSE: 78-year-old female with community-acquired pneumonia and sepsis...
Service: MEDICINE
Chief Complaint: Shortness of breath and fever
History: 3-day history of cough, fever, dyspnea...
CXR: Right lower lobe consolidation consistent with pneumonia...
Blood cultures: Streptococcus pneumoniae isolated...
CODES: J13, R6510

HOSPITAL COURSE: 55-year-old male with chronic kidney disease stage 4 and anemia...
Service: NEPHROLOGY
Chief Complaint: Fatigue and decreased urine output
History: Progressive CKD secondary to hypertension...
Labs: eGFR 25 mL/min, hemoglobin 8.5 g/dL consistent with CKD anemia...
CODES: N184, D631

Now analyze this hospital course and provide precise ICD-10 codes:

HOSPITAL COURSE: {user_input}

CODES:"""

    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """퇴원 요약을 ICD 코드 예측을 위해 전처리 - ICD 정확도 최적화"""
        import re
        import pandas as pd

        try:
            hospital_course = data.get('hospital_course', '')

            if pd.isna(hospital_course) or not isinstance(hospital_course, str) or not hospital_course.strip():
                return {'user_input': 'Patient admitted for medical evaluation and management.'}

            processed_sections = []

            # 더 체계적인 정보 추출 (ICD 코딩에 필수적)

            # Chief Complaint (Primary diagnosis 단서)
            if 'Chief Complaint:' in hospital_course:
                cc_match = re.search(
                    r'Chief Complaint:\s*([^\n]+)', hospital_course)
                if cc_match and cc_match.group(1).strip():
                    processed_sections.append(
                        f"Chief Complaint: {cc_match.group(1).strip()}")

            # Service (진료과별 코딩 패턴)
            if 'Service:' in hospital_course:
                service_match = re.search(
                    r'Service:\s*([^\n]+)', hospital_course)
                if service_match and service_match.group(1).strip():
                    processed_sections.append(
                        f"Service: {service_match.group(1).strip()}")

            # History of Present Illness (상세 진단 정보)
            if 'History of Present Illness:' in hospital_course:
                hpi_match = re.search(r'History of Present Illness:\s*(.*?)(?=\n\n|\nPast Medical|Physical Exam|$)',
                                      hospital_course, re.DOTALL)
                if hpi_match and hpi_match.group(1).strip():
                    hpi = hpi_match.group(1).strip()[:800]
                    processed_sections.append(f"Clinical History: {hpi}")

            # Past Medical History (Comorbidities)
            if 'Past Medical History:' in hospital_course:
                pmh_match = re.search(r'Past Medical History:\s*(.*?)(?=\n\n|PAST SURGICAL|Social History|$)',
                                      hospital_course, re.DOTALL)
                if pmh_match and pmh_match.group(1).strip():
                    pmh = pmh_match.group(1).strip()[:600]
                    processed_sections.append(f"Past Medical History: {pmh}")

            # Major Surgical or Invasive Procedures (Procedure codes)
            if 'Major Surgical or Invasive Procedure:' in hospital_course:
                proc_match = re.search(r'Major Surgical or Invasive Procedure:\s*(.*?)(?=\n\n|History of Present|$)',
                                       hospital_course, re.DOTALL)
                if proc_match:
                    proc = proc_match.group(1).strip()
                    if proc and proc.lower() not in ['none', 'none.', '']:
                        processed_sections.append(f"Procedures: {proc}")

            # Diagnostic Test Results (Supporting evidence)
            lab_findings = re.findall(
                r'(Lab[s]?.*?:.*?(?:[^\n]{50,200}))', hospital_course, re.IGNORECASE | re.DOTALL)
            if lab_findings:
                for i, finding in enumerate(lab_findings[:2]):
                    processed_sections.append(
                        f"Diagnostic Finding {i+1}: {finding.strip()}")

            # Imaging Results with IMPRESSION
            impressions = re.findall(
                r'IMPRESSION:\s*(.*?)(?=\n\n|\n[A-Z_]|\Z)', hospital_course, re.DOTALL)
            if impressions:
                for i, imp in enumerate(impressions[:2]):
                    processed_sections.append(
                        f"Imaging {i+1}: {imp.strip()[:300]}")

            # Discharge Diagnosis (often most accurate for coding)
            if 'Discharge Diagnosis:' in hospital_course or 'DISCHARGE DIAGNOSIS:' in hospital_course:
                dd_match = re.search(r'(?:Discharge Diagnosis|DISCHARGE DIAGNOSIS):\s*(.*?)(?=\n\n|\n[A-Z]|\Z)',
                                     hospital_course, re.DOTALL)
                if dd_match:
                    processed_sections.append(
                        f"Discharge Diagnosis: {dd_match.group(1).strip()}")

            if processed_sections:
                processed_text = '\n\n'.join(processed_sections)
            else:
                # 더 포괄적인 원본 데이터 사용
                processed_text = hospital_course[:2500]

            # ICD 코딩에 중요한 의료 용어 보존
            processed_text = re.sub(r'___+', '[REDACTED]', processed_text)
            processed_text = re.sub(r'\s+', ' ', processed_text)
            processed_text = processed_text.strip()[:3000]

            return {'user_input': processed_text if processed_text else 'Patient admitted for medical evaluation and management.'}

        except Exception as e:
            fallback_text = str(data.get('hospital_course', ''))
            return {'user_input': fallback_text if fallback_text.strip() else 'Patient admitted for medical evaluation and management.'}

    async def postprocess_result(self, result: str) -> str:
        """결과 정리 및 ICD 코드 추출 - 정확도 극대화"""
        import re

        try:
            if not result or not isinstance(result, str):
                return 'Z515'

            result = result.strip()

            if result.startswith(('CODES:', 'codes:', 'Codes:')):
                result = result.split(':', 1)[1].strip()

            if not result:
                return 'Z515'

            # 더 정밀한 ICD 코드 추출 패턴
            # 1. 정확한 ICD-10 형식 (문자+숫자+선택적 알파벳+숫자)
            primary_pattern = r'[A-Z]\d{2}(?:\.[A-Z0-9]{1,4})?[A-Z0-9]*'
            codes = re.findall(primary_pattern, result.upper())

            # 2. 점이 있는 ICD 코드 (예: I21.3, S82.201A)
            dotted_pattern = r'[A-Z]\d{2}\.[A-Z0-9]{1,4}'
            dotted_codes = re.findall(dotted_pattern, result.upper())
            codes.extend(dotted_codes)

            # 3. 7자리 코드 (외상 등)
            seven_char_pattern = r'[A-Z]\d{2}[A-Z0-9]{3}[A-Z]'
            seven_codes = re.findall(seven_char_pattern, result.upper())
            codes.extend(seven_codes)

            # 중복 제거하면서 순서 유지
            unique_codes = []
            seen = set()
            for code in codes:
                if code not in seen and self._validate_icd_code(code):
                    unique_codes.append(code)
                    seen.add(code)

            # Fallback 패턴으로 재시도
            if not unique_codes:
                fallback_pattern = r'[A-Z]+\d+[A-Z0-9]*'
                fallback_codes = re.findall(fallback_pattern, result.upper())
                for code in fallback_codes[:3]:
                    if code not in seen and len(code) >= 3:
                        unique_codes.append(code)
                        seen.add(code)

            # 최대 6개 코드 (평균 1.6개이므로 여유있게)
            final_codes = unique_codes[:6]

            return ', '.join(final_codes) if final_codes else 'Z515'

        except Exception as e:
            return 'Z515'

    def _validate_icd_code(self, code: str) -> bool:
        """ICD-10 코드 형식 검증"""
        if not code or len(code) < 3:
            return False

        # 기본 ICD-10 형식: 문자 + 2자리 숫자로 시작
        if not (code[0].isalpha() and code[1:3].isdigit()):
            return False

        # 너무 짧거나 너무 긴 코드 제외
        if len(code) < 3 or len(code) > 8:
            return False

        return True


# 리더보드 동일 평가 함수
async def exact_taskc_evaluation(train_csv_path: str, api_key: str):
    """대회 리더보드와 정확히 동일한 Task C 평가"""
    
    print("=" * 80)
    print("🏆 Task C 리더보드 정확 평가 시뮬레이션")
    print("=" * 80)
    
    # 1. 데이터 로드 및 전처리
    print("1. 데이터 로드 중...")
    df = pd.read_csv(train_csv_path)
    df = df.dropna(subset=['hospital_course', 'target'])
    
    eval_samples = min(300, len(df))
    eval_df = df.iloc[:eval_samples].copy()
    print(f"평가 샘플: {eval_samples}개")
    
    print(f"\n📊 데이터 분포:")
    print(f"성별: {eval_df['gender'].value_counts().to_dict()}")
    print(f"연령: 평균 {eval_df['anchor_age'].mean():.1f}세")
    
    # 2. TaskC 처리기 초기화
    print("\n2. TaskC 처리기 초기화 (EXAONE 모델)...")
    processor = TaskCProcessor(api_key)
    
    # 3. 예측 생성
    print("3. ICD 코드 예측 생성 중...")
    start_time = time.time()
    
    data_batch = [{'hospital_course': row['hospital_course']} for _, row in eval_df.iterrows()]
    
    results = []
    batch_size = 8
    
    for i in range(0, len(data_batch), batch_size):
        batch = data_batch[i:i+batch_size]
        print(f"   배치 {i//batch_size + 1}/{(len(data_batch)-1)//batch_size + 1} 처리 중...")
        
        # 전처리
        preprocessed = [await processor.preprocess_data(row) for row in batch]
        
        # API 호출
        tasks = [processor.chain.ainvoke(prep) for prep in preprocessed]
        responses = await asyncio.gather(*tasks)
        
        # 후처리
        batch_results = [await processor.postprocess_result(r.content) for r in responses]
        results.extend(batch_results)
        
        # API 제한 준수
        if i + batch_size < len(data_batch):
            print(f"   API 제한 준수를 위해 70초 대기...")
            await asyncio.sleep(70)
    
    predictions = results
    generation_time = time.time() - start_time
    print(f"예측 생성 완료 (총 소요 시간: {generation_time:.1f}초)")
    
    # 4. 정답 데이터 준비 (대회 형식과 동일)
    references = eval_df['target'].tolist()
    
    # ICD 코드 파싱 (대회 평가와 동일)
    print("\n4. ICD 코드 파싱 중...")
    ref_codes = [parse_icd_codes(ref) for ref in references]
    pred_codes = [parse_icd_codes(pred) for pred in predictions]
    
    print(f"파싱 완료: 정답 {len(ref_codes)}개, 예측 {len(pred_codes)}개")
    
    # 5. ICDScore 계산 (대회 공식 계산)
    print("5. ICDScore 계산 중...")
    icd_scorer = ICDScore()
    icd_scores = icd_scorer(refs=ref_codes, hyps=pred_codes)
    icd_mean = np.mean(icd_scores)
    icd_std = np.std(icd_scores)
    
    # 6. 공정성 지표 계산 (대회 공식 계산)
    print("6. 공정성 지표 계산 중...")
    fairness_scorer = FairnessScore(bin_width=10, min_samples_per_group=1)
    
    # 성별 공정성
    gender_fairness = fairness_scorer(
        groups=eval_df['gender'].tolist(),
        scores=icd_scores,
        type='sex'
    )
    gender_stats = fairness_scorer.last_stats
    
    # 연령 공정성
    age_fairness = fairness_scorer(
        groups=eval_df['anchor_age'].tolist(),
        scores=icd_scores,
        type='age'  
    )
    age_stats = fairness_scorer.last_stats
    
    # 7. 대회 정확한 결과 출력
    print("\n" + "=" * 80)
    print("🎯 Task C 리더보드 정확 평가 결과")
    print("=" * 80)
    
    print(f"📊 ICDScore (대회 공식 계산)")
    print(f"   평균: {icd_mean:.6f}")
    print(f"   표준편차: {icd_std:.6f}")
    print(f"   최고: {max(icd_scores):.6f}")
    print(f"   최저: {min(icd_scores):.6f}")
    print(f"   중앙값: {np.median(icd_scores):.6f}")
    
    print(f"\n⚖️ 공정성 지표 (대회 공식 계산)")
    print(f"   성별 공정성: {gender_fairness:.6f}")
    print(f"   성별별 성능: {gender_stats['by_group']}")
    print(f"   성별 격차: {gender_stats['gap']:.6f}")
    print(f"   ")
    print(f"   연령 공정성: {age_fairness:.6f}")
    print(f"   연령대별 성능: {age_stats['by_group']}")
    print(f"   연령 격차: {age_stats['gap']:.6f}")
    
    # 8. 정량 평가 점수 계산 (Task C는 9점 만점)
    print(f"\n🏆 Task C 정량 평가 점수")
    
    # ICDScore 점수 (6점 만점 - 9점의 2/3)
    icd_score_points = min(6.0, max(0.0, (icd_mean / 0.70) * 6.0))  # 목표 0.70
    
    # 공정성 점수 (3점 만점 - 9점의 1/3)
    fairness_avg = (gender_fairness + age_fairness) / 2.0
    fairness_points = min(3.0, max(0.0, (fairness_avg / 0.95) * 3.0))
    
    # 총점
    total_quantitative = icd_score_points + fairness_points
    
    print(f"   ICDScore: {icd_score_points:.3f}/6.000 점")
    print(f"   공정성 지표: {fairness_points:.3f}/3.000 점")
    print(f"   정량 총점: {total_quantitative:.3f}/9.000 점")
    print(f"   정량 달성률: {total_quantitative/9.0*100:.1f}%")
    
    # 9. 성능 등급 판정
    print(f"\n🎖️ 성능 등급")
    if total_quantitative >= 7.5:
        grade = "S급 (최우수)"
        recommendation = "즉시 제출 권장"
    elif total_quantitative >= 6.5:
        grade = "A급 (우수)"
        recommendation = "제출 권장"
    elif total_quantitative >= 5.5:
        grade = "B급 (양호)"
        recommendation = "소폭 개선 후 제출"
    else:
        grade = "C급 (보통)"
        recommendation = "개선 필요"
    
    print(f"   등급: {grade}")
    print(f"   권장사항: {recommendation}")
    
    # 10. 예측 샘플 분석
    print(f"\n📝 예측 품질 샘플 (상위/하위 각 3개)")
    print("-" * 80)
    
    sorted_indices = np.argsort(icd_scores)
    
    print("🏆 최고 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[-(i+1)]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    print("⚠️ 최저 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[i]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    return {
        'icd_score_mean': icd_mean,
        'icd_score_std': icd_std,
        'icd_scores': icd_scores,
        'gender_fairness': gender_fairness,
        'age_fairness': age_fairness,
        'total_score': total_quantitative,
        'grade': grade,
        'predictions': predictions,
        'references': references,
        'evaluation_samples': eval_samples,
        'processing_time': generation_time
    }

# 실행
API_KEY = "cfa06ca698c85aa9c9d4b55440aeef0f85ed94f644cd7b931fdd69f2421c6ecb"
TRAIN_CSV_PATH = "../data/taskC_train.csv"

# Task C 리더보드 정확 평가 실행
taskc_results = await exact_taskc_evaluation(
    train_csv_path=TRAIN_CSV_PATH,
    api_key=API_KEY
)

print(f"\n🎉 Task C 리더보드 평가 완료!")
print(f"최종 예상 점수: {taskc_results['total_score']:.3f}/9.000 점")


🏆 Task C 리더보드 정확 평가 시뮬레이션
1. 데이터 로드 중...
평가 샘플: 300개

📊 데이터 분포:
성별: {'M': 155, 'F': 145}
연령: 평균 63.4세

2. TaskC 처리기 초기화 (EXAONE 모델)...
3. ICD 코드 예측 생성 중...
   배치 1/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 2/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 3/38 처리 중...
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type':