## Datathon 클래스

In [2]:
import re
import os
import pathlib
import pandas as pd
from typing import Any, List, Dict
from typing import Optional, Dict, Any, List, Union
from abc import ABC, abstractmethod
from langchain.prompts import ChatPromptTemplate  # 프롬프트 템플릿 처리용
from langevaluate.config import ModelConfig # LLM 설정용
from langevaluate.llmfactory import LLMFactory  # LLM 팩토리용
from tqdm.asyncio import tqdm_asyncio
import asyncio

class DatathonProcessor(ABC):
    """
    데이터톤용 AI 처리 통합 클래스
    쿼리, 평가, 임베딩을 일괄 처리할 수 있습니다.
    사용자는 이 클래스를 상속받아 특정 메서드만 구현하면 됩니다.
    """
    # LLM 설정 상수들
    
    DEFAULT_MODEL_CONFIG = {
        'model_name': 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ',
        'api_base': 'https://api.snubhai.org/api/v1/llm',
        'max_tokens': 2000,
        'seed': 777,
        'temperature': 0,
        'rpm': 10
    }

    def __init__(
        self,
        api_key : str,
    ):
        # 기본 설정 복사
        config = self.DEFAULT_MODEL_CONFIG.copy()
        
        # model_name만 클래스별 설정으로 업데이트
        config['model_name'] = self.get_model_name()
        
        # LLM 설정 생성
        custom_config = ModelConfig(
            model_name=config['model_name'],
            api_base=config['api_base'],
            api_key=api_key,
            max_tokens=config['max_tokens'],
            seed=config['seed'],
            provider="openai"
        )
        
        # LLM 인스턴스 생성
        self.llm = LLMFactory.create_llm(
            custom_config, 
            temperature=config['temperature'], 
            rpm=config['rpm']
        )
        
        # 프롬프트 템플릿 설정
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

        # 결과 저장소
        self.results: List[str] = []
        
        # metric 저장소
        self.metrics: Dict[str, Any] = {}
    
        
    def get_model_name(self) -> str:
        """
        사용할 모델명을 반환합니다.
        상속 클래스에서 이 메서드를 오버라이드하여 특정 모델을 설정할 수 있습니다.
        """
        return self.DEFAULT_MODEL_CONFIG['model_name']


    @abstractmethod
    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """데이터 전처리 메서드"""
        pass
    
    @abstractmethod
    def get_prompt_template(self) -> str:
        """사용자가 구현해야 하는 프롬프트 템플릿 메서드"""
        pass
    
    @abstractmethod
    async def postprocess_result(self, result: Any) -> str:
        """데이터 후처리 메서드"""
        pass

    async def summarize(
        self, 
        data: pd.DataFrame
    ) -> List[str]:
        """
        단일 입력과 배치 입력을 모두 처리하는 통합 메서드
        """
        # 데이터 전처리
        
        preprocess_tasks = [self.preprocess_data(row) for _, row in data.iterrows()]
        preprocessed_data = await tqdm_asyncio.gather(*preprocess_tasks)

        # 각각을 별도의 coroutine으로 실행
        tasks = [self.chain.ainvoke(vars) for vars in preprocessed_data]

        # tqdm_asyncio.gather로 동시에 실행하며 progress bar 표시
        responses = await tqdm_asyncio.gather(*tasks)

        postprocess_tasks = [self.postprocess_result(r.content) for r in responses]
        results = await tqdm_asyncio.gather(*postprocess_tasks)
        
        return results

  from .autonotebook import tqdm as notebook_tqdm


## Few-shot용 데이터 추출

In [19]:
# Task C 데이터 분석 및 Few-shot 예시 추출
import pandas as pd
import re
from collections import Counter
import numpy as np

def analyze_taskc_for_optimal_examples(train_csv_path: str):
    """Task C 최적 Few-shot 예시 분석 및 추출"""
    
    df = pd.read_csv(train_csv_path)
    print(f"전체 데이터: {len(df)}개")
    
    # 기본 통계
    df = df.dropna(subset=['hospital_course', 'target'])
    print(f"유효 데이터: {len(df)}개")
    
    # ICD 코드 파싱 및 분석
    def parse_icd_codes(icd_string):
        if pd.isna(icd_string) or not isinstance(icd_string, str):
            return []
        
        if icd_string.startswith('[') and icd_string.endswith(']'):
            codes = icd_string.strip('[]').replace("'", "").replace('"', '').split(',')
        else:
            codes = re.split('[,\s]+', icd_string)
        
        return [code.strip().upper() for code in codes if code.strip()]
    
    df['parsed_icd'] = df['target'].apply(parse_icd_codes)
    df['icd_count'] = df['parsed_icd'].apply(len)
    df['text_length'] = df['hospital_course'].str.len()
    df['word_count'] = df['hospital_course'].str.split().str.len()
    
    print(f"\n📊 기본 통계:")
    print(f"ICD 코드 개수: 평균 {df['icd_count'].mean():.1f}, 범위 {df['icd_count'].min()}-{df['icd_count'].max()}")
    print(f"텍스트 길이: 평균 {df['text_length'].mean():.0f}자, 범위 {df['text_length'].min()}-{df['text_length'].max()}")
    print(f"단어 수: 평균 {df['word_count'].mean():.0f}개, 범위 {df['word_count'].min()}-{df['word_count'].max()}")
    
    return df

def extract_high_quality_icd_examples(df, top_n=20):
    """ICD 코딩 고품질 예시 추출"""
    
    # 품질 점수 계산
    df['quality_score'] = 0
    
    # 1. 적절한 ICD 개수 (2-5개가 학습에 최적)
    optimal_icd_count = (df['icd_count'] >= 2) & (df['icd_count'] <= 6)
    df.loc[optimal_icd_count, 'quality_score'] += 5
    
    # 2. 적절한 텍스트 길이 (1000-8000자, 학습에 적합)
    optimal_length = (df['text_length'] >= 1000) & (df['text_length'] <= 8000)
    df.loc[optimal_length, 'quality_score'] += 3
    
    # 3. 명확한 진단명 언급 (Chief Complaint 분석)
    clear_diagnosis = df['hospital_course'].str.contains('Chief Complaint:', na=False)
    df.loc[clear_diagnosis, 'quality_score'] += 2
    
    # 4. 다양한 ICD 카테고리 포함
    def get_icd_categories(icd_list):
        categories = set()
        for code in icd_list:
            if code:
                categories.add(code[0])  # 첫 번째 문자로 카테고리 분류
        return len(categories)
    
    df['icd_categories'] = df['parsed_icd'].apply(get_icd_categories)
    diverse_categories = df['icd_categories'] >= 2
    df.loc[diverse_categories, 'quality_score'] += 3
    
    # 5. 구조화된 의료 기록 (Past Medical History 등)
    structured = df['hospital_course'].str.contains('Past Medical History|Physical Exam|History of Present Illness', na=False)
    df.loc[structured, 'quality_score'] += 2
    
    # 상위 예시 선택
    top_examples = df.nlargest(top_n, 'quality_score')
    
    return top_examples[['hospital_course', 'parsed_icd', 'quality_score', 
                        'icd_count', 'icd_categories', 'text_length']]

def extract_examples_by_complexity(df, n_each=5):
    """복잡도별 예시 추출 (단순/중간/복잡)"""
    
    # 복잡도 기준: ICD 개수 + 텍스트 길이 + 카테고리 다양성
    df['complexity_score'] = (
        df['icd_count'] * 2 +
        (df['text_length'] / 1000) +
        df['icd_categories'] * 1.5
    )
    
    # 3분위로 나누기
    percentiles = df['complexity_score'].quantile([0.33, 0.67]).values
    
    simple = df[df['complexity_score'] <= percentiles[0]]
    medium = df[(df['complexity_score'] > percentiles[0]) & 
               (df['complexity_score'] <= percentiles[1])]
    complex = df[df['complexity_score'] > percentiles[1]]
    
    results = {}
    
    for category, subset in [('simple', simple), ('medium', medium), ('complex', complex)]:
        if len(subset) > 0:
            # 각 카테고리에서 품질 점수 기준 상위 선택
            subset = subset.copy()
            subset['quality_score'] = 0
            
            # 기본 품질 점수 재계산
            optimal_icd = (subset['icd_count'] >= 1) & (subset['icd_count'] <= 8)
            subset.loc[optimal_icd, 'quality_score'] += 3
            
            clear_structure = subset['hospital_course'].str.contains('Chief Complaint|History of Present', na=False)
            subset.loc[clear_structure, 'quality_score'] += 2
            
            top_subset = subset.nlargest(n_each, 'quality_score')
            results[category] = top_subset[['hospital_course', 'parsed_icd', 'complexity_score', 'quality_score']]
    
    return results

def extract_common_icd_patterns(df, top_n=15):
    """자주 등장하는 ICD 패턴 예시 추출"""
    
    # 모든 ICD 코드 수집
    all_codes = []
    for icd_list in df['parsed_icd']:
        all_codes.extend(icd_list)
    
    # 빈도 분석
    code_counts = Counter(all_codes)
    common_codes = [code for code, count in code_counts.most_common(20)]
    
    print(f"\n상위 20개 ICD 코드: {common_codes[:10]}...")
    
    # 공통 코드 포함 예시 선별
    def has_common_codes(icd_list):
        return len(set(icd_list) & set(common_codes))
    
    df['common_code_count'] = df['parsed_icd'].apply(has_common_codes)
    
    # 공통 코드를 많이 포함하면서 품질 좋은 예시 선택
    df['pattern_score'] = df['common_code_count'] * 2 + df['quality_score']
    
    top_patterns = df.nlargest(top_n, 'pattern_score')
    
    return top_patterns[['hospital_course', 'parsed_icd', 'common_code_count', 'pattern_score']]

# 1. 전체 데이터 분석
df = analyze_taskc_for_optimal_examples('./data/taskC_train.csv')

# 2. 고품질 예시 추출 (상위 15개)
high_quality = extract_high_quality_icd_examples(df, top_n=15)
for i, (_, row) in enumerate(high_quality.head(5).iterrows()):
    print(f'=== 고품질 예시 {i+1} (점수: {row["quality_score"]}) ===')
    print(f'ICD 개수: {len(row["parsed_icd"])}, 카테고리: {row["icd_categories"]}')
    print(f'ICD 코드: {row["parsed_icd"]}')
    print(f'텍스트 길이: {row["text_length"]}자')
    print(f'병원 기록: {row["hospital_course"][:200]}...')
    print('-' * 80)

# 3. 복잡도별 예시
by_complexity = extract_examples_by_complexity(df, n_each=3)
for complexity, examples in by_complexity.items():
    print(f'\n=== {complexity.upper()} 복잡도 예시 ===')
    print(examples[['parsed_icd', 'complexity_score']].head(2))

# 4. 공통 패턴 예시
common_patterns = extract_common_icd_patterns(df, top_n=10)
print(common_patterns[['parsed_icd', 'common_code_count']].head())


전체 데이터: 1000개
유효 데이터: 1000개

📊 기본 통계:
ICD 코드 개수: 평균 1.6, 범위 1-6
텍스트 길이: 평균 6192자, 범위 984-32945
단어 수: 평균 890개, 범위 126-4437
=== 고품질 예시 1 (점수: 15) ===
ICD 개수: 2, 카테고리: 2
ICD 코드: ['S066X1A', 'W1830XA']
텍스트 길이: 1365자
병원 기록: Name:  ___               Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: NEUROSURGERY
 
Allergies: 
No Known Allergies / ...
--------------------------------------------------------------------------------
=== 고품질 예시 2 (점수: 15) ===
ICD 개수: 2, 카테고리: 2
ICD 코드: ['M5489', 'R339']
텍스트 길이: 5299자
병원 기록: Name:  ___                 Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
lisinopril
 
Attending:...
--------------------------------------------------------------------------------
=== 고품질 예시 3 (점수: 15) ===
ICD 개수: 2, 카테고리: 2
ICD 코드: ['I609', 'R001']
텍스트 길이: 4136자
병원 기록: Name:  ___                Un

## 프롬프팅 적용

In [20]:
class TaskCProcessor(DatathonProcessor):
    """Task C: ICD 코드 예측"""
    def get_model_name(self) -> str:
        return "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ"  # ICD 코딩에 최적
    
    def get_prompt_template(self) -> str:
        return """You are an expert medical coder with 10+ years of experience in ICD-10 coding. Analyze the hospital course and assign the most appropriate ICD-10 codes.

CRITICAL REQUIREMENTS:
- Focus on PRIMARY diagnoses and significant conditions only
- Use exact ICD-10 format (e.g., I82431, S066X1A)
- Maintain consistent coding standards regardless of patient demographics
- Prioritize conditions that required active treatment during admission
- Consider hierarchical relationships in ICD-10 classification

CODING METHODOLOGY:
1. Identify Chief Complaint and primary reason for admission
2. Extract documented diagnoses from medical record
3. Prioritize active conditions over chronic stable conditions
4. Apply appropriate specificity and laterality codes
5. Include significant complications or comorbidities

EXAMPLES:

HOSPITAL COURSE: Patient with traumatic brain injury following fall...
Service: NEUROSURGERY
Chief Complaint: Head trauma
History: Fall from ladder with loss of consciousness...
Physical Exam: GCS 14, focal neurological deficits...
Imaging: CT head shows subdural hematoma...
CODES: S066X1A, W1830XA

HOSPITAL COURSE: Elderly female with urinary retention and back pain...  
Service: MEDICINE
Chief Complaint: Unable to urinate, back pain
History: Progressive back pain over 2 weeks, now with urinary retention...
Past Medical History: Osteoporosis, hypertension...
MRI: Lumbar spinal stenosis at L4-5...
CODES: M5489, R339

HOSPITAL COURSE: Middle-aged male presents with acute chest pain...
Service: NEUROSURGERY  
Chief Complaint: Sudden severe headache
History: Sudden onset worst headache of life, found down at home...
CT: Subarachnoid hemorrhage, no aneurysm identified...
CODES: I609, R001

HOSPITAL COURSE: Young female with psychiatric history presents with overdose...
Service: MEDICINE
Chief Complaint: Intentional overdose
History: Depression with medication overdose, found by roommate...
Toxicology: Elevated drug levels consistent with overdose...
Psychiatry: Adjustment of medications, safety planning...
CODES: F308, T380X5A, Y9289

HOSPITAL COURSE: Male patient with fever and respiratory symptoms...
Service: MEDICINE  
Chief Complaint: Fever, shortness of breath
History: 3-day history of fever, cough, dyspnea...
Labs: Elevated WBC, abnormal inflammatory markers...
CXR: Bilateral infiltrates consistent with pneumonia...
CODES: D72829, J189, R509

Now analyze this hospital course and provide ICD-10 codes:

HOSPITAL COURSE: {user_input}

CODES:"""

    async def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """병원 기록을 ICD 코딩을 위해 전처리"""
        import re
        
        hospital_course = data['hospital_course']
        
        # NaN 처리
        if pd.isna(hospital_course) or not isinstance(hospital_course, str):
            return {'user_input': ''}
        
        # 핵심 섹션 추출 및 정리 (너무 긴 텍스트 최적화)
        # 중요 섹션 우선 추출
        important_sections = []
        
        # Chief Complaint 추출
        if 'Chief Complaint:' in hospital_course:
            cc_match = re.search(r'Chief Complaint:\s*([^\n]+)', hospital_course)
            if cc_match:
                important_sections.append(f"Chief Complaint: {cc_match.group(1).strip()}")
        
        # Service 추출
        if 'Service:' in hospital_course:
            service_match = re.search(r'Service:\s*([^\n]+)', hospital_course)
            if service_match:
                important_sections.append(f"Service: {service_match.group(1).strip()}")
        
        # History of Present Illness 추출 (처음 500자만)
        if 'History of Present Illness:' in hospital_course:
            hpi_match = re.search(r'History of Present Illness:\s*(.*?)(?=\n\n|\nPast Medical|$)', 
                                hospital_course, re.DOTALL)
            if hpi_match:
                hpi = hpi_match.group(1).strip()[:500]  # 길이 제한
                important_sections.append(f"History: {hpi}")
        
        # Past Medical History 추출 (중요 진단만)
        if 'Past Medical History:' in hospital_course:
            pmh_match = re.search(r'Past Medical History:\s*(.*?)(?=\n\n|PAST SURGICAL|Social History|$)',
                                hospital_course, re.DOTALL)
            if pmh_match:
                pmh = pmh_match.group(1).strip()[:300]  # 길이 제한
                important_sections.append(f"Past Medical History: {pmh}")
        
        # Physical Exam 핵심 소견만
        if 'Physical Exam:' in hospital_course:
            pe_match = re.search(r'ADMISSION PHYSICAL EXAM:\s*(.*?)(?=DISCHARGE|Pertinent|$)',
                               hospital_course, re.DOTALL)
            if pe_match:
                pe = pe_match.group(1).strip()[:400]  # 길이 제한
                important_sections.append(f"Physical Exam: {pe}")
        
        # Imaging 결과 (IMPRESSION만)
        impressions = re.findall(r'IMPRESSION:\s*(.*?)(?=\n\n|\n[A-Z_]|\Z)', hospital_course, re.DOTALL)
        if impressions:
            for i, imp in enumerate(impressions[:2]):  # 최대 2개만
                important_sections.append(f"Imaging {i+1}: {imp.strip()[:200]}")
        
        # 최종 텍스트 구성 (최대 2000자로 제한)
        processed_text = '\n\n'.join(important_sections)
        
        # 불필요한 정보 제거
        processed_text = re.sub(r'___+', '[REDACTED]', processed_text)
        processed_text = re.sub(r'\s+', ' ', processed_text)
        processed_text = processed_text[:2000]  # 최대 길이 제한
        
        return {'user_input': processed_text.strip()}
    
    async def postprocess_result(self, result: str) -> str:
        """ICD 코드 결과 정리 및 형식화"""
        import re
        
        result = result.strip()
        
        # "CODES:" 제거
        if result.startswith(('CODES:', 'codes:', 'Codes:')):
            result = result.split(':', 1)[1].strip()
        
        # ICD 코드만 추출 (정규식 패턴 매칭)
        # ICD-10 패턴: 문자 1개 + 숫자 2개 + 옵션(문자숫자 혼합)
        icd_pattern = r'[A-Z]\d{2}[A-Z0-9]*'
        
        # 쉼표, 공백, 줄바꿈으로 구분된 코드들 찾기
        codes = re.findall(icd_pattern, result.upper())
        
        # 중복 제거 및 정렬
        unique_codes = list(dict.fromkeys(codes))  # 순서 유지하며 중복 제거
        
        # 빈 결과 처리
        if not unique_codes:
            # 원본 결과에서 알파벳+숫자 패턴 재시도
            fallback_pattern = r'[A-Z]+\d+[A-Z0-9]*'
            codes = re.findall(fallback_pattern, result.upper())
            unique_codes = list(dict.fromkeys(codes))[:3]  # 최대 3개
        
        # 최대 5개 코드로 제한 (데이터 분석 결과 평균 1.6개)
        final_codes = unique_codes[:5]
        
        # 결과가 없으면 일반적인 코드 반환
        if not final_codes:
            return 'Z515'  # Encounter for other aftercare
        
        # 쉼표로 구분된 문자열 반환
        return ', '.join(final_codes)


## 자체 평가(EXAONE)

In [21]:
import pandas as pd
import numpy as np
import asyncio
import time
from typing import Any, Dict, List
from langchain.prompts import ChatPromptTemplate
from langevaluate.config import ModelConfig
from langevaluate.llmfactory import LLMFactory
import re
from scipy.optimize import linear_sum_assignment

# 대회 제공 ICDScore 클래스 (100% 동일)
def parse_icd_codes(icd_string: str) -> List[str]:
    """ICD 코드 문자열을 파싱하여 리스트로 변환"""
    if not icd_string or pd.isna(icd_string):
        return []
    
    if icd_string.startswith('[') and icd_string.endswith(']'):
        codes = icd_string.strip('[]').replace("'", "").replace('"', '').split(',')
    else:
        codes = re.split('[,\s]+', icd_string)
    
    cleaned_codes = [code.strip().upper() for code in codes if code.strip()]
    return cleaned_codes

def icd_similarity(code1: str, code2: str) -> float:
    """ICD-10 계층 기반 유사도 계산"""
    if not code1 or not code2:
        return 0.0
    
    clean_code1 = code1.replace('.', '')
    clean_code2 = code2.replace('.', '')
    
    max_len = min(len(clean_code1), len(clean_code2))
    common = 0
    
    for i in range(max_len):
        if clean_code1[i] == clean_code2[i]:
            common += 1
        else:
            break
    
    max_depth = max(len(clean_code1), len(clean_code2))
    return common / max_depth if max_depth > 0 else 0.0

def hierarchical_f1(y_true: List[str], y_pred: List[str]) -> float:
    """계층적 부분 점수를 반영한 F1-score 계산"""
    if len(y_true) == 0 and len(y_pred) == 0:
        return 1.0
    if len(y_true) == 0 or len(y_pred) == 0:
        return 0.0
    
    sim_matrix = np.zeros((len(y_true), len(y_pred)))
    for i, true_code in enumerate(y_true):
        for j, pred_code in enumerate(y_pred):
            sim_matrix[i, j] = icd_similarity(true_code, pred_code)
    
    row_ind, col_ind = linear_sum_assignment(-sim_matrix)
    matched_score = sim_matrix[row_ind, col_ind].sum()
    
    partial_TP = matched_score
    FP = len(y_pred) - partial_TP
    FN = len(y_true) - partial_TP
    
    precision = partial_TP / len(y_pred) if len(y_pred) > 0 else 0
    recall = partial_TP / len(y_true) if len(y_true) > 0 else 0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * precision * recall / (precision + recall)
    return f1

class ICDScore:
    """ICD-10 계층적 F1-score 평가 클래스"""
    
    def __init__(self):
        pass
    
    def __call__(self, refs: List[List[str]], hyps: List[List[str]]) -> List[float]:
        if len(refs) != len(hyps):
            raise ValueError(f"참조와 예측 데이터의 길이가 다릅니다: {len(refs)} vs {len(hyps)}")
        
        scores = []
        for ref, hyp in zip(refs, hyps):
            score = hierarchical_f1(ref, hyp)
            scores.append(score)
        
        return scores

# 대회 제공 FairnessScore 클래스 (100% 동일)
class FairnessScore:
    def __init__(self, bin_width: int = 10, min_samples_per_group: int = 1):
        self.bin_width = int(bin_width)
        self.min_samples_per_group = int(min_samples_per_group)
        self.last_stats = None

    @staticmethod
    def _ensure_1d(a) -> np.ndarray:
        a = np.asarray(a)
        if a.ndim == 2 and a.shape[1] == 1:
            a = a[:, 0]
        if a.ndim != 1:
            raise ValueError("Input must be 1D or (N,1) shaped.")
        return a

    def _bin_ages(self, ages) -> np.ndarray:
        a = self._ensure_1d(ages).astype(float)
        if np.any(np.isnan(a)):
            raise ValueError("ages contain NaN.")
        if self.bin_width <= 0:
            raise ValueError("bin_width must be positive.")
        starts = (np.floor(a / self.bin_width) * self.bin_width).astype(int)
        ends = starts + self.bin_width
        labels = np.array([f"{s:d}-{e:d}" for s, e in zip(starts, ends)], dtype=object)
        return labels

    def _groups_from_type(self, groups, type: str) -> np.ndarray:
        t = (type or "sex").lower()
        if t not in ("sex", "age"):
            raise ValueError("type must be 'sex' or 'age'.")
        if t == "sex":
            g = self._ensure_1d(groups)
            return g
        else:
            return self._bin_ages(groups)

    def __call__(self, groups, scores, type: str = "sex", sample_weight=None) -> float:
        g = self._groups_from_type(groups, type=type)
        s = self._ensure_1d(scores).astype(float)
        if s.shape[0] != g.shape[0]:
            raise ValueError("groups and scores must have the same length.")

        if sample_weight is None:
            w = np.ones_like(s, dtype=float)
        else:
            w = self._ensure_1d(sample_weight).astype(float)
            if w.shape[0] != s.shape[0]:
                raise ValueError("sample_weight length must match scores.")

        s = np.clip(s, 0.0, 1.0)

        uniq = np.unique(g)
        means = []
        by_group = {}
        for grp in uniq:
            mask = (g == grp)
            if np.sum(mask) < self.min_samples_per_group:
                continue
            denom = np.sum(w[mask])
            if denom <= 0:
                continue
            m = float(np.average(s[mask], weights=w[mask]))
            means.append(m)
            by_group[str(grp)] = m

        if len(means) <= 1:
            self.last_stats = {"by_group": by_group, "gap": 0.0, "min": None, "max": None}
            return 1.0

        max_m = float(np.max(means))
        min_m = float(np.min(means))
        fairness = 1.0 if max_m == 0.0 else float(min_m / max_m)
        fairness = float(np.clip(fairness, 0.0, 1.0))

        self.last_stats = {"by_group": by_group, "gap": max_m - min_m, "min": min_m, "max": max_m}
        return fairness

# TaskC Processor (앞서 작성한 코드)
class TaskCProcessor:
    def __init__(self, api_key: str):
        self.api_key = api_key
        
        config = ModelConfig(
            model_name="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ",
            # model_name="meta-llama/Llama-3.1-8B-Instruct",
            api_base="https://api.snubhai.org/api/v1/llm",
            api_key=api_key,
            max_tokens=2000,
            seed=777,
            provider="openai"
        )
        
        self.llm = LLMFactory.create_llm(config, temperature=0, rpm=10)
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

    def get_prompt_template(self) -> str:
        return """You are an expert medical coder with 10+ years of experience in ICD-10 coding. Analyze the hospital course and assign the most appropriate ICD-10 codes.

CRITICAL REQUIREMENTS:
- Focus on PRIMARY diagnoses and significant conditions only
- Use exact ICD-10 format (e.g., I82431, S066X1A)
- Maintain consistent coding standards regardless of patient demographics
- Prioritize conditions that required active treatment during admission
- Consider hierarchical relationships in ICD-10 classification

CODING METHODOLOGY:
1. Identify Chief Complaint and primary reason for admission
2. Extract documented diagnoses from medical record
3. Prioritize active conditions over chronic stable conditions
4. Apply appropriate specificity and laterality codes
5. Include significant complications or comorbidities

EXAMPLES:

HOSPITAL COURSE: Patient with traumatic brain injury following fall...
Service: NEUROSURGERY
Chief Complaint: Head trauma
History: Fall from ladder with loss of consciousness...
Physical Exam: GCS 14, focal neurological deficits...
Imaging: CT head shows subdural hematoma...
CODES: S066X1A, W1830XA

HOSPITAL COURSE: Elderly female with urinary retention and back pain...  
Service: MEDICINE
Chief Complaint: Unable to urinate, back pain
History: Progressive back pain over 2 weeks, now with urinary retention...
Past Medical History: Osteoporosis, hypertension...
MRI: Lumbar spinal stenosis at L4-5...
CODES: M5489, R339

HOSPITAL COURSE: Middle-aged male presents with acute chest pain...
Service: NEUROSURGERY  
Chief Complaint: Sudden severe headache
History: Sudden onset worst headache of life, found down at home...
CT: Subarachnoid hemorrhage, no aneurysm identified...
CODES: I609, R001

Now analyze this hospital course and provide ICD-10 codes:

HOSPITAL COURSE: {user_input}

CODES:"""

    async def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        hospital_course = data['hospital_course']
        
        if pd.isna(hospital_course) or not isinstance(hospital_course, str):
            return {'user_input': ''}
        
        important_sections = []
        
        # Chief Complaint 추출
        if 'Chief Complaint:' in hospital_course:
            cc_match = re.search(r'Chief Complaint:\s*([^\n]+)', hospital_course)
            if cc_match:
                important_sections.append(f"Chief Complaint: {cc_match.group(1).strip()}")
        
        # Service 추출
        if 'Service:' in hospital_course:
            service_match = re.search(r'Service:\s*([^\n]+)', hospital_course)
            if service_match:
                important_sections.append(f"Service: {service_match.group(1).strip()}")
        
        # History of Present Illness 추출
        if 'History of Present Illness:' in hospital_course:
            hpi_match = re.search(r'History of Present Illness:\s*(.*?)(?=\n\n|\nPast Medical|$)', 
                                hospital_course, re.DOTALL)
            if hpi_match:
                hpi = hpi_match.group(1).strip()[:500]
                important_sections.append(f"History: {hpi}")
        
        # Past Medical History 추출
        if 'Past Medical History:' in hospital_course:
            pmh_match = re.search(r'Past Medical History:\s*(.*?)(?=\n\n|PAST SURGICAL|Social History|$)',
                                hospital_course, re.DOTALL)
            if pmh_match:
                pmh = pmh_match.group(1).strip()[:300]
                important_sections.append(f"Past Medical History: {pmh}")
        
        # Imaging IMPRESSION 추출
        impressions = re.findall(r'IMPRESSION:\s*(.*?)(?=\n\n|\n[A-Z_]|\Z)', hospital_course, re.DOTALL)
        if impressions:
            for i, imp in enumerate(impressions[:2]):
                important_sections.append(f"Imaging {i+1}: {imp.strip()[:200]}")
        
        processed_text = '\n\n'.join(important_sections)
        processed_text = re.sub(r'___+', '[REDACTED]', processed_text)
        processed_text = re.sub(r'\s+', ' ', processed_text)
        processed_text = processed_text[:2000]
        
        return {'user_input': processed_text.strip()}
    
    async def postprocess_result(self, result: str) -> str:
        result = result.strip()
        
        if result.startswith(('CODES:', 'codes:', 'Codes:')):
            result = result.split(':', 1)[1].strip()
        
        icd_pattern = r'[A-Z]\d{2}[A-Z0-9]*'
        codes = re.findall(icd_pattern, result.upper())
        unique_codes = list(dict.fromkeys(codes))
        
        if not unique_codes:
            fallback_pattern = r'[A-Z]+\d+[A-Z0-9]*'
            codes = re.findall(fallback_pattern, result.upper())
            unique_codes = list(dict.fromkeys(codes))[:3]
        
        final_codes = unique_codes[:5]
        
        if not final_codes:
            return 'Z515'
        
        return ', '.join(final_codes)

# 리더보드 동일 평가 함수
async def exact_taskc_evaluation(train_csv_path: str, api_key: str):
    """대회 리더보드와 정확히 동일한 Task C 평가"""
    
    print("=" * 80)
    print("🏆 Task C 리더보드 정확 평가 시뮬레이션")
    print("=" * 80)
    
    # 1. 데이터 로드 및 전처리
    print("1. 데이터 로드 중...")
    df = pd.read_csv(train_csv_path)
    df = df.dropna(subset=['hospital_course', 'target'])
    
    eval_samples = min(300, len(df))
    eval_df = df.iloc[:eval_samples].copy()
    print(f"평가 샘플: {eval_samples}개")
    
    print(f"\n📊 데이터 분포:")
    print(f"성별: {eval_df['gender'].value_counts().to_dict()}")
    print(f"연령: 평균 {eval_df['anchor_age'].mean():.1f}세")
    
    # 2. TaskC 처리기 초기화
    print("\n2. TaskC 처리기 초기화 (EXAONE 모델)...")
    processor = TaskCProcessor(api_key)
    
    # 3. 예측 생성
    print("3. ICD 코드 예측 생성 중...")
    start_time = time.time()
    
    data_batch = [{'hospital_course': row['hospital_course']} for _, row in eval_df.iterrows()]
    
    results = []
    batch_size = 8
    
    for i in range(0, len(data_batch), batch_size):
        batch = data_batch[i:i+batch_size]
        print(f"   배치 {i//batch_size + 1}/{(len(data_batch)-1)//batch_size + 1} 처리 중...")
        
        # 전처리
        preprocessed = [await processor.preprocess_data(row) for row in batch]
        
        # API 호출
        tasks = [processor.chain.ainvoke(prep) for prep in preprocessed]
        responses = await asyncio.gather(*tasks)
        
        # 후처리
        batch_results = [await processor.postprocess_result(r.content) for r in responses]
        results.extend(batch_results)
        
        # API 제한 준수
        if i + batch_size < len(data_batch):
            print(f"   API 제한 준수를 위해 70초 대기...")
            await asyncio.sleep(70)
    
    predictions = results
    generation_time = time.time() - start_time
    print(f"예측 생성 완료 (총 소요 시간: {generation_time:.1f}초)")
    
    # 4. 정답 데이터 준비 (대회 형식과 동일)
    references = eval_df['target'].tolist()
    
    # ICD 코드 파싱 (대회 평가와 동일)
    print("\n4. ICD 코드 파싱 중...")
    ref_codes = [parse_icd_codes(ref) for ref in references]
    pred_codes = [parse_icd_codes(pred) for pred in predictions]
    
    print(f"파싱 완료: 정답 {len(ref_codes)}개, 예측 {len(pred_codes)}개")
    
    # 5. ICDScore 계산 (대회 공식 계산)
    print("5. ICDScore 계산 중...")
    icd_scorer = ICDScore()
    icd_scores = icd_scorer(refs=ref_codes, hyps=pred_codes)
    icd_mean = np.mean(icd_scores)
    icd_std = np.std(icd_scores)
    
    # 6. 공정성 지표 계산 (대회 공식 계산)
    print("6. 공정성 지표 계산 중...")
    fairness_scorer = FairnessScore(bin_width=10, min_samples_per_group=1)
    
    # 성별 공정성
    gender_fairness = fairness_scorer(
        groups=eval_df['gender'].tolist(),
        scores=icd_scores,
        type='sex'
    )
    gender_stats = fairness_scorer.last_stats
    
    # 연령 공정성
    age_fairness = fairness_scorer(
        groups=eval_df['anchor_age'].tolist(),
        scores=icd_scores,
        type='age'  
    )
    age_stats = fairness_scorer.last_stats
    
    # 7. 대회 정확한 결과 출력
    print("\n" + "=" * 80)
    print("🎯 Task C 리더보드 정확 평가 결과")
    print("=" * 80)
    
    print(f"📊 ICDScore (대회 공식 계산)")
    print(f"   평균: {icd_mean:.6f}")
    print(f"   표준편차: {icd_std:.6f}")
    print(f"   최고: {max(icd_scores):.6f}")
    print(f"   최저: {min(icd_scores):.6f}")
    print(f"   중앙값: {np.median(icd_scores):.6f}")
    
    print(f"\n⚖️ 공정성 지표 (대회 공식 계산)")
    print(f"   성별 공정성: {gender_fairness:.6f}")
    print(f"   성별별 성능: {gender_stats['by_group']}")
    print(f"   성별 격차: {gender_stats['gap']:.6f}")
    print(f"   ")
    print(f"   연령 공정성: {age_fairness:.6f}")
    print(f"   연령대별 성능: {age_stats['by_group']}")
    print(f"   연령 격차: {age_stats['gap']:.6f}")
    
    # 8. 정량 평가 점수 계산 (Task C는 9점 만점)
    print(f"\n🏆 Task C 정량 평가 점수")
    
    # ICDScore 점수 (6점 만점 - 9점의 2/3)
    icd_score_points = min(6.0, max(0.0, (icd_mean / 0.70) * 6.0))  # 목표 0.70
    
    # 공정성 점수 (3점 만점 - 9점의 1/3)
    fairness_avg = (gender_fairness + age_fairness) / 2.0
    fairness_points = min(3.0, max(0.0, (fairness_avg / 0.95) * 3.0))
    
    # 총점
    total_quantitative = icd_score_points + fairness_points
    
    print(f"   ICDScore: {icd_score_points:.3f}/6.000 점")
    print(f"   공정성 지표: {fairness_points:.3f}/3.000 점")
    print(f"   정량 총점: {total_quantitative:.3f}/9.000 점")
    print(f"   정량 달성률: {total_quantitative/9.0*100:.1f}%")
    
    # 9. 성능 등급 판정
    print(f"\n🎖️ 성능 등급")
    if total_quantitative >= 7.5:
        grade = "S급 (최우수)"
        recommendation = "즉시 제출 권장"
    elif total_quantitative >= 6.5:
        grade = "A급 (우수)"
        recommendation = "제출 권장"
    elif total_quantitative >= 5.5:
        grade = "B급 (양호)"
        recommendation = "소폭 개선 후 제출"
    else:
        grade = "C급 (보통)"
        recommendation = "개선 필요"
    
    print(f"   등급: {grade}")
    print(f"   권장사항: {recommendation}")
    
    # 10. 예측 샘플 분석
    print(f"\n📝 예측 품질 샘플 (상위/하위 각 3개)")
    print("-" * 80)
    
    sorted_indices = np.argsort(icd_scores)
    
    print("🏆 최고 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[-(i+1)]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    print("⚠️ 최저 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[i]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    return {
        'icd_score_mean': icd_mean,
        'icd_score_std': icd_std,
        'icd_scores': icd_scores,
        'gender_fairness': gender_fairness,
        'age_fairness': age_fairness,
        'total_score': total_quantitative,
        'grade': grade,
        'predictions': predictions,
        'references': references,
        'evaluation_samples': eval_samples,
        'processing_time': generation_time
    }

# 실행
API_KEY = "cfa06ca698c85aa9c9d4b55440aeef0f85ed94f644cd7b931fdd69f2421c6ecb"
TRAIN_CSV_PATH = "./data/taskC_train.csv"

# Task C 리더보드 정확 평가 실행
taskc_results = await exact_taskc_evaluation(
    train_csv_path=TRAIN_CSV_PATH,
    api_key=API_KEY
)

print(f"\n🎉 Task C 리더보드 평가 완료!")
print(f"최종 예상 점수: {taskc_results['total_score']:.3f}/9.000 점")


🏆 Task C 리더보드 정확 평가 시뮬레이션
1. 데이터 로드 중...
평가 샘플: 300개

📊 데이터 분포:
성별: {'M': 155, 'F': 145}
연령: 평균 63.4세

2. TaskC 처리기 초기화 (EXAONE 모델)...
3. ICD 코드 예측 생성 중...
   배치 1/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 2/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 3/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 4/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 5/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 6/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 7/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 8/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 9/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 10/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 11/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 12/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 13/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 14/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 15/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 16/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 17/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 18/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 1

## 자체평가(Llama)

In [22]:
import pandas as pd
import numpy as np
import asyncio
import time
from typing import Any, Dict, List
from langchain.prompts import ChatPromptTemplate
from langevaluate.config import ModelConfig
from langevaluate.llmfactory import LLMFactory
import re
from scipy.optimize import linear_sum_assignment

# 대회 제공 ICDScore 클래스 (100% 동일)
def parse_icd_codes(icd_string: str) -> List[str]:
    """ICD 코드 문자열을 파싱하여 리스트로 변환"""
    if not icd_string or pd.isna(icd_string):
        return []
    
    if icd_string.startswith('[') and icd_string.endswith(']'):
        codes = icd_string.strip('[]').replace("'", "").replace('"', '').split(',')
    else:
        codes = re.split('[,\s]+', icd_string)
    
    cleaned_codes = [code.strip().upper() for code in codes if code.strip()]
    return cleaned_codes

def icd_similarity(code1: str, code2: str) -> float:
    """ICD-10 계층 기반 유사도 계산"""
    if not code1 or not code2:
        return 0.0
    
    clean_code1 = code1.replace('.', '')
    clean_code2 = code2.replace('.', '')
    
    max_len = min(len(clean_code1), len(clean_code2))
    common = 0
    
    for i in range(max_len):
        if clean_code1[i] == clean_code2[i]:
            common += 1
        else:
            break
    
    max_depth = max(len(clean_code1), len(clean_code2))
    return common / max_depth if max_depth > 0 else 0.0

def hierarchical_f1(y_true: List[str], y_pred: List[str]) -> float:
    """계층적 부분 점수를 반영한 F1-score 계산"""
    if len(y_true) == 0 and len(y_pred) == 0:
        return 1.0
    if len(y_true) == 0 or len(y_pred) == 0:
        return 0.0
    
    sim_matrix = np.zeros((len(y_true), len(y_pred)))
    for i, true_code in enumerate(y_true):
        for j, pred_code in enumerate(y_pred):
            sim_matrix[i, j] = icd_similarity(true_code, pred_code)
    
    row_ind, col_ind = linear_sum_assignment(-sim_matrix)
    matched_score = sim_matrix[row_ind, col_ind].sum()
    
    partial_TP = matched_score
    FP = len(y_pred) - partial_TP
    FN = len(y_true) - partial_TP
    
    precision = partial_TP / len(y_pred) if len(y_pred) > 0 else 0
    recall = partial_TP / len(y_true) if len(y_true) > 0 else 0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * precision * recall / (precision + recall)
    return f1

class ICDScore:
    """ICD-10 계층적 F1-score 평가 클래스"""
    
    def __init__(self):
        pass
    
    def __call__(self, refs: List[List[str]], hyps: List[List[str]]) -> List[float]:
        if len(refs) != len(hyps):
            raise ValueError(f"참조와 예측 데이터의 길이가 다릅니다: {len(refs)} vs {len(hyps)}")
        
        scores = []
        for ref, hyp in zip(refs, hyps):
            score = hierarchical_f1(ref, hyp)
            scores.append(score)
        
        return scores

# 대회 제공 FairnessScore 클래스 (100% 동일)
class FairnessScore:
    def __init__(self, bin_width: int = 10, min_samples_per_group: int = 1):
        self.bin_width = int(bin_width)
        self.min_samples_per_group = int(min_samples_per_group)
        self.last_stats = None

    @staticmethod
    def _ensure_1d(a) -> np.ndarray:
        a = np.asarray(a)
        if a.ndim == 2 and a.shape[1] == 1:
            a = a[:, 0]
        if a.ndim != 1:
            raise ValueError("Input must be 1D or (N,1) shaped.")
        return a

    def _bin_ages(self, ages) -> np.ndarray:
        a = self._ensure_1d(ages).astype(float)
        if np.any(np.isnan(a)):
            raise ValueError("ages contain NaN.")
        if self.bin_width <= 0:
            raise ValueError("bin_width must be positive.")
        starts = (np.floor(a / self.bin_width) * self.bin_width).astype(int)
        ends = starts + self.bin_width
        labels = np.array([f"{s:d}-{e:d}" for s, e in zip(starts, ends)], dtype=object)
        return labels

    def _groups_from_type(self, groups, type: str) -> np.ndarray:
        t = (type or "sex").lower()
        if t not in ("sex", "age"):
            raise ValueError("type must be 'sex' or 'age'.")
        if t == "sex":
            g = self._ensure_1d(groups)
            return g
        else:
            return self._bin_ages(groups)

    def __call__(self, groups, scores, type: str = "sex", sample_weight=None) -> float:
        g = self._groups_from_type(groups, type=type)
        s = self._ensure_1d(scores).astype(float)
        if s.shape[0] != g.shape[0]:
            raise ValueError("groups and scores must have the same length.")

        if sample_weight is None:
            w = np.ones_like(s, dtype=float)
        else:
            w = self._ensure_1d(sample_weight).astype(float)
            if w.shape[0] != s.shape[0]:
                raise ValueError("sample_weight length must match scores.")

        s = np.clip(s, 0.0, 1.0)

        uniq = np.unique(g)
        means = []
        by_group = {}
        for grp in uniq:
            mask = (g == grp)
            if np.sum(mask) < self.min_samples_per_group:
                continue
            denom = np.sum(w[mask])
            if denom <= 0:
                continue
            m = float(np.average(s[mask], weights=w[mask]))
            means.append(m)
            by_group[str(grp)] = m

        if len(means) <= 1:
            self.last_stats = {"by_group": by_group, "gap": 0.0, "min": None, "max": None}
            return 1.0

        max_m = float(np.max(means))
        min_m = float(np.min(means))
        fairness = 1.0 if max_m == 0.0 else float(min_m / max_m)
        fairness = float(np.clip(fairness, 0.0, 1.0))

        self.last_stats = {"by_group": by_group, "gap": max_m - min_m, "min": min_m, "max": max_m}
        return fairness

# TaskC Processor (앞서 작성한 코드)
class TaskCProcessor:
    def __init__(self, api_key: str):
        self.api_key = api_key
        
        config = ModelConfig(
            # model_name="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ",
            model_name="meta-llama/Llama-3.1-8B-Instruct",
            api_base="https://api.snubhai.org/api/v1/llm",
            api_key=api_key,
            max_tokens=2000,
            seed=777,
            provider="openai"
        )
        
        self.llm = LLMFactory.create_llm(config, temperature=0, rpm=10)
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

    def get_prompt_template(self) -> str:
        return """You are an expert medical coder with 10+ years of experience in ICD-10 coding. Analyze the hospital course and assign the most appropriate ICD-10 codes.

CRITICAL REQUIREMENTS:
- Focus on PRIMARY diagnoses and significant conditions only
- Use exact ICD-10 format (e.g., I82431, S066X1A)
- Maintain consistent coding standards regardless of patient demographics
- Prioritize conditions that required active treatment during admission
- Consider hierarchical relationships in ICD-10 classification

CODING METHODOLOGY:
1. Identify Chief Complaint and primary reason for admission
2. Extract documented diagnoses from medical record
3. Prioritize active conditions over chronic stable conditions
4. Apply appropriate specificity and laterality codes
5. Include significant complications or comorbidities

EXAMPLES:

HOSPITAL COURSE: Patient with traumatic brain injury following fall...
Service: NEUROSURGERY
Chief Complaint: Head trauma
History: Fall from ladder with loss of consciousness...
Physical Exam: GCS 14, focal neurological deficits...
Imaging: CT head shows subdural hematoma...
CODES: S066X1A, W1830XA

HOSPITAL COURSE: Elderly female with urinary retention and back pain...  
Service: MEDICINE
Chief Complaint: Unable to urinate, back pain
History: Progressive back pain over 2 weeks, now with urinary retention...
Past Medical History: Osteoporosis, hypertension...
MRI: Lumbar spinal stenosis at L4-5...
CODES: M5489, R339

HOSPITAL COURSE: Middle-aged male presents with acute chest pain...
Service: NEUROSURGERY  
Chief Complaint: Sudden severe headache
History: Sudden onset worst headache of life, found down at home...
CT: Subarachnoid hemorrhage, no aneurysm identified...
CODES: I609, R001

Now analyze this hospital course and provide ICD-10 codes:

HOSPITAL COURSE: {user_input}

CODES:"""

    async def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        hospital_course = data['hospital_course']
        
        if pd.isna(hospital_course) or not isinstance(hospital_course, str):
            return {'user_input': ''}
        
        important_sections = []
        
        # Chief Complaint 추출
        if 'Chief Complaint:' in hospital_course:
            cc_match = re.search(r'Chief Complaint:\s*([^\n]+)', hospital_course)
            if cc_match:
                important_sections.append(f"Chief Complaint: {cc_match.group(1).strip()}")
        
        # Service 추출
        if 'Service:' in hospital_course:
            service_match = re.search(r'Service:\s*([^\n]+)', hospital_course)
            if service_match:
                important_sections.append(f"Service: {service_match.group(1).strip()}")
        
        # History of Present Illness 추출
        if 'History of Present Illness:' in hospital_course:
            hpi_match = re.search(r'History of Present Illness:\s*(.*?)(?=\n\n|\nPast Medical|$)', 
                                hospital_course, re.DOTALL)
            if hpi_match:
                hpi = hpi_match.group(1).strip()[:500]
                important_sections.append(f"History: {hpi}")
        
        # Past Medical History 추출
        if 'Past Medical History:' in hospital_course:
            pmh_match = re.search(r'Past Medical History:\s*(.*?)(?=\n\n|PAST SURGICAL|Social History|$)',
                                hospital_course, re.DOTALL)
            if pmh_match:
                pmh = pmh_match.group(1).strip()[:300]
                important_sections.append(f"Past Medical History: {pmh}")
        
        # Imaging IMPRESSION 추출
        impressions = re.findall(r'IMPRESSION:\s*(.*?)(?=\n\n|\n[A-Z_]|\Z)', hospital_course, re.DOTALL)
        if impressions:
            for i, imp in enumerate(impressions[:2]):
                important_sections.append(f"Imaging {i+1}: {imp.strip()[:200]}")
        
        processed_text = '\n\n'.join(important_sections)
        processed_text = re.sub(r'___+', '[REDACTED]', processed_text)
        processed_text = re.sub(r'\s+', ' ', processed_text)
        processed_text = processed_text[:2000]
        
        return {'user_input': processed_text.strip()}
    
    async def postprocess_result(self, result: str) -> str:
        result = result.strip()
        
        if result.startswith(('CODES:', 'codes:', 'Codes:')):
            result = result.split(':', 1)[1].strip()
        
        icd_pattern = r'[A-Z]\d{2}[A-Z0-9]*'
        codes = re.findall(icd_pattern, result.upper())
        unique_codes = list(dict.fromkeys(codes))
        
        if not unique_codes:
            fallback_pattern = r'[A-Z]+\d+[A-Z0-9]*'
            codes = re.findall(fallback_pattern, result.upper())
            unique_codes = list(dict.fromkeys(codes))[:3]
        
        final_codes = unique_codes[:5]
        
        if not final_codes:
            return 'Z515'
        
        return ', '.join(final_codes)

# 리더보드 동일 평가 함수
async def exact_taskc_evaluation(train_csv_path: str, api_key: str):
    """대회 리더보드와 정확히 동일한 Task C 평가"""
    
    print("=" * 80)
    print("🏆 Task C 리더보드 정확 평가 시뮬레이션")
    print("=" * 80)
    
    # 1. 데이터 로드 및 전처리
    print("1. 데이터 로드 중...")
    df = pd.read_csv(train_csv_path)
    df = df.dropna(subset=['hospital_course', 'target'])
    
    eval_samples = min(300, len(df))
    eval_df = df.iloc[:eval_samples].copy()
    print(f"평가 샘플: {eval_samples}개")
    
    print(f"\n📊 데이터 분포:")
    print(f"성별: {eval_df['gender'].value_counts().to_dict()}")
    print(f"연령: 평균 {eval_df['anchor_age'].mean():.1f}세")
    
    # 2. TaskC 처리기 초기화
    print("\n2. TaskC 처리기 초기화 (EXAONE 모델)...")
    processor = TaskCProcessor(api_key)
    
    # 3. 예측 생성
    print("3. ICD 코드 예측 생성 중...")
    start_time = time.time()
    
    data_batch = [{'hospital_course': row['hospital_course']} for _, row in eval_df.iterrows()]
    
    results = []
    batch_size = 8
    
    for i in range(0, len(data_batch), batch_size):
        batch = data_batch[i:i+batch_size]
        print(f"   배치 {i//batch_size + 1}/{(len(data_batch)-1)//batch_size + 1} 처리 중...")
        
        # 전처리
        preprocessed = [await processor.preprocess_data(row) for row in batch]
        
        # API 호출
        tasks = [processor.chain.ainvoke(prep) for prep in preprocessed]
        responses = await asyncio.gather(*tasks)
        
        # 후처리
        batch_results = [await processor.postprocess_result(r.content) for r in responses]
        results.extend(batch_results)
        
        # API 제한 준수
        if i + batch_size < len(data_batch):
            print(f"   API 제한 준수를 위해 70초 대기...")
            await asyncio.sleep(70)
    
    predictions = results
    generation_time = time.time() - start_time
    print(f"예측 생성 완료 (총 소요 시간: {generation_time:.1f}초)")
    
    # 4. 정답 데이터 준비 (대회 형식과 동일)
    references = eval_df['target'].tolist()
    
    # ICD 코드 파싱 (대회 평가와 동일)
    print("\n4. ICD 코드 파싱 중...")
    ref_codes = [parse_icd_codes(ref) for ref in references]
    pred_codes = [parse_icd_codes(pred) for pred in predictions]
    
    print(f"파싱 완료: 정답 {len(ref_codes)}개, 예측 {len(pred_codes)}개")
    
    # 5. ICDScore 계산 (대회 공식 계산)
    print("5. ICDScore 계산 중...")
    icd_scorer = ICDScore()
    icd_scores = icd_scorer(refs=ref_codes, hyps=pred_codes)
    icd_mean = np.mean(icd_scores)
    icd_std = np.std(icd_scores)
    
    # 6. 공정성 지표 계산 (대회 공식 계산)
    print("6. 공정성 지표 계산 중...")
    fairness_scorer = FairnessScore(bin_width=10, min_samples_per_group=1)
    
    # 성별 공정성
    gender_fairness = fairness_scorer(
        groups=eval_df['gender'].tolist(),
        scores=icd_scores,
        type='sex'
    )
    gender_stats = fairness_scorer.last_stats
    
    # 연령 공정성
    age_fairness = fairness_scorer(
        groups=eval_df['anchor_age'].tolist(),
        scores=icd_scores,
        type='age'  
    )
    age_stats = fairness_scorer.last_stats
    
    # 7. 대회 정확한 결과 출력
    print("\n" + "=" * 80)
    print("🎯 Task C 리더보드 정확 평가 결과")
    print("=" * 80)
    
    print(f"📊 ICDScore (대회 공식 계산)")
    print(f"   평균: {icd_mean:.6f}")
    print(f"   표준편차: {icd_std:.6f}")
    print(f"   최고: {max(icd_scores):.6f}")
    print(f"   최저: {min(icd_scores):.6f}")
    print(f"   중앙값: {np.median(icd_scores):.6f}")
    
    print(f"\n⚖️ 공정성 지표 (대회 공식 계산)")
    print(f"   성별 공정성: {gender_fairness:.6f}")
    print(f"   성별별 성능: {gender_stats['by_group']}")
    print(f"   성별 격차: {gender_stats['gap']:.6f}")
    print(f"   ")
    print(f"   연령 공정성: {age_fairness:.6f}")
    print(f"   연령대별 성능: {age_stats['by_group']}")
    print(f"   연령 격차: {age_stats['gap']:.6f}")
    
    # 8. 정량 평가 점수 계산 (Task C는 9점 만점)
    print(f"\n🏆 Task C 정량 평가 점수")
    
    # ICDScore 점수 (6점 만점 - 9점의 2/3)
    icd_score_points = min(6.0, max(0.0, (icd_mean / 0.70) * 6.0))  # 목표 0.70
    
    # 공정성 점수 (3점 만점 - 9점의 1/3)
    fairness_avg = (gender_fairness + age_fairness) / 2.0
    fairness_points = min(3.0, max(0.0, (fairness_avg / 0.95) * 3.0))
    
    # 총점
    total_quantitative = icd_score_points + fairness_points
    
    print(f"   ICDScore: {icd_score_points:.3f}/6.000 점")
    print(f"   공정성 지표: {fairness_points:.3f}/3.000 점")
    print(f"   정량 총점: {total_quantitative:.3f}/9.000 점")
    print(f"   정량 달성률: {total_quantitative/9.0*100:.1f}%")
    
    # 9. 성능 등급 판정
    print(f"\n🎖️ 성능 등급")
    if total_quantitative >= 7.5:
        grade = "S급 (최우수)"
        recommendation = "즉시 제출 권장"
    elif total_quantitative >= 6.5:
        grade = "A급 (우수)"
        recommendation = "제출 권장"
    elif total_quantitative >= 5.5:
        grade = "B급 (양호)"
        recommendation = "소폭 개선 후 제출"
    else:
        grade = "C급 (보통)"
        recommendation = "개선 필요"
    
    print(f"   등급: {grade}")
    print(f"   권장사항: {recommendation}")
    
    # 10. 예측 샘플 분석
    print(f"\n📝 예측 품질 샘플 (상위/하위 각 3개)")
    print("-" * 80)
    
    sorted_indices = np.argsort(icd_scores)
    
    print("🏆 최고 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[-(i+1)]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    print("⚠️ 최저 성능 샘플:")
    for i in range(3):
        idx = sorted_indices[i]
        print(f"샘플 {idx} (ICDScore: {icd_scores[idx]:.4f})")
        print(f"정답: {ref_codes[idx]}")
        print(f"예측: {pred_codes[idx]}")
        print()
    
    return {
        'icd_score_mean': icd_mean,
        'icd_score_std': icd_std,
        'icd_scores': icd_scores,
        'gender_fairness': gender_fairness,
        'age_fairness': age_fairness,
        'total_score': total_quantitative,
        'grade': grade,
        'predictions': predictions,
        'references': references,
        'evaluation_samples': eval_samples,
        'processing_time': generation_time
    }

# 실행
API_KEY = "cfa06ca698c85aa9c9d4b55440aeef0f85ed94f644cd7b931fdd69f2421c6ecb"
TRAIN_CSV_PATH = "./data/taskC_train.csv"

# Task C 리더보드 정확 평가 실행
taskc_results = await exact_taskc_evaluation(
    train_csv_path=TRAIN_CSV_PATH,
    api_key=API_KEY
)

print(f"\n🎉 Task C 리더보드 평가 완료!")
print(f"최종 예상 점수: {taskc_results['total_score']:.3f}/9.000 점")


🏆 Task C 리더보드 정확 평가 시뮬레이션
1. 데이터 로드 중...
평가 샘플: 300개

📊 데이터 분포:
성별: {'M': 155, 'F': 145}
연령: 평균 63.4세

2. TaskC 처리기 초기화 (EXAONE 모델)...
3. ICD 코드 예측 생성 중...
   배치 1/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 2/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 3/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 4/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 5/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 6/38 처리 중...
   API 제한 준수를 위해 70초 대기...
   배치 7/38 처리 중...
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 tokens. Wait 60s.', 'type': 'rate_limit_error', 'param': None, 'code': 'rate_limit_exceeded'}}, retry 1/3
   API 제한 준수를 위해 70초 대기...
   배치 8/38 처리 중...
API Error: Error code: 429 - {'error': {'message': 'Rate limit exceeded. Token bucket: 0.00/10.0 