In [None]:
# python version 3.10 설치해주세요.
# pip install langevaluate==0.2.14

In [1]:
from langevaluate.config import ModelConfig
from langevaluate.llmfactory import LLMFactory

  from .autonotebook import tqdm as notebook_tqdm


이메일로 전달 받은 api key를 입력해주세요. SEED와 temperature는 777과 0으로 고정해주세요.

In [2]:
API_KEY = 'qaz010010!'
API_BASE = 'https://api.snubhai.org/api/v1/llm'
SEED = 777
TEMPERATURE = 0

In [3]:
exaone_model_config = ModelConfig(
    model_name="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ",
    api_base=API_BASE ,
    api_key=API_KEY,
    max_tokens=2000,
    seed=SEED,
    provider="openai"
)

llama_model_config = ModelConfig(
    model_name="meta-llama/Llama-3.1-8B-Instruct",
    api_base=API_BASE ,
    api_key=API_KEY,
    max_tokens=2000,
    seed=SEED,
    provider="openai"
)

validation_model_config = ModelConfig(
    model_name="openai/gpt-oss-120b",
    api_base=API_BASE ,
    api_key=API_KEY,
    max_tokens=10000,
    seed=SEED,
    provider="openai"
)


exaone_llm = LLMFactory.create_llm(exaone_model_config, temperature=0, rpm=10, max_retries=3)
llama_llm = LLMFactory.create_llm(llama_model_config, temperature=0, rpm=10, max_retries=3)
validation_llm = LLMFactory.create_llm(validation_model_config, temperature=0.2, rpm=5, max_retries=3)

### langtranlsate의 llmfactory

langranslate의 llmfactory는 langchain의 ChatModel에 ratelimiter를 단 것입니다. 대회는 분당 호출 제한 10회가 있으니 rpm을 10으로 두고 사용하도록 합니다.

In [4]:
exaone_llm.invoke('who are you?')

AIMessage(content='I am EXAONE 3.5, developed by LG AI Research. My role is to assist users by providing helpful and informative responses based on my training data up to April 2024. How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 50, 'prompt_tokens': 23, 'total_tokens': 73, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ', 'system_fingerprint': None, 'id': 'chatcmpl-7d62db9564ba4bc88bc9b68efe19e68b', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--76f317b1-cf0c-4b69-977e-ffbdb22a6c2c-0', usage_metadata={'input_tokens': 23, 'output_tokens': 50, 'total_tokens': 73, 'input_token_details': {}, 'output_token_details': {}})

In [5]:
llama_llm.invoke('who are you?')

AIMessage(content='I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 39, 'total_tokens': 62, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/Llama-3.1-8B-Instruct', 'system_fingerprint': None, 'id': 'chatcmpl-9f184c1330ba437f9db4b80f08e599cd', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--5a059010-aeb2-4402-8f75-543085c31b15-0', usage_metadata={'input_tokens': 39, 'output_tokens': 23, 'total_tokens': 62, 'input_token_details': {}, 'output_token_details': {}})

In [6]:
validation_llm.invoke('who are you?')

AIMessage(content='I’m ChatGPT, an AI language model created by OpenAI. I can help answer questions, brainstorm ideas, explain concepts, draft text, troubleshoot problems, and more—just let me know what you need!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 73, 'prompt_tokens': 75, 'total_tokens': 148, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'openai/gpt-oss-120b', 'system_fingerprint': None, 'id': 'chatcmpl-3aa5be96e39f4723bf265729b6fa08f3', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--5fe8a056-7bca-4700-b7f7-b3485e7afd7a-0', usage_metadata={'input_tokens': 75, 'output_tokens': 73, 'total_tokens': 148, 'input_token_details': {}, 'output_token_details': {}})

---

# 개요
이 노트북은 의료 데이터톤의 3개 주요 Task를 처리하는 완전한 워크플로우를 제공합니다

- **Task A**: 의료 기록 → Brief Hospital Course 작성
- **Task B**: Radiology → IMPRESSION 요약  
- **Task C**: 퇴원 기록 → ICD-10 코드 예측

In [2]:
import re
import os
import pathlib
import pandas as pd
from typing import Any, List, Dict
from typing import Optional, Dict, Any, List, Union
from abc import ABC, abstractmethod
from langchain.prompts import ChatPromptTemplate  # 프롬프트 템플릿 처리용
from langevaluate.config import ModelConfig # LLM 설정용
from langevaluate.llmfactory import LLMFactory  # LLM 팩토리용
from tqdm.asyncio import tqdm_asyncio
import asyncio

class DatathonProcessor(ABC):
    """
    데이터톤용 AI 처리 통합 클래스
    쿼리, 평가, 임베딩을 일괄 처리할 수 있습니다.
    사용자는 이 클래스를 상속받아 특정 메서드만 구현하면 됩니다.
    """
    # LLM 설정 상수들
    
    DEFAULT_MODEL_CONFIG = {
        'model_name': 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ',
        'api_base': 'https://api.snubhai.org/api/v1/llm',
        'max_tokens': 2000,
        'seed': 777,
        'temperature': 0,
        'rpm': 10
    }

    def __init__(
        self,
        api_key : str,
    ):
        # 기본 설정 복사
        config = self.DEFAULT_MODEL_CONFIG.copy()
        
        # model_name만 클래스별 설정으로 업데이트
        config['model_name'] = self.get_model_name()
        
        # LLM 설정 생성
        custom_config = ModelConfig(
            model_name=config['model_name'],
            api_base=config['api_base'],
            api_key=api_key,
            max_tokens=config['max_tokens'],
            seed=config['seed'],
            provider="openai"
        )
        
        # LLM 인스턴스 생성
        self.llm = LLMFactory.create_llm(
            custom_config, 
            temperature=config['temperature'], 
            rpm=config['rpm']
        )
        
        # 프롬프트 템플릿 설정
        self.prompt_template = ChatPromptTemplate.from_template(self.get_prompt_template())
        self.chain = self.prompt_template | self.llm

        # 결과 저장소
        self.results: List[str] = []
        
        # metric 저장소
        self.metrics: Dict[str, Any] = {}
    
        
    def get_model_name(self) -> str:
        """
        사용할 모델명을 반환합니다.
        상속 클래스에서 이 메서드를 오버라이드하여 특정 모델을 설정할 수 있습니다.
        """
        return self.DEFAULT_MODEL_CONFIG['model_name']


    @abstractmethod
    async def preprocess_data(self, data: Any) -> Dict[str, Any]:
        """데이터 전처리 메서드"""
        pass
    
    @abstractmethod
    def get_prompt_template(self) -> str:
        """사용자가 구현해야 하는 프롬프트 템플릿 메서드"""
        pass
    
    @abstractmethod
    async def postprocess_result(self, result: Any) -> str:
        """데이터 후처리 메서드"""
        pass

    async def summarize(
        self, 
        data: pd.DataFrame
    ) -> List[str]:
        """
        단일 입력과 배치 입력을 모두 처리하는 통합 메서드
        """
        # 데이터 전처리
        
        preprocess_tasks = [self.preprocess_data(row) for _, row in data.iterrows()]
        preprocessed_data = await tqdm_asyncio.gather(*preprocess_tasks)

        # 각각을 별도의 coroutine으로 실행
        tasks = [self.chain.ainvoke(vars) for vars in preprocessed_data]

        # tqdm_asyncio.gather로 동시에 실행하며 progress bar 표시
        responses = await tqdm_asyncio.gather(*tasks)

        postprocess_tasks = [self.postprocess_result(r.content) for r in responses]
        results = await tqdm_asyncio.gather(*postprocess_tasks)
        
        return results

  from .autonotebook import tqdm as notebook_tqdm


# Task A, Task B, Task C 작성

참가자들은 TaskAProcessor, TaskBProcessor, TaskCProcesser의 get_prompt_template, preprocess_data, postprocess_result method를 작성하여 .py로 제출합니다. 

- get_prompt_template에 prompt는 수정하여 제출하여도 충분합니다!

# Task A: Brief Hospital Course 작성

In [None]:
class TaskAProcessor(DatathonProcessor):
    """Task A: Brief Hospital Course 작성"""
    def get_model_name(self) -> str:
        return "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ"
    
    def get_prompt_template(self) -> str:
        return """
        당신은 의료 전문가입니다. 다음 의료 기록을 바탕으로 Brief Hospital Course를 작성해주세요.
        
        의료 기록:
        {user_input}
        
        위 의료 기록을 바탕으로 환자의 Brief Hospital Course를 간결하고 명확하게 작성해주세요.
        주요 진단, 치료 과정, 경과를 포함하여 작성하세요:
        """
    
    async def preprocess_data(self, data: Any) -> str:
        """의료 기록을 Brief Hospital Course 작성을 위해 전처리"""
        data = data['medical record']
        return {'user_input': str(data)}
    
    async def postprocess_result(self, result: str) -> str:
        """결과 정리"""
        result = result.strip()
        return result

# Task B: Radiology Impression 요약


In [None]:
class TaskBProcessor(DatathonProcessor):
    """Task B: Radiology Impression 요약"""
    def get_model_name(self) -> str:
        return "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-AWQ"
    
    def get_prompt_template(self) -> str:
        return """
        당신은 방사선과 전문의입니다. 다음 방사선 검사 보고서를 바탕으로 IMPRESSION을 작성해주세요.
        
        방사선 검사 보고서:
        {user_input}
        
        위 검사 결과를 바탕으로 의학적으로 정확한 IMPRESSION을 작성해주세요.
        주요 소견과 임상적 의미를 포함하여 작성하세요:
        """
    
    async def preprocess_data(self, data: Any) -> str:
        """방사선 보고서를 IMPRESSION 작성을 위해 전처리"""
        data = data['radiology report']
        return {'user_input': str(data)}
    
    async def postprocess_result(self, result: str) -> str:
        """결과 정리"""
        result = result.strip()
        return result

---

# Task C: ICD 10 코드 예측


In [None]:

class TaskCProcessor(DatathonProcessor):
    """Task C: ICD 코드 예측"""
    def get_model_name(self) -> str:
        return "meta-llama/Llama-3.1-8B-Instruct"
    
    def get_prompt_template(self) -> str:
        return """
        당신은 의료 코딩 전문가입니다. 다음 퇴원 요약을 바탕으로 적절한 ICD 코드를 예측해주세요.
        
        퇴원 요약:
        {user_input}
        
        위 퇴원 요약을 바탕으로 적절한 ICD 코드를 예측해주세요.
        주 진단과 부 진단을 구분하여 작성하세요:
        """
    
    async def preprocess_data(self, data: Any) -> str:
        """퇴원 요약을 ICD 코드 예측을 위해 전처리"""
            data = data['hospital_course']
        return {'user_input': str(data)}
    
    async def postprocess_result(self, result: str) -> str:
        """결과 정리"""
        result = result.strip()
        return result

---

# 1. 데이터 로딩 함수들

In [11]:
def load_task_a_data(data_dir, split="test"):
    """
    Task A 데이터 로드 함수
    """
    if split == "train":
        file_path = data_dir / "taskA_train.csv"
    else:
        file_path = data_dir / "taskA_test.csv"
    
    if file_path.exists():
        print(f"Task A {split} 데이터 로드 완료: {file_path}")
        return pd.read_csv(file_path)
    else:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        return None

def load_task_b_data(data_dir, split="test"):
    """
    Task B 데이터 로드 함수
    """
    if split == "train":
        file_path = data_dir / "taskB_train.csv"
    else:
        file_path = data_dir / "taskB_test.csv"
    
    if file_path.exists():
        print(f"Task B {split} 데이터 로드 완료: {file_path}")
        return pd.read_csv(file_path)
    else:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        return None

def load_task_c_data(data_dir, split="test"):
    """
    Task C 데이터 로드 함수  
    """
    if split == "train":
        file_path = data_dir / "taskC_train.csv"
    else:
        file_path = data_dir / "taskC_test.csv"
    
    if file_path.exists():
        print(f"Task C {split} 데이터 로드 완료: {file_path}")
        return pd.read_csv(file_path)
    else:
        print(f"파일을 찾을 수 없습니다: {file_path}")
        return None

---

## 2. 데이터 로드 및 정렬

In [12]:
# 데이터 디렉토리 경로 설정 (실제 경로로 수정)
data_dir = pathlib.Path('./data/')  # 실제 데이터 경로

train_data_a = load_task_a_data(data_dir, split="train")[:10]
train_data_b = load_task_b_data(data_dir, split="train")[:10]
train_data_c = load_task_c_data(data_dir, split="train")[:10]

test_data_a = load_task_a_data(data_dir, split="test")[:10]
test_data_b = load_task_b_data(data_dir, split="test")[:10]
test_data_c = load_task_c_data(data_dir, split="test")[:10]  

Task A train 데이터 로드 완료: data/taskA_train.csv
Task B train 데이터 로드 완료: data/taskB_train.csv
Task C train 데이터 로드 완료: data/taskC_train.csv
Task A test 데이터 로드 완료: data/taskA_test.csv
Task B test 데이터 로드 완료: data/taskB_test.csv
Task C test 데이터 로드 완료: data/taskC_test.csv


---

## 3. 프로세서 초기화

In [13]:
# 각 Task별 프로세서 인스턴스 생성
task_a = TaskAProcessor(api_key=API_KEY)
task_b = TaskBProcessor(api_key=API_KEY)
task_c = TaskCProcessor(api_key=API_KEY)

---

## 4. Task별 프롬프트 예제 확인

In [14]:
print("Task별 프롬프트 예제 확인\n")

# Task A 예제
if test_data_a is not None and len(test_data_a) > 0:
    sample_data = test_data_a.iloc[0]
    print("Task A: Brief Hospital Course 작성")
    print(f"입력: Medical Record (sample_id: {sample_data['sample_id']})")
    print(f"샘플: {str(sample_data['medical record'])[:150]}...")
    print("출력: Brief Hospital Course (입원경과 요약)\n")

# Task B 예제  
if test_data_b is not None and len(test_data_b) > 0:
    sample_data = test_data_b.iloc[0]
    print("Task B: Radiology Impression 요약")
    print(f"입력: Radiology Report (sample_id: {sample_data['sample_id']})")
    print(f"샘플: {str(sample_data['radiology report'])[:150]}...")
    print("출력: IMPRESSION (방사선 소견 요약)\n")

# Task C 예제
if test_data_c is not None and len(test_data_c) > 0:
    sample_data = test_data_c.iloc[0]
    print("Task C: ICD-10 코드 예측")
    print(f"입력: Hospital Course (sample_id: {sample_data['sample_id']})")
    print(f"샘플: {str(sample_data['hospital_course'])[:150]}...")
    print("출력: ICD-10 코드들 (예: A01, B02, C03 )\n")

Task별 프롬프트 예제 확인

Task A: Brief Hospital Course 작성
입력: Medical Record (sample_id: 0)
샘플: Name:  ___                  Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   M
 
Serv...
출력: Brief Hospital Course (입원경과 요약)

Task B: Radiology Impression 요약
입력: Radiology Report (sample_id: 0)
샘플: EXAMINATION:  C-SPINE NON-TRAUMA ___ VIEWS

INDICATION:  ___ year old woman POD#5 LAMINECTOMY FUSION W/INSTRUMENTATION
C3-C7 now s/p drain removal// E...
출력: IMPRESSION (방사선 소견 요약)

Task C: ICD-10 코드 예측
입력: Hospital Course (sample_id: 0)
샘플: Name:  ___.                  Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Ser...
출력: ICD-10 코드들 (예: A01, B02, C03 )



---

## 5. LLM 데이터 처리

In [15]:
train_task_a_result = await task_a.summarize(train_data_a)
train_task_b_result = await task_b.summarize(train_data_b)
train_task_c_result = await task_c.summarize(train_data_c)

100%|██████████| 10/10 [00:00<00:00, 2166.82it/s]
100%|██████████| 10/10 [00:19<00:00,  1.92s/it]
100%|██████████| 10/10 [00:00<00:00, 14753.09it/s]
100%|██████████| 10/10 [00:00<00:00, 4083.24it/s]
100%|██████████| 10/10 [01:17<00:00,  7.71s/it]
100%|██████████| 10/10 [00:00<00:00, 17360.53it/s]
100%|██████████| 10/10 [00:00<00:00, 2433.03it/s]
100%|██████████| 10/10 [01:23<00:00,  8.34s/it]
100%|██████████| 10/10 [00:00<00:00, 13604.62it/s]


# 6. validation

#### 0. 추가 라이브러리

In [16]:
from typing import List
from langevaluate.llmtestcase import LLMTestCase
from langevaluate.llmdataset import LLMDataset

## 1. Validation 데이터 준비

In [26]:
import copy

task_categories = ['quality', 'clinical_clarity', 'conciseness', 'hallucination']
task_a_test_cases = {category: [] for category in task_categories}

for input_text, output_text, expected_output in zip(
    train_data_a['medical record'],
    train_task_a_result,
    train_data_a['target']
):
    base_case = LLMTestCase(
        input=input_text,
        output=output_text,
        expected_output=expected_output
    )
    
    for category in task_categories:
        task_a_test_cases[category].append(copy.deepcopy(base_case))
        
task_b_test_cases = {category: [] for category in task_categories}

for input_text, output_text, expected_output in zip(
    train_data_b['radiology report'],
    train_task_b_result,
    train_data_b['target']
):
    base_case = LLMTestCase(
        input=input_text,
        output=output_text,
        expected_output=expected_output
    )
    
    for category in task_categories:
        task_b_test_cases[category].append(copy.deepcopy(base_case))



## 1-1 dataset 생성

In [27]:
task_a_datasets = {category: LLMDataset(test_cases=task_a_test_cases[category]) for category in task_categories}
task_b_datasets = {category: LLMDataset(test_cases=task_b_test_cases[category]) for category in task_categories}

---

## 2. LLM 채점용 template 확인

In [30]:
from langevaluate.metrics.summary_judge.summary_judge_metric import SummaryJudgeMetric
from langevaluate.metrics.summary_judge.summary_judge_template import SummaryJudgeTemplate

task_a_quality = SummaryJudgeMetric(
    score_model=validation_llm,
    category='brief_hospital_course_quality',
    template_language='en',  # 'ko' 또는 'en'
    generate_template_type='reasoning'  # 'reasoning'
)

In [31]:
# llm 채점용 template
print(metric_summarization_quality.template_for_judge.messages[0].prompt.template)

<role>Evaluate the quality of the AI assistant’s brief hospital course of the clinical history shown below.</role>

<task>
Provide your reasoning and score with only the integer (0, 1, 2, 3, 4, 5).

Evaluation Rubric (compare the generated brief hospital course against BOTH the clinical history and the expected brief hospital course):
Question: The brief hospital course is irrelevant to the clinical history or fails to address the question.

5 : The brief hospital course is excellent — it captures all the key points clearly, concisely, and naturally.
4 : The brief hospital course is good — it conveys most of the key information, but may miss some nuance or clarity.
3 : The brief hospital course is fair — it includes some important points but omits or distorts others, leading to partial understanding.
2 : The brief hospital course is weak — it captures only a small portion of the original meaning, or is vague and confusing.
1 : The brief hospital course is poor — it barely addresses the

In [33]:
task_a_metrics = {}
task_a_metrics = {i : SummaryJudgeMetric(score_model=validation_llm, 
                                         category=f'brief_hospital_course_{i}', 
                                         template_language='en', 
                                         generate_template_type='reasoning') for i in task_categories}
task_b_metrics = {i : SummaryJudgeMetric(score_model=validation_llm, 
                                         category=f'radiology_impression_{i}', 
                                         template_language='en', 
                                         generate_template_type='reasoning') for i in task_categories}

In [40]:
task_a_metrics['quality']

<langevaluate.metrics.summary_judge.summary_judge_metric.SummaryJudgeMetric at 0x177424940>

---

## 3. 평가 실행

In [None]:
result = await task_a_metrics['quality'].ameasure(datasets['summarization_quality'])

### 평균 점수

In [None]:
scores = sum([int(i.score) for i in result]) / len(result)

In [None]:
scores

2.8

---

## 4. BertScore

In [53]:
from langevaluate.quantiative_metrics import BertScore

# BertScore 인스턴스 생성
bertscore_metric = BertScore(model_type="distilbert-base-uncased", batch_size=16)

# 테스트 데이터
test_hyps = [
    'Patient was admitted with chest pain and treated successfully.',
    'The radiology report shows normal findings.'
]

test_refs = [
    'Patient admitted for chest pain, treatment was effective.',
    'Radiology report indicates normal results.'
]

# BertScore 계산 (올바른 방법)
bert_scores = bertscore_metric(refs=test_refs, hyps=test_hyps)
print(f"BertScore F1 결과: {bert_scores}")
print(f"평균 BertScore: {sum(bert_scores)/len(bert_scores):.4f}")

  0%|          | 0/10 [18:37<?, ?it/s]
Task exception was never retrieved
future: <Task finished name='Task-227' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /Users/jin/Desktop/코딩/recent/MARS_challenge_2025/example/data_dummy/datathon/.venv/lib/python3.10/site-packages/tqdm/asyncio.py:75> exception=TypeError("object dict can't be used in 'await' expression")>
Traceback (most recent call last):
  File "/Users/jin/Desktop/코딩/recent/MARS_challenge_2025/example/data_dummy/datathon/.venv/lib/python3.10/site-packages/tqdm/asyncio.py", line 76, in wrap_awaitable
    return i, await f
TypeError: object dict can't be used in 'await' expression
Task exception was never retrieved
future: <Task finished name='Task-228' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /Users/jin/Desktop/코딩/recent/MARS_challenge_2025/example/data_dummy/datathon/.venv/lib/python3.10/site-packages/tqdm/asyncio.py:75> exception=TypeError("object dict can't be used in

BertScore F1 결과: [0.9206523299217224, 0.9264420866966248]
평균 BertScore: 0.9235


In [None]:
# Task B BertScore 계산
submission_b_train = pd.read_csv('./result/submission_taskB_train.csv')
# 예측값과 정답 추출
predictions = submission_b_train['target'].astype(str).tolist()
references = train_data_b['target'].astype(str).tolist()

# BertScore 계산
bert_scores_task_b = bertscore_metric(refs=references, hyps=predictions)
avg_bert_score_b = sum(bert_scores_task_b) / len(bert_scores_task_b)

print(f"평균 BertScore: {avg_bert_score_b:.4f}")

평균 BertScore: 0.7053


---

In [None]:
train_data_a

Unnamed: 0,sample_id,gender,anchor_age,anchor_year,anchor_year_group,dod,admittime,dischtime,deathtime,admission_type,...,dbp,sbp,temperature,pain,transfer_id,eventtype,careunit,intime,outtime,target
0,0,M,66,2189,2014 - 2016,2189-12-23,2189-12-18 12:58:00,2189-12-21 12:50:00,,URGENT,...,58.0,94.0,98.5,2,33399871,admit,Med/Surg,2189-12-18 15:26:00,2189-12-21 12:50:35,"___ year old man with with PMH of cirrhosis, H..."
1,1,F,91,2150,2014 - 2016,,2150-05-10 20:02:00,2150-05-13 16:40:00,,EW EMER.,...,55.0,145.0,98.9,0,31444432,transfer,Med/Surg,2150-05-11 22:24:12,2150-05-12 06:29:25,"___ with a PMH of severe Alzheimer's, HCV infe..."
2,2,M,91,2121,2017 - 2019,2121-03-24,2121-03-16 16:25:00,2121-03-20 17:52:00,,OBSERVATION ADMIT,...,79.0,154.0,98.0,2,32510227,ED,Emergency Department,2121-03-16 11:36:00,2121-03-16 17:53:00,TRANSITIONAL ISSUES \n==================== \nD...
3,3,M,59,2117,2011 - 2013,2126-11-20,2121-05-23 18:54:00,2121-05-27 12:45:00,,OBSERVATION ADMIT,...,104.0,151.0,102.7,9,33869192,admit,Transplant,2121-05-23 21:00:00,2121-05-24 10:26:24,Mr. ___ is a ___ year old man with history of ...
4,4,F,82,2131,2017 - 2019,2131-10-03,2131-08-25 23:59:00,2131-09-04 15:30:00,,OBSERVATION ADMIT,...,56.0,152.0,97.6,10,34923982,admit,Neuro Intermediate,2131-08-26 00:43:00,2131-08-27 11:36:02,SUMMARY\n============\n___ is a ___ year old f...


## 5. ICDScore

In [54]:
from langevaluate.quantiative_metrics import ICDScore

In [55]:
# Task C ICD Score 계산 
submission_c_train = pd.read_csv('./result/submission_taskC_train.csv')

# 예측값과 정답 추출 및 전처리
def parse_icd_codes(codes_str):
    """ICD 코드 문자열을 리스트로 변환"""
    if pd.isna(codes_str):
        return []
    return [code.strip() for code in str(codes_str).split(',') if code.strip()]

predictions_icd = submission_c_train['target'].apply(parse_icd_codes).tolist()
references_icd = train_data_c['target'].apply(parse_icd_codes).tolist()


icd_scores = ICDScore()(predictions_icd, references_icd)
avg_icd_score = sum(icd_scores) / len(icd_scores) if icd_scores else 0

print(f"평균 F1 Score: {avg_icd_score:.4f}")

평균 F1 Score: 0.0700


## 6. 공정성 지표

In [56]:
from langevaluate.quantiative_metrics import FairnessScore

In [None]:
fairness_score = FairnessScore()
genders = train_data_b['gender']
age = train_data_b['anchor_age']
sex_fairness_score = fairness_score(genders, bert_scores_task_b, type='sex')
age_fairness_score = fairness_score(age, bert_scores_task_b, type='age')

In [None]:
print('성별 공정성 지표 : ', sex_fairness_score)
print('나이 공정성 지표 : ', age_fairness_score)