In [None]:
import os
import yaml

# Configuration 파일 경로
config_path = "./configuration/config.yaml"

def load_configurations(config_path):
    with open(config_path, "r") as file:
        configs = yaml.safe_load(file)
    return configs

# Configuration 로드
configs = load_configurations(config_path)
print("Configurations Loaded Successfully:")
print(configs)

# 주요 변수 추출
device_config = configs["device"]
model_config = configs["model"]
evaluation_settings = configs["evaluation"]
active_metrics = evaluation_settings["metrics"]
output_dir = evaluation_settings["output_dir"]

# 결과 파일 이름 생성
name_config = (
    f'{device_config["type"]}-'
    f'{device_config["model"]}_'
    f'{model_config["name"]}-'
    f'{model_config["quantization"]}_'
    f'calib-{model_config.get("calibration", "none")}'
)
result_folder = os.path.join(output_dir, 'coqa')
result_file = os.path.join(result_folder, f"{name_config}.txt")

# Ensure result folder exists
os.makedirs(result_folder, exist_ok=True)

if os.path.exists(result_file):
    print(f"Result file already exists at: {result_file}. Skipping computation.")
    proceed_calc = False
else:
    print(f"Result file does not exist. Proceeding with computation...")
    proceed_calc = True

Configurations Loaded Successfully:
{'device': {'type': 'GPU', 'model': 'A5000', 'count': 1}, 'model': {'name': 'llama3.3:70b', 'quantization': 'Q4_K_M', 'calibration': 'base'}, 'evaluation': {'task': 'chatbot', 'metrics': {'precision': True, 'recall': True, 'f1': True, 'tps': False, 'power_consumption': False, 'memory_usage': False}, 'output_dir': './results/chatbot'}}
Result file does not exist. Proceeding with computation...


## 답변 생성

In [None]:
from transformers import AutoTokenizer
import json
from src.chatbot.dataset import load_data, get_inputs
from llama_index.llms.ollama import Ollama

# Prompt 리스트 생성
numTest = 300

def get_prompt_list(data_folder, eval_dataset, model_name, num_ctx, max_output_len):
    tokenizer = AutoTokenizer.from_pretrained('nvidia/ChatQA-1.5-8B')
    dataset_paths = {"coqa": "dev.json"}

    if eval_dataset in dataset_paths:
        input_path = os.path.join(data_folder, dataset_paths[eval_dataset])
    else:
        raise Exception("Invalid dataset name provided.")

    data_list = load_data(input_path)
    print(f"Number of samples in dataset: {len(data_list)}")
    return get_inputs(data_list, eval_dataset, tokenizer, num_ctx=num_ctx, max_output_len=max_output_len)

# LLM 응답 생성
def generate_responses(prompt_list, llm_model, max_tokens, bos_token="<|begin_of_text|>"):
    llm = Ollama(
        model=llm_model,
        temperature=0,
        request_timeout=600,
        max_tokens=max_tokens
    )
    output_list = []
    for prompt in prompt_list[:numTest]:
        prompt = bos_token + prompt
        response = llm.complete(prompt).text.strip().replace("\n", " ")
        output_list.append(response)
    return output_list

if proceed_calc:
    prompt_list = get_prompt_list(
        data_folder='./data/chatbot/coqa/',
        eval_dataset='coqa',
        model_name=model_config["name"],
        num_ctx=3,
        max_output_len=128
    )
    output_list = generate_responses(
        prompt_list, llm_model=model_config["name"], max_tokens=200
    )

    # 결과 저장
    with open(result_file, "w") as f:
        for output in output_list:
            f.write(output + "\n")

loading data from ./data/chatbot/coqa/dev.json
Number of samples in dataset: 7983


## Metric 뽑기

In [6]:
import os
import pandas as pd
import numpy as np
from src.chatbot.get_scores import evaluate_f1

# 리더보드 업데이트 함수
def update_leaderboard(result_dir, leaderboard_file, active_metrics, ground_truth_file):
    # 기존 리더보드 파일 로드 또는 새로 생성
    if os.path.exists(leaderboard_file):
        leaderboard = pd.read_csv(leaderboard_file)
    else:
        leaderboard = pd.DataFrame(columns=[
            "device-type", "device-name", "llm", "quantization", "calibration",
            *active_metrics  # 활성화된 메트릭만 포함
        ])

    # 결과 디렉토리 파일 처리
    for filename in os.listdir(result_dir):
        if not filename.endswith(".txt"):
            continue

        # 파일 이름에서 메타데이터 추출
        name_head = filename.replace(".txt", "")
        metadata = {
            "device-type": name_head.split("-")[0],
            "device-name": name_head.split("_")[0].split("-")[1],
            "llm": name_head.split("_")[1].split('-')[0],
            "quantization": name_head.split("_calib")[0].split('-')[-1],
            "calibration": name_head.split("_calib-")[1],
        }

        # 리더보드에 이미 존재하는 데이터인지 확인
        if ((leaderboard["device-type"] == metadata["device-type"]) &
            (leaderboard["device-name"] == metadata["device-name"]) &
            (leaderboard["llm"] == metadata["llm"]) &
            (leaderboard["quantization"] == metadata["quantization"]) &
            (leaderboard["calibration"] == metadata["calibration"])).any():
            print(f"Skipping {filename}, already in leaderboard.")
            continue

        # 결과 파일 로드
        result_file = os.path.join(result_dir, filename)

        # F1, Precision, Recall 계산
        precision, recall, f1 = evaluate_f1(ground_truth_file, result_file)

        # 평균 메트릭 계산
        avg_metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

        # 새 항목 생성 및 리더보드 업데이트
        new_entry = pd.DataFrame([{
            **metadata,
            **avg_metrics
        }])
        leaderboard = pd.concat([leaderboard, new_entry], ignore_index=True)
        print(f"Processed and added {filename} to leaderboard.")

    # 리더보드 저장
    leaderboard.to_csv(leaderboard_file, index=False)
    print(f"Leaderboard updated and saved to {leaderboard_file}")

# 리더보드 업데이트 실행
result_dir = "./results/chatbot/coqa"
leaderboard_file = os.path.join(result_dir, "leaderboard.csv")
ground_truth_file = "./data/chatbot/coqa/dev.json"
active_metrics = ["precision", "recall", "f1"]

update_leaderboard(result_dir, leaderboard_file, active_metrics, ground_truth_file)

50 7983
Method: default; Precision: 0.5517; recall: 0.7088; f1: 0.6773
Processed and added GPU-A5000_llama3.3:70b-Q4_K_M_calib-base.txt to leaderboard.
Leaderboard updated and saved to ./results/chatbot/coqa/leaderboard.csv


  leaderboard = pd.concat([leaderboard, new_entry], ignore_index=True)
