In [1]:
import os
import yaml
from time import time

# Configuration 파일 경로
config_path = "./configuration/config.yaml"

def load_configurations(config_path):
    with open(config_path, "r") as file:
        configs = yaml.safe_load(file)
    return configs

# Configuration 로드
configs = load_configurations(config_path)
print("Configurations Loaded Successfully:")
print(configs)

# 주요 변수 추출
device_config = configs["device"]
model_config = configs["model"]
evaluation_settings = configs["evaluation"]
active_metrics = evaluation_settings["metrics"]
output_dir = evaluation_settings["output_dir"]

# 결과 파일 이름 생성
name_config = (
    f'{device_config["type"]}-'
    f'{device_config["model"]}_'
    f'{model_config["name"]}-'
    f'{model_config["quantization"]}_'
    f'calib-{model_config.get("calibration", "none")}'
)
result_folder = os.path.join(output_dir, 'coqa')
result_file = os.path.join(result_folder, f"{name_config}.txt")

# Ensure result folder exists
os.makedirs(result_folder, exist_ok=True)

if os.path.exists(result_file):
    print(f"Result file already exists at: {result_file}. Skipping computation.")
    proceed_calc = False
else:
    print(f"Result file does not exist. Proceeding with computation...")
    proceed_calc = True

Configurations Loaded Successfully:
{'device': {'type': 'NPU', 'model': 'RNGD', 'count': 1}, 'model': {'name': 'llama3.1-8B-Instruct', 'quantization': 'W8A8', 'calibration': 'base'}, 'evaluation': {'task': 'chatbot', 'metrics': {'precision': True, 'recall': True, 'f1': True, 'tps': True, 'power_consumption': False, 'memory_usage': False}, 'output_dir': './results/chatbot'}}
Result file does not exist. Proceeding with computation...


## 답변 생성

In [2]:
import os
from furiosa_llm import LLM, SamplingParams

os.environ["RUST_BACKTRACE"] = "full"
furiosa_llm = LLM.from_artifacts("/home/elicer/renegade/Llama-3.1-8B-Instruct", devices="npu:1:*")
sampling_params = SamplingParams(temperature=0, max_tokens=400)


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.1.0+cu121)
    Python  3.10.15 (you have 3.10.15)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
INFO:2025-01-10 10:19:52 Prefill buckets: [Bucket(batch_size=1, attention_size=512), Bucket(batch_size=1, attention_size=1024)]
INFO:2025-01-10 10:19:52 Decode buckets: [Bucket(batch_size=64, attention_size=2048), Bucket(batch_size=128, attention_size=2048)]
INFO:2025-01-10 10:19:52 For some LLaMA V1 models, initializing the fast tokenizer may take a long time. To reduce the initialization time, consider using 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer.


In [3]:
from transformers import AutoTokenizer
import json
from src.chatbot.dataset import load_data, get_inputs, get_inputs_npu
from nltk.tokenize import word_tokenize

# Prompt 리스트 생성
numTest = 7983

def get_prompt_list(data_folder, eval_dataset, model_name, num_ctx, max_output_len):
    tokenizer = AutoTokenizer.from_pretrained('nvidia/ChatQA-1.5-8B')
    dataset_paths = {"coqa": "dev.json"}

    if eval_dataset in dataset_paths:
        input_path = os.path.join(data_folder, dataset_paths[eval_dataset])
    else:
        raise Exception("Invalid dataset name provided.")

    data_list = load_data(input_path)
    print(f"Number of samples in dataset: {len(data_list)}")
    return get_inputs_npu(data_list, eval_dataset, tokenizer, num_ctx=num_ctx, max_output_len=max_output_len)

prompt_list = get_prompt_list(
    data_folder='./data/chatbot/coqa/',
    eval_dataset='coqa',
    model_name=model_config["name"],
    num_ctx=3,
    max_output_len=200
)



loading data from ./data/chatbot/coqa/dev.json
Number of samples in dataset: 7983


In [4]:
sampling_params

SamplingParams(n=1, best_of=1, temperature=0, top_p=1.0, top_k=-1, use_beam_search=False, length_penalty=1.0, early_stopping=False, max_tokens=400, min_tokens=0)

In [5]:
def npu_generate_responses(prompt_list, sampling_params, bos_token="<|begin_of_text|>", num_test=5, max_length=1024):
    """
    NPU 기반의 응답 생성 함수 - 입력 길이 제한 적용

    Args:
    - prompt_list: 프롬프트 리스트
    - sampling_params: 샘플링 파라미터
    - bos_token: 프롬프트 시작 토큰
    - num_test: 테스트할 프롬프트 수
    - max_length: NPU 모델의 최대 입력 길이

    Returns:
    - output_list: 생성된 응답 리스트
    - tps: 초당 토큰 수 리스트
    """
    output_list = []
    tps = []

    for prompt in prompt_list[:num_test]:

        # NPU에서 응답 생성 시작
        start_time = time()
        try:
            output = furiosa_llm.generate(prompt, sampling_params)
        except ValueError as e:
            print(f"Error generating response: {e}")
            output_list.append("<ERROR>")
            tps.append(0)
            continue

        elapsed_time = time() - start_time

        # 응답 처리
        generated_text = output.outputs[0].text[2:].strip().replace("\n", " ")

        # 초당 토큰 수 계산
        len_tokens = len(word_tokenize(prompt))
        tps.append(len_tokens / elapsed_time if elapsed_time > 0 else 0)

        # 결과 저장
        output_list.append(generated_text)

    return output_list, tps

In [6]:
output_list, tps_list = npu_generate_responses(
        prompt_list, sampling_params, num_test=len(prompt_list)
    )
# 결과 저장
with open(result_file, "w") as f:
    for output in output_list:
        f.write(output + "\n")

Error generating response: unsupported input: input sequence too long - max: 1024, received: 1116
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1131
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1143
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1152
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1147
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1154
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1153
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1158
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1158
Error generating response: unsupported input: input sequence too long - max: 1024, received: 1156
Error generating res

## Metric 뽑기

In [7]:
import os
import pandas as pd
import numpy as np
from src.chatbot.get_scores import evaluate_f1

# 리더보드 업데이트 함수
def update_leaderboard(result_dir, leaderboard_file, active_metrics, ground_truth_file):
    # 기존 리더보드 파일 로드 또는 새로 생성
    if os.path.exists(leaderboard_file):
        leaderboard = pd.read_csv(leaderboard_file)
    else:
        leaderboard = pd.DataFrame(columns=[
            "device-type", "device-name", "llm", "quantization", "calibration",
            *active_metrics  # 활성화된 메트릭만 포함
        ])

    # 결과 디렉토리 파일 처리
    for filename in os.listdir(result_dir):
        if not filename.endswith(".txt"):
            continue

        # 파일 이름에서 메타데이터 추출
        name_head = filename.replace(".txt", "")
        metadata = {
            "device-type": name_head.split("-")[0],
            "device-name": name_head.split("_")[0].split("-")[1],
            "llm": name_head.split("_")[1].split('-')[0],
            "quantization": name_head.split("_calib")[0].split('-')[-1],
            "calibration": name_head.split("_calib-")[1],
        }

        # 리더보드에 이미 존재하는 데이터인지 확인
        if ((leaderboard["device-type"] == metadata["device-type"]) &
            (leaderboard["device-name"] == metadata["device-name"]) &
            (leaderboard["llm"] == metadata["llm"]) &
            (leaderboard["quantization"] == metadata["quantization"]) &
            (leaderboard["calibration"] == metadata["calibration"])).any():
            print(f"Skipping {filename}, already in leaderboard.")
            continue

        # 결과 파일 로드
        result_file = os.path.join(result_dir, filename)

        # F1, Precision, Recall 계산
        precision, recall, f1 = evaluate_f1(ground_truth_file, result_file)
        median_tps = np.median(tps_list) if tps_list else np.nan
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "tps": median_tps
        }

        # 새 항목 생성 및 리더보드 업데이트
        new_entry = pd.DataFrame([{
            **metadata,
            **metrics
        }])
        leaderboard = pd.concat([leaderboard, new_entry], ignore_index=True)
        print(f"Processed and added {filename} to leaderboard.")

    # 리더보드 저장
    leaderboard.to_csv(leaderboard_file, index=False)
    print(f"Leaderboard updated and saved to {leaderboard_file}")

# 리더보드 업데이트 실행
result_dir = "./results/chatbot/coqa"
leaderboard_file = os.path.join(result_dir, "leaderboard.csv")
ground_truth_file = "./data/chatbot/coqa/dev.json"

update_leaderboard(result_dir, leaderboard_file, active_metrics, ground_truth_file)

Skipping GPU-A100_llama3.1-Q4_K_M_calib-base.txt, already in leaderboard.
Skipping GPU-A100_llama3.1:70b-Q4_K_M_calib-base.txt, already in leaderboard.
Skipping GPU-A100_qwen2.5:72b-Q4_K_M_calib-base.txt, already in leaderboard.
Skipping GPU-A5000_llama3.1-Q4_K_M_calib-base.txt, already in leaderboard.
Skipping GPU-A5000_llama3.1:70b-Q4_K_M_calib-base.txt, already in leaderboard.
Skipping GPU-A5000_llama3.3:70b-Q4_K_M_calib-base.txt, already in leaderboard.
7983 7983
Method: default; Precision: 0.6995; recall: 0.7344; f1: 0.7929
Processed and added NPU-RNGD_llama3.1-8B-Instruct-W8A8_calib-base.txt to leaderboard.
Leaderboard updated and saved to ./results/chatbot/coqa/leaderboard.csv
