# 서비스 지표 계산

- 환경 탐지 (GPU, NPU-FuriosaAI RNGD)

- 서비스 실행 (예: 번역 + 모니터링)

- 스코어 계산하기 

- 리더보드에 자동 업데이트 

In [1]:
config_path = "./configuration/config.yaml"

In [2]:
from time import time
from src.common.load_config import load_all_configurations
import os, json
import pandas as pd
import numpy as np

# Load configuration
all_configs = load_all_configurations(config_path)

# Set ENABLE_MONITORING=true in the environment to enable monitoring
os.environ["ENABLE_MONITORING"] = "true" if "power_consumption" in all_configs["active_metrics"] else "false"
    
# Print the loaded configurations
print("Configurations Loaded Successfully:")
print("Active Metrics:", all_configs["active_metrics"])
print("Device Config:", all_configs["device_config"])
print("Model Config:", all_configs["model_config"])
print("Evaluation Settings:", all_configs["evaluation_settings"])

# Look up imtermediate result folder to avoid redundancy (performance leaderboard)
name_config = (
    f'{all_configs["device_config"]["type"]}-'
    f'{all_configs["device_config"]["model"]}_'
    f'{all_configs["model_config"]["name"]}-'
    f'{all_configs["model_config"]["quantization"]}_'
    f'calib-{all_configs["model_config"].get("calibration", "none")}'
)
result_folder = os.path.join(all_configs["evaluation_settings"]["output_dir"], "Korean2English")
result_file = os.path.join(result_folder, f"{name_config}.json")

# Ensure result folder exists
os.makedirs(result_folder, exist_ok=True)

print("----------------------- \n")
if os.path.exists(result_file):
    print(f"Result file already exists at: {result_file}. Skipping computation.")
    proceed_calc = "true"
else:
    print(f"Result file does not exist. Proceeding with computation...")
    proceed_calc = "false"


Configurations Loaded Successfully:
Active Metrics: ['BLEU', 'METEOR', 'BERTScore', 'tps']
Device Config: {'type': 'NPU', 'model': 'RNGD', 'count': 1}
Model Config: {'name': 'llama3.1-8B-Instruct', 'quantization': 'W8A8', 'calibration': 'base'}
Evaluation Settings: {'task': 'translation', 'output_dir': './results/translation'}
----------------------- 

Result file already exists at: ./results/translation/Korean2English/NPU-RNGD_llama3.1-8B-Instruct-W8A8_calib-base.json. Skipping computation.


In [3]:
# Load task-specific modules based on the configuration
task = all_configs["evaluation_settings"].get("task")

if task == "translation":
    from src.translation.one_translate import (
        initialize_translation_environment,
        translate_text,
        batch_translate_text,
    )
    from src.metrics.evaluate_translation import evaluate_translation
elif task == "summarization":
    from src.summarization.one_summary import (
        initialize_summarization_environment,
        summarize_text,
        batch_summarize_text,
    )
    from src.metrics.evaluate_summarization import evaluate_summarization
elif task == "multimodal":
    from src.multimodal.one_multimodal import (
        initialize_multimodal_environment,
        process_multimodal_data,
    )
    from src.metrics.evaluate_multimodal import evaluate_multimodal
elif task == "chatbot":
    from src.chatbot.one_chatbot import (
        initialize_chatbot_environment,
        generate_chat_response,
    )
    from src.metrics.evaluate_chatbot import evaluate_chatbot
else:
    raise ValueError(f"Undetermined task: {task}. Please check your configuration.")

# 벤치마킹 데이터셋 (FLORES-200)

- 한 -> 영, 영 -> 한 

In [3]:
# Load the CSV file
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

# FLORES-200 데이터 위치 
data_dir = "./data/translation/flores"

data_eng = load_text_file(f"{data_dir}/devtest.eng_Latn")
data_kor = load_text_file(f"{data_dir}/devtest.kor_Hang")

In [5]:
# Initialize the translation environment
try:
    initialize_translation_environment(llm_model=all_configs["model_config"]["name"])
except EnvironmentError as e:
    print(f"Failed to initialize translation environment: {e}")
    exit(1)
    
# 한 -> 영 번역 
num_benchmark = 100
start_time = time() # start time
batch_results = batch_translate_text(data_kor[:num_benchmark], source_language="Korean", target_language="English")
batch_results["elapsed_time"] = time() - start_time  # Add elapsed time to the results
result_folder = os.path.join(all_configs["evaluation_settings"]["output_dir"], "Korean2English")
result_file = os.path.join(result_folder, f"{name_config}.json")
print('ko2en translation done')

# Save intermediate result 
with open(result_file, "w", encoding="utf-8") as file:
    json.dump(batch_results, file, indent=4, ensure_ascii=False)  # Save the batch results in JSON format
    print(f"Results saved to {result_file}.")
    
# 영 -> 한 번역 
start_time = time()
batch_results = batch_translate_text(data_eng[:num_benchmark], source_language="English", target_language="Korean")
batch_results["elapsed_time"] = time() - start_time  # Add elapsed time to the results
result_folder = os.path.join(all_configs["evaluation_settings"]["output_dir"], "English2Korean")
result_file = os.path.join(result_folder, f"{name_config}.json")
print('en2ko translation done')

# Save intermediate result 
with open(result_file, "w", encoding="utf-8") as file:
    json.dump(batch_results, file, indent=4, ensure_ascii=False)  # Save the batch results in JSON format
    print(f"Results saved to {result_file}.")


2025-01-09 06:53:32,680 - ERROR - Failed to detect NPUs: [Errno 2] No such file or directory: 'furiosa-smi'
2025-01-09 06:53:32,681 - INFO - Translation environment detected: GPU. Using GPU for translations.


2025-01-09 06:53:57,995 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:02,329 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:05,155 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:11,222 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:15,049 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:17,992 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:19,483 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:22,646 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:24,841 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:54:26,460 - INFO - HTTP Request:

ko2en translation done
Results saved to ./results/translation/Korean2English/GPU-A5000_qwen2.5:72b-Q4_K_M_calib-base.json.


2025-01-09 06:58:45,736 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:58:50,987 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:58:56,475 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:05,247 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:11,010 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:22,216 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:23,936 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:29,435 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:32,654 - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-01-09 06:59:35,337 - INFO - HTTP Request:

ReadTimeout: timed out

# Compute metrics 

In [7]:
leaderboard

Unnamed: 0,device-type,device-name,llm,quantization,calibration,BLEU,METEOR,BERTScore,tps
0,GPU,A5000,llama3.1,Q4_K_M,base,0.206346,0.560355,0.848896,52.091613
1,GPU,A5000,llama3.3:70b,Q4_K_M,base,0.262889,0.625706,0.879954,10.143561
2,GPU,A5000,llama3.1:70b,Q4_K_M,base,0.254576,0.606528,0.874383,10.861105
3,GPU,A5000,qwen2:72b,Q4_K_M,base,0.241575,0.608767,0.872418,10.256634
4,GPU,A100,llama3.1:latest,Q4_K_M,base,0.201274,0.555541,0.847286,81.69721
5,GPU,A100,llama3.1:70b,Q4_K_M,base,0.250461,0.604116,0.874396,18.985956
6,GPU,A100,qwen2.5:72b,Q4_K_M,base,0.271871,0.629202,0.879257,15.049937


In [8]:
from src.metrics.evaluate_translation import evaluate_translation

# Configurations
source_lang = "Korean"  # 소스 언어
target_lang = "English"  # 목표 언어
translation_config = f'{source_lang}2{target_lang}'

result_dir = f"./results/translation/{translation_config}/"
leaderboard_file = f"./results/translation/{translation_config}/leaderboard.csv"

# 기존 리더보드 파일 로드 또는 새로 생성
if os.path.exists(leaderboard_file):
    leaderboard = pd.read_csv(leaderboard_file)
else:
    leaderboard = pd.DataFrame(columns=[
        "device-type", "device-name", "llm", "quantization", "calibration",
        "BLEU", "METEOR", "BERTScore", "tps"
    ])
    
# Process each JSON file
for filename in os.listdir(result_dir):
    if not filename.endswith(".json"):
        continue

    # Parse metadata from filename
    name_head = filename.replace(".json", "")
    metadata = {
        "device-type": name_head.split("-")[0],
        "device-name": name_head.split("_")[0].split("-")[1],
        "llm": name_head.split("_")[1].split('-')[0],
        "quantization": name_head.split("_calib")[0].split('-')[-1],
        "calibration": name_head.split("_calib-")[1],
    }

    # Skip if already in leaderboard
    if ((leaderboard["device-type"] == metadata["device-type"]) &
        (leaderboard["device-name"] == metadata["device-name"]) &
        (leaderboard["llm"] == metadata["llm"]) &
        (leaderboard["quantization"] == metadata["quantization"]) &
        (leaderboard["calibration"] == metadata["calibration"])).any():
        print(f"Skipping {filename}, already in leaderboard.")
        continue
    
    # Load translation results
    with open(os.path.join(result_dir, filename), "r", encoding="utf-8") as file:
        json_data = json.load(file)
        translations = json_data.get("translations", [])

    # Evaluate translations
    num_metrics = 4  # BLEU, METEOR, BERTScore, TPS
    metrics = np.full((num_metrics, len(translations)), np.nan)

    for i, result in enumerate(translations):
        translation = result.get("translation", "")
        elapsed_time = result.get("elapsed_time", 1e-6)  # Default time if not provided
        ref_text = data_eng[i] if i < len(data_eng) else ""

        # Evaluate translation
        metric_result = evaluate_translation(
            translation, ref_text, target_lang, elapsed_time, all_configs["active_metrics"]
        )
        for j, metric_name in enumerate(all_configs["active_metrics"]):
            metrics[j, i] = metric_result.get(metric_name, np.nan)

    # Calculate averages
    avg_metrics = {metric: np.nanmean(metrics[j, :]) for j, metric in enumerate(all_configs["active_metrics"])}

    # Add to leaderboard
    new_entry = pd.DataFrame([{
        **metadata,
        **avg_metrics
    }])  # Create a DataFrame for the new entry

    # Concatenate the new entry to the leaderboard
    leaderboard = pd.concat([leaderboard, new_entry], ignore_index=True)
    print(f"Processed and added {filename} to leaderboard.")

# 리더보드 CSV 파일로 저장
leaderboard.to_csv(leaderboard_file, index=False)
print(f"Leaderboard updated and saved to {leaderboard_file}")



Processed and added GPU-A5000_qwen2.5:72b-Q4_K_M_calib-base.json to leaderboard.
Skipping GPU-A100_qwen2.5:72b-Q4_K_M_calib-base.json, already in leaderboard.
Processed and added NPU-RNGD_llama3.1-8B-Instruct-W8A8_calib-base.json to leaderboard.
Skipping GPU-A5000_qwen2:72b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A100_llama3.1:latest-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.1:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A100_llama3.1:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.3:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.1-Q4_K_M_calib-base.json, already in leaderboard.
Leaderboard updated and saved to ./results/translation/Korean2English/leaderboard.csv


In [9]:

# Configurations
source_lang = "English"  # 소스 언어
target_lang = "Korean"  # 목표 언어
translation_config = f'{source_lang}2{target_lang}'

result_dir = f"./results/translation/{translation_config}/"
leaderboard_file = f"./results/translation/{translation_config}/leaderboard.csv"

# 기존 리더보드 파일 로드 또는 새로 생성
if os.path.exists(leaderboard_file):
    leaderboard = pd.read_csv(leaderboard_file)
else:
    leaderboard = pd.DataFrame(columns=[
        "device-type", "device-name", "llm", "quantization", "calibration",
        "BLEU", "METEOR", "BERTScore", "tps"
    ])
    
# Process each JSON file
for filename in os.listdir(result_dir):
    if not filename.endswith(".json"):
        continue

    # Parse metadata from filename
    name_head = filename.replace(".json", "")
    metadata = {
        "device-type": name_head.split("-")[0],
        "device-name": name_head.split("_")[0].split("-")[1],
        "llm": name_head.split("_")[1].split('-')[0],
        "quantization": name_head.split("_calib")[0].split('-')[-1],
        "calibration": name_head.split("_calib-")[1],
    }

    # Skip if already in leaderboard
    if ((leaderboard["device-type"] == metadata["device-type"]) &
        (leaderboard["device-name"] == metadata["device-name"]) &
        (leaderboard["llm"] == metadata["llm"]) &
        (leaderboard["quantization"] == metadata["quantization"]) &
        (leaderboard["calibration"] == metadata["calibration"])).any():
        print(f"Skipping {filename}, already in leaderboard.")
        continue
    
    # Load translation results
    with open(os.path.join(result_dir, filename), "r", encoding="utf-8") as file:
        json_data = json.load(file)
        translations = json_data.get("translations", [])

    # Evaluate translations
    num_metrics = 4  # BLEU, METEOR, BERTScore, TPS
    metrics = np.full((num_metrics, len(translations)), np.nan)

    for i, result in enumerate(translations):
        translation = result.get("translation", "")
        elapsed_time = result.get("elapsed_time", 1e-6)  # Default time if not provided
        ref_text = data_kor[i] if i < len(data_kor) else ""

        # Evaluate translation
        metric_result = evaluate_translation(
            translation, ref_text, target_lang, elapsed_time, all_configs["active_metrics"]
        )
        for j, metric_name in enumerate(all_configs["active_metrics"]):
            metrics[j, i] = metric_result.get(metric_name, np.nan)

    # Calculate averages
    avg_metrics = {metric: np.nanmean(metrics[j, :]) for j, metric in enumerate(all_configs["active_metrics"])}

    # Add to leaderboard
    new_entry = pd.DataFrame([{
        **metadata,
        **avg_metrics
    }])  # Create a DataFrame for the new entry

    # Concatenate the new entry to the leaderboard
    leaderboard = pd.concat([leaderboard, new_entry], ignore_index=True)
    print(f"Processed and added {filename} to leaderboard.")

# 리더보드 CSV 파일로 저장
leaderboard.to_csv(leaderboard_file, index=False)
print(f"Leaderboard updated and saved to {leaderboard_file}")

Skipping GPU-A100_qwen2.5:72b-Q4_K_M_calib-base.json, already in leaderboard.




Processed and added NPU-RNGD_llama3.1-8B-Instruct-W8A8_calib-base.json to leaderboard.
Skipping GPU-A5000_qwen2:72b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A100_llama3.1:latest-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.1:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A100_llama3.1:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.3:70b-Q4_K_M_calib-base.json, already in leaderboard.
Skipping GPU-A5000_llama3.1-Q4_K_M_calib-base.json, already in leaderboard.
Leaderboard updated and saved to ./results/translation/English2Korean/leaderboard.csv
