In [None]:
import os
import re
import sys
import datasets
import multiprocessing
import logging
import torch
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import psutil
import vllm
from tqdm import tqdm
from functools import partial
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import Accelerator, DataLoaderConfiguration
from concurrent.futures import ProcessPoolExecutor
from vllm import LLM, SamplingParams

In [None]:
MODEL_DIR = f"../01.models/"
MODEL_NAME = [f"Meta/meta-llama/Llama-3.1-8B-Instruct/",
              f"Mistral/mistralai/Mistral-Nemo-Instruct-2407/",
              f"Qwen/Qwen3-8B/",
              f"Qwen/Qwen3-14B/"]              

In [None]:
BENCHMARK_DIR = '../00.data/02.WikiBench/'
BENCHMARK_NAME = ['00.original_benchmark_TF_100.parquet',
                  '01.subject_shuffled_benchmark_TF_100.parquet',
                  '02.object_shuffled_benchmark_TF_100.parquet',
                  '03.property_scoped_subject_shuffled_benchmark_TF_100.parquet',
                  '04.property_scoped_object_shuffled_benchmark_TF_100.parquet']
BENCHMARKS = [pq.read_table(f"{BENCHMARK_DIR}{name}").to_pandas() for name in BENCHMARK_NAME]

In [None]:
BENCHMARK_DIR = '../00.data/02.WikiBench/'
BENCHMARK_NAME = ['00.original_benchmark_TF_500.parquet',
                  '01.subject_shuffled_benchmark_TF_500.parquet',
                  '02.object_shuffled_benchmark_TF_500.parquet',
                  '03.property_scoped_subject_shuffled_benchmark_TF_500.parquet',
                  '04.property_scoped_object_shuffled_benchmark_TF_500.parquet']
BENCHMARKS = [pq.read_table(f"{BENCHMARK_DIR}{name}").to_pandas() for name in BENCHMARK_NAME]

In [None]:
RESULT_DIR = '../02.results/00.Benchmark_Results/'
RESULT_NAME = ['00.Gemini/', 
               '01.Llama/',
               '02.Mistral/',
               '03.Qwen/Qwen-8B/',
               '03.Qwen/Qwen-14B/']
               

In [None]:
LANGUAGE_LIST = ['en', 'fr', 'de', 'es', 'it', 'pt', 'ko', 'ja']

# WikiBench 수행

In [None]:
def load_model(model_path):
    llm = LLM(
        model = model_path,
        gpu_memory_utilization = 0.85,
        tensor_parallel_size = 4,
        trust_remote_code = True,
        max_model_len = 512,
        dtype = torch.float16,
        enable_prefix_caching = True,
    )
    sampling_params = SamplingParams(
        temperature = 0.0,
        top_p = 1,
        max_tokens = 10,
        n = 1,
        stop = ["</answer>", "</ANSWER>", "</Answer>"],
        include_stop_str_in_output = False,
    )
    return llm, sampling_params

In [None]:
def run_wikibench_TF(df, llm, params):
    for lang in LANGUAGE_LIST:
        col_wikibench_TF = f"wikibench_TF_{lang}"
        col_response_TF = f"response_TF_{lang}"
        # col_extract_TF = f"extract_TF_{lang}"

        prompts = df[col_wikibench_TF].tolist()
        with torch.no_grad():
            outputs = llm.generate(prompts, sampling_params = params)
            responses = [out.outputs[0].text for out in outputs]
            df[col_response_TF] = responses
            # df[col_extract_TF] = df[col_response_TF].apply(extract_answer)
    return df
            

# Llama-3.1-8B-Instruct

In [None]:
llm, params = load_model(f"{MODEL_DIR}{MODEL_NAME[0]}")

In [None]:
BENCHMARK_LLAMA = [benchmark.copy() for benchmark in BENCHMARKS]
BENCHMARK_LLAMA[0] = run_wikibench_TF(BENCHMARK_LLAMA[0], llm, params)
BENCHMARK_LLAMA[1] = run_wikibench_TF(BENCHMARK_LLAMA[1], llm, params)
BENCHMARK_LLAMA[2] = run_wikibench_TF(BENCHMARK_LLAMA[2], llm, params)
BENCHMARK_LLAMA[3] = run_wikibench_TF(BENCHMARK_LLAMA[3], llm, params)
BENCHMARK_LLAMA[4] = run_wikibench_TF(BENCHMARK_LLAMA[4], llm, params)

In [None]:
del llm, params

# Mistral-Nemo-Instruct-2407

In [None]:
llm, params = load_model(f"{MODEL_DIR}{MODEL_NAME[1]}")

In [None]:
BENCHMARK_MISTRAL = [benchmark.copy() for benchmark in BENCHMARKS]
BENCHMARK_MISTRAL[0] = run_wikibench_TF(BENCHMARK_MISTRAL[0], llm, params)
BENCHMARK_MISTRAL[1] = run_wikibench_TF(BENCHMARK_MISTRAL[1], llm, params)
BENCHMARK_MISTRAL[2] = run_wikibench_TF(BENCHMARK_MISTRAL[2], llm, params)
BENCHMARK_MISTRAL[3] = run_wikibench_TF(BENCHMARK_MISTRAL[3], llm, params)
BENCHMARK_MISTRAL[4] = run_wikibench_TF(BENCHMARK_MISTRAL[4], llm, params)

In [None]:
del llm, params

# Qwen3-8B

In [None]:
llm, params = load_model(f"{MODEL_DIR}{MODEL_NAME[2]}")

In [None]:
BENCHMARK_QWEN3_8B = [benchmark.copy() for benchmark in BENCHMARKS]
BENCHMARK_QWEN3_8B[0] = run_wikibench_TF(BENCHMARK_QWEN3_8B[0], llm, params)
BENCHMARK_QWEN3_8B[1] = run_wikibench_TF(BENCHMARK_QWEN3_8B[1], llm, params)
BENCHMARK_QWEN3_8B[2] = run_wikibench_TF(BENCHMARK_QWEN3_8B[2], llm, params)
BENCHMARK_QWEN3_8B[3] = run_wikibench_TF(BENCHMARK_QWEN3_8B[3], llm, params)
BENCHMARK_QWEN3_8B[4] = run_wikibench_TF(BENCHMARK_QWEN3_8B[4], llm, params)

In [None]:
del llm, params

# Qwen3-14B

In [None]:
llm, params = load_model(f"{MODEL_DIR}{MODEL_NAME[3]}")

In [None]:
BENCHMARK_QWEN3_14B = [benchmark.copy() for benchmark in BENCHMARKS]
BENCHMARK_QWEN3_14B[0] = run_wikibench_TF(BENCHMARK_QWEN3_14B[0], llm, params)
BENCHMARK_QWEN3_14B[1] = run_wikibench_TF(BENCHMARK_QWEN3_14B[1], llm, params)
BENCHMARK_QWEN3_14B[2] = run_wikibench_TF(BENCHMARK_QWEN3_14B[2], llm, params)
BENCHMARK_QWEN3_14B[3] = run_wikibench_TF(BENCHMARK_QWEN3_14B[3], llm, params)
BENCHMARK_QWEN3_14B[4] = run_wikibench_TF(BENCHMARK_QWEN3_14B[4], llm, params)

In [None]:
f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[0]}"

In [None]:
RESULT_DIR

In [None]:
BENCHMARK_NAME = ['00.0120_original_benchmark_TF_500.parquet',
                  '01.0120_subject_shuffled_benchmark_TF_500.parquet',
                  '02.0120_object_shuffled_benchmark_TF_500.parquet',
                  '03.0120_property_scoped_subject_shuffled_benchmark_TF_500.parquet',
                  '04.0120_property_scoped_object_shuffled_benchmark_TF_500.parquet']


In [None]:
BENCHMARK_QWEN3_14B[0].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[0]}")
BENCHMARK_QWEN3_14B[1].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[1]}")
BENCHMARK_QWEN3_14B[2].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[2]}")
BENCHMARK_QWEN3_14B[3].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[3]}")
BENCHMARK_QWEN3_14B[4].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[4]}")

In [None]:
del llm, params

In [None]:
all_values = set()

BENCHMARKS_RESULTS = [
    BENCHMARK_LLAMA,
    BENCHMARK_MISTRAL,
    BENCHMARK_QWEN3_8B,
    BENCHMARK_QWEN3_14B
]

for model_results in BENCHMARKS_RESULTS:
    for lang in LANGUAGE_LIST:
        col = f"response_TF_{lang}"
        for df in model_results:
            if col in df.columns:
                all_values.update(df[col].unique())

print(all_values)
print("총 개수:", len(all_values))


In [None]:
# ? temperature 0.0 으로 바꾸니까 갑자기..?

In [None]:
BENCHMARK_LLAMA[0].to_parquet(f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[0]}")
BENCHMARK_LLAMA[1].to_parquet(f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[1]}")
BENCHMARK_LLAMA[2].to_parquet(f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[2]}")
BENCHMARK_LLAMA[3].to_parquet(f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[3]}")
BENCHMARK_LLAMA[4].to_parquet(f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[4]}")

BENCHMARK_MISTRAL[0].to_parquet(f"{RESULT_DIR}{RESULT_NAME[2]}{BENCHMARK_NAME[0]}")
BENCHMARK_MISTRAL[1].to_parquet(f"{RESULT_DIR}{RESULT_NAME[2]}{BENCHMARK_NAME[1]}")
BENCHMARK_MISTRAL[2].to_parquet(f"{RESULT_DIR}{RESULT_NAME[2]}{BENCHMARK_NAME[2]}")
BENCHMARK_MISTRAL[3].to_parquet(f"{RESULT_DIR}{RESULT_NAME[2]}{BENCHMARK_NAME[3]}")
BENCHMARK_MISTRAL[4].to_parquet(f"{RESULT_DIR}{RESULT_NAME[2]}{BENCHMARK_NAME[4]}")

BENCHMARK_QWEN3_8B[0].to_parquet(f"{RESULT_DIR}{RESULT_NAME[3]}{BENCHMARK_NAME[0]}")
BENCHMARK_QWEN3_8B[1].to_parquet(f"{RESULT_DIR}{RESULT_NAME[3]}{BENCHMARK_NAME[1]}")
BENCHMARK_QWEN3_8B[2].to_parquet(f"{RESULT_DIR}{RESULT_NAME[3]}{BENCHMARK_NAME[2]}")
BENCHMARK_QWEN3_8B[3].to_parquet(f"{RESULT_DIR}{RESULT_NAME[3]}{BENCHMARK_NAME[3]}")
BENCHMARK_QWEN3_8B[4].to_parquet(f"{RESULT_DIR}{RESULT_NAME[3]}{BENCHMARK_NAME[4]}")

BENCHMARK_QWEN3_14B[0].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[0]}")
BENCHMARK_QWEN3_14B[1].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[1]}")
BENCHMARK_QWEN3_14B[2].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[2]}")
BENCHMARK_QWEN3_14B[3].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[3]}")
BENCHMARK_QWEN3_14B[4].to_parquet(f"{RESULT_DIR}{RESULT_NAME[4]}{BENCHMARK_NAME[4]}")

In [None]:
f"{RESULT_DIR}{RESULT_NAME[1]}{BENCHMARK_NAME[0]}"

In [None]:
BENCHMARK_LLAMA[0]