In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [None]:
RESULT_DIR = '../02.results/00.Benchmark_Results/'
RESULT_NAME = ['00.Gemini/', 
               '01.Llama/',
               '02.Mistral/',
               '03.Qwen/Qwen-8B/',
               '03.Qwen/Qwen-14B/']               

In [None]:
BENCHMARK_DIR = '../00.data/02.WikiBench/'
BENCHMARK_NAME = ['00.original_benchmark_TF_500.parquet',
                  '01.subject_shuffled_benchmark_TF_500.parquet',
                  '02.object_shuffled_benchmark_TF_500.parquet',
                  '03.property_scoped_subject_shuffled_benchmark_TF_500.parquet',
                  '04.property_scoped_object_shuffled_benchmark_TF_500.parquet']

In [None]:
BENCHMARK_GEMINI = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[0]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_LLAMA = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[1]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_MISTRAL = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[2]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_QWEN3_8B = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[3]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_QWEN3_14B = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[4]}{name}").to_pandas() for name in BENCHMARK_NAME]

BENCHMARK_RESULTS = [BENCHMARK_MISTRAL,
                     BENCHMARK_LLAMA,
                     BENCHMARK_GEMINI,
                     BENCHMARK_QWEN3_8B,
                     BENCHMARK_QWEN3_14B]

In [None]:
LANGUAGE_LIST = ['en', 'de', 'fr', 'es', 'it', 'pt', 'ko', 'ja']

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
all_values = set()
for model_results in BENCHMARK_RESULTS:
    for lang in LANGUAGE_LIST:
        col = f"response_TF_{lang}"
        for df in model_results:
            if col in df.columns:
                all_values.update(df[col].unique())

# print(all_values)
print("총 개수:", len(all_values))

In [None]:
def is_correct(df):
    if df.iloc[0]['kind'] == 'original':
        correct = 0
        wrong = 1
        unsure = 2
    else:
        correct = 1
        wrong = 0
        unsure = 2
    for lang in LANGUAGE_LIST:
        col_response_TF = f"response_TF_{lang}"
        col_correct = f"correct_{lang}"
        correct_list = []
        for response in df[col_response_TF].tolist():
            if response == "True":
                correct_list.append(correct)
            elif response == "<answer>True</answer>":
                correct_list.append(correct)
            elif response == "False":
                correct_list.append(wrong)
            elif response == "<answer>False</answer>":
                correct_list.append(wrong)
            else:
                correct_list.append(unsure)
        df[col_correct] = correct_list
    return df

In [None]:
import pandas as pd
from collections import defaultdict

def count_unique_response_TF(
    BENCHMARK_RESULTS,
    LANGUAGE_LIST,
    bench_labels=None
):
    """
    BENCHMARK_RESULTS:
        [
          [df_ORG, df_SS, df_OS, df_PSS, df_PSO],  # model 0
          [df_ORG, df_SS, df_OS, df_PSS, df_PSO],  # model 1
          ...
        ]

    return:
        DataFrame with columns:
        [model_idx, bench, lang, response_TF_value, count]
    """
    rows = []

    for model_idx, model_benchmarks in enumerate(BENCHMARK_RESULTS):
        for bench_idx, df in enumerate(model_benchmarks):
            bench_name = (
                bench_labels[bench_idx]
                if bench_labels is not None
                else bench_idx
            )

            for lang in LANGUAGE_LIST:
                col = f"response_TF_{lang}"
                if col not in df.columns:
                    continue

                vc = df[col].value_counts(dropna=False)
                for val, cnt in vc.items():
                    rows.append({
                        "model_idx": model_idx,
                        "bench": bench_name,
                        "lang": lang,
                        "response_TF_value": val,
                        "count": cnt,
                    })

    return pd.DataFrame(rows)
bench_labels = ["ORG", "SS", "SO", "PSS", "PSO"]

tf_value_counts = count_unique_response_TF(
    BENCHMARK_RESULTS,
    LANGUAGE_LIST,
    bench_labels
)

tf_value_counts


In [None]:
pd.set_option("display.max_rows", 10)

In [None]:
for model_idx in range(0, len(BENCHMARK_RESULTS)):  
    for bench_idx in range(len(BENCHMARK_RESULTS[model_idx])):
        BENCHMARK_RESULTS[model_idx][bench_idx] = is_correct(BENCHMARK_RESULTS[model_idx][bench_idx])

In [None]:
import numpy as np
import pandas as pd

def build_acc_table_lang_as_cols_for_model(model_benchmarks, language_list, bench_labels):
 
    # 각 bench별로: 언어별 acc(%) 계산
    rows = []
    for df in model_benchmarks:
        row = {}
        for lang in language_list:
            col = f"correct_{lang}"
            row[lang.upper()] = (df[col] == 0).mean() * 100.0
        rows.append(row)

    out = pd.DataFrame(rows, index=bench_labels)

    # TOTAL 행(bench 평균) 추가
    out.loc["TOTAL"] = out.mean(axis=0)

    return out


def build_acc_table_lang_as_cols_all_models(BENCHMARK_RESULTS, MODEL_NAMES, language_list, bench_labels):
    
    frames = []
    for model_name, model_benchmarks in zip(MODEL_NAMES, BENCHMARK_RESULTS):
        df_model = build_acc_table_lang_as_cols_for_model(
            model_benchmarks, language_list, bench_labels
        )
        df_model["MODEL"] = model_name
        df_model["BENCH"] = df_model.index
        frames.append(df_model.reset_index(drop=True))

    out = pd.concat(frames).set_index(["MODEL", "BENCH"])
    return out


# 사용 예시
bench_labels = ["ORG", "SS", "SO", "PSS", "POS"]
MODEL_NAMES = ["Mistral", "LLaMA", "Gemini", "Qwen3-8B", "Qwen3-14B"]

acc_summary_cols = build_acc_table_lang_as_cols_all_models(
    BENCHMARK_RESULTS,
    MODEL_NAMES,
    LANGUAGE_LIST,
    bench_labels
)

acc_summary_cols


In [None]:
acc_summary_cols.to_csv('../00.data/02.WikiBench/99.Accuracy.csv')