In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

RESULT_DIR = '../02.results/00.Benchmark_Results/'
RESULT_NAME = ['00.Gemini/', 
               '01.Llama/',
               '02.Mistral/',
               '03.Qwen/Qwen-8B/',
               '03.Qwen/Qwen-14B/']
BENCHMARK_DIR = '../00.data/02.WikiBench/'
BENCHMARK_NAME = ['00.original_benchmark_TF_500.parquet',
                  '01.subject_shuffled_benchmark_TF_500.parquet',
                  '02.object_shuffled_benchmark_TF_500.parquet',
                  '03.property_scoped_subject_shuffled_benchmark_TF_500.parquet',
                  '04.property_scoped_object_shuffled_benchmark_TF_500.parquet']
BENCHMARK_GEMINI = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[0]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_LLAMA = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[1]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_MISTRAL = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[2]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_QWEN3_8B = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[3]}{name}").to_pandas() for name in BENCHMARK_NAME]
BENCHMARK_QWEN3_14B = [pq.read_table(f"{RESULT_DIR}{RESULT_NAME[4]}{name}").to_pandas() for name in BENCHMARK_NAME]

BENCHMARK_RESULTS = [BENCHMARK_MISTRAL,
                     BENCHMARK_LLAMA,
                     BENCHMARK_GEMINI,
                     BENCHMARK_QWEN3_8B,
                     BENCHMARK_QWEN3_14B]
LANGUAGE_LIST = ['en', 'de', 'fr', 'es', 'it', 'pt', 'ko', 'ja']
def is_correct(df):
    if df.iloc[0]['kind'] == 'original':
        correct = 0
        wrong = 1
        unsure = 2
    else:
        correct = 1
        wrong = 0
        unsure = 2
    for lang in LANGUAGE_LIST:
        col_response_TF = f"response_TF_{lang}"
        col_correct = f"correct_{lang}"
        correct_list = []
        for response in df[col_response_TF].tolist():
            if response == "True":
                correct_list.append(correct)
            elif response == "<answer>True</answer>":
                correct_list.append(correct)
            elif response == "False":
                correct_list.append(wrong)
            elif response == "<answer>False</answer>":
                correct_list.append(wrong)
            else:
                correct_list.append(unsure)
        df[col_correct] = correct_list
    return df
for model_idx in range(0, len(BENCHMARK_RESULTS)):  
    for bench_idx in range(len(BENCHMARK_RESULTS[model_idx])):
        BENCHMARK_RESULTS[model_idx][bench_idx] = is_correct(BENCHMARK_RESULTS[model_idx][bench_idx])

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# ===== 원하는 그룹 색상 =====
LANG_GROUP_TEXT_COLORS = {
    "g1": "#4A6FB3", # EN/DE
    "g2": "#3A7F5C", # FR/ES/IT/PT
    "g3": "#B24A4A", # KO/JA
}

LANG_TO_GROUP = {
    "en": "g1", "de": "g1",
    "fr": "g2", "es": "g2", "it": "g2", "pt": "g2",
    "ko": "g3", "ja": "g3",
}

MODEL_DISPLAY = {
    "Mistral": "Mistral-Nemo-Instruct-2407",
    "LLaMA": "Llama 3.1 Instruct-8B",
    "Gemini": "Gemini 2.5 Flash",
    "Qwen3-8B": "Qwen3-8B",
    "Qwen3-14B": "Qwen3-14B",
}

def plot_spectrum_grid(
    BENCHMARK_RESULTS,
    LANGUAGE_LIST,
    BENCH_KEYS,
    MODEL_ORDER,
    CMAPS,
    save_path=None,
    figsize=(100, 50),
    dpi=300,
    # ✅ 새로 추가한 옵션
    left=0.18,
    right=0.995,
    top=0.98,
    bottom=0.02,
    wspace=0.02,
    hspace=0.10,
    lang_gap=0.60,   # ✅ 언어(5행) 블록 사이 간격 크기 (0.3~1.0 정도 추천)
):
    num_models = len(BENCHMARK_RESULTS)
    num_bench  = len(BENCH_KEYS)      # 5
    num_lang   = len(LANGUAGE_LIST)   # 8

    # ========= 모델별 row_id 정렬 (그대로) =========
    def get_sorted_rowids_by_org_accuracy(df_org, language_list):
        g = df_org.groupby("row_id")
        scores = {}
        for rowid, group in g:
            vals = [group[f"correct_{lang}"].iloc[0] for lang in language_list]
            scores[rowid] = sum(v == 0 for v in vals) / len(vals)
        return sorted(scores.keys(), key=lambda rid: scores[rid], reverse=True)

    def make_row_vector(df, sorted_rowids, lang):
        col = f"correct_{lang}"
        s = df.groupby("row_id")[col].first().reindex(sorted_rowids)
        arr = pd.to_numeric(s, errors="coerce").to_numpy(dtype=float)
        arr = np.nan_to_num(arr, nan=2.0).astype(int)
        return arr.reshape(1, -1)

    model_sorted_rowids = []
    for mi in range(num_models):
        df_org = BENCHMARK_RESULTS[mi][0]
        model_sorted_rowids.append(get_sorted_rowids_by_org_accuracy(df_org, LANGUAGE_LIST))

    # ========= (2) 언어 블록 사이에 spacer row 넣기 위한 GridSpec =========
    # 총 row 수 = 8언어*5벤치 + (언어-1)*spacer
    height_ratios = []
    row_map = []  # 실제 그려야 하는 r(0..40-1) -> gridspec row index 매핑
    gs_r = 0
    for li in range(num_lang):
        for bi in range(num_bench):
            height_ratios.append(1.0)
            row_map.append(gs_r)
            gs_r += 1
        if li != num_lang - 1:
            height_ratios.append(lang_gap)  # spacer
            gs_r += 1

    total_gs_rows = len(height_ratios)

    fig = plt.figure(figsize=figsize, dpi=dpi)
    fig.subplots_adjust(left=left, right=right, top=top, bottom=bottom, wspace=wspace, hspace=hspace)

    gs = fig.add_gridspec(nrows=total_gs_rows, ncols=num_models, height_ratios=height_ratios)

    # axes 저장(언어 라벨 y_mid 계산용)
    axes = [[None for _ in range(num_models)] for _ in range(num_lang * num_bench)]

    # ========= 본문 그리기 =========
    for mi, model_name in enumerate(MODEL_ORDER):
        sorted_rowids = model_sorted_rowids[mi]
        bench_list = BENCHMARK_RESULTS[mi]

        for li, lang in enumerate(LANGUAGE_LIST):
            for bi, bench_key in enumerate(BENCH_KEYS):
                r = li * num_bench + bi
                gsr = row_map[r]  # gridspec row index

                ax = fig.add_subplot(gs[gsr, mi])
                axes[r][mi] = ax

                df_bench = bench_list[bi]
                row_vec = make_row_vector(df_bench, sorted_rowids, lang)

                ax.imshow(
                    row_vec,
                    aspect="auto",
                    cmap=CMAPS[model_name],
                    interpolation="nearest",
                    vmin=0, vmax=2
                )
                ax.set_xticks([])
                ax.set_yticks([])
                for sp in ax.spines.values():
                    sp.set_visible(False)

                # 모델 제목 (맨 위 row에서만)
                # if r == 0:
                #     ax.set_title(model_name, fontsize=100, pad=20, fontweight="bold")
                if r == 0:
                    ax.set_title('\n' + MODEL_DISPLAY.get(model_name, model_name), fontsize=80, pad=50, fontweight="bold")
                # 벤치 라벨: 매 행 / 첫 번째 모델 열에만
                if mi == 0:
                    ax.text(
                        -0.03, 0.5, bench_key,
                        transform=ax.transAxes,
                        ha="right", va="center",
                        fontsize=50
                    )

    # ========= (1)(3) 언어 라벨: 왼쪽 마진 중앙 + 그룹별 색상 =========
    # 언어 라벨 x 위치를 "왼쪽 마진(left) 영역의 중앙"으로 자동 계산
    x_lang = left * 0.6  # 0.5면 딱 중앙, 조금 더 오른쪽으로 0.55 추천

    for li, lang in enumerate(LANGUAGE_LIST):
        r0 = li * num_bench
        r1 = r0 + (num_bench - 1)

        # 첫 모델 열 기준으로 블록 중앙 y 계산
        pos_top = axes[r0][0].get_position()
        pos_bot = axes[r1][0].get_position()
        y_mid = (pos_top.y1 + pos_bot.y0) / 2

        group = LANG_TO_GROUP.get(lang, "g2")
        color = LANG_GROUP_TEXT_COLORS[group]

        fig.text(
            x_lang, y_mid, lang.upper(),
            ha="center", va="center",
            fontsize=100, fontweight="bold",
            color=color
        )

    # spacer row는 아예 축을 만들지 않았으니 추가 작업 필요 없음

    if save_path is not None:
        fig.savefig(save_path + ".pdf", bbox_inches="tight")
        fig.savefig(save_path + ".svg", bbox_inches="tight")

    plt.show()
    return fig

# ===== run =====
FIGURE_DIR = "../02.results/01.Figures/"

plot_spectrum_grid(
    BENCHMARK_RESULTS=BENCHMARK_RESULTS,
    LANGUAGE_LIST=LANGUAGE_LIST,
    BENCH_KEYS=BENCH_KEYS,
    MODEL_ORDER=MODEL_ORDER,
    CMAPS=CMAPS,
    save_path=f"{FIGURE_DIR}99.Spectrum_TF_500_modelwise_sorted_v2",
    figsize=(100, 50),
    dpi=300,
    # 아래는 선택: 필요하면 조절
    left=0.1,
    lang_gap=0.45,
    top=0.78,   # 기존 0.95 → 0.88 처럼 줄이기
)