In [7]:
#!/usr/bin/env python
# rq1_stats_table_rr_br.py  –  RR / BR  の Wilcoxon＋Holm＋Cliff’s Δ

import os, json, math, itertools, warnings
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

# ────── 固定パラメータ ──────────────────────────────────────────
DATASETS   = ["c100", "tiny-imagenet"]
K          = 0
TGT_RANKS  = [1, 2, 3]
MISCLF_TPS = ["src_tgt", "tgt_fp", "tgt_fn"]
REPS       = range(5)
WNUM       = 472

ALPHA      = 10/11
ALPHA_STR  = f"alpha{ALPHA}_boundsArachne"
ROOT_TMPL  = "/src/src/out_vit_{ds}_fold{K}"

METHODS = {"reptran": "ours", "arachne": "bl", "random_r": "random_r", "random_a": "random_a"}
PAIRS = list(itertools.combinations(METHODS.keys(), 2))

METRIC_INFO = dict(
    T=("tot_time",  "Total Time"),
)

In [3]:
# ────── JSON 読み出し ───────────────────────────────────────────
def metric_value(ds, mtype, rank, rep, method_key, json_key):
    base = Path(ROOT_TMPL.format(ds=ds, K=K))
    jdir = base / f"misclf_top{rank}" / f"{mtype}_repair_weight_by_de"
    if method_key == "reptran":
        fn = f"exp-repair-4-1-metrics_for_repair_n{WNUM}_{ALPHA_STR}_ours_reps{rep}.json"
    elif method_key == "random_r":
        fn = f"exp-repair-4-1-metrics_for_repair_n{WNUM}_{ALPHA_STR}_random_reps{rep}.json"
    elif method_key == "random_a":
        fn = f"exp-repair-3-2-metrics_for_repair_{ALPHA_STR}_random_reps{rep}.json"
    elif method_key == "arachne":
        fn = f"exp-repair-3-1-metrics_for_repair_{ALPHA_STR}_bl_reps{rep}.json"
    with open(jdir / fn) as f:
        return json.load(f)[json_key]

# ────── Wilcoxon & Cliff’s Δ (対応あり) ──────────────────────────
def paired_cliffs_delta(v1: np.ndarray, v2: np.ndarray):
    """対応あり Cliff’s Δ  =  (n_pos - n_neg) / N"""
    diff = v1 - v2
    n_pos = np.sum(diff > 0)
    n_neg = np.sum(diff < 0)
    return (n_pos - n_neg) / diff.size if diff.size else 0.0

def make_tag(m1, m2):
    """例: make_tag('reptran', 'random_a') -> 'R vs. Rand_A'"""
    # タグ名変換辞書
    TAG_MAP = {
        "reptran": "R",
        "arachne": "A",
        "random_r": "Rand_R",
        "random_a": "Rand_A",
    }
    return f"{TAG_MAP[m1]} vs. {TAG_MAP[m2]}"

def wilcoxon_block(values):
    """values = {method: np.array(15)}   ->   {OvA_p, OvA_d, …}"""
    out = {}
    p_raw = []
    # 生 p と Δ をまず計算
    for m1, m2 in PAIRS:
        v1, v2 = values[m1], values[m2]
        if np.allclose(v1, v2):
            p = 1.0
        else:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=UserWarning)
                p = wilcoxon(v1, v2, zero_method="wilcox").pvalue
        d = paired_cliffs_delta(v1, v2)
        tag = make_tag(m1, m2)
        out[f"{tag}_p_raw"] = p
        out[f"{tag}_d"]     = d
        p_raw.append(p)

    # Holm 補正
    _, p_adj, _, _ = multipletests(p_raw, method="holm")
    for (m1, m2), p_c in zip(PAIRS, p_adj):
        tag = make_tag(m1, m2)
        out[f"{tag}_p"] = p_c
    return out

def star(p):
    return "***" if p <= .001 else "**" if p <= .01 else "*" if p <= .05 else ""

def cell(d, p):
    return f"'{d:+.2f} {star(p)}"      # +0.45 ** のように符号を残す

In [4]:
# ────── main ────────────────────────────────────────────────────
for metric_tag, (json_key, nice_name) in METRIC_INFO.items():
    rows = []
    for ds, mtype in itertools.product(DATASETS, MISCLF_TPS):
        # 15 データ点 × 3 手法
        vals = {m: [] for m in METHODS}
        for rank, rep in itertools.product(TGT_RANKS, REPS):
            for m, _ in METHODS.items():
                vals[m].append(
                    metric_value(ds, mtype, rank, rep, m, json_key)
                )
        vals = {m: np.array(v) for m, v in vals.items()}

        stat = wilcoxon_block(vals)
        print(stat.keys())
        # tag の一覧を得る（順序を固定）
        comparison_tags = [make_tag(m1, m2) for m1, m2 in PAIRS]

        row = {
            "dataset": ds,
            "misclf_type": mtype,
        }
        for tag in comparison_tags:
            row[tag] = cell(stat[f"{tag}_d"], stat[f"{tag}_p"])
        rows.append(row)

    # 並べ替え & 保存
    order = dict(dataset=DATASETS,
                 misclf_type=MISCLF_TPS)
    df = pd.DataFrame(rows).sort_values(
        ["dataset", "misclf_type"],
        key=lambda s: s.map({v: i for col in ["dataset","misclf_type"]
                                   for i,v in enumerate(order[col])})
    )
    out_csv = f"exp-repair-4-2-3_wilcoxon_cliffs_{metric_tag}.csv"
    df.to_csv(out_csv, index=False)
    print(f"[✓] {nice_name}  →  {out_csv}")


dict_keys(['R vs. A_p_raw', 'R vs. A_d', 'R vs. Rand_R_p_raw', 'R vs. Rand_R_d', 'R vs. Rand_A_p_raw', 'R vs. Rand_A_d', 'A vs. Rand_R_p_raw', 'A vs. Rand_R_d', 'A vs. Rand_A_p_raw', 'A vs. Rand_A_d', 'Rand_R vs. Rand_A_p_raw', 'Rand_R vs. Rand_A_d', 'R vs. A_p', 'R vs. Rand_R_p', 'R vs. Rand_A_p', 'A vs. Rand_R_p', 'A vs. Rand_A_p', 'Rand_R vs. Rand_A_p'])
dict_keys(['R vs. A_p_raw', 'R vs. A_d', 'R vs. Rand_R_p_raw', 'R vs. Rand_R_d', 'R vs. Rand_A_p_raw', 'R vs. Rand_A_d', 'A vs. Rand_R_p_raw', 'A vs. Rand_R_d', 'A vs. Rand_A_p_raw', 'A vs. Rand_A_d', 'Rand_R vs. Rand_A_p_raw', 'Rand_R vs. Rand_A_d', 'R vs. A_p', 'R vs. Rand_R_p', 'R vs. Rand_A_p', 'A vs. Rand_R_p', 'A vs. Rand_A_p', 'Rand_R vs. Rand_A_p'])
dict_keys(['R vs. A_p_raw', 'R vs. A_d', 'R vs. Rand_R_p_raw', 'R vs. Rand_R_d', 'R vs. Rand_A_p_raw', 'R vs. Rand_A_d', 'A vs. Rand_R_p_raw', 'A vs. Rand_R_d', 'A vs. Rand_A_p_raw', 'A vs. Rand_A_d', 'Rand_R vs. Rand_A_p_raw', 'Rand_R vs. Rand_A_d', 'R vs. A_p', 'R vs. Rand_R_p'

# Paolo コメント受けた修正版
8/22

In [15]:
#!/usr/bin/env python
# rq1_stats_table_rr_br.py  –  RR / BR  の Wilcoxon＋Holm＋Cliff's Δ
# Modified: misclf_type, ranksごとに5回のrepsの平均を取り、3x3=9サンプルで検定

import os, json, math, itertools, warnings
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

# ────── 固定パラメータ ──────────────────────────────────────────
DATASETS   = ["c100", "tiny-imagenet"]
K          = 0
TGT_RANKS  = [1, 2, 3]
MISCLF_TPS = ["src_tgt", "tgt_fp", "tgt_fn"]
REPS       = range(5)
WNUM       = 472

ALPHA      = 10/11
ALPHA_STR  = f"alpha{ALPHA}_boundsArachne"
ROOT_TMPL  = "/src/src/out_vit_{ds}_fold{K}"

METHODS = {"reptran": "ours", "arachne": "bl", "random_r": "random_r", "random_a": "random_a"}
PAIRS = list(itertools.combinations(METHODS.keys(), 2))

METRIC_INFO = dict(
    T=("tot_time",  "Total Time"),
)

# ────── JSON 読み出し ───────────────────────────────────────────
def metric_value(ds, mtype, rank, rep, method_key, json_key):
    base = Path(ROOT_TMPL.format(ds=ds, K=K))
    jdir = base / f"misclf_top{rank}" / f"{mtype}_repair_weight_by_de"
    if method_key == "reptran":
        fn = f"exp-repair-4-1-metrics_for_repair_n{WNUM}_{ALPHA_STR}_ours_reps{rep}.json"
    elif method_key == "random_r":
        fn = f"exp-repair-4-1-metrics_for_repair_n{WNUM}_{ALPHA_STR}_random_reps{rep}.json"
    elif method_key == "random_a":
        fn = f"exp-repair-3-2-metrics_for_repair_{ALPHA_STR}_random_reps{rep}.json"
    elif method_key == "arachne":
        fn = f"exp-repair-3-1-metrics_for_repair_{ALPHA_STR}_bl_reps{rep}.json"
    with open(jdir / fn) as f:
        return json.load(f)[json_key]

# ────── データ収集（新しいロジック）──────────────────────────────
def collect_averaged_data(ds, metric_key):
    """
    各(misclf_type, rank)の組み合わせについて、
    5回のrepsの平均を計算し、各methodごとに9個の値（3ranks × 3misclf_types）を返す
    """
    json_key = METRIC_INFO[metric_key][0]
    method_values = {method: [] for method in METHODS.keys()}
    
    for mtype in MISCLF_TPS:
        for rank in TGT_RANKS:
            # 各methodについて、この(mtype, rank)での5回のrepsの平均を計算
            for method in METHODS.keys():
                rep_values = []
                for rep in REPS:
                    try:
                        value = metric_value(ds, mtype, rank, rep, method, json_key)
                        rep_values.append(value)
                    except (FileNotFoundError, KeyError) as e:
                        print(f"Warning: Missing data for {ds}/{mtype}/{rank}/{method}/rep{rep}: {e}")
                        rep_values.append(np.nan)
                
                # 5回のrepsの平均を計算
                if rep_values:
                    avg_value = np.nanmean(rep_values)
                    method_values[method].append(avg_value)
                else:
                    method_values[method].append(np.nan)
    
    # numpy arrayに変換
    for method in METHODS.keys():
        method_values[method] = np.array(method_values[method])
    
    return method_values

# ────── Wilcoxon & Cliff's Δ (対応あり) ──────────────────────────
def paired_cliffs_delta(v1: np.ndarray, v2: np.ndarray):
    """対応あり Cliff's Δ  =  (n_pos - n_neg) / N"""
    diff = v1 - v2
    n_pos = np.sum(diff > 0)
    n_neg = np.sum(diff < 0)
    return (n_pos - n_neg) / diff.size if diff.size else 0.0

def make_tag(m1, m2):
    """例: make_tag('reptran', 'random_a') -> 'R vs. Rand_A'"""
    # タグ名変換辞書
    TAG_MAP = {
        "reptran": "R",
        "arachne": "A",
        "random_r": "Rand_R",
        "random_a": "Rand_A",
    }
    return f"{TAG_MAP[m1]} vs. {TAG_MAP[m2]}"

def wilcoxon_block(values):
    """values = {method: np.array(9)}   ->   {OvA_p, OvA_d, …}"""
    out = {}
    p_raw = []
    # 生 p と Δ をまず計算
    for m1, m2 in PAIRS:
        v1, v2 = values[m1], values[m2]
        
        # NaN値をチェック
        valid_mask = ~(np.isnan(v1) | np.isnan(v2))
        if not np.any(valid_mask):
            print(f"Warning: No valid data for comparison {m1} vs {m2}")
            p = 1.0
            d = 0.0
        else:
            v1_clean = v1[valid_mask]
            v2_clean = v2[valid_mask]
            
            if np.allclose(v1_clean, v2_clean):
                p = 1.0
            else:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", category=UserWarning)
                    if len(v1_clean) < 6:  # Wilcoxon検定には最低6個のサンプルが推奨
                        print(f"Warning: Small sample size ({len(v1_clean)}) for {m1} vs {m2}")
                    p = wilcoxon(v1_clean, v2_clean, zero_method="wilcox").pvalue
            d = paired_cliffs_delta(v1_clean, v2_clean)
        
        tag = make_tag(m1, m2)
        out[f"{tag}_p_raw"] = p
        out[f"{tag}_d"]     = d
        p_raw.append(p)

    # Holm 補正
    _, p_adj, _, _ = multipletests(p_raw, method="holm")
    for (m1, m2), p_c in zip(PAIRS, p_adj):
        tag = make_tag(m1, m2)
        out[f"{tag}_p"] = p_c
    return out

def star(p):
    return "***" if p <= .001 else "**" if p <= .01 else "*" if p <= .05 else ""

def cell(d, p):
    return f"'{d:+.2f} {star(p)}"      # +0.45 ** のように符号を残す

# ────── メイン処理 ──────────────────────────────────────────────
def main():
    all_results = []
    
    for ds in DATASETS:
        print(f"Processing dataset: {ds}")
        
        for metric_key in METRIC_INFO.keys():
            print(f"  Processing metric: {metric_key}")
            
            # データ収集（新しいロジック）
            values = collect_averaged_data(ds, metric_key)
            
            # デバッグ情報
            print(f"    Sample sizes: {[(method, len(arr[~np.isnan(arr)])) for method, arr in values.items()]}")
            
            # 統計検定
            stats = wilcoxon_block(values)
            
            # 結果をまとめる
            row = {"dataset": ds, "metric": metric_key}
            for m1, m2 in PAIRS:
                tag = make_tag(m1, m2)
                d = stats[f"{tag}_d"]
                p = stats[f"{tag}_p"]
                row[tag] = cell(d, p)
                print(f"    {tag}, {p}")
            
            all_results.append(row)
    
    # 結果をDataFrameに変換して表示
    df = pd.DataFrame(all_results)
    print("\n" + "="*80)
    print("RESULTS (9 samples per comparison: 3 ranks × 3 misclf_types, averaged over 5 reps)")
    print("="*80)
    print(df.to_string(index=False))
    
    # CSVファイルに保存
    output_file = "exp-repair-4-2-3_rq1_stats_results_averaged.csv"
    df.drop(columns=["metric"], axis=1, inplace=True)  # metric列を削除
    df.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")

In [16]:
main()

Processing dataset: c100
  Processing metric: T
    Sample sizes: [('reptran', 9), ('arachne', 9), ('random_r', 9), ('random_a', 9)]
    R vs. A, 0.0234375
    R vs. Rand_R, 0.0234375
    R vs. Rand_A, 0.328125
    A vs. Rand_R, 0.65234375
    A vs. Rand_A, 0.0234375
    Rand_R vs. Rand_A, 0.0234375
Processing dataset: tiny-imagenet
  Processing metric: T
    Sample sizes: [('reptran', 9), ('arachne', 9), ('random_r', 9), ('random_a', 9)]
    R vs. A, 0.0234375
    R vs. Rand_R, 0.0234375
    R vs. Rand_A, 0.734375
    A vs. Rand_R, 0.71875
    A vs. Rand_A, 0.0234375
    Rand_R vs. Rand_A, 0.0234375

RESULTS (9 samples per comparison: 3 ranks × 3 misclf_types, averaged over 5 reps)
      dataset metric  R vs. A R vs. Rand_R R vs. Rand_A A vs. Rand_R A vs. Rand_A Rand_R vs. Rand_A
         c100      T '-1.00 *     '-1.00 *      '+0.33       '-0.33      '+1.00 *          '+1.00 *
tiny-imagenet      T '-1.00 *     '-1.00 *      '-0.11       '-0.56      '+1.00 *          '+1.00 *

Results