# Bootstrapped Pairwise Differences Visualization

Ingests results, groups scores by prompt type, computes bootstrapped 95 % CIs for all pairwise differences, exports a summary CSV and generates CI plots.


In [None]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Fixed pair ordering for same_story (homogeneous) and same_story_bad_apple (robustness)
fixed_pairs_regular = [
    ("Turnip", "OldManSons"),
    ("Spoons", "OldManSons"),
    ("Spoons", "Turnip"),
    ("Teamwork", "OldManSons"),
    ("Teamwork", "Turnip"),
    ("Teamwork", "Spoons"),
    ("Musketeers", "OldManSons"),
    ("Musketeers", "Turnip"),
    ("Musketeers", "Spoons"),
    ("Musketeers", "Teamwork"),
    ("Peacemaker", "OldManSons"),
    ("Peacemaker", "Turnip"),
    ("Peacemaker", "Spoons"),
    ("Peacemaker", "Teamwork"),
    ("Peacemaker", "Musketeers"),
    ("Soup", "OldManSons"),
    ("Soup", "Turnip"),
    ("Soup", "Spoons"),
    ("Soup", "Teamwork"),
    ("Soup", "Musketeers"),
    ("Soup", "Peacemaker"),
    ("nsPlumber", "OldManSons"),
    ("nsPlumber", "Turnip"),
    ("nsPlumber", "Spoons"),
    ("nsPlumber", "Teamwork"),
    ("nsPlumber", "Musketeers"),
    ("nsPlumber", "Peacemaker"),
    ("nsPlumber", "Soup"),
    ("Odyssey", "OldManSons"),
    ("Odyssey", "Turnip"),
    ("Odyssey", "Spoons"),
    ("Odyssey", "Teamwork"),
    ("Odyssey", "Musketeers"),
    ("Odyssey", "Peacemaker"),
    ("Odyssey", "Soup"),
    ("Odyssey", "nsPlumber"),
    ("nsCarrot", "OldManSons"),
    ("nsCarrot", "Turnip"),
    ("nsCarrot", "Spoons"),
    ("nsCarrot", "Teamwork"),
    ("nsCarrot", "Musketeers"),
    ("nsCarrot", "Peacemaker"),
    ("nsCarrot", "Soup"),
    ("nsCarrot", "nsPlumber"),
    ("nsCarrot", "Odyssey"),
    ("noinstruct", "OldManSons"),
    ("noinstruct", "Turnip"),
    ("noinstruct", "Spoons"),
    ("noinstruct", "Teamwork"),
    ("noinstruct", "Musketeers"),
    ("noinstruct", "Peacemaker"),
    ("noinstruct", "Soup"),
    ("noinstruct", "nsPlumber"),
    ("noinstruct", "Odyssey"),
    ("noinstruct", "nsCarrot"),
    ("maxreward", "OldManSons"),
    ("maxreward", "Turnip"),
    ("maxreward", "Spoons"),
    ("maxreward", "Teamwork"),
    ("maxreward", "Musketeers"),
    ("maxreward", "Peacemaker"),
    ("maxreward", "Soup"),
    ("maxreward", "nsPlumber"),
    ("maxreward", "Odyssey"),
    ("maxreward", "nsCarrot"),
    ("maxreward", "noinstruct")
]
fixed_pairs_temp = fixed_pairs_regular.copy()

def analyze_data(data,
                 csv_filename="pairwise_confidence_intervals.csv",
                 ci_plot_filename="pairwise_CI_plot.png",
                 n_bootstrap=1000,
                 subtitle="",
                 fixed_pairs=None):
    """
    Computes bootstrapped 95% confidence intervals for pairwise differences,
    saves results to CSV, and creates error‐bar plots.
    """
    # Bootstrap helper: resample and compute diff of means
    def bootstrap_diff(data1, data2):
        diffs = []
        for _ in range(n_bootstrap):
            s1 = np.random.choice(data1, size=len(data1), replace=True)
            s2 = np.random.choice(data2, size=len(data2), replace=True)
            diffs.append(np.mean(s2) - np.mean(s1))
        return np.percentile(diffs, 2.5), np.percentile(diffs, 97.5)

    # Determine which pairs to compare
    if fixed_pairs is None:
        # heterogeneous: sort categories by their sample mean
        means = {cat: np.mean(vals) for cat, vals in data.items()}
        cats = sorted(means, key=means.get)
        pair_comparisons = [(cats[i], cats[j]) for i in range(len(cats)) for j in range(i+1, len(cats))]
    else:
        pair_comparisons = fixed_pairs

    # Compute CIs
    results = []
    for a, b in pair_comparisons:
        if a not in data or b not in data:
            # missing data => NaNs
            results.append({"Category1": a, "Category2": b, "Lower_bound": np.nan, "Upper_bound": np.nan})
            print(f"Warning: {a} or {b} not found; CI set to NaN.")
        else:
            # ensure diff is always mean(b) - mean(a)
            if np.mean(data[a]) < np.mean(data[b]):
                low, up = bootstrap_diff(data[a], data[b])
            else:
                low, up = bootstrap_diff(data[b], data[a])
            results.append({"Category1": a, "Category2": b, "Lower_bound": low, "Upper_bound": up})

    # For different_story, drop any "All vs ..." comparisons
    if "different story" in subtitle.lower():
        results = [r for r in results if "All" not in r["Category1"] and "All" not in r["Category2"]]

    # Save filtered results
    pd.DataFrame(results).to_csv(csv_filename, index=False)
    print(f"Pairwise confidence intervals saved to '{csv_filename}'.")

    # Plot error bars
    fig, ax = plt.subplots(figsize=(8, 10))
    ax.axvline(0, color='red', linestyle='--')

    y = np.arange(len(results))
    for idx, r in enumerate(results):
        l, u = r["Lower_bound"], r["Upper_bound"]
        if np.isnan(l):
            ax.plot(0, idx, 'o', color='lightgray')
        else:
            center = (l + u) / 2
            err_low, err_high = center - l, u - center
            crosses = (l < 0 < u) or (l == 0)
            mcol = 'red' if crosses else 'black'
            ecol = 'red' if crosses else 'gray'
            ax.errorbar(center, idx, xerr=[[err_low], [err_high]],
                        fmt='o', color=mcol, ecolor=ecol, capsize=3)

    labels = [f"{r['Category1']} vs {r['Category2']}" for r in results]
    ax.set_yticks(y)
    ax.set_yticklabels(labels)
    ax.invert_yaxis()
    ax.set_xlabel("Difference in Means (95% CI)")
    ax.set_title(subtitle)
    plt.tight_layout()
    plt.savefig(ci_plot_filename)
    plt.savefig(ci_plot_filename.replace(".png", ".pdf"))
    plt.show()


def analyze_collaboration_scores_all(csv_files: list):
    """
    1. Load and concatenate CSV files.
    2. Filter to 'final' for same_story/bad_apple, keep all rows for different_story.
    3. Build data dict by PromptType.
    4. Derive subtitle and bundle_key from filenames.
    5. Select fixed_pairs (None for different_story).
    6. Call analyze_data().
    """
    # Load and combine
    dfs = [pd.read_csv(f) for f in csv_files]
    df = pd.concat(dfs, ignore_index=True)

    # Normalize Round
    if "Round" in df.columns:
        df["Round"] = df["Round"].astype(str).str.lower().str.strip()

    first = os.path.basename(csv_files[0]).lower()
    if "different_story" in first:
        score_col, used = "CumulativePayoff", df
    else:
        score_col = "CollaborationScore"
        used = df[df["Round"] == "final"].copy()
        used[score_col] = pd.to_numeric(used[score_col], errors="coerce")
        used.dropna(subset=[score_col], inplace=True)

    # Build data dictionary
    data_dict = {pt: grp[score_col].tolist() for pt, grp in used.groupby("PromptType")}
    print("Data dictionary (PromptType: count):")
    for k, v in data_dict.items():
        print(f"{k}: {len(v)}")

    # Subtitle logic
    if "different_story" in first:
        subtitle = "Different Story 4 Agents"
    elif "bad_apple" in first:
        subtitle = "Same Story Robust 4 Agents"
    elif "same_story" in first:
        if "ag4" in first:
            subtitle = "Same Story 4 Agents"
        elif "ag16" in first:
            subtitle = "Same Story 16 Agents"
        elif "ag32" in first:
            subtitle = "Same Story 32 Agents"
        else:
            subtitle = "Same Story"
    else:
        subtitle = first

    # Bundle key for filenames
    if first.startswith("game_results_"):
        suff = first[len("game_results_"):-4]
    else:
        suff = first[:-4]
    parts = suff.split('_')
    if len(parts) >= 3 and parts[0] in ["same", "bad"]:
        bundle_key = "_".join(parts[:2] + parts[3:])
    else:
        bundle_key = suff

    # Choose fixed_pairs
    if "different_story" in first:
        fixed_pairs = None
    elif "Spoons" in data_dict:
        fixed_pairs = fixed_pairs_regular
    elif "temp0.6" in first:
        fixed_pairs = fixed_pairs_temp
    else:
        fixed_pairs = fixed_pairs_regular

    # Output filenames
    csv_out = f"combined_pairwise_confidence_intervals_{bundle_key}.csv"
    ci_out = f"combined_pairwise_CI_plot_{bundle_key}.png"

    analyze_data(data_dict,
                 csv_filename=csv_out,
                 ci_plot_filename=ci_out,
                 n_bootstrap=1000,
                 subtitle=subtitle,
                 fixed_pairs=fixed_pairs)


if __name__ == "__main__":
    # Find and group game_results bundles
    files = glob.glob("game_results_*.csv")
    if not files:
        print("No game_results CSV files found.")
    else:
        bundles = {}
        for fpath in files:
            base = os.path.basename(fpath)
            if base.startswith("game_results_"):
                key_suff = base[len("game_results_"):-4]
            else:
                key_suff = base[:-4]
            toks = key_suff.split('_')
            if len(toks) >= 3 and toks[0] in ["same", "bad"]:
                key = "_".join(toks[:2] + toks[3:])
            else:
                key = key_suff
            bundles.setdefault(key, []).append(fpath)

        for key, group in bundles.items():
            print(f"\nProcessing bundle: {key} ({len(group)} files)")
            analyze_collaboration_scores_all(group)
