In [1]:
from collections import defaultdict
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [2]:
# Load data
folder_path = "../results/chickengame/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 301 valid JSON result files from '../results/chickengame/' and its subfolders.


In [3]:
def unordered_pair(a, b):
    return tuple(sorted([a, b]))

aggregates = defaultdict(lambda: {
    "model_0": None,
    "model_1": None,
    "rewards": {0: [], 1: []},
    "cooperation": {0: [], 1: []},
})

# Aggregate data
for entry in data:
    m0 = entry["model_general_name_0"]
    m1 = entry["model_general_name_1"]
    pair = unordered_pair(m0, m1)

    rewards = [entry["agent_0_reward"], entry["agent_1_reward"]]
    actions = [
        1 if entry["agent_0_answer"]["action"] == "swerve" else 0,
        1 if entry["agent_1_answer"]["action"] == "swerve" else 0
    ]

    # Flip role if needed
    if pair == (m0, m1):
        idx_0, idx_1 = 0, 1
    else:
        idx_0, idx_1 = 1, 0

    aggregates[pair]["model_0"] = pair[0]
    aggregates[pair]["model_1"] = pair[1]
    aggregates[pair]["rewards"][0].append(rewards[idx_0])
    aggregates[pair]["rewards"][1].append(rewards[idx_1])
    aggregates[pair]["cooperation"][0].append(actions[idx_0])
    aggregates[pair]["cooperation"][1].append(actions[idx_1])

# Convert to row list
rows = []
for (m0, m1), stats in aggregates.items():
    def compute_stats(i):
        r = stats["rewards"][i]
        c = stats["cooperation"][i]
        return sum(r) / len(r), sum(c) / len(c) * 100

    avg_r0, coop_0 = compute_stats(0)
    avg_r1, coop_1 = compute_stats(1)

    rows.append({
        "model_0": m0,
        "model_1": m1,
        "avg_reward_0": avg_r0,
        "avg_reward_1": avg_r1,
        "cooperation_%_0": coop_0,
        "cooperation_%_1": coop_1,
        "n_matches": len(stats["rewards"][0])
    })

df_coop_summary = pd.DataFrame(rows)
df_coop_summary.sort_values(["model_0", "model_1"], inplace=True)

In [4]:
# Step 1: Create long-form data for all pair directions
rows_long = []

for _, row in df_coop_summary.iterrows():
    rows_long.append({
        "model_a": row["model_0"],
        "model_b": row["model_1"],
        "cooperation_percent": row["cooperation_%_0"]
    })
    rows_long.append({
        "model_a": row["model_1"],
        "model_b": row["model_0"],
        "cooperation_percent": row["cooperation_%_1"]
    })

df_long = pd.DataFrame(rows_long)

# Step 2: Pivot to matrix form
heatmap_df_chicken = df_long.pivot_table(
    index="model_a",
    columns="model_b",
    values="cooperation_percent",
    aggfunc="mean"
)

In [5]:
# Load data
folder_path = "../results/staghunt/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 344 valid JSON result files from '../results/staghunt/' and its subfolders.


In [6]:
def unordered_pair(a, b):
    return tuple(sorted([a, b]))

aggregates = defaultdict(lambda: {
    "model_0": None,
    "model_1": None,
    "rewards": {0: [], 1: []},
    "cooperation": {0: [], 1: []},
})

# Aggregate data
for entry in data:
    m0 = entry["model_general_name_0"]
    m1 = entry["model_general_name_1"]
    pair = unordered_pair(m0, m1)

    rewards = [entry["agent_0_reward"], entry["agent_1_reward"]]
    actions = [
        1 if entry["agent_0_answer"]["action"] == "stag" else 0,
        1 if entry["agent_1_answer"]["action"] == "stag" else 0
    ]

    # Flip role if needed
    if pair == (m0, m1):
        idx_0, idx_1 = 0, 1
    else:
        idx_0, idx_1 = 1, 0

    aggregates[pair]["model_0"] = pair[0]
    aggregates[pair]["model_1"] = pair[1]
    aggregates[pair]["rewards"][0].append(rewards[idx_0])
    aggregates[pair]["rewards"][1].append(rewards[idx_1])
    aggregates[pair]["cooperation"][0].append(actions[idx_0])
    aggregates[pair]["cooperation"][1].append(actions[idx_1])

# Convert to row list
rows = []
for (m0, m1), stats in aggregates.items():
    def compute_stats(i):
        r = stats["rewards"][i]
        c = stats["cooperation"][i]
        return sum(r) / len(r), sum(c) / len(c) * 100

    avg_r0, coop_0 = compute_stats(0)
    avg_r1, coop_1 = compute_stats(1)

    rows.append({
        "model_0": m0,
        "model_1": m1,
        "avg_reward_0": avg_r0,
        "avg_reward_1": avg_r1,
        "cooperation_%_0": coop_0,
        "cooperation_%_1": coop_1,
        "n_matches": len(stats["rewards"][0])
    })

df_coop_summary = pd.DataFrame(rows)
df_coop_summary.sort_values(["model_0", "model_1"], inplace=True)

In [7]:
# Step 1: Create long-form data for all pair directions
rows_long = []

for _, row in df_coop_summary.iterrows():
    rows_long.append({
        "model_a": row["model_0"],
        "model_b": row["model_1"],
        "cooperation_percent": row["cooperation_%_0"]
    })
    rows_long.append({
        "model_a": row["model_1"],
        "model_b": row["model_0"],
        "cooperation_percent": row["cooperation_%_1"]
    })

df_long = pd.DataFrame(rows_long)

# Step 2: Pivot to matrix form
heatmap_df_stag = df_long.pivot_table(
    index="model_a",
    columns="model_b",
    values="cooperation_percent",
    aggfunc="mean"
)

In [8]:
# Load data
folder_path = "../results/trustgame/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 314 valid JSON result files from '../results/trustgame/' and its subfolders.


In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict

# Set constants
base_amount = 100
multiplier = 3

# Create lists to store metrics
give_data = []
return_pct_data = []

for entry in data:
    m0 = entry["model_general_name_0"]
    m1 = entry["model_general_name_1"]

    sent = int(entry["agent_0_answer"]["amount"])
    received_back = int(entry["agent_1_answer"]["amount"])
    sent_back_pct = received_back / (sent * multiplier) if sent > 0 else 0.0

    give_data.append({
        "sender": m0,
        "receiver": m1,
        "sent_amount": sent
    })

    return_pct_data.append({
        "sender": m1,     # agent_1 is now sender in return
        "receiver": m0,
        "return_pct": sent_back_pct * 100  # percentage
    })

# Convert to DataFrames
df_give = pd.DataFrame(give_data)
df_return = pd.DataFrame(return_pct_data)

# Pivot into heatmap format
heatmap_give = df_give.pivot_table(index="sender", columns="receiver", values="sent_amount", aggfunc="mean")
heatmap_return = df_return.pivot_table(index="sender", columns="receiver", values="return_pct", aggfunc="mean")

In [20]:
rows = []
all_models = sorted(
    set(heatmap_df_chicken.index)
    | set(heatmap_df_stag.index)
    | set(heatmap_give.index)
    | set(heatmap_return.index)
)

for sender in all_models:
    for receiver in all_models:
        row = {
            "Cooperator": sender,
            "Cooperatee": receiver,
            "Chicken %": heatmap_df_chicken.loc[sender, receiver] if sender in heatmap_df_chicken.index and receiver in heatmap_df_chicken.columns else None,
            "Stag %": heatmap_df_stag.loc[sender, receiver] if sender in heatmap_df_stag.index and receiver in heatmap_df_stag.columns else None,
            "Trust Sent": heatmap_give.loc[sender, receiver] if sender in heatmap_give.index and receiver in heatmap_give.columns else None,
            "Trust Returned %": heatmap_return.loc[sender, receiver] if sender in heatmap_return.index and receiver in heatmap_return.columns else None,
        }
        rows.append(row)

df_combined = pd.DataFrame(rows)

# Optional: round numeric columns
df_combined = df_combined.round({
    "Chicken %": 1,
    "Stag %": 1,
    "Trust Sent": 0,
    "Trust Returned %": 1,
})

# Optional: remove rows where all metrics are missing
df_combined.dropna(subset=["Chicken %", "Stag %", "Trust Sent", "Trust Returned %"], how="all", inplace=True)


In [21]:
latex_table = df_combined.to_latex(
    index=False,
    escape=False,
    float_format="%.1f",
    column_format="llcccc",
    longtable=False
)

print(latex_table)

\begin{tabular}{llcccc}
\toprule
Cooperator & Cooperatee & Chicken % & Stag % & Trust Sent & Trust Returned % \\
\midrule
claude & claude & 100.0 & 84.1 & 50.0 & 44.4 \\
claude & deepseek & 100.0 & 78.6 & 42.0 & 50.0 \\
claude & gpt & 100.0 & 91.7 & 50.0 & 50.0 \\
claude & mistral & 100.0 & 82.1 & 50.0 & 52.8 \\
deepseek & claude & 100.0 & 100.0 & 52.0 & 41.7 \\
deepseek & deepseek & 100.0 & 96.7 & 51.0 & 51.5 \\
deepseek & gpt & 100.0 & 94.9 & 50.0 & 51.7 \\
deepseek & mistral & 100.0 & 98.3 & 51.0 & 50.0 \\
gpt & claude & 100.0 & 97.2 & 50.0 & 48.3 \\
gpt & deepseek & 100.0 & 97.4 & 49.0 & 48.9 \\
gpt & gpt & 100.0 & 95.0 & 50.0 & 51.7 \\
gpt & mistral & 100.0 & 97.5 & 51.0 & 49.4 \\
mistral & claude & 83.3 & 100.0 & 53.0 & 50.0 \\
mistral & deepseek & 97.9 & 98.3 & 51.0 & 51.1 \\
mistral & gpt & 96.3 & 92.5 & 55.0 & 50.2 \\
mistral & mistral & 92.9 & 96.7 & 50.0 & 50.0 \\
\bottomrule
\end{tabular}

