# Install Required Dependencies


In [None]:
# Install all required dependencies
pip install -U langchain-community langchain_openai numpy pandas matplotlib seaborn


In [None]:
# Standard library
import os
import sys
import time
import random
import re
import csv
import pickle
import statistics
import glob
import itertools

# Third-party libraries
import requests
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from typing import List, Dict, Any, TextIO

# Project-specific or custom packages
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage, AIMessage, BaseMessage

# Load and Configure LLM Agents

In [None]:
# -----------------------------
# Central Configuration
# -----------------------------
config: Dict[str, Any] = {
    "llm_choice": os.getenv("LLM_CHOICE", "llama"),
    "llama": {
        "model_name": os.getenv("LLAMA_MODEL_NAME", "meta-llama-3.3-70b-instruct-fp8"),
        "temperature": float(os.getenv("LLAMA_TEMP", 0.6)),
        "max_tokens": int(os.getenv("LLAMA_MAX_TOKENS", 500)),
        "api_url": os.getenv("LLAMA_API_URL")
    },
    "openai": {
        "model_name": os.getenv("OPENAI_MODEL_NAME", "gpt-4o-mini"),
        "temperature": float(os.getenv("OPENAI_TEMP", 1.0)),
        "max_tokens": int(os.getenv("OPENAI_MAX_TOKENS", 10)),
        "api_key": os.getenv("OPENAI_API_KEY")
    },
}

# -----------------------------
# Llama Helper Class
# -----------------------------
class Llama:
    def __init__(self, model_name: str, temperature: float, max_tokens: int, api_url: str) -> None:
        self.model_name = model_name
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.api_url = api_url

    def invoke(self, history: List[BaseMessage]) -> AIMessage:
        # Build messages in a robust way.
        messages: List[Dict[str, str]] = []
        for msg in history:
            # Try to extract role and content, whether msg is an object or dict.
            if hasattr(msg, "role") and hasattr(msg, "content"):
                role = msg.role
                content = msg.content
            elif isinstance(msg, dict):
                role = msg.get("role", "unknown")
                content = msg.get("content", str(msg))
            else:
                role = "unknown"
                content = str(msg)
            messages.append({"role": role, "content": content})

        data: Dict[str, Any] = {
            "model": self.model_name,
            "messages": messages,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature
        }

        try:
            response = requests.post(self.api_url, json=data, timeout=30)
            response.raise_for_status()
        except requests.exceptions.Timeout:
            raise RuntimeError("Llama API request timed out.")
        except requests.exceptions.HTTPError as http_err:
            raise RuntimeError(f"Llama API HTTP error: {http_err}")
        except requests.exceptions.RequestException as err:
            raise RuntimeError(f"Llama API connection error: {err}")

        try:
            response_data = response.json()
            content = response_data["choices"][0]["message"]["content"]
        except (ValueError, KeyError, IndexError) as parse_err:
            raise RuntimeError(f"Failed to parse Llama API response: {parse_err}")

        return AIMessage(content=content)

def log_records(message: str, records_file: TextIO) -> None:
    """
    Helper function to log a message to records file so we can keep track of all history.
    """
    records_file.write(message + "\n")
    records_file.flush() # make sure it's written immediately

def calculate_payoffs(contributions: List[float], e: float, m: float, na: int) -> List[float]:
    """
    Given a list of contributions from each agent, compute the payoff for each agent.
    """
    total: float = sum(contributions)
    shared_bonus: float = m * total / na
    return [e - c + shared_bonus for c in contributions]

# -----------------------------
# Agent Class
# -----------------------------
class Agent:
    """
    A simple agent that uses an LLM backend determined by LLM_CHOICE.
    It maintains its own conversation history so that previous rounds and summaries
    can influence its responses.
    """
    def __init__(self, name: str, system_message: str, records_file: TextIO) -> None:
        self.name = name
        # Start the conversation with a system message.
        self.history: List[BaseMessage] = [SystemMessage(content=system_message)]
        self.records_file = records_file

        # Log the creation of this agent and its system prompt to records.
        log_records(f"CREATING AGENT: {self.name}", records_file)
        log_records(f"System Prompt for {self.name}: {system_message}", records_file)

        # Create the appropriate LLM instance
        llm_choice = config["llm_choice"]
        if llm_choice == "llama":
            llama_cfg = config["llama"]
            self.llm = Llama(
                model_name=llama_cfg["model_name"],
                temperature=llama_cfg["temperature"],
                max_tokens=llama_cfg["max_tokens"],
                api_url=llama_cfg["api_url"]
            )
        else:
            openai_cfg = config["openai"]
            self.llm = ChatOpenAI(
                model_name=openai_cfg["model_name"],
                temperature=openai_cfg["temperature"],
                max_tokens=openai_cfg["max_tokens"],
                openai_api_key=openai_cfg["api_key"]
            )

    def chat(self, message: str) -> str:
        """
        Append a human message, call the LLM, append the assistant's reply,
        log everything, and return the response content.
        """
        log_records(f"{self.name} receives HUMAN message: {message}", self.records_file)
        self.history.append(HumanMessage(content=message))
        try:
            response = self.llm.invoke(self.history) # Use the pre-created ChatOpenAI instance.
        except RuntimeError as err:
            log_records(f"{self.name} ERROR: {err}", self.records_file) 
            return f"[ERROR]: {err}"

        # Store the response in the conversation history
        self.history.append(response)
        # Brief pause to help avoid rate limits
        log_records(f"{self.name} responds ASSISTANT: {response.content}", self.records_file)       
        time.sleep(0.01)
        return response.content

# -----------------------------
# DummyAgent Class
# -----------------------------
class DummyAgent:
    """
    A 'dummy' agent that does NOT connect to an LLM and always contributes 0.
    """
    def __init__(self, name: str, system_message: str, records_file: TextIO) -> None:
        self.name = name
        self.history: List[BaseMessage] = [SystemMessage(content=system_message)]
        self.records_file = records_file

        log_records(f"CREATING DUMMY AGENT: {self.name}", records_file)
        log_records(f"(Dummy) System Prompt for {self.name}: {system_message}", records_file)

    def chat(self, message: str) -> str:
        """
        This agent always contributes "0".
        We still log the conversation but do not call any LLM.
        """
        log_records(f"{self.name} (dummy) receives HUMAN message: {message}", self.records_file)
        # We return "0" as a string to emulate the minimal integer-based response.
        response_content = "<TOKEN>0</TOKEN>"
        log_records(f"{self.name} (dummy) responds ASSISTANT: {response_content}", self.records_file)
        time.sleep(0.01)  # a brief pause, mirroring the normal agent
        return response_content

# Define Agent Utility Functions

In [None]:
def load_intermediate_results(exp_name, is_dict=True):
    """Loads intermediate results from a pickle file if it exists."""
    intermediate_results_filename = f"results_{exp_name}.pkl"
    if os.path.exists(intermediate_results_filename):
        with open(intermediate_results_filename, 'rb') as f:
            intermediate_results = pickle.load(f)
        print(f"Loaded intermediate results from {intermediate_results_filename}.")
    else:
        intermediate_results = {} if is_dict else []
    return intermediate_results, intermediate_results_filename

def save_intermediate_results(intermediate_results, intermediate_results_filename):
    """Saves intermediate results to a pickle file."""
    with open(intermediate_results_filename, 'wb') as f:
        pickle.dump(intermediate_results, f)

def get_system_prompt(na, e, m):
    """
    Generates the system prompt.
    
    Args:
        na (int): Number of agents.
        e (int): Endowment per round (tokens given to each agent).
        m (float): Contribution multiplier affecting payoffs.
    
    Returns:
        str: The formatted system prompt.
    """
    return (
        f"You are participating in a repeated public goods game with {na - 1} other agents. "
        f"In each round, you receive {e} tokens. You must decide how many tokens (an integer between 0 and {e}) "
        f"to contribute to a public pool. After everyone contributes, the total contribution is multiplied by {m} "
        f"and divided equally among all {na} agents. This means your payoff for a round is calculated as: \n\n"
        f"    {e} - (your contribution) + ({m} * total contributions / {na})\n\n"
        "Please respond **only** with the number of tokens you wish to contribute for this round, enclosed within `<TOKEN>` and `</TOKEN>` tags. "
        "For example: `<TOKEN>5</TOKEN>`. Do not provide any additional text, explanations, or summaries. "
        "STORY"
    )

def compute_and_print_statistics(scores_by_prompt, exp_name):
    """Computes and prints mean and standard deviation of scores for each prompt."""
    print(f"\n=== Comparison of Effective Collaboration Scores Over Games for experiment {exp_name} ===")
    for prompt_label, score_list in scores_by_prompt.items():
        mean_val = statistics.mean(score_list)
        stdev_val = statistics.stdev(score_list) if len(score_list) > 1 else 0
        print(f"{prompt_label}: Mean = {mean_val:.2f}, SD = {stdev_val:.2f}")

def extract_contribution(response_str):
    """
    Extracts a valid contribution from an agent's response.
    - First, looks for a contribution enclosed in <TOKEN>...</TOKEN>.
    - If found, extracts the first standalone integer from the match.
    """
    
    # Look for <TOKEN>...</TOKEN> first
    match = re.search(r"<TOKEN>(\d+)</TOKEN>", response_str)
    if match:
        return int(match.group(1))
    return None


def get_valid_contribution(agent, round_num, e, max_retries=5):
    """
    Repeatedly queries the agent for a valid contribution up to max_retries.
    If the response is invalid for more than max_retries, defaults to 0.
    """
    retries = 0
    while retries < max_retries:
        prompt = f"Round {round_num}: What is your contribution (0-{e})?"
        
        if retries > 0:
            prompt += " Your previous response was invalid. **Only provide a number inside `<TOKEN>...</TOKEN>`** with no extra text. Example: `<TOKEN>5</TOKEN>`."
        
        response_str = agent.chat(prompt).strip()
        print(f"{agent.name} (Story: {agent.story_label}) response (attempt {retries + 1}): {response_str}")

        contribution = extract_contribution(response_str)
        
        if contribution is not None:
            return contribution
        
        print(f"Warning: {agent.name} provided an invalid response. Retrying... ({retries + 1}/{max_retries})")
        retries += 1

    # If all retries fail, default to 0 and log the failure
    print(f"Error: {agent.name} failed to provide a valid response after {max_retries} attempts. Defaulting to 0.")
    return 0

def collect_contributions(agents, round_num, e):
    """Collects valid contributions from all agents."""
    contributions = []
    for agent in agents:       
        contribution = get_valid_contribution(agent, round_num, e)
        
        # Enforce valid contribution range
        available_tokens = e # Each agent gets `e` tokens every round
        if contribution > available_tokens:
            print(f"{agent.name} attempted to contribute {contribution} tokens but only has {available_tokens}. "
                    f"Reducing contribution to {available_tokens}.")
            contribution = available_tokens
        contribution = max(0, contribution)
        contributions.append(contribution)
    print(f"Round Contributions: {contributions}")
    return contributions

def calculate_rewards(contributions, agents, e, m, na, total_rewards, round_num):
    """Calculates payoffs and updates agent rewards."""
    round_total = sum(contributions)    
    # Calculate payoffs for the round.
    payoffs = calculate_payoffs(contributions, e, m, na)
    print(f"Round Payoffs: {payoffs}")

    for idx, agent in enumerate(agents):
        total_rewards[idx] += payoffs[idx]
        summary = (
            f"Round {round_num} Summary:\n"
            f" - Your contribution: {contributions[idx]}\n"
            f" - Total contributions: {round_total}\n"
            f" - Your payoff this round: {payoffs[idx]:.2f}\n"
            f" - Your cumulative reward: {total_rewards[idx]:.2f}"
        )
        agent.chat(summary)

    return payoffs, round_total, total_rewards


def log_round_results(csv_writer, agents, contributions, payoffs, total_rewards, game_index, prompt_label, round_num, exp_type):
    """Logs round results to CSV."""
    for idx, agent in enumerate(agents):
        story_or_prompt_label = agent.story_label if exp_type == "different_story" else prompt_label
        # Write per-round info to the CSV file.
        csv_writer.writerow([
            game_index,
            story_or_prompt_label,
            round_num,
            agent.name,
            contributions[idx],
            f"{payoffs[idx]:.2f}",
            f"{total_rewards[idx]:.2f}",
            "" # CollaborationScore left empty for per-round details.
        ])
def execute_game_rounds(agents, na, nr, e, m, csv_writer, records_file, game_index, prompt_label, exp_type, num_dummy_agents):
    """
    Executes a full game session consisting of multiple rounds where agents contribute to a shared pool.
    Args:
        agents (list): List of agent objects.
        na (int): Number of agents.
        nr (int): Number of rounds.
        e (int): Endowment per round.
        m (float): Multiplier for contributions.
        csv_writer (csv.writer): CSV writer object.
        records_file (file object): File for logging agent responses.
        game_index (int): Game identifier.
        prompt_label (str): Label for the prompt or story used.
        exp_type (str): "same_story" or "different_story" (to determine CSV formatting).
        num_dummy_agents: Number of dummy agents in the game
    Returns:
        effective_score (float): The overall collaboration score.
        total_rewards (list): Cumulative rewards for each agent.
    """
    total_rewards = [0 for _ in range(na)]
    total_game_contributions = 0

    print("\n=== Starting a New Game ===")
    for round_num in range(1, nr + 1):
        print(f"\n--- Round {round_num} ---")
        contributions = collect_contributions(agents, round_num, e)
        payoffs, round_total, total_rewards = calculate_rewards(contributions, agents, e, m, na, total_rewards, round_num)
        log_round_results(csv_writer, agents, contributions, payoffs, total_rewards, game_index, prompt_label, round_num, exp_type)
        total_game_contributions += round_total

    max_possible = (na - num_dummy_agents) * e * nr
    effective_score = total_game_contributions / max_possible
    print(f"\nEffective Collaboration Score: {effective_score:.2f}")

    csv_writer.writerow([
        game_index,
        prompt_label,
        "final",
        "All",
        "",
        "",
        "",
        f"{effective_score:.2f}"
    ])

    return effective_score, total_rewards

def run_single_game(game_index: int, prompt_label: str, system_prompt_used: str,
                    na: int, nr: int, e: int, m: float, csv_writer, records_file, num_dummy_agents, exp_type) -> float:
    """
    Run a single game (with nr rounds) using the given system prompt and experiment parameters.
    Returns the effective collaboration score for the game.
    """
    # Create new agents for this game.    
    agents = []
    for i in range(na):
        if i < num_dummy_agents:
            # Create a dummy agent
            agent = DummyAgent(f"Agent_{i+1}", system_prompt_used, records_file)
        else:
            # Create a standard LLM-based agent
            agent = Agent(f"Agent_{i+1}", system_prompt_used, records_file)
        agents.append(agent)
        
    for agent in agents:
        agent.story_label = prompt_label
    
    # Executes all rounds of the game, tracking contributions, payoffs, and collaboration scores.
    effective_score, _ = execute_game_rounds(
        agents, na, nr, e, m, csv_writer, records_file, game_index, prompt_label, exp_type, num_dummy_agents
    )
    return effective_score

def run_single_game_random_story(game_index: int, system_prompt_story: str, na: int, nr: int, e: int, m: float,
                                csv_writer, records_file, story_prompts: dict, exp_type) -> (float, list):
    """
    Run a single game where each agent gets a random story.
    Returns:
    - effective_score: overall collaboration score (total game contributions divided by maximum possible)
    - agent_results: list of tuples (agent_name, story_label, cumulative_reward) for each agent.
    """
    agents = []

    # Create agents with random story prompts.
    for i in range(na):
        chosen_label, chosen_story = random.choice(list(story_prompts.items()))
        # Insert the chosen story into the base system prompt.
        prompt_text = system_prompt_story.replace("STORY", chosen_story)
        agent = Agent(f"Agent_{i+1}", prompt_text, records_file)
        agent.story_label = chosen_label
        agents.append(agent)

    # Executes all rounds of the game, tracking contributions, payoffs, and collaboration scores.
    effective_score, total_rewards = execute_game_rounds(
        agents, na, nr, e, m, csv_writer, records_file, game_index, "All", exp_type, 0
    )

    # Prepare results: (agent_name, story_label, cumulative_reward)
    agent_results = [(agents[i].name, agents[i].story_label, total_rewards[i]) for i in range(na)]
    return effective_score, agent_results

# Configure and Launch Game Simulations

In [None]:
CSV_HEADER = ["Game", "PromptType", "Round", "AgentName", "Contribution", "RoundPayoff", "CumulativePayoff", "CollaborationScore"]

def run_same_story_experiment(is_bad_apple, story_index, num_rounds_list, endowment_list, multiplier_list, num_games, num_agents_list, exp_type):
    """Runs the same story experiment using Story index."""
    story_files = sorted(glob.glob("stories/*.txt"))
    if story_index >= len(story_files):
        print("Invalid story index. Exiting.")
        sys.exit(1)

    selected_story_file = story_files[story_index]
    story_name = os.path.splitext(os.path.basename(selected_story_file))[0]

    for na, nr, e, m in itertools.product(num_agents_list, num_rounds_list, endowment_list, multiplier_list):
        exp_name = f"{'bad_apple' if is_bad_apple else 'same_story'}_{story_name}_ag{na}_ro{nr}_end{e}_mult{m}_temp0.8"
        num_dummy_agents = 1 if is_bad_apple else 0

        print(f"\n\n######################\nRunning experiment: {exp_name}\n######################\n")
        intermediate_results, results_path = load_intermediate_results(exp_name, is_dict=True)
        
        scores_by_prompt = run_same_story_games(
            exp_name, selected_story_file, story_name, na, nr, e, m, num_games, num_dummy_agents,
            intermediate_results, results_path, exp_type
        )
        compute_and_print_statistics(scores_by_prompt, exp_name)


def run_different_story_experiment(num_rounds_list, endowment_list, multiplier_list, num_games, num_agents_list, exp_type):
    for na, nr, e, m in itertools.product(num_agents_list, num_rounds_list, endowment_list, multiplier_list):
        exp_name = f"different_story_ag{na}_ro{nr}_end{e}_mult{m}_temp0.8"
        print(f"\n\n######################\nRunning experiment: {exp_name}\n######################\n")

        story_prompts = load_all_story_prompts()
        system_prompt = get_system_prompt(na, e, m)
        intermediate_results, results_path = load_intermediate_results(exp_name, is_dict=False)

        scores_list, rewards_by_story = run_different_story_games(
            exp_name, na, nr, e, m, num_games, story_prompts, system_prompt,
            intermediate_results, results_path, exp_type
        )

        print(f"\n=== Rewards by Story for experiment {exp_name} ===")
        compute_and_print_statistics(rewards_by_story, exp_name)
        if scores_list:
            print(f"\nOverall Effective Collaboration Score: Mean = {statistics.mean(scores_list):.2f}, SD = {statistics.stdev(scores_list):.2f}")


def run_same_story_games(exp_name, story_file, story_name, na, nr, e, m, num_games, num_dummy_agents, intermediate_results, results_path, exp_type):
    with open(story_file, "r", encoding="utf-8") as f:
        story_content = f.read()
    if story_name not in ["maxreward", "noinstruct"]:
        story_content = "Your behavior is influenced by the following bedtime story your mother read to you every night: " + story_content
    # Build a dynamic system prompt using the current parameters.
    prompt_text = get_system_prompt(na, e, m).replace("STORY", story_content)

    if story_name not in intermediate_results:
        intermediate_results[story_name] = []

    # If intermediate results exist, re-populate the CSV file with final rows from previous games.
    scores = intermediate_results[story_name][:]
    with open(f"records_{exp_name}.txt", "w", encoding="utf-8") as records_file, \
         open(f"game_results_{exp_name}.csv", "w", newline="", encoding="utf-8") as results_file:

        writer = csv.writer(results_file)
        writer.writerow(CSV_HEADER)

        # Re-write completed game rows if reloading
        for idx, score in enumerate(scores, start=1):
            writer.writerow([idx, story_name, "final", "All", "", "", "", f"{score:.2f}"])
        print(f"\n=== Running Games with prompt: {story_name} for experiment {exp_name} ===")
        # Determine how many games have already been run for this prompt.
            
        for game_index in range(len(scores) + 1, num_games + 1):
            print(f"\n=== Game {game_index} ({story_name}) for experiment {exp_name} ===")
            score = run_single_game(game_index, story_name, prompt_text, na, nr, e, m, writer, records_file, num_dummy_agents, exp_type)
            scores.append(score)
            intermediate_results[story_name].append(score)
            save_intermediate_results(intermediate_results, results_path)

    return {story_name: scores}


def run_different_story_games(exp_name, na, nr, e, m, num_games, story_prompts, system_prompt, intermediate_results, results_path, exp_type):
    scores_list = []
    rewards_by_story = {story: [] for story in story_prompts}

    with open(f"records_{exp_name}.txt", "w", encoding="utf-8") as records_file, \
         open(f"game_results_{exp_name}.csv", "w", newline="", encoding="utf-8") as results_file:

        writer = csv.writer(results_file)
        writer.writerow(CSV_HEADER)

        for game_index in range(1, num_games + 1):
            if game_index <= len(intermediate_results):
                print(f"Skipping game {game_index} as it has already been run.")
                continue

            print(f"\n=== Game {game_index} for experiment {exp_name} ===")
            score, agent_results = run_single_game_random_story(
                game_index, system_prompt, na, nr, e, m, writer, records_file, story_prompts, exp_type
            )
            intermediate_results.append((game_index, score, agent_results))
            save_intermediate_results(intermediate_results, results_path)

            scores_list.append(score)
            for _, story_label, reward in agent_results:
                rewards_by_story[story_label].append(reward)

    return scores_list, rewards_by_story


def load_all_story_prompts():
    story_prompts = {}
    for story_file in sorted(glob.glob("stories/*.txt")):
        story_name = os.path.splitext(os.path.basename(story_file))[0]
        with open(story_file, "r", encoding="utf-8") as f:
            content = f.read()
        if story_name not in ["maxreward", "noinstruct"]:
            content = "Your behavior is influenced by the following bedtime story your mother read to you every night: " + content
        story_prompts[story_name] = content
    return story_prompts


# Main Execution Block

In [None]:
# -----------------------------
# Configurable Run Function
# -----------------------------
def run_experiment(exp_type, story_index=0):

    # -----------------------------
    # Experiment Configurations
    # -----------------------------
    num_rounds_list = [5]
    endowment_list = [10]
    multiplier_list = [1.5]
        
    try:       
        if exp_type in ["same_story", "bad_apple"]:
            num_games = 100
            num_agents_list = [4, 16, 32] if exp_type == "same_story" else [16]
            run_same_story_experiment(
                is_bad_apple=(exp_type == "bad_apple"),
                story_index=story_index,
                num_rounds_list=num_rounds_list,
                endowment_list=endowment_list,
                multiplier_list=multiplier_list,
                num_games=num_games,
                num_agents_list=num_agents_list,
                exp_type = exp_type
            )
        elif exp_type == "different_story":
            num_games = 400
            num_agents_list = [16]
            run_different_story_experiment(
                num_rounds_list, endowment_list, multiplier_list, num_games, num_agents_list, exp_type
            )
        else:
            print(f"[ERROR] Unknown experiment type: {exp_type}")
    except Exception as e:
        print(f"[ERROR] Experiment '{exp_type}' failed: {e}")


# Homogenous Experiment

Runs 100 games per story for agent sizes [4, 16, 32].

(a) Cooperation Among Homogeneous Agents
To run across the experiment for all stories:

In [None]:
# Run all 12 stories for same_story
for i in range(12):
    print(f"Running same_story experiment for story {i}")
    run_experiment("same_story", story_index=i)


(b) Robustness Experiment
Same as the same story experiment, but introduces one dummy agent who always contributes 0.

To run across the experiment for all stories:

In [None]:
# Run all 12 stories for bad_apple
for i in range(12):
    print(f"Running bad_apple experiment for story {i}")
    run_experiment("bad_apple", story_index=i)

# Heterogenous Experiment

Assigns a random story to each agent and runs 200 games with 4 agents.

In [None]:
# Run different_story experiment once
print("Running different_story experiment")
run_experiment("different_story")

# Visualization


The project includes scripts to visualize collaboration and scaling results.

# 1. Distribution Analysis Plots

Generates violin plots for different experiment types:

Collaboration Score for Homogenous and Robustness experiments.
Payoff per Agent for Heterogenous experiment.

Run:

In [None]:
# Define color dictionary for plot consistency
COLOR_DICT = {
    # Baseline condition (Shades of Blue)
    "maxreward": "#87CEFA",
    "noinstruct": "#4682B4",
    "nsCarrot": "#4169E1",
    "nsPlumber": "#00008B",
    # Meaningful stories (Shades of Purple/Pink)
    "Odyssey": "#FFB3E6",
    "Soup": "#FF99CC",
    "Peacemaker": "#FF66B3",
    "Musketeers": "#FF4D9E",
    "Teamwork": "#F02278",
    "Spoons": "#D81B60",
    "Turnip": "#B83B7D",
    "OldManSons": "#B22272",
}

# Ensure vectorized rendering
mpl.rcParams['savefig.format'] = 'pdf'
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

def load_csv_files(pattern):
    """Loads all CSV files matching a pattern and merges them into a DataFrame."""
    files = glob.glob(pattern)
    if not files:
        print(f"No files found for pattern: {pattern}")
        return None
    return pd.concat([pd.read_csv(f) for f in files], ignore_index=True)


def preprocess_data(df, metric):
    """
    Prepares data by filtering only final round rows and converting columns to numeric types.
    Metric can be 'CollaborationScore' or 'CumulativePayoff'.
    """
    if df is None or df.empty:
        return None

    if metric == "CollaborationScore":
        df = df[df["Round"] == "final"].copy()
    else:  # "CumulativePayoff"
        df = df[df["Round"] != "final"].copy()
        df["Round"] = pd.to_numeric(df["Round"], errors="coerce")
        df.dropna(subset=["Round"], inplace=True)
        df = df.loc[df.groupby(["Game", "AgentName"])["Round"].idxmax()].copy()

    df[metric] = pd.to_numeric(df[metric], errors="coerce")
    df.dropna(subset=[metric], inplace=True)

    return df


def plot_violin(df, metric, title, output_pdf, plot_mean_line=True, show_legend=False):
    """
    Plots a violin plot for CollaborationScore or CumulativePayoff.
    """
    if df is None or df.empty:
        print(f"No data to visualize for {title}")
        return

    # Compute x-axis order based on mean metric values
    order = (df.groupby("PromptType")[metric]
            .mean()
            .sort_values(ascending=True)
            .index.tolist())

    plt.figure(figsize=(12, 7))

    # Define color palette
    palette = {cat: COLOR_DICT.get(cat, "#888888") for cat in order}

    # Violin plot with embedded box plot
    ax = sns.violinplot(
        data=df,
        x="PromptType",
        y=metric,
        hue="PromptType",
        palette=palette,
        inner="box",
        dodge=False,
        order=order,
        bw_adjust=5, # Adjusting KDE bandwidth
        scale="area", # Uniform width across all violins
    )

    if ax.get_legend():
        ax.get_legend().remove()

    # Overlay scatter points
    sns.stripplot(
        data=df,
        x="PromptType",
        y=metric,
        color="black",
        dodge=False,
        alpha=0.2,
        size=2,
        zorder=2,
        order=order
    )

    # (Optional) Plot mean trend line
    if plot_mean_line:
        means = df.groupby("PromptType")[metric].mean().loc[order]
        x_positions = list(range(len(order)))
        plt.plot(
            x_positions, means.values, marker='o',
            color='black', linestyle='-', linewidth=2,
            markersize=6, alpha=0.5, label="Mean Trend"
        )
        if show_legend:
            plt.legend(["Mean Trend"])

    # Customize plot

    if metric == "CollaborationScore":
        plt.ylim(0, 1.3)  # Set range for Collaboration Score
    elif metric == "CumulativePayoff":
        plt.ylim(0, 120) #Set range for Cumulative Payoff
    plt.xlabel("Story Prompt", fontsize=18, labelpad=15)

    ylabel_text = "Payoff per Agent" if metric == "CumulativePayoff" else "Collaboration Score"

    plt.ylabel(f"{ylabel_text}", fontsize=18, labelpad=15)
    plt.title(title, fontsize=20, weight="bold" , pad=20)
    plt.xticks(rotation=90 if len(order) > 5 else 0, fontsize=14)
    plt.yticks(fontsize=14)

    sns.despine()
    plt.grid(False)

    # Save the plot as Pdf
    plt.tight_layout()
    plt.savefig(output_pdf, bbox_inches='tight', format='pdf', transparent=False)  # Fully vectorized PDF
    print(f"Figure saved as {output_pdf}")

# Define experiment types and file patterns
CATEGORIES = {
    "same_story_4_agents": "game_results_same_story_*_ag4_ro5_end10_mult1.5.csv",
    "same_story_16_agents": "game_results_same_story_*_ag16_ro5_end10_mult1.5.csv",
    "same_story_32_agents": "game_results_same_story_*_ag32_ro5_end10_mult1.5.csv",
    "different_story_16_agents": "game_results_different_story_ag16_ro5_end10_mult1.5.csv",
    "bad_apple_16_agents": "game_results_bad_apple_*_ag16_ro5_end10_mult1.5.csv"
}

for category, pattern in CATEGORIES.items():
    agent_count = category.split("_")[-2]  # Extract agent count dynamically

    if "different_story" in category:
        # Different story -> Cumulative Payoff
        csv_files = glob.glob(pattern)
        for csv_file in csv_files:
            output_file = csv_file.replace("game_results", "cumulative_payoffs").replace(".csv", ".jpg")
            df = preprocess_data(pd.read_csv(csv_file), "CumulativePayoff")

            title = f"Heterogenous Experiment"

            plot_violin(df, "CumulativePayoff", title, output_file)
    else:
        # Same story & bad apple -> Collaboration Score
        df = load_csv_files(pattern)
        df = preprocess_data(df, "CollaborationScore")
        if df is not None:
            output_file = f"{category}_collaboration_scores.pdf"

            if "bad_apple" in category:
                title = f"Robustness"
            else:
                title = f"Homogenous Experiment"

            plot_violin(df, "CollaborationScore", title, output_file)

# 2. Scaling Experiment Visualization

Plots the mean collaboration score across agent sizes to analyze scaling effects in Homogenous Experiment

Run:



In [None]:
# Define agent sizes used in the experiment
agent_sizes = [4, 16, 32]

# Define color gradients for baseline (blue) and meaningful stories (pink)
BLUE_SHADES = ["#87CEFA", "#4682B4", "#4169E1", "#00008B"]  # Light to Dark Blue
PINK_SHADES = ["#FFB3E6", "#FF99CC", "#FF66B3", "#FF4D9E", "#F02278", "#D81B60", "#B83B7D", "#B22272"]  # Light to Dark Pink

# Define file categories for different temperatures
CATEGORY_GROUPS = {
    "temp_0.6": {
        4: "game_results_same_story_*_ag4_ro5_end10_mult1.5_temp_0.6.csv",
        16: "game_results_same_story_*_ag16_ro5_end10_mult1.5_temp_0.6.csv",
        32: "game_results_same_story_*_ag32_ro5_end10_mult1.5_temp_0.6.csv",
    },
}

def process_category(temp_label, CATEGORIES):
    """Processes a single temperature condition and generates a visualization."""
    story_scores = {}

    # Load CSV data and extract mean collaboration scores
    for agent_count, pattern in CATEGORIES.items():
        files = glob.glob(pattern)
        if not files:
            print(f"No files found for pattern: {pattern}")
            continue

        df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

        # Compute mean collaboration scores per story
        story_means = df.groupby("PromptType")["CollaborationScore"].mean()
        story_scores[agent_count] = story_means.to_dict()  # Store scores

    # Extract the order of stories as they appear at N=4 in ascending order
    starting_order = sorted(story_scores[4].items(), key=lambda x: x[1])
    starting_stories = [story for story, _ in starting_order]

    print(f"Processing {temp_label}: starting_stories = {starting_stories}")

    # Dynamically assign colors based on order in starting_stories
    COLOR_DICT = {}
    blue_idx, pink_idx = 0, 0  # Track index for blue and pink shades

    for story in starting_stories:
        if story in ["noinstruct", "nsCarrot", "nsPlumber", "maxreward"]:  # Baseline stories
            COLOR_DICT[story] = BLUE_SHADES[blue_idx]
            blue_idx += 1  # Move to next darker shade
        else:  # Meaningful stories
            COLOR_DICT[story] = PINK_SHADES[pink_idx]
            pink_idx += 1  # Move to next darker shade

    # Plot settings
    plt.figure(figsize=(12, 6))
    legend_handles = []

    # Plot each story's progression across agent sizes
    for story in COLOR_DICT.keys():  # Only plot stories in COLOR_DICT
        positions = []

        for agent_count in agent_sizes:
            if agent_count in story_scores and story in story_scores[agent_count]:
                x_pos = story_scores[agent_count][story]
                y_pos = agent_sizes.index(agent_count)
                positions.append((x_pos, y_pos))

                # Scatter plot for each point
                plt.scatter(x_pos, y_pos, s=70, facecolors="none", edgecolors=COLOR_DICT[story], linewidths=1.5, label=story if agent_count == 4 else "")

        if len(positions) > 1:
            x_vals, y_vals = zip(*positions)
            plt.plot(x_vals, y_vals, linestyle="dashed", color=COLOR_DICT[story], alpha=0.7)

    for story in starting_stories:
        legend_handles.append(
            mlines.Line2D(
                [], [], marker="o", linestyle="None", markersize=8, color=COLOR_DICT.get(story, "#888888"), label=story
            )
        )

    # Customizing plot
    plt.xlabel("Mean Collaboration Score", fontsize=18, labelpad=15)
    plt.ylabel("Agent Size", fontsize=18, labelpad=15)
    plt.yticks(range(len(agent_sizes)), [f"N = {n}" for n in agent_sizes], fontsize=14, weight="bold")
    plt.title(f"Scaling Experiment", fontsize=20, weight="bold", pad=20)
    plt.grid(axis="y", linestyle="dotted")

    plt.legend(
        handles=legend_handles, title="Story", bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=12
    )

    plt.tight_layout()

    # Save as Pdf
    filename_base = f"scaling_experiment_collab_score_{temp_label}"
    plt.savefig(f"{filename_base}.pdf", bbox_inches="tight", format="pdf")

    print(f"Scaling experiment figures saved as {filename_base}.pdf")

    plt.show()

# Run the process for each category
for temp_label, category_dict in CATEGORY_GROUPS.items():
    process_category(temp_label, category_dict)