# 1. API Definitions/Setup

In [None]:
!pip install -U "autogen-agentchat" "autogen-ext[openai,azure]"
# install for colab

In [None]:
import os
from openai import OpenAI
import json
import collections

# for agent environment
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from dotenv import load_dotenv
load_dotenv()

# Core Variables
TEMP = 1
# standalone until 5/14
#models = ["openai/gpt-4o-mini", "anthropic/claude-3.5-haiku", "google/gemini-2.0-flash-exp:free", "deepseek/deepseek-chat-v3-0324:free", "meta-llama/llama-4-scout:free", "mistralai/mixtral-8x7b-instruct"]
# since 5/14
models = ["openai/gpt-4o-mini", "anthropic/claude-3.5-haiku", "deepseek/deepseek-chat-v3-0324", "meta-llama/llama-4-scout", "mistralai/mixtral-8x7b-instruct"] # skips gemini, structured ouput flag issue potentially
model = models[0]  # default model

API_KEY = None
try:
    # Google Colab environment
    from google.colab import userdata
    API_KEY = userdata.get('OPENROUTER_API_KEY')  # Colab secret name
except ImportError:
    # Local environment
    import os
    API_KEY = os.environ.get("OPENROUTER_API_KEY")  # Local environment variable

def get_client(model = model):
  client = OpenAIChatCompletionClient(
      api_key=API_KEY,
      base_url="https://openrouter.ai/api/v1",
      model=model,
      temperature=TEMP,
      model_info = {
          "vision": False,
          "function_calling": False,
          "json_output": False,
          "family": "unknown",
      }
  )
  return client
client = get_client()

# 3. Load questions

In [None]:
# NOTE PLEASE USE FORKED https://github.com/sinemmy/greatest-good-benchmark (questions have unique identifiers)
import os 
import json

# CURRENTLY THE QUESTION DIR AND WISODM DIR IS EXPECTED TO BE SEPARATE FOLDERS IN YOUR GITHUB FOLDER 
QUESTION_DIR = './' # CHANGE HERE IF NEEDED


# making DATA_DIR separate becuase they also have prompts (originaly but also including inverting the likert scale)
#QUESTION_DATA_DIR = os.path.abspath(QUESTION_DIR + 'data/') 
#QUESTION_JSON = os.path.abspath('./data/GreatestGoodBenchmark.json')
#INVERTED_JSON = os.path.abspath('./data/GreatestGoodBenchmarkInverted.json')
QUESTION_JSON = os.path.abspath('./data/OUSquestions.json')

from typing import Literal

class GGB_Statements:
    def __init__(self, JSONpath = QUESTION_JSON):
        self.json_data = self._load_json(JSONpath)
        self.questions = self._json_to_dict()
        

    def _load_json(self, path):
        with open(path, 'r') as f:
            return json.load(f)
    
    def _json_to_dict(self):
        self.questions  = {}
        for entry in self.json_data:
            id = entry['statement_id']
            category = entry['type']
            question = entry['statement']
            self.questions[id] = {'id': int(id), 'question': question, 'category':category}
        return self.questions

    def print_question(self, question_id, printout=False):
        qstring = self.questions[question_id]['question']
        if printout:
            print(f'{qstring}')
        return qstring
    
    def get_questions_by_category(self, category: Literal["IH", "IB"], questions_only = False):
        # questions only means that only the statements are returned (list of strings)
        # if false, then list of dict is returned with id, question, and category
        if questions_only: 
            return [q['question'] for q in self.questions if q["type"] == category]
        else: 
            return [q for q in self.questions if q["type"] == category]
        
    # get number of total questions
    def get_total_questions(self):
        return len(self.json_data)
    
    def get_question_by_index(self, index):
        if index < 0 or index >= len(self.json_data):
            raise IndexError("Index out of range")
        return self.json_data[index]
    
# GGB Questions
Qs = GGB_Statements()
# GGB Inverted Questions
#InvertQs = GGB_Statements(INVERTED_JSON)

Qs.get_question_by_index(0)

sampleQ = Qs.print_question('1', printout=False)
#sampleInvert = InvertQs.print_question('101', printout=False)
print(f'\t Original Question: \n {sampleQ}') #\n \t Inverted Question: \n {sampleInvert}')
# note: the inversions are not perfect quite yet but its a start



In [None]:
from typing import Literal
import os
import json

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap

from scipy.stats import gaussian_kde

import glob
from math import isnan

from matplotlib.patches import Rectangle

##########
# Load human dataset
DATA_CSV = os.path.abspath('./human_data/ous_filtered.csv')

# Load Human answers
h1 = pd.read_csv(DATA_CSV) #"human_data/ous_filtered.csv")
h2 = h1.copy()
h2["IB"] = (h2["IB1"] + h2["IB2"] + h2["IB3"] + h2["IB4"] + h2["IB5"]) / 5
h2["IH"] = (h2["IH1"] + h2["IH2"] + h2["IH3"] + h2["IH4"]) / 4
human_df = h2


# KDE plotting function
# def human_kde(human_df=h2, ax=None, alpha=1, colormap='Greys'):
#     # TODO: Not sure this is correct?
#     # Draw humans as KDE
#     smoothness=20
#     ib_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IB
#     ih_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IH
#     ib_grid, ih_grid = np.mgrid[1:7:(smoothness*1j), 1:7:(smoothness*1j)]
#     positions = np.vstack([ib_grid.ravel(), ih_grid.ravel()])
#     values = np.vstack([human_df['IB'], human_df['IH']])
#     #values = np.vstack([np.random.random(10000) * 3, np.random.random(10000) * 5])
#     kernel = gaussian_kde(values)
#     Z = np.reshape(kernel(positions).T, ib_vals.shape + ih_vals.shape)
#     if ax is None:
#         plt.imshow(np.rot90(Z), cmap=colormap, extent=[1, 7, 1, 7], alpha=alpha)
#     else:
#         ax.imshow(np.rot90(Z), cmap=colormap, extent=[1, 7, 1, 7], alpha=alpha)

# Switching IH and IB axes so that it is consistent with the paper
def human_kde(human_df=h2, ax=None, alpha=1, colormap='coolwarm'): #'Greys'):
    # Draw humans as KDE with IH on x-axis and IB on y-axis
    smoothness = 20
    ih_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IH (x-axis)
    ib_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IB (y-axis)
    ih_grid, ib_grid = np.mgrid[1:7:(smoothness*1j), 1:7:(smoothness*1j)]  # Switch order here
    positions = np.vstack([ih_grid.ravel(), ib_grid.ravel()])  # Switch order here
    values = np.vstack([human_df['IH'], human_df['IB']])  # Switch order here
    
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, ih_vals.shape + ib_vals.shape)  # Notice ih first, then ib
    
    if ax is None:
        # For imshow, extent is [left, right, bottom, top]
        plt.imshow(np.rot90(Z), cmap=colormap, extent=[1, 7, 1, 7], alpha=alpha)
    else:
        ax.imshow(np.rot90(Z), cmap=colormap, extent=[1, 7, 1, 7], alpha=alpha)

In [None]:
#test figure generation
colors = plt.cm.Greys(np.linspace(0.01,1, 256))
custom_grey = LinearSegmentedColormap.from_list('custom_grey', colors)
human_kde(human_df=human_df, colormap=custom_grey)

# 4.3 Martin ring testing ground

In [None]:

from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
import asyncio
import random
import matplotlib.pyplot as plt
import numpy as np
import re
import sys
from collections import defaultdict
from autogen_agentchat.conditions import MaxMessageTermination
import seaborn as sns
from matplotlib.patches import Rectangle
import logging
import statistics
import matplotlib.pyplot as plt
from helpers import get_prompt

def extract_answer_from_response(content):
    """Extracts the answer from the agent's response."""
    start_index = content.find("<ANSWER>")
    end_index = content.find("</ANSWER>")
    if start_index != -1 and end_index != -1:
        return content[start_index + len("<ANSWER>"):end_index]
    return "No answer found in the agent's response."


def clean_data(data_dict, placeholder="Miss"):
    """Replace missing strings in a dictionary of lists."""
    return {
        model: [placeholder if "No" in str(val) else val for val in values]
        for model, values in data_dict.items()
    }

def process_label(label):
    if label.startswith('agent_'):
        label = label[len('agent_'):]  # remove leading 'agent_'
    # split at first underscore only, insert line break there
    parts = label.split('_', 1)
    if len(parts) == 2:
        return parts[0] + '\n' + parts[1]
    else:
        return label  # if no underscore after removing 'agent_'

# plot convergence pattern for one iteration of loops
"""
def plot_polished_answers(model_answers, iteration_index, model_ensemble, agents):
   # Plot answers for a single iteration and return the figure and axes.
    import seaborn as sns
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.patches import Rectangle

    sns.set(style='whitegrid', font_scale=1.2)

    # Enforce consistent model order based on model_ensemble
    #modelslist = [m['model'] for m in model_ensemble]
    modelslist = [m.name for m in agents] # preserves shuffle order and should work for homogeneous as well

    # ✨ Apply line breaks in model names
    #wrapped_models = [model.replace('/', '\n') for model in modelslist]
    wrapped_models = [m.name.replace('/', '\n') for m in agents] # to keep using shuffled list

    # deubg:
    print("Models in agents list:")
    print(modelslist)
    print("Keys in model_answers:")
    print(list(model_answers.keys()))

    missing_keys = [model for model in modelslist if model not in model_answers]
    if missing_keys:
        print(f"❌ These models are missing from model_answers: {missing_keys}")



    max_loops = max(max(len(v) for v in model_answers.values()), 1)
    fig, ax = plt.subplots(figsize=(max_loops * 1.5, len(modelslist) * 1.2))

    answer_colors = {
        '1': '#5e3c99',
        '2': '#1f78b4',
        '3': '#a6cee3',
        '4': '#b2df8a',
        '5': '#fdbf6f',
        '6': '#ff7f00',
        '7': '#e31a1c',
        'No data': 'lightgray',
    }

    for i, model in enumerate(modelslist):
        for j in range(max_loops):
            answer = model_answers[model][j] if j < len(model_answers[model]) else 'No data'
            label = f"{answer}" if answer != "No data" else "No data"
            bg_color = answer_colors.get(answer, 'lightgray')
            rect = Rectangle((j - 0.5, i - 0.5), 1, 1,
                             facecolor=bg_color, linewidth=2, alpha=0.7)
            ax.add_patch(rect)
            ax.text(j, i, label, ha='center', va='center', fontsize=12,
                    color='black' if answer != "No data" else 'dimgray', weight='bold')

    ax.set_xticks(np.arange(max_loops))
    ax.set_xticklabels([f"Loop {i+1}" for i in range(max_loops)],
                       rotation=45, ha='right', fontsize=9)

    ax.set_yticks(np.arange(len(wrapped_models)))
    ax.set_yticklabels(wrapped_models, fontsize=9)  # ✨ Now uses wrapped model names

    ax.set_title(f"Model Responses – Iteration {iteration_index + 1}", fontsize=15, pad=12)
    ax.set_xlim(-0.5, max_loops - 0.5)
    ax.set_ylim(-0.5, len(modelslist) - 0.5)
    ax.invert_yaxis()
    """
def plot_polished_answers(model_answers, iteration_index, model_ensemble, agents, agent_map): 
    """
    Plot answers for a single iteration and return the figure and axes.
    Displays agents in the y-axis (preserving order), but looks up model names in model_answers.
    """
    import seaborn as sns
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.patches import Rectangle

    sns.set(style='whitegrid', font_scale=1.2)

    # Show agent names (preserving shuffle) for plotting,
    # but lookup answers using model names via agent_map
    #agent_names = [agent.name for agent in agents]
    #model_names = [agent_map[name] for name in agent_names]  # map agent name → model name for data lookup
   

    agent_names = list(model_answers.keys())  # agent_xxx keys
    model_names = [agent_map.get(agent, "Unknown") for agent in agent_names]  # map agent -> model

    # ✨ Wrap agent names for better display
    wrapped_labels = [name.replace('/', '\n') for name in agent_names]

    """
    # Debug print
    print(f"🧠 Debug Info: plot_polished_answers")
    print(f"Agent names: {agent_names}")
    print(f"Model names: {model_names}")
    print(f"Keys in model_answers: {list(model_answers.keys())}")

    # Check for missing answers by model name if needed (optional)
    missing_models = []
    for model in set(agent_map.values()):
        # If no agent with this model has answers, mark missing
        if not any(m == model for m in model_names):
            missing_models.append(model)
    if missing_models:
        print(f"⚠️ Models in agent_map but missing in model_answers: {missing_models}")
    """
    # Determine grid dimensions
    max_loops = max(max(len(v) for v in model_answers.values()), 1)
    fig, ax = plt.subplots(figsize=(max_loops * 1.5, len(agent_names) * 1.2))

    # Define color map
    answer_colors = {
        '1': '#5e3c99',
        '2': '#1f78b4',
        '3': '#a6cee3',
        '4': '#b2df8a',
        '5': '#fdbf6f',
        '6': '#ff7f00',
        '7': '#e31a1c',
        'Miss': 'lightgray',
    }

    # Plot answers for each agent
    for i, (agent_name, model_name) in enumerate(zip(agent_names, model_names)):
        answers = model_answers.get(agent_name, [])
        #print(f"🧠 Debug: Plotting answers for agent '{agent_name}' (model '{model_name}'): {answers}")
        for j in range(max_loops):
            #answers = model_answers.get(model_name, [])
            answer = answers[j] if j < len(answers) else 'Miss'
            label = str(answer)
            color = answer_colors.get(answer, 'lightgray')
            #print(f'j = {j}, i = {i}')
            rect = Rectangle((j - 0.5, i - 0.5), 1, 1,
                             facecolor=color, linewidth=2, alpha=0.7)
            ax.add_patch(rect)
            ax.text(j, i, label, ha='center', va='center', fontsize=12,
                    color='black' if answer != "Miss" else 'dimgray', weight='bold')
    #print(f'max_loops = {max_loops}')
    # Set axis ticks and labels
    ax.set_xticks(np.arange(max_loops))
    ax.set_xticklabels([f"Loop {i+1}" for i in range(max_loops)],
                       rotation=45, ha='right', fontsize=9)

    #processed_labels = [process_label(name) for name in agent_names]
    processed_labels = [f"{process_label(agent)}\n" for agent in agent_names]
    #ax.set_yticks(np.arange(len(wrapped_labels)))
    ax.set_yticks(np.arange(len(processed_labels)))
    #ax.set_yticklabels(wrapped_labels, fontsize=9)
    ax.set_yticklabels(processed_labels, fontsize=9)

    ax.set_title(f"Model Responses – Iteration {iteration_index + 1}", fontsize=15, pad=12)
    ax.set_xlim(-0.5, max_loops - 0.5)
    xlim = ax.get_xlim()
    #print(f"Current x-axis limits: {xlim}")
    ax.set_ylim(-0.5, len(agent_names) - 0.5)
    ax.invert_yaxis()
    sns.despine(ax=ax, left=True, bottom=True)

    #plt.tight_layout() # has some layout issues squishing x bins
    plt.subplots_adjust(left=0.3, bottom=0.2, top=0.9)
    return fig, ax



import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
import math




async def run_round_robin_chat(system_message, model_ensemble, task, agents, shuffle=False, secret=False):
    """
    Runs a round-robin group chat between different models,
    allowing different response counts per model, optional shuffling,
    answer and confidence extraction, and question asking from categories.

    Args:
        model_ensemble (list): List of model objects, each with 'model' and 'responses' keys.
        task (str): The initial task or message to start the chat.
        shuffle (bool): Whether to shuffle the agent order. Defaults to False.

    Returns:
        dict: A dictionary mapping models to lists of extracted answers and confidences.
    """
    
    # Create agents from different models
    agents = []
    model_answers = defaultdict(list)  # To store answers by model
    agent_map = {}
    #all_model_answers = {}

    logging.info(f"system prompt = {system_message}")
    logging.info(f"task= {task}")
    print(f"system prompt = \n {system_message}")
    print(f'task= \n {task}')

    for i, model_data in enumerate(model_ensemble):
        for j in range(model_data['number']):
            model = model_data['model']
            model_text = re.sub(r'\W+', '_', model)  # Replaces all non-alphanumeric chars except '_'
            agent_name = f"agent_{model_text}_{i + j}"
            
            agent = AssistantAgent(
                name=agent_name,
                model_client=get_client(model),  # Use your client defined previously
                system_message=system_message,
            )
            agent_map[agent_name] = model
            agents.append(agent)

    # Shuffle agents if specified
    if shuffle:
        random.shuffle(agents) # for 90 calls (even from same starting list) is likely to produce 63 unique orders, but will only see about 13 double repeats, and ~4 triple repeats
        print(f'\n shuffled agents \n')
    logging.info(f"# of agents:  {len(agents)}")
    print("# of agents: ", len(agents))

    # implementation for secret first round
    N_convergence_loops_loc = N_convergence_loops
    if secret == True:
         # Silent round: Each agent responds without seeing others' responses
         silent_responses = {}
         for agent in agents:
             result = await agent.run(task=task)  # Each agent responds independently
             for message in result.messages:
                if message.source != "user":  # Exclude user messages
                    silent_responses[message.source] = message.content
                    logging.info(f"First round response from {message.source}: {message.content}")
                    print(f"First round response from {message.source}: {message.content}")
                    # store first round answers for analysis
                    answer = extract_answer_from_response(message.content)
                    #silent_responses[message.source] = "Personal luxury is more important than healthcare. <ANSWER>1</ANSWER>"
                    #print(f"Silent response from {message.source}: Personal luxury is more important than healthcare. <ANSWER>1</ANSWER>")
                    #answer = "1"
                    model = agent_map[message.source]
                    model_answers[model].append(answer) 

         # Share silent responses with all agents at the start of the second round
         shared_responses = "\n".join([f"{name}: {response}" for name, response in silent_responses.items()])
         task_with_responses = f"{task}\n\nResponses from the first round:\n{shared_responses}"
         N_convergence_loops_loc = N_convergence_loops - 1 # since one used already for silent round
         task = task_with_responses # overwrite with messages from silent round

    # Create RoundRobinGroupChat with termination condition
    team = RoundRobinGroupChat(
        agents,
        termination_condition=MaxMessageTermination((N_convergence_loops_loc * len(agents)) + 1),  # Terminate when any agent reaches its response limit
    )

    # Run the chat and print the conversation
    result = await Console(team.run_stream(task=task))
    logging.info(f"{result}")
    print(result)

    # Extract answers and group by model
    for message in result.messages:
        if message.source != "user":
            answer = extract_answer_from_response(message.content)
            #model = agent_map[message.source]
            #model_answers[model].append(answer)
            agent_name = message.source  # This is already the unique agent ID
            model_answers[agent_name].append(answer)
    """"
   # Extract mean and stdev only from the last round of messages
    numeric_answers = []
    for message in result.messages[-len(agents):]:  # Only consider the last set of messages (one per agent)
        if message.source != "user":
            answer = extract_answer_from_response(message.content)
            if answer.isdigit():  # Ensure the answer is numeric
                numeric_answers.append(int(answer))

   # Calculate mean and standard deviation if there are values
    if numeric_answers:
        iteration_answer_agentmean = statistics.mean(numeric_answers)
        iteration_answer_agentstddev = statistics.stdev(numeric_answers) if len(numeric_answers) > 1 else 0
    else:
        iteration_answer_agentmean = 0
        iteration_answer_agentstddev = 0

    agentmean_values.append(iteration_answer_agentmean)
    agentstddev_values.append(iteration_answer_agentstddev)
    """
    return model_answers, agent_map, agents



async def main():
    agentmean_values = [] # need to later rest in each loop- stores aggregated values for all iterations
    agentstddev_values = []
    figures = [] # storing convergence plots for all iterations
    agents = [] # init to fill in roundrobinchat and use in plotting function

    for it in range(N_iterations_per_question):
        logging.info(f"\n\nDiscussion iteration index for question {question_idx} = {it}\n\n")
        print(f"\n\n Discussion iteration index for question {question_idx} = {it} \n\n")
       
        model_answers, agent_map, agents = await run_round_robin_chat(system_message, model_ensemble, agents=agents, task=task, shuffle=shuffle) # recreates agents every iteration, should not have carryover
        logging.info(f"Final answers by model: {model_answers}")
        print("Final answers by model:", model_answers)
        
        cleaned_answers = clean_data(model_answers)
        #logging.info(f"Cleaned answers: {cleaned_answers}")

        # Store the answers in the container with the iteration index
        #all_model_answers[it] = model_answers
        #print("✅ agents just before plotting:", agents)
        #print("✅ agent_map:", agent_map)
        #print("✅ model_answers keys:", list(model_answers.keys()))

        # Collect the figure and axes
        fig, ax = plot_polished_answers(cleaned_answers, iteration_index=it, model_ensemble=model_ensemble, agents=agents, agent_map=agent_map)
        #fig, ax = plot_polished_answers(cleaned_answers, iteration_index=it, model_ensemble=model_ensemble, agents=agents)
        figures.append(fig)
    
        # Extract the *last* answer from each model's list (skip "No data")
        final_numeric_answers = []
        for answers in model_answers.values():
            if answers:
                last = answers[-1]
                if str(last).isdigit():
                    final_numeric_answers.append(int(last))

        # Now calculate mean and stddev
        if final_numeric_answers:
            iteration_answer_agentmean = statistics.mean(final_numeric_answers)
            iteration_answer_agentstddev = statistics.stdev(final_numeric_answers) if len(final_numeric_answers) > 1 else 0
        else:
            iteration_answer_agentmean = 0
            iteration_answer_agentstddev = 0

        agentmean_values.append(iteration_answer_agentmean)
        agentstddev_values.append(iteration_answer_agentstddev)
      
        #plot_polished_answers(cleaned_answers, iteration_index=it, model_ensemble=model_ensemble)

    #==========
    # Write averages of final round means and stddev to json file
    # data_filepath = os.path.join(data_folder, data_filename)
    # Calculate average of the means of all final round/converged (assumed) answer values, over N_iterations, for current question
    average_mean = sum(agentmean_values) / len(agentmean_values)
    # propagate std dev through sum: 1/n sqrt(sum(stddev)^2)
    n = len(agentstddev_values)
    sum_of_variances = sum(sd**2 for sd in agentstddev_values)
    variance_of_mean = sum_of_variances / (n ** 2)
    propagated_uncertainty = math.sqrt(variance_of_mean)

    # Store in dictionary
    result = {
        "average_mean": average_mean,
        "propagated_uncertainty": propagated_uncertainty
    }

    # write to JSON file
    with open(data_filepath, "w") as f:
        json.dump(result, f, indent=4)

    # Print result
    print(result)


    #==========
    # Plotting convergence plots for current question : 
    
    question_title = f"{next((key for key, values in question_set.items() if question_idx in values), 'Unknown Category')} Question {question_idx}"

    # Convergence: Arrange all convergence plots in a 2x5 grid (or adjust as needed)
    # Grid layout
    rows, cols = 2, 5
    #fig, axes = plt.subplots(rows, cols, figsize=(18, 8), constrained_layout=True)
    #fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
    fig, axes = plt.subplots(rows, cols, figsize=(20, 8), gridspec_kw={'hspace': 0.25}) # make distance narrower between plot rows
    axes = axes.flatten()

    # Font scale factors
    base_font_scale = 2.0
    extra_boost = 1.25
    font_scale = base_font_scale * extra_boost

    for i, fig_i in enumerate(figures):
        for ax in fig_i.axes:
            # --- Replace title with 'Iteration X' ---
            ax.set_title(f"Iteration {i + 1}", fontsize=12 * font_scale)

            # --- Set x-axis label ---
            ax.xaxis.label.set_fontsize(ax.xaxis.label.get_fontsize() * font_scale)

            # --- Force line break in y-axis label --- may not be active
            original_ylabel = ax.get_ylabel()
            if '/' in original_ylabel:
                wrapped_ylabel = original_ylabel.replace('/', '\n')
            else:
                wrapped_ylabel = original_ylabel

            # Clear and set explicitly
            ax.set_ylabel('')
            ax.set_ylabel(wrapped_ylabel, fontsize=ax.yaxis.label.get_fontsize() * font_scale)

            # --- Scale tick labels and reduce x-tick angle ---
            for label in ax.get_xticklabels():
                label.set_fontsize(label.get_fontsize() * font_scale)
                label.set_rotation(30)

            for label in ax.get_yticklabels():
                label.set_fontsize(label.get_fontsize() * font_scale)

            # --- Scale text annotations (e.g., numbers in boxes) ---
            for text in ax.texts:
                text.set_fontsize(text.get_fontsize() * font_scale)

        # Ensure figure is redrawn after label updates
        #fig_i.canvas.draw()
        #width, height = fig_i.canvas.get_width_height()
        #image = np.frombuffer(fig_i.canvas.tostring_rgb(), dtype=np.uint8).reshape(height, width, 3)
        #image = np.frombuffer(fig_i.canvas.buffer_rgba(), dtype=np.uint8).reshape(height, width, 4)[..., :3]
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        from PIL import Image
        import io

        canvas = FigureCanvas(fig_i)
        buf = io.BytesIO()
        fig_i.savefig(buf, format='png', bbox_inches='tight', dpi=150)  # crucial for capturing labels
        buf.seek(0)
        image = np.array(Image.open(buf))  # Now has everything preserved

        # Place in combined plot grid
        axes[i].imshow(image)
        axes[i].axis('off')

    # Turn off unused axes
    for j in range(len(figures), rows * cols):
        axes[j].axis('off')

    # Global title
    fig.suptitle(f"Agent Ensemble Convergence Plots for {question_title}", fontsize=22, y=0.96) # 1.02 leaves one empty line, 0.9 overlaps with subplot title, 1.0 still almost leaves empty line

    #plt.subplots_adjust(top=0.9, bottom=0.15) # avoid xaxis labels cut off
    #plt.subplots_adjust(top=0.9, bottom=0.2) # avoid xaxis labels cut off
    plt.subplots_adjust(top=0.92, bottom=0.1, left=0.08, right=0.92)  # Increase left and top margins
    #plt.tight_layout(rect=[0.05, 0.05, 1, 0.95])  # Give more room at the left and bottom to prevent clipping of x labels
    #plt.tight_layout(rect=[0.05, 0.05, 1, 0.95], pad=2.0) # padding to avoid xaxis labels cut off
    plot_convfilename = f'convergenceplots_{plot_filename}'
    plot_convfilepath = os.path.join(plot_folder, plot_convfilename)
    fig.savefig(f'{plot_convfilepath}', dpi=300)
    print(f"plot saved to {plot_folder}convergenceplots_{plot_filename}")
    plt.show()
    

# open item to compare the different exp setups, need to then calculate std dev of mean values over iterations

# Execution
N_iterations_per_question = 10  # default 10 for enough statistics to understand variability of each question
N_convergence_loops = 3  # default 3, for one iteration for one question

group_chat = True # always for roundrobin
# to do: implement looping on reasoning and secret, then invert questions and implement invert

sys.exit() # circuit breaker

question_set = {
    #"IH questions": [1], # debug
    "IH questions": [1, 2, 3, 4],
    "IB questions": [5, 6, 7, 8, 9]
}

# set up logging and plot paths
# Create a 'log' folder and plot if it doesn't exist
log_folder = os.path.abspath("./logs/")
plot_folder = os.path.abspath("./plots/")
data_folder = os.path.abspath("./data/")
os.makedirs(log_folder, exist_ok=True)
os.makedirs(plot_folder, exist_ok=True)
# data folder already exists

reasoning = True
secret = False
inverted = False # not ready yet- need to write inverted questions for ous manually; does not impact system_message/prompt, only different task including answer scale and question
#secret = True
#reasoning = True

import itertools

# Define the two binary flags
flag_values = [True, False]

shuffle = False  # for now keep false to maintain order
homogeneity = False

""" # rationale and secret version:
    model_ensemble = [
            {"model": models[0], "number": 1},  # here only one model per model flavor
            {"model": models[1], "number": 1},
            {"model": models[2], "number": 1}, 
            {"model": models[3], "number": 1}, 
            {"model": models[4], "number": 1},
        ]
"""

# Loop over all combinations of reasoning and secret
#for reasoning, secret in itertools.product(flag_values, repeat=2):
for shuffle, homogeneity in itertools.product(flag_values, repeat=2):

    print(f"homogeneity: {homogeneity}, shuffle: {shuffle}, secret: {secret}")

    N_iterations = 0
    model_ensemble = [] # init empty

    if homogeneity == True and shuffle == True:
        continue # no need to shuffle homogeneous ensemble
    if homogeneity == False: # for both shuffle values, True covered later in shuffle once ensemble defined- shuffle for every iteration without studying specific order, just wash out
        for idx in range(5):
            model_ensemble.append( {"model": models[idx], "number": 1}) # here only one model per model flavor
        N_iterations = 1 # only run once per shuffle value
    if homogeneity == True and shuffle == False:
        N_iterations = 5 # default 5, in each iteration define new homogeneous ensemble

    # geneity and shuffle execution loop    
    for it_idx in range(N_iterations):

        if homogeneity == True and shuffle == False:
            model_ensemble = [] # reset
            model_ensemble.append( {"model": models[it_idx], "number": 5}) # homogeneous

        print(f"model_ensemble: {model_ensemble}")
                    
        system_message = get_prompt() # use default values in helpers get_prompt 

        for question_idx in sum(question_set.values(), []):    
            # debug

            # Generate a logfile name based on the variable values
            filename = f"question_{question_idx}_homogeneity_{homogeneity}_modeltypeidx_{it_idx}_shuffle_{shuffle}_reasoning_{reasoning}_inverted_{inverted}_secret_{secret}"
            print(f"Filename: {filename}")
            log_filename = f"{filename}.log"
            log_filepath = os.path.join(log_folder, log_filename)
            plot_filename = f"{filename}.png"
            plot_filepath = os.path.join(plot_folder, plot_filename)
            data_filename = f"{filename}.csv"
            data_filepath = os.path.join(data_folder, data_filename)

            print(f"{Qs.print_question(str(question_idx), printout=False)}")
            
            # Configure logging
            logging.basicConfig(
                filename=log_filepath,
                level=logging.INFO,
                format="%(asctime)s - %(levelname)s - %(message)s",
            )
            logging.info(f"Starting run for question={question_idx} with homogeneity={homogeneity}, shuffle={shuffle}, reasoning={reasoning}, inverted={inverted}, secret={secret}")
            logging.info(f"Logfile: {log_filepath}")

            #=========
            # execution

            task = "" # updated task creation not including Likert scale instructions
            if inverted == False:
                task = Qs.print_question(str(question_idx), printout=False)
            else: 
                task = InvertQs.print_question(str(question_idx) + 100, printout=False)


            agentmean_values = [] # stores aggregated values for all iterations
            agentstddev_values = []

            await main()


            # Remove all handlers associated with the root logger to not have open files
            for handler in logging.root.handlers[:]:
                logging.root.removeHandler(handler)



# 4.4 Load Data for Post-Processing

In [None]:
#----------
# Load base case switch data for each question and human data

import os
import json
import matplotlib.pyplot as plt
import re
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import json
import re
from collections import defaultdict

# === Load Human Likert Data ===
human_data = pd.read_csv("./human_data/ous_filtered.csv")  # Replace with your actual file path

ih_columns = ['IH1', 'IH2', 'IH3', 'IH4']
ib_columns = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5']
question_columns = ih_columns + ib_columns
human_means = human_data[question_columns].mean()
human_stds = human_data[question_columns].std()
human_sems = human_stds / np.sqrt(len(human_data))
human_x = np.arange(1, len(question_columns) + 1)

# === Load JSON Flag Data ===
data_folder = "./data"
filename_pattern = re.compile(
    r"question_(\d+)_reasoning_(True|False)_inverted_(True|False)_secret_(True|False)\.csv"
)

grouped_data = defaultdict(list)
for filename in os.listdir(data_folder):
    match = filename_pattern.match(filename)
    if match:
        question_idx = int(match.group(1))
        reasoning = match.group(2) == "True"
        secret = match.group(4) == "True"
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r") as f:
            data = json.load(f)
            avg = data.get("average_mean")
            sem = data.get("propagated_uncertainty")
            grouped_data[(reasoning, secret)].append((question_idx, avg, sem))

# === Define Color Map ===
color_map = {
    (True, True): "tab:blue",
    (True, False): "tab:green",
    (False, True): "tab:red",
    (False, False): "tab:orange",
}

# === Plot ===
fig = plt.figure(figsize=(12, 6))  # Capture figure object

# Human data with SD shading and SEM error bars
human_offset = -0.2  # shift left
plt.fill_between(human_x + human_offset, human_means - human_stds, human_means + human_stds,
                 color='lightgray', label='Human ± SD')
plt.errorbar(human_x + human_offset, human_means, yerr=human_sems, fmt='o', color='black',
             capsize=4, label='Human ± SEM')

# JSON data with offsets to avoid overlap
offsets = {
    (True, True): -0.1,
    (True, False): 0.0,
    (False, True): 0.1,
    (False, False): 0.2,
}

for (reasoning, secret), data_points in grouped_data.items():
    data_points.sort()
    x_vals = np.array([dp[0] for dp in data_points])
    y_vals = np.array([dp[1] for dp in data_points])
    sems = np.array([dp[2] for dp in data_points])
    stds = sems * np.sqrt(10)  # N = 10

    offset = offsets[(reasoning, secret)]
    x_offset = x_vals + offset

    plt.fill_between(x_offset, y_vals - stds, y_vals + stds,
                     color=color_map[(reasoning, secret)], alpha=0.2)
    plt.errorbar(x_offset, y_vals, yerr=sems, fmt='o', color=color_map[(reasoning, secret)],
                 capsize=4, label=f"reasoning={reasoning}, secret={secret}")

# === Final plot settings ===
plt.xticks(human_x, question_columns)
for tick in plt.gca().get_xticklabels():
    if "IH" in tick.get_text():
        tick.set_color("blue")
    else:
        tick.set_color("red")

plt.axvline(x=4.5, linestyle='--', color='gray', alpha=0.6)
plt.xlabel("Question")
plt.ylabel("Mean Response")
plt.title("Human vs Model Mean Responses with SD and SEM (Offset for Clarity)")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

# === Save and show ===
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)
fig.savefig(os.path.join(output_dir, "likert_response_comparison.png"), dpi=300)
plt.show()




# 4.4.2 geneity and shuffle plots

In [None]:
import os
import json
import matplotlib.pyplot as plt
import re
from collections import defaultdict
import pandas as pd
import numpy as np

# === Load Human Likert Data ===
human_data = pd.read_csv("./human_data/ous_filtered.csv")  # Update path if needed

ih_columns = ['IH1', 'IH2', 'IH3', 'IH4']
ib_columns = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5']
question_columns = ih_columns + ib_columns
human_means = human_data[question_columns].mean()
human_stds = human_data[question_columns].std()
human_sems = human_stds / np.sqrt(len(human_data))
human_x = np.arange(1, len(question_columns) + 1)

# === Load Model Likert Data ===
data_folder = "./data"
filename_pattern = re.compile(
    r"question_(\d+)_homogeneity_(True|False)_modeltypeidx_(\d+)_shuffle_(True|False)_reasoning_True_inverted_False_secret_False\.csv"
)

# Group data by condition
grouped_data = defaultdict(list)

for filename in os.listdir(data_folder):
    match = filename_pattern.match(filename)
    if match:
        question_idx = int(match.group(1))
        homogeneity = match.group(2) == "True"
        modeltypeidx = int(match.group(3))
        shuffle = match.group(4) == "True"

        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r") as f:
            data = json.load(f)
            avg = data.get("average_mean")
            sem = data.get("propagated_uncertainty")

        key = (homogeneity, shuffle, modeltypeidx)
        grouped_data[key].append((question_idx, avg, sem))

# === Setup Plot ===
fig, ax = plt.subplots(figsize=(14, 6))

# Human data
human_offset = -0.2
ax.fill_between(human_x + human_offset, human_means - human_stds, human_means + human_stds,
                color='lightgray', label='Human ± SD')
ax.errorbar(human_x + human_offset, human_means, yerr=human_sems, fmt='o', color='black',
            capsize=4, label='Human ± SEM')

# === Define Plot Styles ===
model_colors = {
    0: '#e41a1c',  # Red
    1: '#377eb8',  # Blue
    2: '#4daf4a',  # Green
    3: '#984ea3',  # Purple
    4: '#ff7f00',  # Orange
}

hetero_colors = {
    False: '#4b4b4b',  # shuffle = False → Dark gray
    True: '#1b9e77',   # shuffle = True → Teal green
}

offsets = {
    (False, False): 0.1,
    (False, True): 0.2,
    0: -0.1,
    1: -0.05,
    2: 0.0,
    3: 0.05,
    4: 0.1,
}

# === Plot Heterogeneous Curves (shuffle True/False) ===
for shuffle_val in [False, True]:
    key = (False, shuffle_val, 0)
    datapoints = grouped_data.get(key, [])
    if not datapoints:
        continue

    datapoints.sort()
    x_vals = np.array([dp[0] for dp in datapoints])
    y_vals = np.array([dp[1] for dp in datapoints])
    sems = np.array([dp[2] for dp in datapoints])
    stds = sems * np.sqrt(10)

    x_offset = x_vals + offsets[(False, shuffle_val)]
    color = hetero_colors[shuffle_val]
    label = f"Hetero / Shuffle={shuffle_val}"

    ax.fill_between(x_offset, y_vals - stds, y_vals + stds, alpha=0.2, color=color)
    ax.errorbar(x_offset, y_vals, yerr=sems, fmt='o', capsize=4,
                color=color, markersize=6, markerfacecolor=color, label=label)

# === Plot Homogeneous Curves (5 modeltypeidx) ===
for modeltypeidx in range(5):
    key = (True, False, modeltypeidx)
    datapoints = grouped_data.get(key, [])
    if not datapoints:
        continue

    datapoints.sort()
    x_vals = np.array([dp[0] for dp in datapoints])
    y_vals = np.array([dp[1] for dp in datapoints])
    sems = np.array([dp[2] for dp in datapoints])
    stds = sems * np.sqrt(10)

    color = model_colors[modeltypeidx]
    marker = 'D'
    x_offset = x_vals + offsets[modeltypeidx]

    label = f"Model {modeltypeidx} (Homog.)"
    ax.fill_between(x_offset, y_vals - stds, y_vals + stds, alpha=0.2, color=color)
    ax.errorbar(x_offset, y_vals, yerr=sems, fmt=marker, capsize=4,
                color=color, markersize=6, markerfacecolor='none', label=label)

# === Finalize Plot ===
ax.set_xticks(human_x)
ax.set_xticklabels(question_columns, rotation=30)
for tick in ax.get_xticklabels():
    tick.set_color("blue" if "IH" in tick.get_text() else "red")

ax.axvline(x=4.5, linestyle='--', color='gray', alpha=0.6)
ax.set_xlabel("Question")
ax.set_ylabel("Mean Response")
ax.set_title("Human vs Model Responses\n(Heterogeneous vs Homogeneous Ensembles)")
ax.grid(True, linestyle='--', alpha=0.6)
ax.legend(loc="lower right", fontsize=8)

plt.tight_layout()

# === Save Plot ===
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)


fig.savefig(os.path.join(output_dir, "geneity_shuffle_singlequestioncomparison_ous.png"), dpi=300)
plt.show()


# 4.5 Plot per single question plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np

use_std = True

# Shift question numbers by +1 for plotting (do NOT modify original DataFrame)
question_nums = sorted(single_by_question['question_num'].unique())
question_nums_plot = [q + 1 for q in question_nums]  # Shifted for display
flag_combos = sorted(single_by_question['flag_combo'].unique())
num_combos = len(flag_combos)

plt.figure(figsize=(15, 6))

width = 0.25
combo_colors = plt.cm.tab10(np.linspace(0, 1, num_combos))
positions = {}

for i, combo in enumerate(flag_combos):
    offset = (i - (num_combos - 1) / 2) * width
    positions[combo] = [q + offset for q in question_nums_plot]

    combo_data = single_by_question[single_by_question['flag_combo'] == combo]

    for idx, q_num in enumerate(question_nums):
        q_data = combo_data[combo_data['question_num'] == q_num]

        if not q_data.empty:
            mean_val = q_data['mean'].values[0]
            error_val = q_data['std'].values[0] if use_std else q_data['sem'].values[0]

            plt.errorbar(
                positions[combo][idx],
                mean_val,
                yerr=error_val,
                fmt='o',
                color=combo_colors[i],
                label=combo if idx == 0 else "",
                capsize=3
            )

# Update x-axis to show question numbers starting from 1
plt.xticks(question_nums_plot, labels=[str(q) for q in question_nums_plot], rotation=90)

# Color tick labels based on category
ax = plt.gca()
for tick in ax.get_xticklabels():
    question = int(tick.get_text())
    if question <= 4:  # Questions 1–4 are IH
        tick.set_color('blue')
    else:  # 5–9 are IB
        tick.set_color('red')

# Vertical line between IH and IB (after Q4, now at x=4.5)
plt.axvline(x=4.5, color='black', linestyle='--', alpha=0.7)

# Y-axis annotation position
y_min, y_max = plt.ylim()
y_text = y_max * 0.95

plt.text(2.0, y_text, 'IH Category', color='blue', ha='center', fontsize=12)
plt.text(7.0, y_text, 'IB Category', color='red', ha='center', fontsize=12)

plt.xlabel('Question Number')
plt.ylabel('Mean Value')
plt.title('Mean Values by Question Number and Flag Combination')
plt.legend(title="Flag Settings")
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 4.6 Plot 2d IH IB data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import json
import re
from collections import defaultdict

# === Load Human Likert Data ===
human_data = pd.read_csv("./human_data/ous_filtered.csv")

ih_columns = ['IH1', 'IH2', 'IH3', 'IH4']
ib_columns = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5']

# Compute human stats
human_ih_means = human_data[ih_columns].mean(axis=1)
human_ib_means = human_data[ib_columns].mean(axis=1)

human_ih_avg = human_ih_means.mean()
human_ib_avg = human_ib_means.mean()

human_ih_std = human_ih_means.std()
human_ib_std = human_ib_means.std()

n_human = len(human_data)
human_ih_sem = human_ih_std / np.sqrt(n_human)
human_ib_sem = human_ib_std / np.sqrt(n_human)

# === Load JSON Flag Data ===
data_folder = "./data"
filename_pattern = re.compile(
    r"question_(\d+)_reasoning_(True|False)_inverted_(True|False)_secret_(True|False)\.csv"
)

grouped_data = defaultdict(list)
for filename in os.listdir(data_folder):
    match = filename_pattern.match(filename)
    if match:
        question_idx = int(match.group(1))
        reasoning = match.group(2) == "True"
        secret = match.group(4) == "True"
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r") as f:
            data = json.load(f)
            avg = data.get("average_mean")
            sem = data.get("propagated_uncertainty")
            grouped_data[(reasoning, secret)].append((question_idx, avg, sem))

# === Compute Flag Combo IH/IB Averages, SEM, STD ===
flag_stats = {}
N_flag = 10  # as assumed in prior calculations

for flag_key, datapoints in grouped_data.items():
    datapoints.sort()
    ih_vals = [avg for idx, avg, _ in datapoints if idx in range(1, 5)]
    ib_vals = [avg for idx, avg, _ in datapoints if idx in range(5, 10)]
    ih_sems = [sem for idx, _, sem in datapoints if idx in range(1, 5)]
    ib_sems = [sem for idx, _, sem in datapoints if idx in range(5, 10)]

    ih_mean = np.mean(ih_vals)
    ib_mean = np.mean(ib_vals)

    ih_std = np.sqrt(np.sum(np.square(np.array(ih_sems) * np.sqrt(N_flag))) / len(ih_sems))
    ib_std = np.sqrt(np.sum(np.square(np.array(ib_sems) * np.sqrt(N_flag))) / len(ib_sems))

    ih_sem = ih_std / np.sqrt(len(ih_sems))
    ib_sem = ib_std / np.sqrt(len(ib_sems))
    print(f"flag_key: {flag_key}, ih_mean: {ih_mean}, ib_mean: {ib_mean}, ih_std: {ih_std}, ib_std: {ib_std}, ih_sem: {ih_sem}, ib_sem: {ib_sem}")
    flag_stats[flag_key] = {
        'ih_mean': ih_mean,
        'ib_mean': ib_mean,
        'ih_std': ih_std,
        'ib_std': ib_std,
        'ih_sem': ih_sem,
        'ib_sem': ib_sem,
    }

# === Plot 2D Mean IH vs IB with SEM and SD Ellipses ===
import matplotlib.patches as patches

# === Plot 2D Mean IH vs IB with SEM and SD Ellipses ===
fig, ax = plt.subplots(figsize=(8, 8))

# Plot Human Data
ax.errorbar(human_ih_avg, human_ib_avg,
            xerr=human_ih_sem, yerr=human_ib_sem,
            fmt='o', color='black', capsize=5, label='Human ± SEM')

ellipse = patches.Ellipse((human_ih_avg, human_ib_avg),
                          width=2*human_ih_std, height=2*human_ib_std,
                          color='gray', alpha=0.2, label='Human ± SD')
ax.add_patch(ellipse)

# Plot Flag Combo Data
for (reasoning, secret), stats in flag_stats.items():
    ih_mean = stats['ih_mean']
    ib_mean = stats['ib_mean']
    ih_std = stats['ih_std']
    ib_std = stats['ib_std']
    ih_sem = stats['ih_sem']
    ib_sem = stats['ib_sem']

    color = color_map[(reasoning, secret)]
    label = f"reasoning={reasoning}, secret={secret}"

    ax.errorbar(ih_mean, ib_mean,
                xerr=ih_sem, yerr=ib_sem,
                fmt='o', color=color, capsize=5, label=f"{label} ± SEM")

    ellipse = patches.Ellipse((ih_mean, ib_mean),
                              width=2*ih_std, height=2*ib_std,
                              color=color, alpha=0.2)
    ax.add_patch(ellipse)

# Plot settings
ax.set_xlabel("Average IH Response")
ax.set_ylabel("Average IB Response")
ax.set_title("Mean IH vs IB (Human vs Flag Combos) with SEM and SD")
ax.grid(True, linestyle='--', alpha=0.6)
ax.legend()
ax.set_aspect('equal', 'box')
ax.set_xlim(1, 7)
ax.set_ylim(1, 7)

# Save plot
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "ih_vs_ib_mean_scatter.png"), dpi=300)
plt.show()


# 4.6.1 geneity and shuffle outcomes

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import json
import re
from collections import defaultdict
import matplotlib.patches as patches

# Load Human Likert Data
human_data = pd.read_csv("./human_data/ous_filtered.csv")
ih_columns = ['IH1', 'IH2', 'IH3', 'IH4']
ib_columns = ['IB1', 'IB2', 'IB3', 'IB4', 'IB5']

# Compute human stats
human_ih_means = human_data[ih_columns].mean(axis=1)
human_ib_means = human_data[ib_columns].mean(axis=1)
human_ih_avg = human_ih_means.mean()
human_ib_avg = human_ib_means.mean()
human_ih_std = human_ih_means.std()
human_ib_std = human_ib_means.std()
n_human = len(human_data)
human_ih_sem = human_ih_std / np.sqrt(n_human)
human_ib_sem = human_ib_std / np.sqrt(n_human)

# Updated pattern: includes modeltypeidx, homogeneity, shuffle
data_folder = "./data"
filename_pattern = re.compile(
    r"question_(\d+)_homogeneity_(True|False)_modeltypeidx_(\d+)_shuffle_(True|False)_reasoning_.*?_inverted_.*?_secret_.*?\.csv"
)

grouped_data = defaultdict(list)
for filename in os.listdir(data_folder):
    match = filename_pattern.match(filename)
    if match:
        question_idx = int(match.group(1))
        homogeneity = match.group(2) == "True"
        modeltypeidx = int(match.group(3))
        shuffle = match.group(4) == "True"
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r") as f:
            data = json.load(f)
            avg = data.get("average_mean")
            sem = data.get("propagated_uncertainty")
            key = (homogeneity, shuffle, modeltypeidx if homogeneity else None)
            grouped_data[key].append((question_idx, avg, sem))

# Compute group statistics
flag_stats = {}
N_flag = 10
for flag_key, datapoints in grouped_data.items():
    datapoints.sort()
    ih_vals = [avg for idx, avg, _ in datapoints if idx in range(1, 5)]
    ib_vals = [avg for idx, avg, _ in datapoints if idx in range(5, 10)]
    ih_sems = [sem for idx, _, sem in datapoints if idx in range(1, 5)]
    ib_sems = [sem for idx, _, sem in datapoints if idx in range(5, 10)]

    ih_mean = np.mean(ih_vals)
    ib_mean = np.mean(ib_vals)
    ih_std = np.sqrt(np.sum(np.square(np.array(ih_sems) * np.sqrt(N_flag))) / len(ih_sems))
    ib_std = np.sqrt(np.sum(np.square(np.array(ib_sems) * np.sqrt(N_flag))) / len(ib_sems))
    ih_sem = ih_std / np.sqrt(len(ih_sems))
    ib_sem = ib_std / np.sqrt(len(ib_sems))

    flag_stats[flag_key] = {
        'ih_mean': ih_mean,
        'ib_mean': ib_mean,
        'ih_std': ih_std,
        'ib_std': ib_std,
        'ih_sem': ih_sem,
        'ib_sem': ib_sem,
    }

# Plotting
fig, ax = plt.subplots(figsize=(8, 8))

# Human
ax.errorbar(human_ih_avg, human_ib_avg, xerr=human_ih_sem, yerr=human_ib_sem,
            fmt='o', color='black', capsize=5, label='Human ± SEM')
ellipse = patches.Ellipse((human_ih_avg, human_ib_avg), width=2*human_ih_std, height=2*human_ib_std,
                          color='gray', alpha=0.2, label='Human ± SD')
ax.add_patch(ellipse)

# Color and marker map
modeltype_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']
marker_homo = 'D'  # diamonds
marker_hetero = 'o'  # full circles

# Plot model results
for (homogeneity, shuffle, modeltypeidx), stats in flag_stats.items():
    ih_mean = stats['ih_mean']
    ib_mean = stats['ib_mean']
    ih_std = stats['ih_std']
    ib_std = stats['ib_std']
    ih_sem = stats['ih_sem']
    ib_sem = stats['ib_sem']

    if homogeneity:
        color = modeltype_colors[modeltypeidx]
        label = f"Homogeneous (model {modeltypeidx})"
        marker = marker_homo
    else:
        label = f"Heterogeneous ({'Shuffled' if shuffle else 'Not Shuffled'})"
        color = 'tab:brown' if shuffle else 'tab:cyan'
        marker = marker_hetero

    ax.errorbar(ih_mean, ib_mean, xerr=ih_sem, yerr=ib_sem,
                fmt=marker, color=color, capsize=5, label=label)
    ellipse = patches.Ellipse((ih_mean, ib_mean), width=2*ih_std, height=2*ib_std,
                              color=color, alpha=0.2)
    ax.add_patch(ellipse)

# Final plot settings
ax.set_xlabel("Average IH Response")
ax.set_ylabel("Average IB Response")
ax.set_title("Mean IH vs IB (Human vs Model) with SEM and SD")
ax.grid(True, linestyle='--', alpha=0.6)
ax.legend(loc='lower right', fontsize=8)
ax.set_aspect('equal', 'box')
ax.set_xlim(1, 7)
ax.set_ylim(1, 7)

# Save plot
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "geneity_shuffle_ih_vs_ib_mean_scatter.png"), dpi=300)
plt.show()


# 4.7 Plot IH, IB-only

In [None]:
# === 1D Plot: IB Mean Responses Only (Compact Layout) ===
fig, ax = plt.subplots(figsize=(10, 4))

# Define datasets in compact format
datasets = [("Human", "black", human_ib_avg, human_ib_std, human_ib_sem)] + [
    (f"reasoning={reasoning}, secret={secret}",
     color_map[(reasoning, secret)],
     stats['ib_mean'],
     stats['ib_std'],
     stats['ib_sem'])
    for (reasoning, secret), stats in flag_stats.items()
]
bar_width = 0.95
x_positions = np.arange(len(datasets))

# Plot each dataset
for i, (label, color, ib_mean, ib_std, ib_sem) in enumerate(datasets):
    ax.errorbar(x_positions[i], ib_mean, yerr=ib_sem, fmt='o', color=color, capsize=5, label=f"{label} ± SEM")
    ax.fill_between([x_positions[i] - bar_width / 2, x_positions[i] + bar_width / 2],
                    [ib_mean - ib_std] * 2,
                    [ib_mean + ib_std] * 2,
                    color=color if label != "Human" else "gray", alpha=0.2)

# Aesthetics
ax.set_xticks(x_positions)
ax.set_xticklabels([label for label, _, _, _, _ in datasets], rotation=10, ha='right')
ax.set_xlim(-0.5, len(datasets) - 0.5)
ax.set_ylim(1, 7)
ax.set_ylabel("Average IB OUS Response")
ax.set_title("Averaged IB OUS Responses (±SD, ±SEM)")
ax.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()

# Save plot
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)
plt.savefig(os.path.join(output_dir, "ib_mean_1d_plot_compact.png"), dpi=300)
plt.show()


In [None]:

# === 1D Plot: IH Mean Responses Only (Compact Layout) ===
fig, ax = plt.subplots(figsize=(10, 4))

# Define datasets in compact format
datasets = [("Human", "black", human_ih_avg, human_ih_std, human_ih_sem)] + [
    (f"reasoning={reasoning}, secret={secret}",
     color_map[(reasoning, secret)],
     stats['ih_mean'],
     stats['ih_std'],
     stats['ih_sem'])
    for (reasoning, secret), stats in flag_stats.items()
]

bar_width = 0.95
x_positions = np.arange(len(datasets))

# Plot each dataset
for i, (label, color, ih_mean, ih_std, ih_sem) in enumerate(datasets):
    ax.errorbar(x_positions[i], ih_mean, yerr=ih_sem, fmt='o', color=color, capsize=5, label=f"{label} ± SEM")
    ax.fill_between([x_positions[i] - bar_width / 2, x_positions[i] + bar_width / 2],
                    [ih_mean - ih_std] * 2,
                    [ih_mean + ih_std] * 2,
                    color=color if label != "Human" else "gray", alpha=0.2)

# Aesthetics
ax.set_xticks(x_positions)
ax.set_xticklabels([label for label, _, _, _, _ in datasets], rotation=10, ha='right')
ax.set_xlim(-0.5, len(datasets) - 0.5)
ax.set_ylim(1, 7)
ax.set_ylabel("Average IH OUS Response")
ax.set_title("Averaged IH OUS Responses (±SD, ±SEM)")
ax.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()

# Save plot
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)
plt.savefig(os.path.join(output_dir, "ih_mean_1d_plot_compact.png"), dpi=300)
plt.show()

# Print IH stats for reasoning=True, secret=False
for (reasoning, secret), stats in flag_stats.items():
    print(f"Reasoning={reasoning}, Secret={secret}")
    print(f"  Mean:     {stats['ih_mean']}")
    print(f"  Std Dev:  {stats['ih_std']}")
    print(f"  SEM:      {stats['ih_sem']}")
    print("-" * 40)
