In [None]:
import os
from openai import OpenAI
import json
import collections
import asyncio
import re

import subprocess
import sys


from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from dotenv import load_dotenv

from typing import Literal

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap

from scipy.stats import gaussian_kde

import glob
from math import isnan

from matplotlib.patches import Rectangle

In [None]:
# main functions to import from src
from src import GGB_Statements,  get_model_shortname
from analysis_functions import ring_csv_to_df, ring_to_roundrobin_df, load_and_clean_single_run, get_agent_shortname

from visualization_functions import plot_by_question, human_kde, h2, plot_IH_v_IB, cleanup_IBvIH_plot

In [None]:
# questions 
QUESTION_JSON = os.path.abspath('GGB_benchmark/GreatestGoodBenchmark.json') 
Inverted_JSON = os.path.abspath('GGB_benchmark/GreatestGoodBenchmarkInverted.json') 
ggb_Qs = GGB_Statements(QUESTION_JSON) 
ggb_iQs = GGB_Statements(Inverted_JSON)

# Specifications for paper

In [None]:
col_width = 3.3125 # inches
text_wdith = 7.0 # inches

# SINGLE ANALYSIS

In [None]:
single_csvs = glob.glob('results/single_ggb**_q1-90_n12.csv')

In [None]:
single_df = pd.DataFrame()

for irun, runcsv in enumerate(single_csvs):
    if 'inverted' in runcsv:
        Qs = ggb_iQs
        label = 'GGB_inverted'
    else:
        Qs = ggb_Qs
        label = 'GGB'

    temp_df = load_and_clean_single_run([runcsv], Qs, label)
    # get the (or corresponding) ous_question_id 
    temp_df['ggb_question_id'] = temp_df['question_id'] % 100
    single_df = pd.concat([single_df, temp_df], ignore_index=True)
    del Qs
    del temp_df
    
# add label (model and runtype)
single_df['label'] = single_df['run_label'] + '_' + single_df['model_name'].apply(get_model_shortname)

In [None]:
single_df.label.unique()

In [None]:
# Convert answer column to numeric, coercing errors to NaN
single_df['answer_numeric'] = pd.to_numeric(single_df['answer'], errors='coerce')

# Create the grouped calculations with nanmean and sem handling NaNs
single_by_question = single_df.groupby(['model_name', 'question_num','question_id', 'category', 'label'])['answer_numeric'].agg([
    ('mean', lambda x: np.nanmean(x)),
    ('std',  lambda x: np.nanstd(x, ddof=1)),
    ('sem', lambda x: np.nanstd(x, ddof=1) / np.sqrt(np.sum(~np.isnan(x))))
]).reset_index()

# (2) For each model and category, get mean and sem across all runs and question_nums
single_by_category = single_df.groupby(['model_name', 'category', 'label'])['answer_numeric'].agg([
    ('mean', lambda x: np.nanmean(x)),
    ('std',  lambda x: np.nanstd(x, ddof=1)),
    ('sem', lambda x: np.nanstd(x, ddof=1) / np.sqrt(np.sum(~np.isnan(x))))
]).reset_index()

In [None]:
single_df

In [None]:
single_by_category

In [None]:
f, _ = plot_IH_v_IB (single_by_category, use_std = False, label = 'label', text_size=10)
ax = f.axes
ax[0].axis('square')
f.set_size_inches(1, 1)


In [None]:
# MODIFY FIGURE (SINGLE)
f = cleanup_IBvIH_plot(f)
# Display the updated figure
display(f)

# f.savefig('figures/singleIBvIH.png')
f.savefig('figures/singleIBvIH.pdf', bbox_inches='tight', pad_inches=0.1)


# RING ANALYSIS

In [None]:
ring_csvs = glob.glob('results_multi/ggb_**_ensemble_**_q1-90_n12.csv')

In [None]:
# Process the file
current_Qs = ggb_iQs

csv_file = ring_csvs[0]
df = ring_csv_to_df(csv_file, current_Qs)
print(f"Processing {csv_file}")
print(f"Raw DataFrame shape: {df.shape}")
print(f"Columns: {df.columns.tolist() if not df.empty else 'Empty'}")

# Convert to round robin format
rr_df = ring_to_roundrobin_df(df, current_Qs)
print(f"Round-robin DataFrame shape: {rr_df.shape}")
rr_df.head()

In [None]:
# Pre-allocate lists to collect dataframes
ring_dfs = []
ring_rr_dfs = []

# Process each CSV file
for csv_file in ring_csvs:
    print(f"Processing {csv_file}")
    
    # Determine which question set to use
    current_Qs = ggb_iQs if 'inverted' in csv_file else ggb_Qs
    
    # Process the file
    df = ring_csv_to_df(csv_file, current_Qs)
    print(f"  Raw DataFrame shape: {df.shape}")
    
    if not df.empty:
        ring_dfs.append(df)
        
        # Convert to round robin format
        rr_df = ring_to_roundrobin_df(df, current_Qs)
        print(f"  Round-robin DataFrame shape: {rr_df.shape}")
        
        if not rr_df.empty:
            ring_rr_dfs.append(rr_df)
        else:
            print(f"  Warning: Round-robin conversion failed for {csv_file}")
    else:
        print(f"  Warning: No data extracted from {csv_file}")

# Single concat operations outside the loop
if ring_dfs:
    ring_df = pd.concat(ring_dfs, ignore_index=True)
    print(f"Combined ring_df shape: {ring_df.shape}")
else:
    ring_df = pd.DataFrame()
    print("No ring data found")

if ring_rr_dfs:
    ring_rr_df = pd.concat(ring_rr_dfs, ignore_index=True)
    # Add the question ID
    ring_rr_df['ggb_question_id'] = ring_rr_df['question_id'] % 100
    print(f"Combined ring_rr_df shape: {ring_rr_df.shape}")
    print(f"Sample of ring_rr_df columns: {ring_rr_df.columns.tolist()}")
else:
    ring_rr_df = pd.DataFrame()
    print("No round-robin data found")

print(f"Processed {len(ring_dfs)} ring dataframes, {len(ring_rr_dfs)} round-robin dataframes")
print(f"Total ring records: {len(ring_df)}, Total round-robin records: {len(ring_rr_df)}")

In [None]:
ring_df = pd.concat(ring_dfs, ignore_index=True)
ring_rr_df = pd.concat(ring_rr_dfs, ignore_index=True)


In [None]:
ring_rr_df

In [None]:
# check the missing repeats/questions
for chat in ring_df.chat_type.unique():
    for q in ring_df['question_num'].unique():
        reps = np.sort(ring_df[((ring_df['chat_type'] == chat) & (ring_df['question_num'] == q))]['run_index'].unique())
        try:
            if np.all(reps == np.arange(1,13)):
                continue
        except: 
            print(f'chat:{chat}, Q:{q}, reps that ran: {reps}')

In [None]:
# Get the answers by each agent
rr_by_agent_df = ring_rr_df.copy()
rr_by_agent_df['agent_shortname']  = rr_by_agent_df['agent_name'].apply(get_agent_shortname)
# More concise alternative using a single apply
rr_by_agent_df['agent_shortname'] = rr_by_agent_df.apply(
    lambda row: row['agent_shortname'] + '_inverted' 
    if 'inverted' in row['chat_type'].lower() 
    else row['agent_shortname'], 
    axis=1)




In [None]:
rr_by_agent_df.agent_shortname.unique()

In [None]:
ring_by_category_and_model = rr_by_agent_df.groupby(['agent_shortname', 'category','round','message_index'])['agent_answer'].agg([
    ('mean', lambda x: np.nanmean(x)),
    ('std',  lambda x: np.nanstd(x, ddof=1)),
    ('sem', lambda x: np.nanstd(x, ddof=1) / np.sqrt(np.sum(~np.isnan(x))))
    ]).reset_index()

ring_by_question = ring_rr_df.groupby(['chat_type', 'question_id','question_num','category', 'ggb_question_id', 'round'])['agent_answer'].agg([
    ('mean', lambda x: np.nanmean(x)),
    ('std',  lambda x: np.nanstd(x, ddof=1)),
    ('sem', lambda x: np.nanstd(x, ddof=1) / np.sqrt(np.sum(~np.isnan(x))))
]).reset_index()

# ous_by_question.column
ring_by_category = ring_rr_df.groupby(['chat_type', 'category', 'round'])['agent_answer'].agg([
    ('mean', lambda x: np.nanmean(x)),
    ('std',  lambda x: np.nanstd(x, ddof=1)),
    ('sem', lambda x: np.nanstd(x, ddof=1) / np.sqrt(np.sum(~np.isnan(x))))
]).reset_index()

In [None]:
ring_by_category_and_model.agent_shortname.unique()




## Individual Agents's responses in Hetero and Homo Ring Runs

In [None]:
f,_ = plot_IH_v_IB (ring_by_category_and_model[ring_by_category_and_model['round'] == 1], use_std = False, ax_lims=[1,7], label='agent_shortname')

In [None]:
# MODIFY FIGURE (SINGLE)

fof = cleanup_IBvIH_plot(f)
# Display the updated figure
display(f)

# f.savefig('figures/singleIBvIH.png')
f.savefig('figures/agent_by_cat_ring_IBvIH.pdf', bbox_inches='tight', pad_inches=0.1)



## Mixed Single and MAS (see if round 1, message 1 and Singles are the same)

In [None]:
mixed_single_and_MAS = pd.DataFrame()
ring_by_category_and_model[(ring_by_category_and_model['round'] == 1) & (ring_by_category_and_model['message_index'] == 1)] 

In [None]:
single_by_category

In [None]:
single_by_category.label.unique()

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, norm
from IPython.display import display, HTML

results = []

# Extract unique labels from single_by_category for matching
available_labels = single_by_category['label'].unique()

# Loop over agent_shortnames in rr_by_agent_df
for agent in rr_by_agent_df['agent_shortname'].unique():
    if 'inverted' in agent:
        # e.g. "claude_inverted" -> "GGB_inverted_claude"
        base = agent.replace('_inverted', '')
        label = f'GGB_inverted_{base}'
    else:
        label = f'GGB_{agent}'

    if label not in available_labels:
        continue

    y = rr_by_agent_df[(rr_by_agent_df['agent_shortname'] == agent) &
                       (rr_by_agent_df['round'] == 1) &
                        (rr_by_agent_df['message_index'] == 1)
      ]['agent_answer'].dropna()
    x = single_df[single_df['label'] == label]['answer_numeric'].dropna()

    if len(x) < 2 or len(y) < 2:
        continue

    stat, p = mannwhitneyu(x, y, alternative='two-sided')

    try:
        z = norm.ppf(1 - p / 2)
        r = z / np.sqrt(len(x) + len(y))
    except:
        r = np.nan

    results.append({
        'agent': agent,
        'label': label,
        'n_x': len(x),
        'n_y': len(y),
        'U': stat,
        'p_value': p,
        'effect_size_r': r
    })

results_df = pd.DataFrame(results)

# Round selected float columns
results_df['p_value'] = results_df['p_value'].round(2)
results_df['effect_size_r'] = results_df['effect_size_r'].round(2)


# for better display/sorting:
# Create a helper column with base agent name (without "_inverted")
results_df['agent_base'] = results_df['agent'].str.replace('_inverted', '', regex=False)

# Optional: set a consistent order based on unique agent bases
agent_order = results_df['agent_base'].drop_duplicates().tolist()

# Sort by agent base first, then put normal agent before inverted
results_df = results_df.sort_values(
    by=['agent_base', 'agent'],
    key=lambda col: col if col.name != 'agent' else col.apply(lambda x: (x.endswith('_inverted'), x))
).reset_index(drop=True)

# Drop the helper column if not needed
results_df = results_df.drop(columns='agent_base')

#print(results_df)
results_df



## Ring By Question

In [None]:
# Before calling plot_by_question, add validation
round_4_data = ring_by_question[ring_by_question['round'] == 4]

if round_4_data.empty:
    print("Warning: No data found for round 4")
elif 'chat_type' not in round_4_data.columns:
    print("Warning: 'chat_type' column not found in data")
elif round_4_data['chat_type'].isna().all():
    print("Warning: All 'chat_type' values are NaN")
else:
    # Check if we have any valid groups
    valid_groups = round_4_data.groupby('chat_type').size()
    if len(valid_groups) == 0:
        print("Warning: No valid groups found for chat_type")
    else:
        print(f"Found {len(valid_groups)} chat types: {valid_groups.index.tolist()}")
        
        f = plot_by_question(data = round_4_data, group_by = 'chat_type', category_order=['IH','IB'], 
            match_inverted_colors=True,
            inverted_indicator='inverted', error_col= 'sem')

        ax = f.axes[0]  # Get the axes from the figure
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3)
        plt.tight_layout()  # Adjust layout to accommodate the legend

## Round 4 Homo and Hetero Ring ensembles 

In [None]:
round_4_rr_df = ring_rr_df[ring_rr_df['round'] == 4]

In [None]:
round_4_deepseek = round_4_rr_df[(round_4_rr_df['category'] == 'IH') & (round_4_rr_df['chat_type'].apply(lambda x: 'deepseek' in x))]


In [None]:
f = plot_IH_v_IB (ring_by_category[ring_by_category['round'] == 4], use_std = False)

In [None]:
f = plot_IH_v_IB (ring_by_category[ring_by_category['round'] == 4], use_std = True,ax_lims=[0,8])

## Convergence for a round

In [None]:
from visualization_functions import plot_rr_round

In [None]:
if False: # just to avoid massive plotting (these plots still need help to be publicaiton ready)
    all_chat_types = ring_rr_df.chat_type.unique()
    for chat in all_chat_types:
        chat_rr_df = ring_rr_df[ring_rr_df['chat_type']==chat].copy()
        start_rep = chat_rr_df['repeat_index'].min()
        end_rep = chat_rr_df['repeat_index'].max()

        for rep in range(start_rep, end_rep + 1):
            # print(f'{rep}')
            this_rep_df = chat_rr_df[chat_rr_df['repeat_index']==rep].copy()
            plot_rr_round(this_rep_df , round = 4)
    # TODO: average over rounds!
    # TODO: why is it repeating 2x (there should be 5 repeats??)

# STAR ANALYSIS

In [None]:
# going to need to chnage the chat type for each one because currently has the supervisor name in the chat type

In [None]:
from analysis_functions import star_csv_to_df

In [None]:
ous_star_csvs = glob.glob('results_ous_multi/**_star_super**_q1-9_n2.csv')
ous_evilstar_csvs = glob.glob('results_ous_multi/**_star_evil**_q1-9_n2.csv')

ous_all_star = glob.glob('results_ous_multi/**star**_q1-9_n2.csv')

In [None]:
ous_all_star

In [None]:
for csv_file in ous_all_star:
    if 'inverted' in csv_file:
        current_Qs = ous_iQs
    else:
        current_Qs = ous_Qs
    
    df = star_csv_to_df(csv_file, current_Qs, csv_file)
    

    
    



In [None]:
# change supervisor to shortname
supervisor = df['config_details'].apply(lambda x: get_model_shortname(x['central_model']))

if len(supervisor.unique()) > 1:
    Warning('This function wors for one supervisor at a time')
    # TODO: UNCOOMENT WHEN TURNING INTO A FUNCTION
    # return 

# number of loops
n_loops = df['config_details'][0]['loops']
# number of repeats
repeats = df['run_index'].unique()

# add 1 to repeat if starts at 0 else add 0 when saving
minrep = min(repeats)
if minrep == 0:
    add_to_repeat = 1
elif minrep == 1:
    add_to_repeat = 0
else:
    Warning(f'repeats start at {minrep}')
    add_to_repeat = 0


