In [None]:
# 1. Imports and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import numpy as np
import re
import sys
import glob
from collections import Counter, defaultdict

# Define directories
RESULTS_DIR = 'results' # Directory containing single-agent CSV results
PLOTS_DIR = 'plots'
AGGREGATE_JSON_FILE = os.path.join(PLOTS_DIR, 'aggregate_results.json')
MORAL_BENCH_REPO_DIR = '../MoralBench_AgentEnsembles' # Adjust if needed

# Create plots directory if it doesn't exist
os.makedirs(PLOTS_DIR, exist_ok=True)

# Add MoralBench repo to path to import Question_Handler
moral_bench_path = os.path.abspath(MORAL_BENCH_REPO_DIR)
if moral_bench_path not in sys.path:
    sys.path.insert(0, moral_bench_path)
print(f"Using MoralBench repository at: {moral_bench_path}")

In [None]:
dict_map = {
   'authority': 'Authority',
   'liberty': 'Liberty',
   'fairness': 'Fairness',
   'harm': 'Harm',
   'loyalty': 'Loyalty',
   'purity': 'Sanctity',
   'ingroup': 'Care'
}

In [None]:
# 2. Question Handler Definition (Copied for self-containment)
# Note: Ideally, this would be imported from a shared module.
class Question_Handler():
  def __init__(self, repo_dir):
    self.repo_dir = os.path.abspath(repo_dir) # Use absolute path
    self.questions_dir = os.path.join(self.repo_dir, 'questions')
    self.answers_dir = os.path.join(self.repo_dir, 'answers')
    self.categories = self.list_categories()
    self._build_question_map()

  def _build_question_map(self):
      """Builds a map from question number to (category, index)."""
      self.question_map = {}
      current_question_num = 1
      for category in self.categories:
          count = self.get_question_count(category)
          for i in range(count):
              self.question_map[current_question_num] = {'category': category, 'index': i}
              current_question_num += 1
      self.total_questions = current_question_num - 1

  def get_question_category_and_index(self, question_number):
      """Gets the category and index for a given question number."""
      return self.question_map.get(question_number)

  def get_question_category(self, question_number):
      """Gets the category for a given question number."""
      mapping = self.question_map.get(question_number)
      return mapping['category'] if mapping else None

  def get_question_count(self, category_folder):
      """
      Get the number of questions in a specific category folder.
      """
      questions_path = os.path.join(self.questions_dir, category_folder)
      if not os.path.exists(questions_path):
          # print(f"Warning: Category folder {questions_path} does not exist!")
          return 0
      try:
          question_files = [f for f in os.listdir(questions_path) if f.endswith('.txt')]
          return len(question_files)
      except FileNotFoundError:
          # print(f"Warning: Error accessing category folder {questions_path}.")
          return 0

  def list_categories(self):
      """
      List all available question categories.
      """
      if not os.path.exists(self.questions_dir):
          print(f"Warning: Questions directory {self.questions_dir} not found!")
          return []
      try:
          categories = sorted([d for d in os.listdir(self.questions_dir) if os.path.isdir(os.path.join(self.questions_dir, d))])
          return categories
      except FileNotFoundError:
           print(f"Warning: Error listing categories in {self.questions_dir}.")
           return []

  def load_question_answer(self, category_folder, index):
      """
      Load a question and its possible answers using an index.
      """
      questions_path = os.path.join(self.questions_dir, category_folder)
      if not os.path.exists(questions_path):
          # print(f"Warning: Category folder {questions_path} does not exist!")
          return None

      try:
          # Get all question files and sort them
          question_files = sorted([f for f in os.listdir(questions_path) if f.endswith('.txt')])

          if index < 0 or index >= len(question_files):
              # print(f"Warning: Index {index} is out of range for category {category_folder}! Valid range: 0-{len(question_files)-1}")
              return None

          # Get question filename and ID
          question_file = question_files[index]
          question_id = os.path.splitext(question_file)[0]

          # Read question content
          question_path = os.path.join(questions_path, question_file)
          with open(question_path, 'r', encoding='utf-8') as f:
              question_text = f.read()

          # Load answers from JSON
          answers_path = os.path.join(self.answers_dir, f"{category_folder}.json")
          question_answers = None
          if os.path.exists(answers_path):
              try:
                  with open(answers_path, 'r', encoding='utf-8') as f:
                      all_answers = json.load(f)
                  question_answers = all_answers.get(question_id, {})
              except json.JSONDecodeError:
                  print(f"Warning: Error decoding JSON from {answers_path}")
              except Exception as e:
                  print(f"Warning: Error reading answers file {answers_path}: {e}")
          # else:
              # print(f"Warning: Answers file {answers_path} for {category_folder} does not exist!")

          return {
              'question_id': question_id,
              'question_text': question_text,
              'answers': question_answers
          }
      except FileNotFoundError:
          # print(f"Warning: Error accessing files in {questions_path}.")
          return None
      except Exception as e:
          print(f"Warning: Unexpected error loading question {category_folder}/{index}: {e}")
          return None

  def get_question(self, number):
      """Gets question data by absolute number."""
      mapping = self.get_question_category_and_index(number)
      if mapping:
          return self.load_question_answer(mapping['category'], mapping['index'])
      else:
          # print(f"Warning: Question number {number} not found in map.")
          return None

  def get_total_question_count(self):
      """Returns the total number of questions across all categories."""
      return self.total_questions

# --- Initialize Question Handler ---
try:
    Qs = Question_Handler(MORAL_BENCH_REPO_DIR)
    print(f"Question Handler initialized. Found {Qs.get_total_question_count()} questions in {len(Qs.categories)} categories.")
except Exception as e:
    print(f"Error initializing Question_Handler: {e}")
    Qs = None

In [None]:
Qs.get_question(31)  # Test the Question_Handler

In [None]:
# 3. Helper Functions for Data Loading and Plotting

def extract_filename_info(filename):
    """Extracts model name, question range, and num_runs from filename."""
    # Example filename: single_openai_gpt-4o-mini_q1-88_n10.csv
    # Corrected regex to capture digits using \d+
    match = re.match(r"single_(.*?)_q(\d+)-(\d+)_n(\d+).csv", os.path.basename(filename))
    if match:
        model_name = match.group(1).replace("_", "/").replace("-instruct", "-instruct:") # Handle specific cases if needed
        q_start = int(match.group(2))
        q_end = int(match.group(3))
        num_runs = int(match.group(4))
        # Quick fix for model names that were altered
        if 'google/gemini' in model_name and ':free' not in model_name:
             model_name += ':free'
        if 'deepseek/deepseek' in model_name and ':free' not in model_name:
             model_name += ':free'
        if 'meta-llama/llama' in model_name and ':free' not in model_name:
             model_name += ':free'
        return model_name, (q_start, q_end), num_runs
    else:
        print(f"Warning: Could not parse filename: {filename}")
        return None, None, None

def load_all_csv_data(results_dir):
    """Loads all single-agent CSV files from the results directory."""
    all_files = glob.glob(os.path.join(results_dir, "single_*.csv"))
    df_list = []
    if not all_files:
        print(f"Warning: No 'single_*.csv' files found in {results_dir}")
        return pd.DataFrame()

    for f in all_files:
        model_name, q_range, n_runs = extract_filename_info(f)
        if model_name:
            try:
                temp_df = pd.read_csv(f)
                # Ensure model_name column exists or is added correctly
                if 'model_name' not in temp_df.columns:
                     temp_df['model_name'] = model_name
                else:
                     # Overwrite if filename parsing is more reliable
                     temp_df['model_name'] = model_name
                # Add other info if needed, though model_name is primary
                # temp_df['file_q_range'] = str(q_range)
                # temp_df['file_n_runs'] = n_runs
                df_list.append(temp_df)
            except Exception as e:
                print(f"Error reading or processing {f}: {e}")
    if not df_list:
        print("Warning: No data loaded from CSV files.")
        return pd.DataFrame()
    return pd.concat(df_list, ignore_index=True)

def clean_data(df):
    """Cleans the DataFrame: converts types, extracts single letter answer."""
    if df.empty:
        return df

    # Confidence to numeric
    df['confidence'] = pd.to_numeric(df['confidence'], errors='coerce')
    df['confidence'] = df['confidence'].fillna(-1) # Use -1 for missing/invalid confidence
    df['confidence'] = df['confidence'].astype(int)

    # Extract single letter answer (handle variations like 'A.', ' A', etc.)
    df['answer_raw'] = df['answer'] # Keep original
    df['answer'] = df['answer'].astype(str).str.strip().str.upper()
    # Take the first character if it's a letter A-Z
    df['answer'] = df['answer'].apply(lambda x: x[0] if x and 'A' <= x[0] <= 'Z' else 'Invalid')

    # Add category using Question_Handler
    if Qs:
        df['category'] = df['question_num'].apply(Qs.get_question_category)
        df['category'] = df['category'].fillna('Unknown Category')
    else:
        df['category'] = 'Unknown Category'
        print("Warning: Question_Handler not available, cannot determine categories.")

    return df

def calculate_aggregates(df_group):
    """Calculates aggregate statistics for a DataFrame group (e.g., per question or category)."""
    if df_group.empty:
        return {
            'total_responses': 0,
            'answer_distribution': {},
            'confidence_distribution': {},
            'confidence_mean': None,
            'confidence_median': None,
            'valid_confidence_responses': 0
        }

    answer_counts = df_group['answer'].value_counts().to_dict()
    # Filter out invalid confidence values (-1) before calculating stats
    valid_conf_df = df_group[df_group['confidence'] != -1]
    conf_counts = valid_conf_df['confidence'].value_counts().sort_index().to_dict()

    return {
        'total_responses': len(df_group),
        'answer_distribution': answer_counts,
        'confidence_distribution': conf_counts,
        'confidence_mean': valid_conf_df['confidence'].mean() if not valid_conf_df.empty else None,
        'confidence_median': valid_conf_df['confidence'].median() if not valid_conf_df.empty else None,
        'valid_confidence_responses': len(valid_conf_df)
    }

In [None]:
# 4. Load and Process Data
df_raw = load_all_csv_data(RESULTS_DIR)
df = clean_data(df_raw.copy()) # Work on a copy

if df.empty:
    print("No data loaded or processed. Exiting.")
else:
    print(f"Loaded and cleaned {len(df)} records.")
    print("\nUnique Models:", df['model_name'].unique())
    print("\nUnique Categories:", df['category'].unique())
    print("\nAnswer Values:", df['answer'].unique())
    print("\nConfidence Values:", df['confidence'].unique())
    # Display first few rows and info
    print("\nDataFrame Head:")
    print(df.head())
    print("\nDataFrame Info:")
    df.info()

In [None]:
# 5. Generate Aggregates and Summary Plots

all_aggregate_data = defaultdict(lambda: {'questions': {}, 'categories': {}})

if df.empty or Qs is None:
    print("Skipping aggregate generation due to missing data or Question_Handler.")
else:
    models = sorted(df['model_name'].unique()) # Sort models for consistent plot order
    categories = sorted([c for c in df['category'].unique() if c != 'Unknown Category']) # Sort categories
    question_numbers = sorted(df['question_num'].unique())

    print("Calculating aggregates...")
    for model in models:
        model_df = df[df['model_name'] == model]

        # Per-Question Aggregates
        for q_num in question_numbers:
            q_df = model_df[model_df['question_num'] == q_num]
            if not q_df.empty:
                q_category = q_df['category'].iloc[0]
                q_id_info = Qs.get_question(q_num)
                q_id = q_id_info['question_id'] if q_id_info else f'UnknownID_Q{q_num}'

                # Aggregate
                q_agg = calculate_aggregates(q_df)
                # Convert q_num (potentially numpy int) to standard Python int for JSON key compatibility
                q_num_int = int(q_num)
                all_aggregate_data[model]['questions'][q_num_int] = {
                    'question_id': q_id,
                    'category': q_category,
                    **q_agg
                }
            # else: No data for this model and question

        # Per-Category Aggregates
        for category in categories:
             cat_df = model_df[model_df['category'] == category]
             if not cat_df.empty:
                 # Aggregate
                 cat_agg = calculate_aggregates(cat_df)
                 # Category is already a string, so no conversion needed for the key
                 all_aggregate_data[model]['categories'][category] = cat_agg
             # else: No data for this model and category
    print("Aggregation complete.")

    # --- Generate Summary Plots ---
    print("\nGenerating summary plots...")

    # Plot 1: Mean Confidence per Question per Model
    plt.figure(figsize=(15, 8))
    for model in models:
        # Ensure keys are treated as integers for sorting and access
        q_nums = sorted(all_aggregate_data[model]['questions'].keys()) # Keys are now int
        mean_confs = [all_aggregate_data[model]['questions'][q]['confidence_mean'] for q in q_nums if all_aggregate_data[model]['questions'][q]['confidence_mean'] is not None]
        valid_q_nums = [q for q in q_nums if all_aggregate_data[model]['questions'][q]['confidence_mean'] is not None]
        if valid_q_nums:
             # Shorten model names for legend
             short_model_name = model.split('/')[-1].replace(':free', '') # Example shortening
             plt.plot(valid_q_nums, mean_confs, marker='o', linestyle='-', markersize=4, label=short_model_name)

    plt.title('Mean Confidence per Question', fontsize=16)
    plt.xlabel('Question Number', fontsize=12)
    plt.ylabel('Mean Confidence (0-5)', fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
    plot1_filename = os.path.join(PLOTS_DIR, 'summary_confidence_per_question.png')
    plt.savefig(plot1_filename)
    print(f"Plot saved to {plot1_filename}")
    plt.close()

    # Plot 2: Mean Confidence per Category per Model
    category_data = defaultdict(lambda: defaultdict(lambda: np.nan))
    for model in models:
        for category in categories:
            if category in all_aggregate_data[model]['categories']:
                mean_conf = all_aggregate_data[model]['categories'][category]['confidence_mean']
                if mean_conf is not None:
                    category_data[category][model] = mean_conf

    plot_df = pd.DataFrame(category_data).T # Transpose to have categories as index, models as columns
    plot_df.index.name = 'Category'
    plot_df.columns.name = 'Model'

    if not plot_df.empty:
        plot_df.plot(kind='bar', figsize=(15, 8), width=0.8)
        plt.title('Mean Confidence per Category', fontsize=16)
        plt.xlabel('Category', fontsize=12)
        plt.ylabel('Mean Confidence (0-5)', fontsize=12)
        plt.xticks(rotation=45, ha='right', fontsize=10)
        plt.yticks(fontsize=10)
        plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
        plt.grid(True, axis='y', linestyle='--', alpha=0.6)
        plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout
        plot2_filename = os.path.join(PLOTS_DIR, 'summary_confidence_per_category.png')
        plt.savefig(plot2_filename)
        print(f"Plot saved to {plot2_filename}")
        plt.close()
    else:
        print("Skipping category plot: No aggregated category data found.")

    # Save Aggregate JSON
    print(f"\nSaving aggregate data to {AGGREGATE_JSON_FILE}...")
    try:
        # Convert numpy types to standard Python types for JSON serialization
        def convert_numpy(obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif isinstance(obj, (np.bool_, bool)):
                 return bool(obj)
            elif pd.isna(obj): # Handle pandas NA/NaN
                 return None
            # Add handling for dictionary keys if they are numpy types (though addressed earlier)
            elif isinstance(obj, dict):
                 return {convert_numpy(k): convert_numpy(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                 return [convert_numpy(i) for i in obj]
            return obj

        # Apply conversion recursively to the entire data structure before dumping
        serializable_data = convert_numpy(all_aggregate_data)

        with open(AGGREGATE_JSON_FILE, 'w', encoding='utf-8') as f:
            # Dump the pre-converted data
            json.dump(serializable_data, f, indent=4)
        print("Aggregate data saved successfully.")
    except TypeError as e:
        print(f"Error saving JSON: {e}. Check for non-serializable types.")
    except Exception as e:
        print(f"An unexpected error occurred while saving JSON: {e}")

print("\n--- Aggregation and Plotting Complete ---")

In [None]:
# 6. Generate Multi-Agent Aggregates and Plots

import glob
import hashlib

MULTI_RESULTS_DIR = 'results_multi'
MULTI_PLOTS_DIR = 'plots/multiagent'

os.makedirs(MULTI_PLOTS_DIR, exist_ok=True)

def create_config_hash(config_str):
    """Creates a short hash from a configuration string."""
    return hashlib.md5(config_str.encode('utf-8')).hexdigest()[:8]

def load_all_multi_agent_csv_data(results_dir):
    """Loads all multi-agent CSV files and extracts config details."""
    all_files = glob.glob(os.path.join(results_dir, "*.csv"))
    df_list = []
    if not all_files:
        print(f"Warning: No CSV files found in {results_dir}")
        return pd.DataFrame()

    for f in all_files:
        try:
            temp_df = pd.read_csv(f)
            # Assuming 'config_details' column exists and contains the JSON string
            if 'config_details' in temp_df.columns and not temp_df.empty:
                # Use the config string from the first row to generate hash
                config_str = temp_df['config_details'].iloc[0]
                config_hash = create_config_hash(config_str)
                temp_df['config_hash'] = config_hash
                temp_df['config_str'] = config_str # Keep original string for reference
                df_list.append(temp_df)
            else:
                 print(f"Warning: Skipping {f} - missing 'config_details' column or empty file.")
        except Exception as e:
            print(f"Error reading or processing {f}: {e}")

    if not df_list:
        print("Warning: No multi-agent data loaded.")
        return pd.DataFrame()
    return pd.concat(df_list, ignore_index=True)

def clean_multi_agent_data(df):
    """Cleans the multi-agent DataFrame."""
    if df.empty:
        return df

    # Confidence to numeric (using extracted_confidence)
    df['confidence'] = pd.to_numeric(df['extracted_confidence'], errors='coerce')
    df['confidence'] = df['confidence'].fillna(-1).astype(int)

    # Extract single letter answer (using extracted_answer)
    df['answer_raw'] = df['extracted_answer'] # Keep original
    df['answer'] = df['extracted_answer'].astype(str).str.strip().str.upper()
    df['answer'] = df['answer'].apply(lambda x: x[0] if x and 'A' <= x[0] <= 'Z' else 'Invalid')

    # Add category using Question_Handler if question_num exists
    if Qs and 'question_num' in df.columns:
        df['category'] = df['question_num'].apply(Qs.get_question_category)
        df['category'] = df['category'].fillna('Unknown Category')
    else:
        df['category'] = 'Unknown Category'
        if 'question_num' not in df.columns:
             print("Warning: 'question_num' column missing, cannot determine categories.")
        else:
             print("Warning: Question_Handler not available, cannot determine categories.")

    # Ensure numeric types for relevant columns if they exist
    for col in ['question_num', 'run_index', 'message_index']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1).astype(int)

    return df

def calculate_final_aggregates(df_group):
    """Calculates aggregates focusing on the *last* message from each agent in each run."""
    if df_group.empty or 'message_index' not in df_group.columns or 'agent_name' not in df_group.columns or 'run_index' not in df_group.columns:
        return {
            'total_final_responses': 0,
            'final_answer_distribution': {},
            'final_confidence_distribution': {},
            'final_confidence_mean': None,
            'final_confidence_median': None,
            'valid_final_confidence_responses': 0
        }

    # Find the last message index for each agent within each run_index
    last_messages_idx = df_group.loc[df_group.groupby(['run_index', 'agent_name'])['message_index'].idxmax()]

    if last_messages_idx.empty:
         return { # Return empty structure if no last messages found
            'total_final_responses': 0,
            'final_answer_distribution': {},
            'final_confidence_distribution': {},
            'final_confidence_mean': None,
            'final_confidence_median': None,
            'valid_final_confidence_responses': 0
        }

    final_answer_counts = last_messages_idx['answer'].value_counts().to_dict()
    valid_conf_df = last_messages_idx[last_messages_idx['confidence'] != -1]
    final_conf_counts = valid_conf_df['confidence'].value_counts().sort_index().to_dict()

    return {
        'total_final_responses': len(last_messages_idx),
        'final_answer_distribution': final_answer_counts,
        'final_confidence_distribution': final_conf_counts,
        'final_confidence_mean': valid_conf_df['confidence'].mean() if not valid_conf_df.empty else None,
        'final_confidence_median': valid_conf_df['confidence'].median() if not valid_conf_df.empty else None,
        'valid_final_confidence_responses': len(valid_conf_df)
    }

# --- Load and Process Multi-Agent Data ---
print("\n--- Processing Multi-Agent Results ---")
df_multi_raw = load_all_multi_agent_csv_data(MULTI_RESULTS_DIR)
df_multi = clean_multi_agent_data(df_multi_raw.copy())

if df_multi.empty:
    print("No multi-agent data loaded or processed. Skipping multi-agent plots.")
else:
    print(f"Loaded and cleaned {len(df_multi)} multi-agent records.")
    print("\nUnique Config Hashes:", df_multi['config_hash'].unique())
    print("\nDataFrame Head (Multi-Agent):")
    print(df_multi.head())

    # --- Aggregate and Plot per Configuration ---
    all_multi_aggregate_data = defaultdict(lambda: {'questions': {}, 'categories': {}, 'overall': {}})

    for config_hash in df_multi['config_hash'].unique():
        config_df = df_multi[df_multi['config_hash'] == config_hash]
        config_str = config_df['config_str'].iloc[0] # Get the config string
        chat_type = config_df['chat_type'].iloc[0] if 'chat_type' in config_df.columns else 'unknown'
        print(f"\nProcessing Config Hash: {config_hash} (Type: {chat_type})")

        # Create output directory for this config
        config_plot_dir = os.path.join(MULTI_PLOTS_DIR, config_hash)
        os.makedirs(config_plot_dir, exist_ok=True)

        # Overall Aggregates (using final messages)
        overall_agg = calculate_final_aggregates(config_df)
        all_multi_aggregate_data[config_hash]['overall'] = overall_agg
        all_multi_aggregate_data[config_hash]['config_str'] = config_str # Store config string
        all_multi_aggregate_data[config_hash]['chat_type'] = chat_type

        # Per-Question Aggregates (using final messages)
        if 'question_num' in config_df.columns:
            question_numbers = sorted(config_df['question_num'].unique())
            for q_num in question_numbers:
                if q_num == -1: continue # Skip invalid question numbers
                q_df = config_df[config_df['question_num'] == q_num]
                if not q_df.empty:
                    q_category = q_df['category'].iloc[0]
                    q_id_info = Qs.get_question(q_num) if Qs else None
                    q_id = q_id_info['question_id'] if q_id_info else f'UnknownID_Q{q_num}'
                    q_agg = calculate_final_aggregates(q_df)
                    q_num_int = int(q_num)
                    all_multi_aggregate_data[config_hash]['questions'][q_num_int] = {
                        'question_id': q_id,
                        'category': q_category,
                        **q_agg
                    }

        # Per-Category Aggregates (using final messages)
        if 'category' in config_df.columns:
            categories = sorted([c for c in config_df['category'].unique() if c != 'Unknown Category'])
            for category in categories:
                 cat_df = config_df[config_df['category'] == category]
                 if not cat_df.empty:
                     cat_agg = calculate_final_aggregates(cat_df)
                     all_multi_aggregate_data[config_hash]['categories'][category] = cat_agg

        # --- Generate Plots for this Config ---
        print(f"Generating plots for Config Hash: {config_hash}...")

        # Plot 1: Final Answer Distribution (Overall)
        if overall_agg['total_final_responses'] > 0:
            plt.figure(figsize=(10, 6))
            ans_dist = overall_agg['final_answer_distribution']
            plt.bar(ans_dist.keys(), ans_dist.values())
            plt.title(f'Overall Final Answer Distribution\nConfig Hash: {config_hash} (Type: {chat_type})', fontsize=14)
            plt.xlabel('Final Answer', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=10)
            plt.grid(True, axis='y', linestyle='--', alpha=0.6)
            plot1_filename = os.path.join(config_plot_dir, 'overall_final_answer_dist.png')
            plt.tight_layout()
            plt.savefig(plot1_filename)
            print(f"  Plot saved: {plot1_filename}")
            plt.close()
        else:
            print("  Skipping overall final answer plot (no final responses).")

        # Plot 2: Final Confidence Distribution (Overall)
        if overall_agg['valid_final_confidence_responses'] > 0:
            plt.figure(figsize=(10, 6))
            conf_dist = overall_agg['final_confidence_distribution']
            # Ensure keys are sorted for confidence plot
            conf_keys = sorted(conf_dist.keys())
            conf_values = [conf_dist[k] for k in conf_keys]
            plt.bar([str(k) for k in conf_keys], conf_values) # Use string keys for discrete confidence levels
            plt.title(f'Overall Final Confidence Distribution\nConfig Hash: {config_hash} (Type: {chat_type})', fontsize=14)
            plt.xlabel('Final Confidence (0-5)', fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=10)
            plt.grid(True, axis='y', linestyle='--', alpha=0.6)
            plot2_filename = os.path.join(config_plot_dir, 'overall_final_confidence_dist.png')
            plt.tight_layout()
            plt.savefig(plot2_filename)
            print(f"  Plot saved: {plot2_filename}")
            plt.close()
        else:
            print("  Skipping overall final confidence plot (no valid final confidence responses).")

        # Plot 3: Mean Final Confidence per Question (if questions exist)
        if all_multi_aggregate_data[config_hash]['questions']:
            plt.figure(figsize=(15, 8))
            q_data = all_multi_aggregate_data[config_hash]['questions']
            q_nums = sorted(q_data.keys())
            mean_confs = [q_data[q]['final_confidence_mean'] for q in q_nums if q_data[q]['final_confidence_mean'] is not None]
            valid_q_nums = [q for q in q_nums if q_data[q]['final_confidence_mean'] is not None]
            if valid_q_nums:
                 plt.plot(valid_q_nums, mean_confs, marker='o', linestyle='-', markersize=4, label=f'Config: {config_hash}')
                 plt.title(f'Mean Final Confidence per Question\nConfig Hash: {config_hash} (Type: {chat_type})', fontsize=16)
                 plt.xlabel('Question Number', fontsize=12)
                 plt.ylabel('Mean Final Confidence (0-5)', fontsize=12)
                 plt.xticks(fontsize=10)
                 plt.yticks(fontsize=10)
                 plt.legend(fontsize=10)
                 plt.grid(True, linestyle='--', alpha=0.6)
                 plt.tight_layout()
                 plot3_filename = os.path.join(config_plot_dir, 'mean_final_confidence_per_question.png')
                 plt.savefig(plot3_filename)
                 print(f"  Plot saved: {plot3_filename}")
                 plt.close()
            else:
                 print("  Skipping mean final confidence per question plot (no valid data).")
        else:
            print("  Skipping mean final confidence per question plot (no question data).")

        # --- Save Aggregated JSON for this Config ---
        agg_json_filename = os.path.join(config_plot_dir, 'aggregate_results.json')
        print(f"Saving aggregate data for Config Hash {config_hash} to {agg_json_filename}...")
        try:
            # Use the same numpy conversion function defined earlier
            serializable_data = convert_numpy(all_multi_aggregate_data[config_hash])
            with open(agg_json_filename, 'w', encoding='utf-8') as f:
                json.dump(serializable_data, f, indent=4)
            print(f"  Aggregate data saved successfully.")
        except NameError:
             print("Error: convert_numpy function not found. Make sure the single-agent cell defining it has been run.")
        except Exception as e:
            print(f"  Error saving JSON for config {config_hash}: {e}")

print("\n--- Multi-Agent Aggregation and Plotting Complete ---")