In [None]:
# 1. Imports and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import numpy as np
import re
import sys
import glob
from collections import Counter, defaultdict
import ast  # For safely evaluating string representations of lists/dicts

# Define directories
RESULTS_DIR = 'results' # Directory containing single-agent CSV results
RESULTS_DIR_SINGLE = 'results'
RESULTS_DIR_MULTI = 'results_multi'
PLOT_DIR = 'plots'
os.makedirs(PLOT_DIR, exist_ok=True)

# Define the datasets (categories) to include
# These should match the category names returned by Question_Handler
INCLUDED_DATASETS = ['MFQ_30', '6_concepts']

# Create plots directory if it doesn't exist
os.makedirs(PLOT_DIR, exist_ok=True)

# Add MoralBench repo to path to import Question_Handler
MORAL_BENCH_REPO_DIR = '../MoralBench_AgentEnsembles' # Adjust if needed
moral_bench_path = os.path.abspath(MORAL_BENCH_REPO_DIR)
if moral_bench_path not in sys.path:
    sys.path.insert(0, moral_bench_path)
print(f"Using MoralBench repository at: {moral_bench_path}")

In [None]:
# Updated mapping based on standard MFT + Liberty
dict_map = {
   'authority': 'Authority',
   'fairness': 'Fairness',
   'harm': 'Harm', # Care/Harm
   'ingroup': 'Loyalty', # Loyalty/Betrayal
   'purity': 'Sanctity', # Sanctity/Degradation
   'liberty': 'Liberty'
}

# Define the order for plotting categories
PLOT_CATEGORIES = ['Harm', 'Fairness', 'Loyalty', 'Authority', 'Sanctity', 'Liberty']

In [None]:
# 2. Question Handler Definition (Copied for self-containment)
# Note: Ideally, this would be imported from a shared module.
class Question_Handler():
    def __init__(self, repo_dir):
        self.repo_dir = os.path.abspath(repo_dir) # Use absolute path
        self.questions_dir = os.path.join(self.repo_dir, 'questions')
        self.answers_dir = os.path.join(self.repo_dir, 'answers')
        self.categories = self.list_categories()
        self._build_question_map()

    def _build_question_map(self):
        """Builds a map from question number to (category, index)."""
        self.question_map = {}
        current_question_num = 1
        for category in self.categories:
            count = self.get_question_count(category)
            for i in range(count):
                self.question_map[current_question_num] = {'category': category, 'index': i}
                current_question_num += 1
        self.total_questions = current_question_num - 1

    def get_question_category_and_index(self, question_number):
        """Gets the category and index for a given question number."""
        return self.question_map.get(question_number)

    def get_question_category(self, question_number):
        """Gets the category for a given question number."""
        mapping = self.question_map.get(question_number)
        return mapping['category'] if mapping else None

    def get_question_count(self, category_folder):
        """
        Get the number of questions in a specific category folder.
        """
        questions_path = os.path.join(self.questions_dir, category_folder)
        if not os.path.exists(questions_path):
            # print(f"Warning: Category folder {questions_path} does not exist!")
            return 0
        try:
            question_files = [f for f in os.listdir(questions_path) if f.endswith('.txt')]
            return len(question_files)
        except FileNotFoundError:
            # print(f"Warning: Error accessing category folder {questions_path}.")
            return 0

    def list_categories(self):
        """
        List all available question categories.
        """
        if not os.path.exists(self.questions_dir):
            print(f"Warning: Questions directory {self.questions_dir} not found!")
            return []
        try:
            categories = sorted([d for d in os.listdir(self.questions_dir) if os.path.isdir(os.path.join(self.questions_dir, d))])
            return categories
        except FileNotFoundError:
            print(f"Warning: Error listing categories in {self.questions_dir}.")
            return []

    def load_question_answer(self, category_folder, index):
        """
        Load a question and its possible answers using an index.
        """
        questions_path = os.path.join(self.questions_dir, category_folder)
        if not os.path.exists(questions_path):
            # print(f"Warning: Category folder {questions_path} does not exist!")
            return None

        try:
            # Get all question files and sort them
            question_files = sorted([f for f in os.listdir(questions_path) if f.endswith('.txt')])

            if index < 0 or index >= len(question_files):
                # print(f"Warning: Index {index} is out of range for category {category_folder}! Valid range: 0-{len(question_files)-1}")
                return None

            # Get question filename and ID
            question_file = question_files[index]
            question_id = os.path.splitext(question_file)[0]

            # Read question content
            question_path = os.path.join(questions_path, question_file)
            with open(question_path, 'r', encoding='utf-8') as f:
                question_text = f.read()

            # Load answers from JSON
            answers_path = os.path.join(self.repo_dir, 'answers', f"{category_folder}.json") # Corrected path
            question_answers = None
            if os.path.exists(answers_path):
                try:
                    with open(answers_path, 'r', encoding='utf-8') as f:
                        all_answers = json.load(f)
                    question_answers = all_answers.get(question_id, {})
                except json.JSONDecodeError:
                    print(f"Warning: Error decoding JSON from {answers_path}")
                except Exception as e:
                    print(f"Warning: Error reading answers file {answers_path}: {e}")
            # else:
                # print(f"Warning: Answers file {answers_path} for {category_folder} does not exist!")

            return {
                'question_id': question_id,
                'question_text': question_text,
                'answers': question_answers
            }
        except FileNotFoundError:
            # print(f"Warning: Error accessing files in {questions_path}.")
            return None
        except Exception as e:
            print(f"Warning: Unexpected error loading question {category_folder}/{index}: {e}")
            return None

    def get_question(self, number):
        """Gets question data by absolute number."""
        mapping = self.get_question_category_and_index(number)
        if mapping:
            return self.load_question_answer(mapping['category'], mapping['index'])
        else:
            # print(f"Warning: Question number {number} not found in map.")
            return None
        
    def get_question_by_category_and_id(self, category, id):
        """Gets question data by category and ID (eg MFQ_30 and harm_1)."""
        # it should iterate over the questions in the category and find the one with the matching id
        questions_path = os.path.join(self.questions_dir, category)
        if not os.path.exists(questions_path):
            # print(f"Warning: Category folder {questions_path} does not exist!")
            return None
        try:
            # Get all question files and sort them
            question_files = sorted([f for f in os.listdir(questions_path) if f.endswith('.txt')])

            for question_file in question_files:
                question_id = os.path.splitext(question_file)[0]
                if question_id == id:
                    # Read question content
                    question_path = os.path.join(questions_path, question_file)
                    with open(question_path, 'r', encoding='utf-8') as f:
                        question_text = f.read()

                    # Load answers from JSON
                    answers_path = os.path.join(self.repo_dir, 'answers', f"{category}.json") # Corrected path
                    question_answers = None
                    if os.path.exists(answers_path):
                        try:
                            with open(answers_path, 'r', encoding='utf-8') as f:
                                all_answers = json.load(f)
                            question_answers = all_answers.get(question_id, {})
                        except json.JSONDecodeError:
                            print(f"Warning: Error decoding JSON from {answers_path}")
                        except Exception as e:
                            print(f"Warning: Error reading answers file {answers_path}: {e}")

                    return {
                        'question_id': question_id,
                        'question_text': question_text,
                        'answers': question_answers
                    }
            # print(f"Warning: Question ID {id} not found in category {category}.")
            return None
        except Exception as e:
            print(f"Warning: Unexpected error loading question {category}/{id}: {e}")
            return None


    def get_total_question_count(self):
        """Returns the total number of questions across all categories."""
        return self.total_questions

# --- Initialize Question Handler ---
try:
    Qs = Question_Handler(MORAL_BENCH_REPO_DIR)
    print(f"Question Handler initialized. Found {Qs.get_total_question_count()} questions in {len(Qs.categories)} categories.")
    print(f"Available categories: {Qs.categories}")
except Exception as e:
    print(f"Error initializing Question_Handler: {e}")
    Qs = None

Qs.get_question_by_category_and_id('MFQ_30', 'harm_1')

In [None]:
Qs.get_question_category_and_index(88)

# Qs.load_question_answer('MFQ_30', 88)

In [None]:
# making sure they all have question_id
if Qs:
    for i in range(1,89):
        q_info = Qs.get_question(i)  # Test the Question_Handler
        print(f'{q_info.keys()}')

In [None]:
# --- Helper Functions ---

def extract_category_from_id(question_id):
    """Extracts the category name from the question_id (e.g., 'fairness_3' -> 'Fairness')."""
    if not isinstance(question_id, str):
        return 'Unknown'
    match = re.match(r"([a-zA-Z_]+)_?\d*", question_id)
    if match:
        category_name = match.group(1).replace('_', ' ').title()
        # Handle specific known prefixes if needed
        if category_name.startswith('Mfq '):
             category_name = 'MFQ_30' # Keep original dataset name if preferred
        elif category_name.startswith('6 Concepts'):
             category_name = '6_concepts' # Keep original dataset name if preferred
        return category_name.strip().lower()
    return 'Unknown'

def get_category_from_qnum(q_num): # this gets out the dataset category (eg MFQ_30)
    """Gets the category name using the Question_Handler based on question number."""
    if Qs:
        return Qs.get_question_category(q_num)
    return 'Unknown' # Fallback if Qs is not initialized

def get_moralbench_scores(question_number, answer):
    """Gets the moral score for a given question number and answer."""
    if Qs:
        q_data = Qs.get_question(question_number)
        if q_data and 'answers' in q_data and q_data['answers'] and answer in q_data['answers']:
            return q_data['answers'][answer]
        # Handle cases where answers might be missing or empty
        # print(f"Warning: No score found for Q{question_number}, Answer '{answer}'. Q_data: {q_data}")
    return None # Fallback if Qs is not initialized, answer not found, or answers missing

def get_question_id_from_qnum(q_num): # this gets out the question_id (eg fairness_3)
    """Gets the question ID using the Question_Handler based on question number."""
    if Qs:
        q_info = Qs.get_question(q_num)
        if q_info and 'question_id' in q_info:
            return q_info['question_id']
    return 'Unknown' # Fallback if Qs is not initialized or question not found

def get_moral_category_from_qnum(q_num): # this gets out the moral category (eg Harm)
    """Gets the moral category name using the Question_Handler based on question number."""
    if Qs:
        q_info = Qs.get_question(q_num)
        if q_info and 'question_id' in q_info:
            return extract_category_from_id(q_info['question_id'])
    return 'Unknown' # Fallback if Qs is not initialized or question not found

def safe_literal_eval(val):
    """Safely evaluate a string literal (list, dict). Returns None on error."""
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError, TypeError):
        # print(f"Warning: Could not parse value: {val}")
        return None

def load_and_preprocess_data(results_dir):
    """Loads all CSV files from a directory and preprocesses them."""
    all_data_rows = []
    print(f"Checking directory: {results_dir}")
    if not os.path.exists(results_dir):
        print(f"Warning: Directory not found: {results_dir}")
        return pd.DataFrame()

    print(f"Found directory: {results_dir}. Searching for CSV files...")
    found_csv = False
    for filename in os.listdir(results_dir):
        if filename.endswith(".csv"):
            found_csv = True
            filepath = os.path.join(results_dir, filename)
            print(f"  Loading file: {filename}")
            try:
                df_raw = pd.read_csv(filepath)
                if df_raw.empty:
                    print(f"    Warning: File is empty: {filename}")
                    continue

                # Determine run type early based on columns
                is_multi_agent = 'agent_responses' in df_raw.columns
                is_single_agent = 'model_name' in df_raw.columns and 'run_index' in df_raw.columns and not is_multi_agent

                # --- Process based on run type ---
                if is_multi_agent:
                    print(f"    Processing as multi-agent data...")
                    df_raw['run_type'] = 'multi'
                    # Explode the agent_responses column
                    df_raw['agent_responses_parsed'] = df_raw['agent_responses'].apply(safe_literal_eval)
                    df_exploded = df_raw.explode('agent_responses_parsed')
                    df_exploded = df_exploded.dropna(subset=['agent_responses_parsed']) # Drop rows where parsing failed or was empty

                    # Expand the dictionary into columns
                    agent_data = pd.json_normalize(df_exploded['agent_responses_parsed'])
                    df = pd.concat([df_exploded.drop(columns=['agent_responses', 'agent_responses_parsed']).reset_index(drop=True),
                                    agent_data.reset_index(drop=True)], axis=1)
                    print(f"    Exploded agent responses. Shape after explode: {df.shape}")

                elif is_single_agent:
                    print(f"    Processing as single-agent data...")
                    df = df_raw.copy() # Use the raw df directly
                    df['run_type'] = 'single'
                else:
                    print(f"    Warning: Could not determine run type for {filename}. Skipping.")
                    continue

                # --- Add Category and Moral Category (Common Logic) ---
                if 'question_num' in df.columns and Qs:
                    df['category'] = df['question_num'].apply(get_category_from_qnum) # get the dataset category
                    df['question_id'] = df['question_num'].apply(get_question_id_from_qnum) # get the question_id
                    df['moral_category'] = df['question_num'].apply(get_moral_category_from_qnum) # get the moral category
                    print(f"    Extracted categories from 'question_num'. Unique values: {df['category'].unique()[:5]}...")
                elif 'question_id' in df.columns: # Fallback if question_num missing but question_id exists
                     df['category'] = df['question_id'].apply(extract_category_from_id) # Attempt to get moral category
                     df['moral_category'] = df['question_id'].apply(extract_category_from_id) # Use same logic for moral category
                     # Try to infer dataset category if possible (might be less reliable)
                     if Qs:
                         # This requires reversing the map, might be slow/complex. Stick to moral category for now.
                         print("    Warning: 'question_num' missing. Using 'question_id' for moral category. Dataset category might be inaccurate.")
                     else:
                         print("    Warning: 'question_num' missing and Qs handler failed. Using 'question_id' for moral category.")
                else:
                    df['category'] = 'Unknown'
                    df['moral_category'] = 'Unknown'
                    df['question_id'] = 'Unknown'
                    print("    Warning: Could not determine category ('question_num' or 'question_id' missing, or Qs handler failed).")

                # --- Filter by Dataset ---
                initial_rows = len(df)
                df = df[df['category'].isin(INCLUDED_DATASETS)]
                filtered_rows = len(df)
                print(f"    Filtered by INCLUDED_DATASETS ({INCLUDED_DATASETS}). Kept {filtered_rows}/{initial_rows} rows.")

                if not df.empty:
                    all_data_rows.append(df)
                else:
                    print(f"    Info: No rows remaining after filtering for datasets {INCLUDED_DATASETS}.")

            except pd.errors.EmptyDataError:
                print(f"    Warning: Skipping empty file: {filename}")
            except Exception as e:
                print(f"    Error loading or processing file {filename}: {e}")
                import traceback
                traceback.print_exc() # Print full traceback for debugging

    if not found_csv:
        print(f"Warning: No CSV files found in directory: {results_dir}")

    if not all_data_rows:
        print(f"No data loaded or retained from {results_dir} after processing and filtering. Check CSV files exist, are not empty, and contain data matching INCLUDED_DATASETS: {INCLUDED_DATASETS}.")
        return pd.DataFrame()

    print(f"Concatenating data from {len(all_data_rows)} files/dataframes.")
    combined_df = pd.concat(all_data_rows, ignore_index=True)

    # --- Data Cleaning (Common Logic) ---
    # Convert confidence to numeric, coercing errors
    if 'extracted_confidence' in combined_df.columns:
        combined_df['confidence_numeric'] = pd.to_numeric(combined_df['extracted_confidence'], errors='coerce')
    elif 'confidence' in combined_df.columns:
         combined_df['confidence_numeric'] = pd.to_numeric(combined_df['confidence'], errors='coerce')
    else:
        print("Warning: No 'confidence' or 'extracted_confidence' column found for numeric conversion.")
        combined_df['confidence_numeric'] = np.nan # Add column as NaN

    # Clean up answer strings (remove leading/trailing spaces, periods)
    if 'extracted_answer' in combined_df.columns:
        combined_df['answer_clean'] = combined_df['extracted_answer'].astype(str).str.strip().str.rstrip('.')
    elif 'answer' in combined_df.columns:
         combined_df['answer_clean'] = combined_df['answer'].astype(str).str.strip().str.rstrip('.')
    else:
        print("Warning: No 'answer' or 'extracted_answer' column found for cleaning.")
        combined_df['answer_clean'] = 'Unknown'

    # --- Calculate Score (Common Logic, requires 'question_num' and 'answer_clean') ---
    if Qs and 'question_num' in combined_df.columns and 'answer_clean' in combined_df.columns:
        print("    Calculating MoralBench scores...")
        combined_df['score'] = combined_df.apply(lambda row: get_moralbench_scores(row['question_num'], row['answer_clean']), axis=1)
        print(f"    Score calculation done. NaN count: {combined_df['score'].isna().sum()}")
    else:
        print("    Warning: Could not calculate scores ('question_num' or 'answer_clean' missing, or Qs handler failed).")
        combined_df['score'] = np.nan

    print(f"Finished loading and preprocessing for {results_dir}. Resulting dataframe shape: {combined_df.shape}")
    print(f"Columns: {combined_df.columns.tolist()}")
    return combined_df

In [None]:
test_multi_agent_file = 'star_1_0.csv'
print(f"\n--- Testing loading for specific file: {test_multi_agent_file} ---")

multi_agent_json_path = os.path.join(RESULTS_DIR_MULTI, test_multi_agent_file)
multi_agent_df = pd.read_csv(multi_agent_json_path)
multi_agent_df
# map question_id to category
def get_moral_category_from_id(question_id):
    """Gets the moral category name using the Question_Handler based on question ID."""
    return dict_map.get(extract_category_from_id(question_id), 'Unknown')

multi_agent_df['category'] = multi_agent_df['question_id'].apply(get_moral_category_from_id)
# if question num mod 9 is 1 - 5, database is MFQ_30, else 6_concepts
multi_agent_df['database'] = multi_agent_df['question_num'].apply(lambda x: '6_concepts' if x % 11 in [1, 2, 3, 4, 5, 6] else 'MFQ_30')

multi_agent_df.head()

# get first agent responses

all_scores = []
for j, row in multi_agent_df.iterrows():
    x = j
    agent_responses = multi_agent_df['agent_responses'].iloc[x]
    # Parse the agent responses
    agent_responses_parsed = safe_literal_eval(agent_responses)
    agent_responses_parsed

    # run Qs.get_question_by_category_and_id('MFQ_30', 'harm_1') to get each score
    question_id = multi_agent_df['question_id'].iloc[x]
    database = multi_agent_df['database'].iloc[x]

    score_answer = Qs.get_question_by_category_and_id(database, question_id)
    if score_answer is None:
        pass

    scores = {}
    confidences = {}
    iteration = 0
    scores_in_iteration = []
    confidences_in_iteration = []
    first_agent = agent_responses_parsed[0]['agent_model']
    for i, obj in enumerate(agent_responses_parsed):
        # get the answer
        if agent_responses_parsed[i]['agent_model'] == first_agent and i > 0:
            iteration += 1
            average_score = np.mean(scores_in_iteration)
            sem = np.std(scores_in_iteration) / np.sqrt(len(scores_in_iteration))
            scores[iteration] = {
                "score": average_score,
                "sem": sem
            }
            average_confidence = np.mean(confidences_in_iteration)
            confidences[iteration] = {
                "score": average_confidence,
                "sem": np.std(confidences_in_iteration) / np.sqrt(len(confidences_in_iteration))
            }
            scores_in_iteration = []
            confidences_in_iteration = []
        
        answer = obj['extracted_answer']
        if score_answer is None:
            print(database, question_id)
        if 'answers' in score_answer and answer in score_answer['answers']:
            score = score_answer['answers'][answer]
            scores_in_iteration.append(score)

            try:
                confidence = float(obj['extracted_confidence'])
                confidences_in_iteration.append(confidence)
            except (ValueError, TypeError):
                # Handle cases where confidence is not a valid float
                print(f"Warning: Invalid confidence value '{obj['extracted_confidence']}' for answer '{answer}'. Skipping.")
    scores[iteration + 1] = {
        "score": np.mean(scores_in_iteration), # Add the last iteration's
        "sem": np.std(scores_in_iteration) / np.sqrt(len(scores_in_iteration))
    }
    confidences[iteration + 1] = {
        "score": np.mean(confidences_in_iteration), # Add the last iteration's
        "sem": np.std(confidences_in_iteration) / np.sqrt(len(confidences_in_iteration))
    }
        
    all_scores.append({
        'scores': scores,
        'confidences': confidences,
        'database': database,
        'category': multi_agent_df['category'].iloc[x],
    })
all_scores


In [None]:
# --- Test loading and preprocessing for a specific multi-agent file from scratch, don't use any functions ---
test_multi_agent_file = 'ring_1.csv'
print(f"\n--- Testing loading for specific file: {test_multi_agent_file} ---")

multi_agent_json_path = os.path.join(RESULTS_DIR_MULTI, test_multi_agent_file)
multi_agent_df = pd.read_csv(multi_agent_json_path)
multi_agent_df
# pop the first 2 rows
multi_agent_df = multi_agent_df.iloc[2:]
multi_agent_df
# map question_id to category
def get_moral_category_from_id(question_id):
    """Gets the moral category name using the Question_Handler based on question ID."""
    return dict_map.get(extract_category_from_id(question_id), 'Unknown')

multi_agent_df['category'] = multi_agent_df['question_id'].apply(get_moral_category_from_id)
# if question num mod 9 is 1 - 5, database is MFQ_30, else 6_concepts
multi_agent_df['database'] = multi_agent_df['question_num'].apply(lambda x: '6_concepts' if x % 11 in [1, 2, 3, 4, 5, 6] else 'MFQ_30')

multi_agent_df.head()

# get first agent responses

all_scores = []
for j, row in multi_agent_df.iterrows():
    x = j-2
    agent_responses = multi_agent_df['agent_responses'].iloc[x]
    # Parse the agent responses
    agent_responses_parsed = safe_literal_eval(agent_responses)
    agent_responses_parsed

    # run Qs.get_question_by_category_and_id('MFQ_30', 'harm_1') to get each score
    question_id = multi_agent_df['question_id'].iloc[x]
    database = multi_agent_df['database'].iloc[x]

    score_answer = Qs.get_question_by_category_and_id(database, question_id)
    if score_answer is None:
        pass

    scores = {}
    confidences = {}
    iteration = 0
    scores_in_iteration = []
    confidences_in_iteration = []
    first_agent = agent_responses_parsed[0]['agent_model']
    for i, obj in enumerate(agent_responses_parsed):
        # get the answer
        if agent_responses_parsed[i]['agent_model'] == first_agent and i > 0:
            iteration += 1
            average_score = np.mean(scores_in_iteration)
            sem = np.std(scores_in_iteration) / np.sqrt(len(scores_in_iteration))
            scores[iteration] = {
                "score": average_score,
                "sem": sem
            }
            average_confidence = np.mean(confidences_in_iteration)
            confidences[iteration] = {
                "score": average_confidence,
                "sem": np.std(confidences_in_iteration) / np.sqrt(len(confidences_in_iteration))
            }
            scores_in_iteration = []
            confidences_in_iteration = []
        
        answer = obj['extracted_answer']
        if score_answer is None:
            print(database, question_id)
        if 'answers' in score_answer and answer in score_answer['answers']:
            score = score_answer['answers'][answer]
            scores_in_iteration.append(score)

            try:
                confidence = float(obj['extracted_confidence'])
                confidences_in_iteration.append(confidence)
            except (ValueError, TypeError):
                # Handle cases where confidence is not a valid float
                print(f"Warning: Invalid confidence value '{obj['extracted_confidence']}' for answer '{answer}'. Skipping.")
    scores[iteration + 1] = {
        "score": np.mean(scores_in_iteration), # Add the last iteration's
        "sem": np.std(scores_in_iteration) / np.sqrt(len(scores_in_iteration))
    }
    confidences[iteration + 1] = {
        "score": np.mean(confidences_in_iteration), # Add the last iteration's
        "sem": np.std(confidences_in_iteration) / np.sqrt(len(confidences_in_iteration))
    }
        
    all_scores.append({
        'scores': scores,
        'confidences': confidences,
        'database': database,
        'category': multi_agent_df['category'].iloc[x],
    })
all_scores


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

variable = 'confidences'

figsize = (15, 8)
plt.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 14,
    'lines.linewidth': 2.5,
    'figure.titlesize': 18
})

# Group data by database
databases = {}
for item in all_scores:
    db = item['database']
    if db not in databases:
        databases[db] = {}
    if item['category'] not in databases[db]:
        databases[db][item['category']] = {
            1: [],
            2: [],
            3: [],
        }
    for k, v in item[variable].items():
        if k not in databases[db][item['category']]:
            databases[db][item['category']][k] = []
        databases[db][item['category']][k].append(v)
# Calculate the mean and std for each category
for db, categories in databases.items():
    for category, scores in categories.items():
        for k, v in scores.items():
            if len(v) > 0:
                print(f"Database: {db}, Category: {category}, Iteration: {k}, Scores: {v}") 
                sum = np.sum([x['score'] for x in v])
                sem = np.sum([x['sem'] for x in v])
                databases[db][category][k] = (sum, sem)
            else:
                databases[db][category][k] = (0, 0)


def set_plot_style(title_fontsize=16, label_fontsize=14, tick_fontsize=12, 
                   legend_fontsize=14, line_width=2.5):
    """Set global matplotlib parameters for radar plots"""
    plt.rcParams.update({
        'font.size': label_fontsize,
        'axes.titlesize': title_fontsize,
        'axes.labelsize': label_fontsize,
        'xtick.labelsize': tick_fontsize,
        'ytick.labelsize': tick_fontsize,
        'legend.fontsize': legend_fontsize,
        'lines.linewidth': line_width,
        'figure.titlesize': title_fontsize + 2
    })

def get_consistent_colors(iterations):
    """Get consistent colors for iterations across different plots"""
    # Define a fixed set of colors
    fixed_colors = {
        1: '#1f77b4',  # blue
        2: '#ff7f0e',  # orange
        3: '#2ca02c',  # green
        4: '#d62728',  # red
        5: '#9467bd',  # purple
        6: '#8c564b',  # brown
    }
    
    # Create a mapping from iterations to colors
    color_map = {}
    for iteration in iterations:
        color_map[iteration] = fixed_colors.get(iteration, f'C{iteration}')
    
    return color_map

def plot_moral_radar_by_iteration(databases_data, figsize=(18, 9), show_sem=True, save_path=None):
    """
    Create radar plots for moral foundations by database and iteration.
    
    Parameters:
    -----------
    databases_data : dict
        Dictionary with database names as keys and category data as values
    figsize : tuple, default=(18, 9)
        Size of the figure (width, height)
    show_sem : bool, default=True
        Whether to show standard error as shaded region
    save_path : str, optional
        Path to save the plots, if None, plots will be displayed
        
    Returns:
    --------
    fig : matplotlib figure
        The created figure with subplots
    """
    # Dictionary to map moral_category to display names
    dict_map = {
        'authority': 'Authority',
        'fairness': 'Fairness',
        'harm': 'Care',  # Care/Harm
        'ingroup': 'Loyalty',  # Loyalty/Betrayal
        'purity': 'Sanctity',  # Sanctity/Degradation
        'liberty': 'Liberty'
    }
    
    # Get list of databases
    databases = list(databases_data.keys())
    
    # Get list of all iterations
    all_iterations = set()
    for db in databases:
        for category in databases_data[db]:
            all_iterations.update(databases_data[db][category].keys())
    iterations = sorted(list(all_iterations))
    
    # Get consistent colors for iterations
    color_map = get_consistent_colors(iterations)
    
    # Create figure with subplots
    fig, axes = plt.subplots(1, len(databases), figsize=figsize, subplot_kw=dict(polar=True))
    
    # If there's only one database, make axes an array
    if len(databases) == 1:
        axes = [axes]
    
    # Process each database
    for i, database in enumerate(databases):
        ax = axes[i]
        
        # Get data for this database
        db_data = databases_data[database]
        
        # Get moral categories for this database
        moral_cats = sorted(db_data.keys())
        
        # Number of moral categories
        N = len(moral_cats)
        
        # Create angle values (in radians)
        angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
        
        # Make the plot circular by appending the first angle again
        angles += angles[:1]
        
        # Set up the axis
        ax.set_xticks(angles[:-1])
        
        # Map moral_category to display names and set as labels
        labels = [dict_map.get(cat.lower(), cat) for cat in moral_cats]
        ax.set_xticklabels(labels)
        
        # Set title for the subplot with padding to avoid overlap
        ax.set_title(f"Database: {database}", pad=20)
        
        # Find max value for scaling
        max_val = 0
        for cat in moral_cats:
            for it in iterations:
                if it in db_data[cat]:
                    if db_data[cat][it][0] + db_data[cat][it][1] > max_val:
                        max_val = db_data[cat][it][0] + db_data[cat][it][1]
            
        # Set y-axis limits with some margin
        ax.set_ylim(0, max_val * 1.2)
        
        # Add grid lines with improved labels
        rticks = [max_val/5, 2*max_val/5, 3*max_val/5, 4*max_val/5, max_val]
        ax.set_rticks(rticks)
        # Format tick labels with proper precision
        ax.set_yticklabels([f"{tick:.3f}" for tick in rticks])
        ax.grid(True)
        
        # Plot each iteration
        for iteration in iterations:
            # Create arrays for means and SEMs
            means = []
            sems = []
            
            # Collect data for each category
            for cat in moral_cats:
                if iteration in db_data[cat]:
                    mean, sem = db_data[cat][iteration]
                    means.append(mean)
                    sems.append(sem)
                else:
                    means.append(0)
                    sems.append(0)
            
            # Make means circular for plotting
            means_circular = np.append(means, means[0])
            
            # Plot the mean line with consistent color
            ax.plot(angles, means_circular, color=color_map[iteration], 
                    label=f"Iteration {iteration}")
            
            # Add SEM shading if requested
            if show_sem:
                upper_bound = np.array(means) + np.array(sems)
                lower_bound = np.array(means) - np.array(sems)
                lower_bound = np.maximum(lower_bound, 0)  # Ensure no negative values
                
                # Make bounds circular
                upper_bound_circular = np.append(upper_bound, upper_bound[0])
                lower_bound_circular = np.append(lower_bound, lower_bound[0])
                
                # Create shaded region
                ax.fill_between(angles, lower_bound_circular, upper_bound_circular, 
                                alpha=0.2, color=color_map[iteration])
    
    # Add a legend to the figure
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, 0.02), 
               ncol=len(iterations))
    
    # Adjust layout to prevent overlap
    plt.tight_layout(rect=[0, 0.08, 1, 0.95])
    
    # Add more space between subplots
    plt.subplots_adjust(wspace=0.3)
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
    return fig

# Example usage:
set_plot_style(title_fontsize=16, label_fontsize=14, tick_fontsize=12, 
               legend_fontsize=14, line_width=2.5)
fig = plot_moral_radar_by_iteration(databases, show_sem=True)
plt.show()
