# Analysis of Graded Agent Responses

This notebook loads classification results generated by `grade.py` (JSONL files) from multiple result directories and visualizes various aspects of the data, including the distribution of rationale categories, off-topic responses, and performance metrics by agent model.

The notebook will automatically detect and process:
- Single-agent results from the `results/` directory
- Multi-agent results from the `results_multi/` directory  
- Multi-agent star topology results from the `results_multi_star/` directory
- Any other `results_*` directories found
- Generate separate visualizations for each type
- Create aggregate comparisons across all types

In [5]:
import pandas as pd
import json
import os
import logging
import numpy as np
import glob
from pathlib import Path

# --- Configuration ---
BASE_DIR = "/Users/ram/Github/wisdom_agents/"

# --- Logging Setup ---
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - NOTEBOOK - %(levelname)s - %(message)s')

# --- Function to discover result directories ---
def discover_result_directories(base_dir):
    """Automatically discover all results directories."""
    result_dirs = {}
    
    # Look for directories starting with 'results'
    for item in os.listdir(base_dir):
        item_path = os.path.join(base_dir, item)
        if os.path.isdir(item_path) and item.startswith('results'):
            # Determine the type based on directory name
            if item == 'results':
                data_type = 'single_agent'
            elif item == 'results_multi':
                data_type = 'multi_agent'
            elif item == 'results_multi_star':
                data_type = 'multi_agent_star'
            else:
                # For any other results_* directories, use the directory name as type
                data_type = item.replace('results_', '').replace('results', 'mixed')
            
            result_dirs[data_type] = item_path
            logging.info(f"Found results directory: {item} -> {data_type}")
    
    return result_dirs

# --- Function to Load Classification Results ---
def load_classification_files(directory, file_pattern="*_classification.jsonl"):
    """Load all classification JSONL files from a directory."""
    classification_files = []
    if os.path.exists(directory):
        pattern = os.path.join(directory, file_pattern)
        classification_files = glob.glob(pattern)
        logging.info(f"Found {len(classification_files)} classification files in {directory}")
    else:
        logging.warning(f"Directory {directory} does not exist")
    return classification_files

def load_and_label_data(files, data_type):
    """Load data from multiple JSONL files and add data_type label."""
    all_data = []
    for file_path in files:
        try:
            df = pd.read_json(file_path, lines=True)
            if not df.empty:
                df['data_type'] = data_type
                df['source_file'] = os.path.basename(file_path)
                all_data.append(df)
                logging.info(f"Loaded {len(df)} records from {os.path.basename(file_path)}")
            else:
                logging.warning(f"Empty file: {file_path}")
        except Exception as e:
            logging.error(f"Error loading {file_path}: {e}")
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        logging.info(f"Combined {len(combined_df)} total records for {data_type}")
        return combined_df
    else:
        logging.warning(f"No data loaded for {data_type}")
        return pd.DataFrame()

# --- Discover and Load Data from All Result Directories ---
result_directories = discover_result_directories(BASE_DIR)
datasets = {}

print("=== DISCOVERED RESULT DIRECTORIES ===")
for data_type, directory in result_directories.items():
    print(f"{data_type}: {directory}")

print("\n=== LOADING DATA ===")
for data_type, directory in result_directories.items():
    files = load_classification_files(directory)
    if files:
        datasets[data_type] = load_and_label_data(files, data_type)
        print(f"{data_type}: {len(datasets[data_type])} records from {len(files)} files")
    else:
        datasets[data_type] = pd.DataFrame()
        print(f"{data_type}: No files found")

# --- Combine All Data ---
df_all = pd.DataFrame()
non_empty_datasets = {k: v for k, v in datasets.items() if not v.empty}

if non_empty_datasets:
    df_all = pd.concat(non_empty_datasets.values(), ignore_index=True)
    logging.info(f"Combined dataset: {len(df_all)} total records")
    
    print("\n=== DATA LOADING SUMMARY ===")
    for data_type, df in datasets.items():
        print(f"{data_type}: {len(df)} records")
    print(f"Total combined records: {len(df_all)}")
    
    if not df_all.empty:
        print("\nData type distribution:")
        print(df_all['data_type'].value_counts())
        print("\nSample of combined data:")
        print(df_all[['data_type', 'source_file', 'question_id']].head(10))
else:
    logging.warning("No classification data found in any directory")
    print("No classification data found in any directory")

2025-05-22 16:51:07,474 - NOTEBOOK - INFO - Found results directory: results_ous_multi -> ous_multi
2025-05-22 16:51:07,478 - NOTEBOOK - INFO - Found results directory: results_multi_star -> multi_agent_star
2025-05-22 16:51:07,480 - NOTEBOOK - INFO - Found results directory: results_ous -> ous
2025-05-22 16:51:07,481 - NOTEBOOK - INFO - Found results directory: results -> single_agent
2025-05-22 16:51:07,478 - NOTEBOOK - INFO - Found results directory: results_multi_star -> multi_agent_star
2025-05-22 16:51:07,480 - NOTEBOOK - INFO - Found results directory: results_ous -> ous
2025-05-22 16:51:07,481 - NOTEBOOK - INFO - Found results directory: results -> single_agent
2025-05-22 16:51:07,482 - NOTEBOOK - INFO - Found results directory: results_multi -> multi_agent
2025-05-22 16:51:07,488 - NOTEBOOK - INFO - Found 0 classification files in /Users/ram/Github/wisdom_agents/results_ous_multi
2025-05-22 16:51:07,494 - NOTEBOOK - INFO - Found 0 classification files in /Users/ram/Github/wisd

=== DISCOVERED RESULT DIRECTORIES ===
ous_multi: /Users/ram/Github/wisdom_agents/results_ous_multi
multi_agent_star: /Users/ram/Github/wisdom_agents/results_multi_star
ous: /Users/ram/Github/wisdom_agents/results_ous
single_agent: /Users/ram/Github/wisdom_agents/results
multi_agent: /Users/ram/Github/wisdom_agents/results_multi

=== LOADING DATA ===
ous_multi: No files found
multi_agent_star: No files found
ous: No files found
single_agent: 900 records from 1 files


2025-05-22 16:51:07,776 - NOTEBOOK - INFO - Loaded 25690 records from ggb_qwen-2.5-7b-instruct_ring_ensemble_260486c5_q1-90_n12_classification.jsonl
2025-05-22 16:51:07,777 - NOTEBOOK - INFO - Combined 25690 total records for multi_agent
2025-05-22 16:51:07,777 - NOTEBOOK - INFO - Combined 25690 total records for multi_agent
2025-05-22 16:51:07,822 - NOTEBOOK - INFO - Combined dataset: 26590 total records
2025-05-22 16:51:07,822 - NOTEBOOK - INFO - Combined dataset: 26590 total records


multi_agent: 25690 records from 1 files

=== DATA LOADING SUMMARY ===
ous_multi: 0 records
multi_agent_star: 0 records
ous: 0 records
single_agent: 900 records
multi_agent: 25690 records
Total combined records: 26590

Data type distribution:
data_type
multi_agent     25690
single_agent      900
Name: count, dtype: int64

Sample of combined data:
      data_type                                        source_file  \
0  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
1  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
2  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
3  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
4  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
5  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
6  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
7  single_agent  single_anthropic_claude-3.5-haiku_q1-90_n10_cl...   
8  single_agent  singl

In [6]:
# --- Data Preparation Function ---
def prepare_analysis_data(df, data_label=""):
    """Prepare data for analysis by separating errors from valid classifications."""
    if df.empty:
        logging.warning(f"Empty DataFrame provided for {data_label}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    
    # Check if error_type column exists, if not create it with all NaN values
    if 'error_type' not in df.columns:
        df = df.copy()
        df['error_type'] = pd.NA
        logging.info(f"{data_label}: Added missing 'error_type' column with NaN values")
    
    # Separate error records from valid classifications
    df_errors = df[df['error_type'].notna()].copy()
    df_valid = df[df['selected_categories'].notna() & df['error_type'].isna()].copy()
    
    if not df_valid.empty:
        # Ensure selected_categories is a list
        df_valid['selected_categories'] = df_valid['selected_categories'].apply(
            lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [])
        )
        
        # Create exploded categories DataFrame
        df_exploded = df_valid.explode('selected_categories')
        
        # Convert numeric columns
        if 'extracted_answer' in df_valid.columns:
            df_valid['extracted_answer_numeric'] = pd.to_numeric(df_valid['extracted_answer'], errors='coerce')
        else:
            df_valid['extracted_answer_numeric'] = np.nan
            
        if 'extracted_confidence' in df_valid.columns:
            df_valid['extracted_confidence_numeric'] = pd.to_numeric(df_valid['extracted_confidence'], errors='coerce')
        else:
            df_valid['extracted_confidence_numeric'] = np.nan
    else:
        df_exploded = pd.DataFrame()
    
    logging.info(f"{data_label}: {len(df_valid)} valid records, {len(df_errors)} error records, {len(df_exploded)} exploded category records")
    return df_valid, df_exploded, df_errors

# --- Prepare Data for Each Dataset Type ---
prepared_datasets = {}

for data_type, df in datasets.items():
    if not df.empty:
        analysis_df, exploded_df, errors_df = prepare_analysis_data(df, data_type.replace('_', '-').title())
        prepared_datasets[data_type] = {
            'analysis': analysis_df,
            'exploded': exploded_df,
            'errors': errors_df
        }

# Also prepare combined dataset
if not df_all.empty:
    df_all_analysis, df_all_exploded, df_all_errors = prepare_analysis_data(df_all, "Combined")
    prepared_datasets['combined'] = {
        'analysis': df_all_analysis,
        'exploded': df_all_exploded,
        'errors': df_all_errors
    }

# --- Summary Statistics ---
print("=== DATA PREPARATION SUMMARY ===")

for data_type, data_dict in prepared_datasets.items():
    analysis_df = data_dict['analysis']
    exploded_df = data_dict['exploded']
    errors_df = data_dict['errors']
    
    print(f"\n{data_type.replace('_', '-').title()}:")
    print(f"  Valid classifications: {len(analysis_df)}")
    print(f"  Exploded categories: {len(exploded_df)}")
    print(f"  Processing errors: {len(errors_df)}")
    
    if not analysis_df.empty:
        print(f"  Unique questions: {analysis_df['question_id'].nunique()}")
        if 'agent_model' in analysis_df.columns:
            print(f"  Unique models: {analysis_df['agent_model'].nunique()}")
        if not errors_df.empty:
            print(f"  Error types: {errors_df['error_type'].nunique()}")

2025-05-22 16:51:07,849 - NOTEBOOK - INFO - Single-Agent: Added missing 'error_type' column with NaN values
2025-05-22 16:51:07,866 - NOTEBOOK - INFO - Single-Agent: 900 valid records, 0 error records, 2512 exploded category records
2025-05-22 16:51:07,866 - NOTEBOOK - INFO - Single-Agent: 900 valid records, 0 error records, 2512 exploded category records
2025-05-22 16:51:08,006 - NOTEBOOK - INFO - Multi-Agent: 25680 valid records, 10 error records, 48932 exploded category records
2025-05-22 16:51:08,006 - NOTEBOOK - INFO - Multi-Agent: 25680 valid records, 10 error records, 48932 exploded category records
2025-05-22 16:51:08,136 - NOTEBOOK - INFO - Combined: 26580 valid records, 10 error records, 51444 exploded category records
2025-05-22 16:51:08,136 - NOTEBOOK - INFO - Combined: 26580 valid records, 10 error records, 51444 exploded category records


=== DATA PREPARATION SUMMARY ===

Single-Agent:
  Valid classifications: 900
  Exploded categories: 2512
  Processing errors: 0
  Unique questions: 90
  Unique models: 1

Multi-Agent:
  Valid classifications: 25680
  Exploded categories: 48932
  Processing errors: 10
  Unique questions: 90
  Unique models: 1
  Error types: 1

Combined:
  Valid classifications: 26580
  Exploded categories: 51444
  Processing errors: 10
  Unique questions: 90
  Unique models: 2
  Error types: 1


## Visualization of Graded Rationale Classifications

The following plots visualize the distribution of classified rationale categories, off-topic responses, answer scores, and other metrics. Visualizations are generated for each discovered dataset type:

1. **Single-agent results** - Individual model responses (from `results/`)
2. **Multi-agent results** - Group conversation responses (from `results_multi/`)
3. **Multi-agent star results** - Star topology conversations (from `results_multi_star/`)
4. **Other discovered datasets** - Any additional `results_*` directories
5. **Combined analysis** - Aggregate view across all types

In [7]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# Set theme for seaborn plots
sns.set_theme(style="whitegrid")

# --- Visualization Functions ---
def plot_category_distribution(df_exploded, title_prefix=""):
    """Plot distribution of rationale categories."""
    if df_exploded.empty or 'selected_categories' not in df_exploded.columns:
        print(f"No data available for category distribution: {title_prefix}")
        return None
    
    valid_categories = df_exploded[df_exploded['selected_categories'].notna()]
    if valid_categories.empty:
        print(f"No valid categories found: {title_prefix}")
        return None
    
    category_counts = valid_categories['selected_categories'].value_counts().reset_index()
    category_counts.columns = ['category', 'count']
    
    fig = px.bar(category_counts, x='category', y='count',
                 title=f'{title_prefix} Distribution of Selected Rationale Categories',
                 labels={'category': 'Rationale Category', 'count': 'Frequency'},
                 height=600)
    fig.update_layout(xaxis_tickangle=-45)
    return fig

def plot_off_topic_distribution(df_analysis, title_prefix=""):
    """Plot distribution of off-topic responses."""
    if df_analysis.empty or 'is_response_off_topic' not in df_analysis.columns:
        print(f"No off-topic data available: {title_prefix}")
        return None
    
    off_topic_counts = df_analysis['is_response_off_topic'].value_counts(dropna=False).reset_index()
    off_topic_counts.columns = ['is_off_topic', 'count']
    
    fig = px.pie(off_topic_counts, names='is_off_topic', values='count',
                 title=f'{title_prefix} Distribution of Off-Topic Responses',
                 hole=0.3)
    return fig

def plot_answer_distribution(df_analysis, title_prefix=""):
    """Plot distribution of answer scores."""
    if df_analysis.empty or 'extracted_answer_numeric' not in df_analysis.columns:
        print(f"No answer score data available: {title_prefix}")
        return None
    
    valid_answers = df_analysis[df_analysis['extracted_answer_numeric'].notna()]
    if valid_answers.empty:
        print(f"No valid answer scores found: {title_prefix}")
        return None
    
    fig = px.histogram(valid_answers, x='extracted_answer_numeric',
                       title=f'{title_prefix} Distribution of Answer Scores',
                       labels={'extracted_answer_numeric': 'Answer Score'},
                       nbins=7)
    return fig

def plot_categories_by_model(df_exploded, title_prefix=""):
    """Plot categories by agent model."""
    if (df_exploded.empty or 'agent_model' not in df_exploded.columns or 
        'selected_categories' not in df_exploded.columns):
        print(f"No model/category data available: {title_prefix}")
        return None
    
    valid_data = df_exploded[df_exploded['selected_categories'].notna()]
    if valid_data.empty:
        print(f"No valid model/category data found: {title_prefix}")
        return None
    
    categories_by_model = valid_data.groupby(['agent_model', 'selected_categories']).size().reset_index(name='count')
    
    if categories_by_model.empty:
        print(f"No aggregated model/category data: {title_prefix}")
        return None
    
    fig = px.bar(categories_by_model, x='selected_categories', y='count',
                 color='agent_model', barmode='group',
                 title=f'{title_prefix} Rationale Categories by Agent Model',
                 labels={'selected_categories': 'Rationale Category', 'count': 'Frequency'},
                 height=700)
    fig.update_layout(xaxis_tickangle=-45)
    return fig

# --- Generate Visualizations for Each Dataset ---
for data_type, data_dict in prepared_datasets.items():
    analysis_df = data_dict['analysis']
    exploded_df = data_dict['exploded']
    
    if analysis_df.empty:
        print(f"\n=== {data_type.replace('_', ' ').upper()} VISUALIZATIONS ===")
        print(f"No data available for {data_type} visualizations")
        continue
    
    dataset_name = data_type.replace('_', '-').title()
    print(f"\n=== {dataset_name.upper()} VISUALIZATIONS ===")
    
    # 1. Category Distribution
    fig = plot_category_distribution(exploded_df, f"{dataset_name} -")
    if fig:
        fig.show()
    
    # 2. Off-topic Distribution
    fig = plot_off_topic_distribution(analysis_df, f"{dataset_name} -")
    if fig:
        fig.show()
    
    # 3. Answer Score Distribution
    fig = plot_answer_distribution(analysis_df, f"{dataset_name} -")
    if fig:
        fig.show()
    
    # 4. Categories by Model
    fig = plot_categories_by_model(exploded_df, f"{dataset_name} -")
    if fig:
        fig.show()
    
    # 5. Average Answer by Question ID
    if ('extracted_answer_numeric' in analysis_df.columns and 
        'question_id' in analysis_df.columns and
        analysis_df['extracted_answer_numeric'].notna().any()):
        
        avg_answer_by_qid = analysis_df.groupby('question_id')['extracted_answer_numeric'].mean().reset_index()
        if not avg_answer_by_qid.empty:
            fig = px.bar(avg_answer_by_qid, x='question_id', y='extracted_answer_numeric',
                        title=f'{dataset_name} - Average Answer Score by Question ID',
                        labels={'question_id': 'Question ID', 'extracted_answer_numeric': 'Average Answer Score'})
            fig.update_layout(xaxis_type='category')
            fig.show()


=== SINGLE-AGENT VISUALIZATIONS ===



=== MULTI-AGENT VISUALIZATIONS ===



=== COMBINED VISUALIZATIONS ===


In [8]:
# --- Comparative Analysis Between All Dataset Types ---
print("\n=== COMPARATIVE ANALYSIS ===")

# Filter out combined dataset for comparison (we'll show it separately)
comparison_datasets = {k: v for k, v in prepared_datasets.items() if k != 'combined'}

if len(comparison_datasets) >= 2:
    print(f"Comparing {len(comparison_datasets)} dataset types")
    
    # 1. Side-by-side Category Comparison
    valid_exploded_datasets = {k: v['exploded'] for k, v in comparison_datasets.items() 
                              if not v['exploded'].empty and 'selected_categories' in v['exploded'].columns}
    
    if len(valid_exploded_datasets) >= 2:
        print("\n1. Category Distribution Comparison")
        
        # Create subplots for category comparison
        n_datasets = len(valid_exploded_datasets)
        cols = min(3, n_datasets)  # Max 3 columns
        rows = (n_datasets + cols - 1) // cols  # Calculate needed rows
        
        fig = make_subplots(
            rows=rows, cols=cols,
            subplot_titles=[k.replace('_', '-').title() for k in valid_exploded_datasets.keys()],
            specs=[[{"type": "bar"} for _ in range(cols)] for _ in range(rows)]
        )
        
        for i, (data_type, exploded_df) in enumerate(valid_exploded_datasets.items()):
            row = (i // cols) + 1
            col = (i % cols) + 1
            
            category_counts = exploded_df['selected_categories'].value_counts().head(10)
            
            fig.add_trace(
                go.Bar(x=category_counts.index, y=category_counts.values, 
                      name=data_type.replace('_', '-').title()),
                row=row, col=col
            )
        
        fig.update_layout(
            title_text="Top 10 Rationale Categories by Dataset Type",
            height=400 * rows,
            showlegend=False
        )
        fig.update_xaxes(tickangle=-45)
        fig.show()
    
    # 2. Answer Score Comparison
    valid_analysis_datasets = {k: v['analysis'] for k, v in comparison_datasets.items() 
                              if not v['analysis'].empty and 'extracted_answer_numeric' in v['analysis'].columns}
    
    if len(valid_analysis_datasets) >= 2:
        print("\n2. Answer Score Distribution Comparison")
        
        comparison_data = []
        for data_type, analysis_df in valid_analysis_datasets.items():
            scores = analysis_df[analysis_df['extracted_answer_numeric'].notna()]
            for score in scores['extracted_answer_numeric']:
                comparison_data.append({
                    'score': score, 
                    'dataset_type': data_type.replace('_', '-').title()
                })
        
        if comparison_data:
            comparison_df = pd.DataFrame(comparison_data)
            
            fig = px.box(comparison_df, x='dataset_type', y='score',
                        title='Answer Score Distribution by Dataset Type',
                        labels={'dataset_type': 'Dataset Type', 'score': 'Answer Score'})
            fig.show()
    
    # 3. Off-topic Response Comparison
    print("\n3. Off-topic Response Rate Comparison")
    
    off_topic_comparison_data = []
    for data_type, data_dict in comparison_datasets.items():
        analysis_df = data_dict['analysis']
        if not analysis_df.empty and 'is_response_off_topic' in analysis_df.columns:
            off_topic_rate = (analysis_df['is_response_off_topic'].sum() / len(analysis_df)) * 100
            off_topic_comparison_data.append({
                'dataset_type': data_type.replace('_', '-').title(),
                'off_topic_rate': off_topic_rate,
                'total_responses': len(analysis_df)
            })
    
    if off_topic_comparison_data:
        off_topic_df = pd.DataFrame(off_topic_comparison_data)
        
        fig = px.bar(off_topic_df, x='dataset_type', y='off_topic_rate',
                    title='Off-Topic Response Rates by Dataset Type (%)',
                    labels={'dataset_type': 'Dataset Type', 'off_topic_rate': 'Off-Topic Rate (%)'},
                    text='total_responses')
        fig.update_traces(texttemplate='n=%{text}', textposition="outside")
        fig.show()
    
    # 4. Model Performance Comparison (if applicable)
    print("\n4. Model Performance Comparison")
    
    model_performance_data = []
    for data_type, data_dict in comparison_datasets.items():
        analysis_df = data_dict['analysis']
        if (not analysis_df.empty and 'agent_model' in analysis_df.columns and 
            'extracted_answer_numeric' in analysis_df.columns):
            
            model_perf = analysis_df.groupby('agent_model')['extracted_answer_numeric'].agg(['mean', 'count']).reset_index()
            model_perf['dataset_type'] = data_type.replace('_', '-').title()
            model_performance_data.append(model_perf)
    
    if model_performance_data:
        combined_model_perf = pd.concat(model_performance_data, ignore_index=True)
        
        # Filter to models that appear in multiple datasets
        model_counts = combined_model_perf['agent_model'].value_counts()
        common_models = model_counts[model_counts > 1].index
        
        if len(common_models) > 0:
            filtered_perf = combined_model_perf[combined_model_perf['agent_model'].isin(common_models)]
            
            fig = px.bar(filtered_perf, x='agent_model', y='mean',
                        color='dataset_type', barmode='group',
                        title='Average Answer Score by Model and Dataset Type',
                        labels={'agent_model': 'Agent Model', 'mean': 'Average Answer Score'},
                        text='count')
            fig.update_traces(texttemplate='n=%{text}', textposition="outside")
            fig.update_layout(xaxis_tickangle=-45)
            fig.show()

else:
    print("Not enough datasets for comparative analysis (need at least 2 non-empty datasets)")

print("\n=== ANALYSIS COMPLETE ===")


=== COMPARATIVE ANALYSIS ===
Comparing 2 dataset types

1. Category Distribution Comparison



2. Answer Score Distribution Comparison



3. Off-topic Response Rate Comparison



4. Model Performance Comparison

=== ANALYSIS COMPLETE ===


In [9]:
# --- Summary Statistics Table ---
print("=== SUMMARY STATISTICS ===")

def generate_summary_stats(df_analysis, df_exploded, label):
    """Generate summary statistics for a dataset."""
    if df_analysis.empty:
        return None
    
    stats = {
        'Dataset': label,
        'Total Responses': len(df_analysis),
        'Unique Questions': df_analysis['question_id'].nunique() if 'question_id' in df_analysis.columns else 'N/A',
        'Unique Models': df_analysis['agent_model'].nunique() if 'agent_model' in df_analysis.columns else 'N/A',
        'Off-Topic Rate (%)': round(df_analysis['is_response_off_topic'].sum() / len(df_analysis) * 100, 2) if 'is_response_off_topic' in df_analysis.columns else 'N/A',
        'Avg Answer Score': round(df_analysis['extracted_answer_numeric'].mean(), 2) if 'extracted_answer_numeric' in df_analysis.columns and df_analysis['extracted_answer_numeric'].notna().any() else 'N/A',
        'Most Common Category': df_exploded['selected_categories'].mode().iloc[0] if not df_exploded.empty and 'selected_categories' in df_exploded.columns and not df_exploded['selected_categories'].isna().all() else 'N/A',
        'Total Categories Used': df_exploded['selected_categories'].nunique() if not df_exploded.empty and 'selected_categories' in df_exploded.columns else 'N/A'
    }
    return stats

# Generate stats for each dataset
summary_data = []
for data_type, data_dict in prepared_datasets.items():
    stats = generate_summary_stats(data_dict['analysis'], data_dict['exploded'], 
                                  data_type.replace('_', '-').title())
    if stats:
        summary_data.append(stats)

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    print("\nSummary Statistics Table:")
    print(summary_df.to_string(index=False))
    
    # Display as a nice table using plotly
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(summary_df.columns),
                   fill_color='paleturquoise',
                   align='left'),
        cells=dict(values=[summary_df[col] for col in summary_df.columns],
                  fill_color='lavender',
                  align='left'))
    ])
    fig.update_layout(title="Summary Statistics: Multi-Dataset Analysis")
    fig.show()

# --- Error Analysis ---
all_errors = []
for data_type, data_dict in prepared_datasets.items():
    errors_df = data_dict['errors']
    if not errors_df.empty:
        errors_df = errors_df.copy()
        errors_df['source_dataset'] = data_type.replace('_', '-').title()
        all_errors.append(errors_df)

if all_errors:
    combined_errors = pd.concat(all_errors, ignore_index=True)
    
    print("\n=== ERROR ANALYSIS ===")
    error_by_type = combined_errors.groupby(['source_dataset', 'error_type']).size().reset_index(name='count')
    
    if not error_by_type.empty:
        fig = px.bar(error_by_type, x='error_type', y='count', color='source_dataset',
                    title='Processing Errors by Type and Dataset',
                    labels={'error_type': 'Error Type', 'count': 'Count'},
                    barmode='group')
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()
        
        print("Error summary by dataset:")
        error_pivot = error_by_type.pivot(index='error_type', columns='source_dataset', values='count').fillna(0)
        print(error_pivot)

print("\nNotebook execution complete!")

=== SUMMARY STATISTICS ===

Summary Statistics Table:
     Dataset  Total Responses  Unique Questions  Unique Models  Off-Topic Rate (%)  Avg Answer Score           Most Common Category  Total Categories Used
Single-Agent              900                90              1                0.44              3.43 HARM_AVOIDANCE_NON_MALEFICENCE                     15
 Multi-Agent            25680                90              1                0.04              4.39 HARM_AVOIDANCE_NON_MALEFICENCE                     22
    Combined            26580                90              2                0.05              4.35 HARM_AVOIDANCE_NON_MALEFICENCE                     22



=== ERROR ANALYSIS ===


Error summary by dataset:
source_dataset                       Combined  Multi-Agent
error_type                                                
ClassificationCallFailed_RetryError        10           10

Notebook execution complete!
