# Neuro-Symbolic Reasoning: Data Analysis for ICML Submission

This notebook provides comprehensive data analysis for neuro-symbolic reasoning experiments,
following the methodology established by LOGIC-LM (Pan et al., 2023) and adapted for ICML standards.

## Table of Contents
1. [Setup & Data Loading](#1-setup--data-loading)
2. [Main Results Table](#2-main-results-table)
3. [Execution Analysis (Exe_Rate & Exe_Acc)](#3-execution-analysis)
4. [Performance by Reasoning Depth](#4-performance-by-reasoning-depth)
5. [Self-Refinement Analysis](#5-self-refinement-analysis)
6. [Per-Rule/Axiom Breakdown](#6-per-ruleaxiom-breakdown)
7. [Error Analysis](#7-error-analysis)
8. [Time & Efficiency Analysis](#8-time--efficiency-analysis)
9. [Statistical Significance Tests](#9-statistical-significance-tests)
10. [Publication-Ready Figures](#10-publication-ready-figures)
11. [Export Results](#11-export-results)

---
## 1. Setup & Data Loading

In [None]:
# Core imports
import json
import glob
import os
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# Statistical tests
from scipy import stats
from scipy.stats import ttest_rel, wilcoxon, bootstrap

# Set style for publication-quality figures
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams.update({
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 11,
    'figure.figsize': (10, 6),
    'figure.dpi': 150,
    'savefig.dpi': 300,
    'savefig.bbox': 'tight'
})

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Setup complete!")

In [None]:
# Configuration - Update these paths for your experiment
CONFIG = {
    'results_dir': '../code/baseline_logiclm_plus/',  # Directory containing result JSON files
    'output_dir': './outputs/',                       # Where to save figures and tables
    'experiment_name': 'neuro_symbolic_icml_2026',    # Name for this analysis run
    
    # Method names for labeling
    'method_names': {
        'standard': 'Standard Prompting',
        'cot': 'Chain-of-Thought',
        'logiclm': 'LOGIC-LM',
        'ours': 'Our Method',
        'ours_no_refine': 'Ours (w/o Refinement)'
    },
    
    # Color scheme for plots
    'colors': {
        'standard': '#1f77b4',
        'cot': '#ff7f0e', 
        'logiclm': '#2ca02c',
        'ours': '#d62728',
        'ours_no_refine': '#9467bd'
    },
    
    # Dataset info
    'datasets': ['propositional_logic', 'first_order_logic', 'nm_logic']
}

# Create output directory
os.makedirs(CONFIG['output_dir'], exist_ok=True)
os.makedirs(f"{CONFIG['output_dir']}/figures", exist_ok=True)
os.makedirs(f"{CONFIG['output_dir']}/tables", exist_ok=True)

print(f"Output directory: {CONFIG['output_dir']}")

In [None]:
def load_experiment_results(json_path: str) -> Dict[str, Any]:
    """
    Load experiment results from a JSON file.
    
    Args:
        json_path: Path to the JSON results file
        
    Returns:
        Dictionary containing metadata, metrics, and per-example results
    """
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data


def results_to_dataframe(results: Dict[str, Any]) -> pd.DataFrame:
    """
    Convert experiment results to a pandas DataFrame for analysis.
    
    Args:
        results: Dictionary from load_experiment_results()
        
    Returns:
        DataFrame with one row per example
    """
    records = []
    
    for example in results.get('results', []):
        record = {
            # Identifiers
            'example_id': example.get('example_id'),
            'rule_type': example.get('rule_type'),
            'axiom': example.get('axiom'),
            
            # Ground truth and predictions
            'ground_truth': example.get('ground_truth'),
            'predicted_answer': example.get('answer'),
            'converted_answer': example.get('converted_answer'),
            'is_correct': example.get('is_correct', False),
            
            # Execution metrics
            'formalization_success': example.get('formalization_success', False),
            'execution_success': example.get('execution_success', False),
            
            # Refinement metrics
            'num_refinement_iterations': example.get('num_refinement_iterations', 0),
            'num_backtracks': example.get('num_backtracks', 0),
            'total_llm_calls': example.get('total_llm_calls', 0),
            
            # Timing
            'total_time': example.get('total_time', 0),
            'formalization_time': example.get('time_breakdown', {}).get('formalization', 0),
            'refinement_time': example.get('time_breakdown', {}).get('refinement', 0),
            'solving_time': example.get('time_breakdown', {}).get('solving', 0),
            
            # Error info
            'error': example.get('error'),
            'formalization_error': example.get('final_formulation', {}).get('formalization_error')
        }
        records.append(record)
    
    df = pd.DataFrame(records)
    
    # Add metadata as attributes
    df.attrs['metadata'] = results.get('metadata', {})
    df.attrs['aggregate_metrics'] = results.get('metrics', {})
    
    return df


def load_all_experiments(results_dir: str, pattern: str = '*.json') -> Dict[str, pd.DataFrame]:
    """
    Load all experiment result files from a directory.
    
    Args:
        results_dir: Directory containing JSON result files
        pattern: Glob pattern for matching files
        
    Returns:
        Dictionary mapping filenames to DataFrames
    """
    experiments = {}
    json_files = glob.glob(os.path.join(results_dir, pattern))
    
    for filepath in json_files:
        filename = os.path.basename(filepath)
        try:
            results = load_experiment_results(filepath)
            df = results_to_dataframe(results)
            experiments[filename] = df
            print(f"Loaded: {filename} ({len(df)} examples)")
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    
    return experiments

In [None]:
# Load experiments
# Option 1: Load all JSON files from directory
# experiments = load_all_experiments(CONFIG['results_dir'])

# Option 2: Load specific files
experiment_files = [
    '../code/baseline_logiclm_plus/results_with_refinement.json',
    # Add more experiment files here as they become available
    # '../code/baseline_logiclm_plus/results_standard.json',
    # '../code/baseline_logiclm_plus/results_cot.json',
]

experiments = {}
for filepath in experiment_files:
    if os.path.exists(filepath):
        filename = os.path.basename(filepath)
        results = load_experiment_results(filepath)
        experiments[filename] = results_to_dataframe(results)
        print(f"Loaded: {filename}")
    else:
        print(f"File not found: {filepath}")

# Display summary
print(f"\nTotal experiments loaded: {len(experiments)}")

In [None]:
# Preview data structure
if experiments:
    sample_df = list(experiments.values())[0]
    print("Sample DataFrame info:")
    print(sample_df.info())
    print("\nFirst few rows:")
    display(sample_df.head())

---
## 2. Main Results Table

Following LOGIC-LM Table 2 format: Accuracy across datasets and methods

In [None]:
def compute_accuracy_metrics(df: pd.DataFrame) -> Dict[str, float]:
    """
    Compute accuracy metrics from a results DataFrame.
    
    Returns:
        Dictionary with overall accuracy and per-category breakdowns
    """
    metrics = {
        'overall_accuracy': df['is_correct'].mean() * 100,
        'total_examples': len(df),
        'correct_count': df['is_correct'].sum()
    }
    
    # Per rule_type accuracy
    if 'rule_type' in df.columns:
        for rule_type in df['rule_type'].unique():
            mask = df['rule_type'] == rule_type
            metrics[f'accuracy_{rule_type}'] = df.loc[mask, 'is_correct'].mean() * 100
            metrics[f'count_{rule_type}'] = mask.sum()
    
    # Per axiom accuracy
    if 'axiom' in df.columns:
        for axiom in df['axiom'].unique():
            mask = df['axiom'] == axiom
            metrics[f'accuracy_{axiom}'] = df.loc[mask, 'is_correct'].mean() * 100
    
    return metrics


def create_main_results_table(experiments: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """
    Create the main results table (Table 2 style from LOGIC-LM).
    
    Args:
        experiments: Dictionary of experiment DataFrames
        
    Returns:
        DataFrame with methods as rows and metrics as columns
    """
    rows = []
    
    for exp_name, df in experiments.items():
        metrics = compute_accuracy_metrics(df)
        metadata = df.attrs.get('metadata', {})
        
        row = {
            'Experiment': exp_name,
            'Model': metadata.get('model', 'Unknown'),
            'Logic Type': metadata.get('logic_type', 'Mixed'),
            'N': metrics['total_examples'],
            'Accuracy (%)': f"{metrics['overall_accuracy']:.2f}",
            'Correct': metrics['correct_count']
        }
        rows.append(row)
    
    return pd.DataFrame(rows)

In [None]:
# Generate main results table
main_results = create_main_results_table(experiments)
print("Main Results Table:")
display(main_results)

# Save to CSV
main_results.to_csv(f"{CONFIG['output_dir']}/tables/main_results.csv", index=False)
print(f"\nSaved to {CONFIG['output_dir']}/tables/main_results.csv")

In [None]:
def create_comparison_table_template() -> pd.DataFrame:
    """
    Create a template for the full comparison table (LOGIC-LM Table 2 style).
    Fill in values as experiments are completed.
    """
    # Template structure - fill with actual values
    data = {
        'Dataset': ['PrOntoQA', 'ProofWriter', 'FOLIO', 'LogicalDeduction', 'AR-LSAT', 'LogicBench-Prop', 'LogicBench-FOL', 'LogicBench-NM'],
        'Standard (GPT-3.5)': [None] * 8,
        'CoT (GPT-3.5)': [None] * 8,
        'Ours (GPT-3.5)': [None] * 8,
        'Standard (GPT-4)': [None] * 8,
        'CoT (GPT-4)': [None] * 8,
        'Ours (GPT-4)': [None] * 8,
    }
    return pd.DataFrame(data)

# Display template
print("Comparison Table Template (fill as experiments complete):")
display(create_comparison_table_template())

---
## 3. Execution Analysis (Exe_Rate & Exe_Acc)

Following LOGIC-LM Table 3: Analyze symbolic formulation quality

In [None]:
def compute_execution_metrics(df: pd.DataFrame) -> Dict[str, float]:
    """
    Compute execution rate and execution accuracy metrics.
    
    Exe_Rate: % of formulations that are syntactically valid and executable
    Exe_Acc: Accuracy only on executable examples (semantic correctness)
    """
    total = len(df)
    
    # Execution rate: formalization_success AND execution_success
    executable_mask = df['execution_success'] == True
    exe_rate = executable_mask.mean() * 100
    
    # Execution accuracy: accuracy among executable examples
    if executable_mask.sum() > 0:
        exe_acc = df.loc[executable_mask, 'is_correct'].mean() * 100
    else:
        exe_acc = 0.0
    
    return {
        'exe_rate': exe_rate,
        'exe_acc': exe_acc,
        'total_examples': total,
        'executable_count': executable_mask.sum(),
        'correct_among_executable': df.loc[executable_mask, 'is_correct'].sum() if executable_mask.sum() > 0 else 0
    }


def create_execution_analysis_table(experiments: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """
    Create execution analysis table (Table 3 style from LOGIC-LM).
    """
    rows = []
    
    for exp_name, df in experiments.items():
        metrics = compute_execution_metrics(df)
        metadata = df.attrs.get('metadata', {})
        
        row = {
            'Experiment': exp_name,
            'Model': metadata.get('model', 'Unknown'),
            'Exe_Rate (%)': f"{metrics['exe_rate']:.1f}",
            'Exe_Acc (%)': f"{metrics['exe_acc']:.1f}",
            'Executable': f"{metrics['executable_count']}/{metrics['total_examples']}"
        }
        rows.append(row)
    
    return pd.DataFrame(rows)

In [None]:
# Generate execution analysis table
exe_table = create_execution_analysis_table(experiments)
print("Execution Analysis (Table 3 style):")
display(exe_table)

# Save
exe_table.to_csv(f"{CONFIG['output_dir']}/tables/execution_analysis.csv", index=False)

In [None]:
def plot_execution_metrics(experiments: Dict[str, pd.DataFrame], save_path: Optional[str] = None):
    """
    Visualize execution rate vs execution accuracy.
    """
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    exp_names = []
    exe_rates = []
    exe_accs = []
    
    for exp_name, df in experiments.items():
        metrics = compute_execution_metrics(df)
        exp_names.append(exp_name.replace('.json', ''))
        exe_rates.append(metrics['exe_rate'])
        exe_accs.append(metrics['exe_acc'])
    
    x = np.arange(len(exp_names))
    width = 0.35
    
    # Bar chart
    ax1 = axes[0]
    bars1 = ax1.bar(x - width/2, exe_rates, width, label='Exe_Rate', color='#2ecc71')
    bars2 = ax1.bar(x + width/2, exe_accs, width, label='Exe_Acc', color='#3498db')
    
    ax1.set_xlabel('Experiment')
    ax1.set_ylabel('Percentage (%)')
    ax1.set_title('Execution Rate vs Execution Accuracy')
    ax1.set_xticks(x)
    ax1.set_xticklabels(exp_names, rotation=45, ha='right')
    ax1.legend()
    ax1.set_ylim(0, 105)
    
    # Add value labels
    for bar in bars1:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)
    
    for bar in bars2:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)
    
    # Scatter plot: Exe_Rate vs Exe_Acc relationship
    ax2 = axes[1]
    ax2.scatter(exe_rates, exe_accs, s=100, alpha=0.7, c='#e74c3c')
    
    for i, name in enumerate(exp_names):
        ax2.annotate(name, (exe_rates[i], exe_accs[i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    ax2.set_xlabel('Execution Rate (%)')
    ax2.set_ylabel('Execution Accuracy (%)')
    ax2.set_title('Exe_Rate vs Exe_Acc Relationship')
    ax2.set_xlim(0, 105)
    ax2.set_ylim(0, 105)
    
    # Add diagonal reference line
    ax2.plot([0, 100], [0, 100], 'k--', alpha=0.3, label='y=x')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()

# Generate plot
plot_execution_metrics(experiments, f"{CONFIG['output_dir']}/figures/execution_metrics.png")

---
## 4. Performance by Reasoning Depth

Following LOGIC-LM Figure 3: Show robustness as complexity increases

In [None]:
def analyze_by_reasoning_depth(df: pd.DataFrame, depth_column: str = 'num_refinement_iterations') -> pd.DataFrame:
    """
    Analyze accuracy by reasoning depth/complexity.
    
    Note: Adapt depth_column based on your data structure.
    Could be reasoning_depth, num_hops, problem_complexity, etc.
    """
    if depth_column not in df.columns:
        print(f"Column '{depth_column}' not found. Available columns: {df.columns.tolist()}")
        return pd.DataFrame()
    
    depth_analysis = df.groupby(depth_column).agg(
        accuracy=('is_correct', 'mean'),
        count=('is_correct', 'count'),
        correct=('is_correct', 'sum')
    ).reset_index()
    
    depth_analysis['accuracy'] *= 100
    
    return depth_analysis


def plot_accuracy_by_depth(experiments: Dict[str, pd.DataFrame], 
                           depth_column: str = 'axiom',
                           save_path: Optional[str] = None):
    """
    Plot accuracy curves by reasoning depth (Figure 3 style).
    """
    fig, ax = plt.subplots(figsize=(12, 6))
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(experiments)))
    
    for (exp_name, df), color in zip(experiments.items(), colors):
        if depth_column in df.columns:
            depth_data = df.groupby(depth_column)['is_correct'].mean() * 100
            
            # Plot
            ax.plot(range(len(depth_data)), depth_data.values, 
                   marker='o', linewidth=2, markersize=8,
                   label=exp_name.replace('.json', ''), color=color)
            
            # Set x-tick labels
            ax.set_xticks(range(len(depth_data)))
            ax.set_xticklabels(depth_data.index, rotation=45, ha='right')
    
    ax.set_xlabel(depth_column.replace('_', ' ').title())
    ax.set_ylabel('Accuracy (%)')
    ax.set_title(f'Accuracy by {depth_column.replace("_", " ").title()}')
    ax.legend(loc='best')
    ax.set_ylim(0, 105)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()

In [None]:
# Analyze by axiom/rule type
for exp_name, df in experiments.items():
    print(f"\n{exp_name}:")
    
    # By axiom
    if 'axiom' in df.columns:
        axiom_analysis = df.groupby('axiom').agg(
            accuracy=('is_correct', lambda x: x.mean() * 100),
            count=('is_correct', 'count')
        ).round(2)
        print("\nAccuracy by Axiom:")
        display(axiom_analysis)
    
    # By rule_type
    if 'rule_type' in df.columns:
        rule_analysis = df.groupby('rule_type').agg(
            accuracy=('is_correct', lambda x: x.mean() * 100),
            count=('is_correct', 'count')
        ).round(2)
        print("\nAccuracy by Rule Type:")
        display(rule_analysis)

In [None]:
# Plot accuracy by axiom
plot_accuracy_by_depth(experiments, 'axiom', f"{CONFIG['output_dir']}/figures/accuracy_by_axiom.png")

---
## 5. Self-Refinement Analysis

Following LOGIC-LM Figure 4: Impact of refinement rounds

In [None]:
def analyze_refinement_impact(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze how self-refinement iterations affect performance.
    """
    if 'num_refinement_iterations' not in df.columns:
        print("No refinement data available")
        return pd.DataFrame()
    
    refinement_analysis = df.groupby('num_refinement_iterations').agg(
        accuracy=('is_correct', lambda x: x.mean() * 100),
        exe_rate=('execution_success', lambda x: x.mean() * 100),
        count=('is_correct', 'count'),
        avg_time=('total_time', 'mean'),
        avg_llm_calls=('total_llm_calls', 'mean')
    ).reset_index()
    
    return refinement_analysis


def plot_refinement_analysis(experiments: Dict[str, pd.DataFrame], save_path: Optional[str] = None):
    """
    Plot refinement impact (Figure 4 style from LOGIC-LM).
    """
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    for exp_name, df in experiments.items():
        ref_data = analyze_refinement_impact(df)
        
        if ref_data.empty:
            continue
        
        label = exp_name.replace('.json', '')
        
        # Accuracy by refinement rounds
        axes[0].plot(ref_data['num_refinement_iterations'], ref_data['accuracy'],
                    marker='o', linewidth=2, markersize=8, label=label)
        
        # Execution rate by refinement rounds
        axes[1].plot(ref_data['num_refinement_iterations'], ref_data['exe_rate'],
                    marker='s', linewidth=2, markersize=8, label=label)
    
    axes[0].set_xlabel('Refinement Iterations')
    axes[0].set_ylabel('Accuracy (%)')
    axes[0].set_title('Accuracy vs Refinement Rounds')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].set_xlabel('Refinement Iterations')
    axes[1].set_ylabel('Execution Rate (%)')
    axes[1].set_title('Exe_Rate vs Refinement Rounds')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()

In [None]:
# Refinement analysis
for exp_name, df in experiments.items():
    print(f"\n{exp_name} - Refinement Analysis:")
    ref_data = analyze_refinement_impact(df)
    if not ref_data.empty:
        display(ref_data)

# Plot
plot_refinement_analysis(experiments, f"{CONFIG['output_dir']}/figures/refinement_analysis.png")

---
## 6. Per-Rule/Axiom Breakdown

Detailed performance analysis by logical rule type

In [None]:
def create_per_axiom_heatmap(experiments: Dict[str, pd.DataFrame], save_path: Optional[str] = None):
    """
    Create a heatmap showing accuracy by axiom across experiments.
    """
    # Collect accuracy by axiom for each experiment
    axiom_data = {}
    
    for exp_name, df in experiments.items():
        if 'axiom' in df.columns:
            axiom_acc = df.groupby('axiom')['is_correct'].mean() * 100
            axiom_data[exp_name.replace('.json', '')] = axiom_acc
    
    if not axiom_data:
        print("No axiom data available")
        return
    
    # Create DataFrame
    heatmap_df = pd.DataFrame(axiom_data)
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 8))
    
    sns.heatmap(heatmap_df, annot=True, fmt='.1f', cmap='RdYlGn',
                vmin=0, vmax=100, ax=ax, cbar_kws={'label': 'Accuracy (%)'})
    
    ax.set_title('Accuracy by Axiom Across Experiments')
    ax.set_xlabel('Experiment')
    ax.set_ylabel('Axiom')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()
    
    return heatmap_df

In [None]:
# Create heatmap
heatmap_df = create_per_axiom_heatmap(experiments, f"{CONFIG['output_dir']}/figures/axiom_heatmap.png")

if heatmap_df is not None:
    print("\nAxiom Accuracy Table:")
    display(heatmap_df.round(2))
    heatmap_df.to_csv(f"{CONFIG['output_dir']}/tables/axiom_accuracy.csv")

---
## 7. Error Analysis

Categorize and analyze failure modes

In [None]:
def categorize_errors(df: pd.DataFrame) -> pd.DataFrame:
    """
    Categorize errors into types for analysis.
    
    Error categories:
    1. Formalization failure - couldn't generate valid logical form
    2. Execution failure - valid form but solver failed
    3. Semantic error - executed but wrong answer
    """
    df = df.copy()
    
    def categorize(row):
        if row['is_correct']:
            return 'Correct'
        elif not row['formalization_success']:
            return 'Formalization Error'
        elif not row['execution_success']:
            return 'Execution Error'
        else:
            return 'Semantic Error'
    
    df['error_category'] = df.apply(categorize, axis=1)
    
    return df


def plot_error_distribution(experiments: Dict[str, pd.DataFrame], save_path: Optional[str] = None):
    """
    Plot error distribution across experiments.
    """
    fig, axes = plt.subplots(1, len(experiments), figsize=(6*len(experiments), 5))
    
    if len(experiments) == 1:
        axes = [axes]
    
    colors = {'Correct': '#2ecc71', 'Formalization Error': '#e74c3c', 
              'Execution Error': '#f39c12', 'Semantic Error': '#9b59b6'}
    
    for ax, (exp_name, df) in zip(axes, experiments.items()):
        df_cat = categorize_errors(df)
        error_counts = df_cat['error_category'].value_counts()
        
        wedges, texts, autotexts = ax.pie(
            error_counts.values, 
            labels=error_counts.index,
            autopct='%1.1f%%',
            colors=[colors.get(cat, '#95a5a6') for cat in error_counts.index],
            explode=[0.05 if cat != 'Correct' else 0 for cat in error_counts.index]
        )
        
        ax.set_title(exp_name.replace('.json', ''))
    
    plt.suptitle('Error Distribution by Category', fontsize=14, y=1.02)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()

In [None]:
# Error analysis
for exp_name, df in experiments.items():
    df_cat = categorize_errors(df)
    
    print(f"\n{exp_name} - Error Distribution:")
    error_dist = df_cat['error_category'].value_counts()
    error_pct = df_cat['error_category'].value_counts(normalize=True) * 100
    
    error_summary = pd.DataFrame({
        'Count': error_dist,
        'Percentage': error_pct.round(2)
    })
    display(error_summary)

# Plot
plot_error_distribution(experiments, f"{CONFIG['output_dir']}/figures/error_distribution.png")

In [None]:
def show_error_examples(df: pd.DataFrame, n_examples: int = 3):
    """
    Show example errors for qualitative analysis.
    """
    df_cat = categorize_errors(df)
    
    for category in ['Formalization Error', 'Execution Error', 'Semantic Error']:
        errors = df_cat[df_cat['error_category'] == category]
        
        if len(errors) > 0:
            print(f"\n{'='*60}")
            print(f"Category: {category} ({len(errors)} total)")
            print('='*60)
            
            for idx, row in errors.head(n_examples).iterrows():
                print(f"\nExample ID: {row['example_id']}")
                print(f"Axiom: {row['axiom']}")
                print(f"Ground Truth: {row['ground_truth']}")
                print(f"Predicted: {row['converted_answer']}")
                if row['error']:
                    print(f"Error: {row['error'][:200]}..." if len(str(row['error'])) > 200 else f"Error: {row['error']}")
                print('-'*40)

# Show error examples
if experiments:
    sample_df = list(experiments.values())[0]
    show_error_examples(sample_df, n_examples=2)

---
## 8. Time & Efficiency Analysis

In [None]:
def analyze_efficiency(df: pd.DataFrame) -> Dict[str, float]:
    """
    Analyze computational efficiency metrics.
    """
    metrics = {
        'avg_total_time': df['total_time'].mean(),
        'std_total_time': df['total_time'].std(),
        'avg_formalization_time': df['formalization_time'].mean(),
        'avg_refinement_time': df['refinement_time'].mean(),
        'avg_solving_time': df['solving_time'].mean(),
        'avg_llm_calls': df['total_llm_calls'].mean(),
        'total_llm_calls': df['total_llm_calls'].sum()
    }
    return metrics


def plot_time_breakdown(experiments: Dict[str, pd.DataFrame], save_path: Optional[str] = None):
    """
    Plot time breakdown by component.
    """
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Stacked bar chart for time breakdown
    exp_names = []
    form_times = []
    ref_times = []
    solve_times = []
    
    for exp_name, df in experiments.items():
        exp_names.append(exp_name.replace('.json', ''))
        form_times.append(df['formalization_time'].mean())
        ref_times.append(df['refinement_time'].mean())
        solve_times.append(df['solving_time'].mean())
    
    x = np.arange(len(exp_names))
    width = 0.5
    
    ax1 = axes[0]
    ax1.bar(x, form_times, width, label='Formalization', color='#3498db')
    ax1.bar(x, ref_times, width, bottom=form_times, label='Refinement', color='#e74c3c')
    ax1.bar(x, solve_times, width, bottom=np.array(form_times)+np.array(ref_times), 
           label='Solving', color='#2ecc71')
    
    ax1.set_xlabel('Experiment')
    ax1.set_ylabel('Average Time (seconds)')
    ax1.set_title('Time Breakdown by Component')
    ax1.set_xticks(x)
    ax1.set_xticklabels(exp_names, rotation=45, ha='right')
    ax1.legend()
    
    # Distribution of total time
    ax2 = axes[1]
    for exp_name, df in experiments.items():
        ax2.hist(df['total_time'], bins=20, alpha=0.5, 
                label=exp_name.replace('.json', ''), edgecolor='black')
    
    ax2.set_xlabel('Total Time (seconds)')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Distribution of Processing Time')
    ax2.legend()
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Saved to {save_path}")
    
    plt.show()

In [None]:
# Efficiency analysis
efficiency_table = []

for exp_name, df in experiments.items():
    metrics = analyze_efficiency(df)
    metrics['Experiment'] = exp_name
    efficiency_table.append(metrics)

efficiency_df = pd.DataFrame(efficiency_table)
print("Efficiency Metrics:")
display(efficiency_df.round(3))

# Save
efficiency_df.to_csv(f"{CONFIG['output_dir']}/tables/efficiency_metrics.csv", index=False)

# Plot
plot_time_breakdown(experiments, f"{CONFIG['output_dir']}/figures/time_breakdown.png")

---
## 9. Statistical Significance Tests

In [None]:
def compute_confidence_interval(data: np.ndarray, confidence: float = 0.95) -> Tuple[float, float]:
    """
    Compute bootstrap confidence interval for accuracy.
    """
    n_bootstrap = 10000
    bootstrap_means = []
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrap_means.append(sample.mean())
    
    lower = np.percentile(bootstrap_means, (1 - confidence) / 2 * 100)
    upper = np.percentile(bootstrap_means, (1 + confidence) / 2 * 100)
    
    return lower * 100, upper * 100


def paired_significance_test(df1: pd.DataFrame, df2: pd.DataFrame, 
                             metric: str = 'is_correct') -> Dict[str, float]:
    """
    Perform paired statistical tests between two experiments.
    
    Uses both paired t-test and Wilcoxon signed-rank test.
    """
    # Align by example_id
    merged = pd.merge(df1[['example_id', metric]], df2[['example_id', metric]], 
                     on='example_id', suffixes=('_1', '_2'))
    
    if len(merged) == 0:
        return {'error': 'No matching examples'}
    
    vals1 = merged[f'{metric}_1'].astype(float).values
    vals2 = merged[f'{metric}_2'].astype(float).values
    
    # Paired t-test
    t_stat, t_pval = ttest_rel(vals1, vals2)
    
    # Wilcoxon signed-rank test (non-parametric)
    try:
        w_stat, w_pval = wilcoxon(vals1, vals2)
    except ValueError:
        w_stat, w_pval = np.nan, np.nan
    
    return {
        'n_paired': len(merged),
        'mean_diff': (vals1 - vals2).mean() * 100,
        't_statistic': t_stat,
        't_pvalue': t_pval,
        'wilcoxon_statistic': w_stat,
        'wilcoxon_pvalue': w_pval,
        'significant_005': t_pval < 0.05,
        'significant_001': t_pval < 0.01
    }

In [None]:
# Confidence intervals
print("95% Confidence Intervals for Accuracy:\n")

ci_table = []
for exp_name, df in experiments.items():
    accuracy = df['is_correct'].mean() * 100
    ci_lower, ci_upper = compute_confidence_interval(df['is_correct'].values)
    
    ci_table.append({
        'Experiment': exp_name,
        'Accuracy (%)': f"{accuracy:.2f}",
        '95% CI Lower': f"{ci_lower:.2f}",
        '95% CI Upper': f"{ci_upper:.2f}",
        'CI Width': f"{ci_upper - ci_lower:.2f}"
    })

ci_df = pd.DataFrame(ci_table)
display(ci_df)
ci_df.to_csv(f"{CONFIG['output_dir']}/tables/confidence_intervals.csv", index=False)

In [None]:
# Paired significance tests (if multiple experiments)
if len(experiments) >= 2:
    exp_list = list(experiments.items())
    
    print("\nPaired Significance Tests:\n")
    
    for i in range(len(exp_list)):
        for j in range(i+1, len(exp_list)):
            name1, df1 = exp_list[i]
            name2, df2 = exp_list[j]
            
            results = paired_significance_test(df1, df2)
            
            print(f"{name1} vs {name2}:")
            print(f"  N pairs: {results.get('n_paired', 'N/A')}")
            print(f"  Mean difference: {results.get('mean_diff', 'N/A'):.2f}%")
            print(f"  t-test p-value: {results.get('t_pvalue', 'N/A'):.4f}")
            print(f"  Significant (p<0.05): {results.get('significant_005', 'N/A')}")
            print()

---
## 10. Publication-Ready Figures

In [None]:
def create_main_comparison_figure(experiments: Dict[str, pd.DataFrame], 
                                   save_path: Optional[str] = None):
    """
    Create the main comparison figure for the paper.
    """
    fig = plt.figure(figsize=(16, 10))
    
    # Create grid
    gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.25)
    
    # 1. Main accuracy bar chart
    ax1 = fig.add_subplot(gs[0, 0])
    
    exp_names = [name.replace('.json', '').replace('_', ' ') for name in experiments.keys()]
    accuracies = [df['is_correct'].mean() * 100 for df in experiments.values()]
    
    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(experiments)))
    bars = ax1.bar(exp_names, accuracies, color=colors, edgecolor='black', linewidth=1.2)
    
    ax1.set_ylabel('Accuracy (%)', fontweight='bold')
    ax1.set_title('(a) Overall Accuracy', fontweight='bold')
    ax1.set_ylim(0, 100)
    ax1.set_xticklabels(exp_names, rotation=45, ha='right')
    
    for bar, acc in zip(bars, accuracies):
        ax1.annotate(f'{acc:.1f}%', 
                    xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                    xytext=(0, 5), textcoords='offset points',
                    ha='center', fontsize=11, fontweight='bold')
    
    # 2. Execution metrics
    ax2 = fig.add_subplot(gs[0, 1])
    
    x = np.arange(len(experiments))
    width = 0.35
    
    exe_rates = [compute_execution_metrics(df)['exe_rate'] for df in experiments.values()]
    exe_accs = [compute_execution_metrics(df)['exe_acc'] for df in experiments.values()]
    
    bars1 = ax2.bar(x - width/2, exe_rates, width, label='Exe_Rate', color='#27ae60', edgecolor='black')
    bars2 = ax2.bar(x + width/2, exe_accs, width, label='Exe_Acc', color='#2980b9', edgecolor='black')
    
    ax2.set_ylabel('Percentage (%)', fontweight='bold')
    ax2.set_title('(b) Execution Analysis', fontweight='bold')
    ax2.set_xticks(x)
    ax2.set_xticklabels(exp_names, rotation=45, ha='right')
    ax2.legend(loc='lower right')
    ax2.set_ylim(0, 105)
    
    # 3. Per-axiom performance
    ax3 = fig.add_subplot(gs[1, 0])
    
    sample_df = list(experiments.values())[0]
    if 'axiom' in sample_df.columns:
        axiom_acc = sample_df.groupby('axiom')['is_correct'].mean() * 100
        axiom_acc = axiom_acc.sort_values(ascending=True)
        
        colors_axiom = plt.cm.RdYlGn(axiom_acc.values / 100)
        bars3 = ax3.barh(axiom_acc.index, axiom_acc.values, color=colors_axiom, edgecolor='black')
        
        ax3.set_xlabel('Accuracy (%)', fontweight='bold')
        ax3.set_title('(c) Accuracy by Axiom', fontweight='bold')
        ax3.set_xlim(0, 100)
    
    # 4. Error distribution
    ax4 = fig.add_subplot(gs[1, 1])
    
    df_cat = categorize_errors(sample_df)
    error_counts = df_cat['error_category'].value_counts()
    
    colors_pie = {'Correct': '#27ae60', 'Formalization Error': '#e74c3c', 
                  'Execution Error': '#f39c12', 'Semantic Error': '#9b59b6'}
    
    wedges, texts, autotexts = ax4.pie(
        error_counts.values,
        labels=error_counts.index,
        autopct='%1.1f%%',
        colors=[colors_pie.get(cat, '#95a5a6') for cat in error_counts.index],
        explode=[0.03] * len(error_counts),
        shadow=True
    )
    ax4.set_title('(d) Error Distribution', fontweight='bold')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Saved to {save_path}")
    
    plt.show()

In [None]:
# Generate main figure
create_main_comparison_figure(experiments, f"{CONFIG['output_dir']}/figures/main_comparison.png")

---
## 11. Export Results

In [None]:
def export_latex_table(df: pd.DataFrame, caption: str, label: str) -> str:
    """
    Export DataFrame as LaTeX table for paper.
    """
    latex = df.to_latex(index=False, escape=False)
    
    # Wrap in table environment
    full_latex = f"""\\begin{{table}}[t]
\\centering
\\caption{{{caption}}}
\\label{{{label}}}
{latex}\\end{{table}}"""
    
    return full_latex

In [None]:
# Generate LaTeX tables
if not main_results.empty:
    latex_main = export_latex_table(
        main_results, 
        "Main results comparing methods across datasets.",
        "tab:main_results"
    )
    
    with open(f"{CONFIG['output_dir']}/tables/main_results.tex", 'w') as f:
        f.write(latex_main)
    
    print("LaTeX table saved to tables/main_results.tex")
    print("\nPreview:")
    print(latex_main[:500] + "...")

In [None]:
# Generate comprehensive summary report
def generate_summary_report(experiments: Dict[str, pd.DataFrame]) -> str:
    """
    Generate a text summary of all experiments.
    """
    report = []
    report.append("="*60)
    report.append("NEURO-SYMBOLIC REASONING EXPERIMENT SUMMARY")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("="*60)
    
    for exp_name, df in experiments.items():
        report.append(f"\n\n{'='*40}")
        report.append(f"Experiment: {exp_name}")
        report.append('='*40)
        
        # Metadata
        metadata = df.attrs.get('metadata', {})
        report.append(f"\nModel: {metadata.get('model', 'Unknown')}")
        report.append(f"Logic Type: {metadata.get('logic_type', 'Unknown')}")
        report.append(f"Total Examples: {len(df)}")
        
        # Accuracy
        accuracy = df['is_correct'].mean() * 100
        report.append(f"\nOverall Accuracy: {accuracy:.2f}%")
        
        # Execution metrics
        exe_metrics = compute_execution_metrics(df)
        report.append(f"Execution Rate: {exe_metrics['exe_rate']:.1f}%")
        report.append(f"Execution Accuracy: {exe_metrics['exe_acc']:.1f}%")
        
        # Efficiency
        eff_metrics = analyze_efficiency(df)
        report.append(f"\nAvg Time per Query: {eff_metrics['avg_total_time']:.2f}s")
        report.append(f"Avg LLM Calls: {eff_metrics['avg_llm_calls']:.1f}")
        
        # Per-axiom (if available)
        if 'axiom' in df.columns:
            report.append("\nPer-Axiom Accuracy:")
            axiom_acc = df.groupby('axiom')['is_correct'].mean() * 100
            for axiom, acc in axiom_acc.items():
                report.append(f"  {axiom}: {acc:.1f}%")
    
    return '\n'.join(report)

# Generate and save report
report = generate_summary_report(experiments)
print(report)

with open(f"{CONFIG['output_dir']}/summary_report.txt", 'w') as f:
    f.write(report)

print(f"\nReport saved to {CONFIG['output_dir']}/summary_report.txt")

In [None]:
# Final summary of all outputs
print("\n" + "="*60)
print("ANALYSIS COMPLETE - OUTPUT FILES")
print("="*60)

import os
for root, dirs, files in os.walk(CONFIG['output_dir']):
    level = root.replace(CONFIG['output_dir'], '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")