# Imports and Setup

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import json
from matplotlib.lines import Line2D

# Helper Functions

In [2]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def process_raw_data(llm_path, scenario, agent_types):
    all_data = []
    for agent_type in agent_types:
        file_path = os.path.join(llm_path, agent_type, scenario)
        if os.path.exists(file_path):
            data = load_json(file_path)
            if scenario == 'assignment_test.json':
                for claim in ['claim_A', 'claim_B']:
                    claim_data = data[claim]
                    processed_data = process_claim_data(claim_data, agent_type, claim)
                    all_data.extend(processed_data)
            else:
                processed_data = process_claim_data(data, agent_type)
                all_data.extend(processed_data)
    return pd.DataFrame(all_data)

def process_claim_data(claim_data, agent_type, claim=None):
    processed_data = []
    for metric, values in claim_data.items():
        if isinstance(values, list):
            if metric == 'bertscore':  # Special handling for bertscore
                for i, item in enumerate(values):
                    for submetric, value in item.items():
                        row = {
                            'Agent': agent_type,
                            'Metric': f'bertscore_{submetric}',
                            'Value': value,
                            'Index': i
                        }
                        if claim:
                            row['Claim'] = claim
                        processed_data.append(row)
            else:  # For other metrics
                for i, value in enumerate(values):
                    row = {
                        'Agent': agent_type,
                        'Metric': metric,
                        'Value': value,
                        'Index': i
                    }
                    if claim:
                        row['Claim'] = claim
                    processed_data.append(row)
        else:  # For single value metrics
            row = {
                'Agent': agent_type,
                'Metric': metric,
                'Value': values,
                'Index': 0
            }
            if claim:
                row['Claim'] = claim
            processed_data.append(row)
    return processed_data

def create_subplots(nrows=4, ncols=3, figsize=(20, 24)):
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize, squeeze=False)
    return fig, axes

def plot_in_grid(fig, axes, data, plot_function, title, nrows=4, ncols=3):
    fig.suptitle(title, fontsize=16, y=0.95)
    
    for i, (metric, metric_data) in enumerate(data.items()):
        if i < (nrows - 1) * ncols:  # All plots except those in the last row
            row, col = divmod(i, ncols)
            ax = axes[row, col]
        elif i == (nrows - 1) * ncols:  # First plot of the last row
            ax = axes[-1, 1]  # Place the 10th plot in the middle of the bottom row
        elif i == (nrows - 1) * ncols + 1:  # 11th plot (if there are 11 or 12)
            ax = axes[-1, 0]
        else:  # 12th plot
            ax = axes[-1, 2]
        
        plot_function(ax, metric, metric_data)
    
    # Adjust the layout based on the number of plots
    if len(data) == 10:
        fig.delaxes(axes[-1, 0])  # Remove the bottom-left subplot (unused)
        fig.delaxes(axes[-1, 2])  # Remove the bottom-right subplot (unused)
    elif len(data) < 10:
        # Remove unused subplots in the last row
        for col in range(len(data) - (nrows - 1) * ncols, ncols):
            fig.delaxes(axes[-1, col])
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)  # Adjust top margin for the title

def create_density_plots(base_path, llms, scenarios, agent_types, metrics):
    color_palette = sns.color_palette("husl", n_colors=len(agent_types))
    agent_colors = dict(zip(agent_types, color_palette))

    def density_plot(ax, metric, metric_data):
        for agent in agent_types:
            agent_data = metric_data[metric_data['Agent'] == agent]
            sns.kdeplot(data=agent_data, x='Value', ax=ax, fill=True, alpha=0.2, 
                        color=agent_colors[agent], label=agent)
            sns.kdeplot(data=agent_data, x='Value', ax=ax, color=agent_colors[agent], 
                        linewidth=2, label='_nolegend_')
        ax.set_title(f"{metric}")
        ax.set_xlabel("Value")
        ax.set_ylabel("Density")
        legend_elements = [Line2D([0], [0], color=agent_colors[agent], lw=2, label=agent)
                           for agent in agent_types]
        ax.legend(handles=legend_elements, title="Agent Type", loc="upper right")

    for llm in llms:
        llm_path = os.path.join(base_path, llm)
        llm_name = llm.split('/')[-1].upper()
        
        model_folder = os.path.join("./graphs/density_plots", llm_name)
        os.makedirs(model_folder, exist_ok=True)
        
        for scenario in scenarios:
            df = process_raw_data(llm_path, scenario, agent_types)
            
            fig, axes = create_subplots()
            title = f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()}"
            
            data = {metric: df[df['Metric'] == metric] for metric in metrics}
            plot_in_grid(fig, axes, data, density_plot, title)
            
            plt.savefig(os.path.join(model_folder, f"{scenario.replace('.json', '')}_density_plots.png"), dpi=300, bbox_inches='tight')
            plt.close()

def create_boxplots(base_path, llms, scenarios, agent_types, metrics):
    def box_plot(ax, metric, metric_data):
        sns.boxplot(data=metric_data, x='Agent', y='Value', ax=ax)
        ax.set_title(f"{metric}")
        ax.set_xlabel("Agent Type")
        ax.set_ylabel("Value")
        ax.tick_params(axis='x', rotation=45)

    for llm in llms:
        llm_path = os.path.join(base_path, llm)
        llm_name = llm.split('/')[-1].upper()
        
        model_folder = os.path.join("./graphs/boxplots", llm_name)
        os.makedirs(model_folder, exist_ok=True)
        
        for scenario in scenarios:
            df = process_raw_data(llm_path, scenario, agent_types)
            
            fig, axes = create_subplots()
            title = f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()} - Boxplots"
            
            data = {metric: df[df['Metric'] == metric] for metric in metrics}
            plot_in_grid(fig, axes, data, box_plot, title)
            
            plt.savefig(os.path.join(model_folder, f"{scenario.replace('.json', '')}_boxplots.png"), dpi=300, bbox_inches='tight')
            plt.close()

def create_heatmap(base_path, llms, scenarios, agent_types, metrics):
    for llm in llms:
        llm_path = os.path.join(base_path, llm)
        llm_name = llm.split('/')[-1].upper()
        
        model_folder = os.path.join("./graphs/heatmaps", llm_name)
        os.makedirs(model_folder, exist_ok=True)
        
        for scenario in scenarios:
            df = process_raw_data(llm_path, scenario, agent_types)
            
            pivot_df = df.pivot_table(values='Value', index='Agent', columns='Metric', aggfunc='mean')
            
            plt.figure(figsize=(15, 10))
            sns.heatmap(pivot_df, annot=True, cmap="YlGnBu", fmt=".2f")
            plt.title(f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()} - Heatmap")
            plt.tight_layout()
            plt.savefig(os.path.join(model_folder, f"{scenario.replace('.json', '')}_heatmap.png"), dpi=300, bbox_inches='tight')
            plt.close()


def create_radar_plots(base_path, llms, scenarios, agent_types, metrics):
    for llm in llms:
        llm_path = os.path.join(base_path, llm)
        llm_name = llm.split('/')[-1].upper()
        
        model_folder = os.path.join("./graphs/radar_plots", llm_name)
        os.makedirs(model_folder, exist_ok=True)
        
        for scenario in scenarios:
            df = process_raw_data(llm_path, scenario, agent_types)
            
            fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
            
            for agent in agent_types:
                agent_data = df[df['Agent'] == agent]
                values = [agent_data[agent_data['Metric'] == metric]['Value'].mean() for metric in metrics]
                values += values[:1]  # Repeat the first value to close the polygon
                angles = [n / float(len(metrics)) * 2 * np.pi for n in range(len(metrics))]
                angles += angles[:1]  # Repeat the first angle to close the polygon
                
                ax.plot(angles, values, linewidth=2, linestyle='solid', label=agent)
                ax.fill(angles, values, alpha=0.1)
            
            ax.set_xticks(angles[:-1])
            ax.set_xticklabels(metrics)
            ax.set_title(f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()} - Radar Plot")
            ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
            
            plt.tight_layout()
            plt.savefig(os.path.join(model_folder, f"{scenario.replace('.json', '')}_radar_plot.png"), dpi=300, bbox_inches='tight')
            plt.close()

def create_correlation_heatmaps(base_path, llms, scenarios, agent_types, metrics):
    for llm in llms:
        llm_path = os.path.join(base_path, llm)
        llm_name = llm.split('/')[-1].upper()
        
        model_folder = os.path.join("./graphs/correlation_heatmaps", llm_name)
        os.makedirs(model_folder, exist_ok=True)
        
        for scenario in scenarios:
            df = process_raw_data(llm_path, scenario, agent_types)
            
            pivot_df = df.pivot_table(values='Value', index='Agent', columns='Metric', aggfunc='mean')
            correlation_matrix = pivot_df.corr()
            
            plt.figure(figsize=(12, 10))
            sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0)
            plt.title(f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()} - Correlation Heatmap")
            plt.tight_layout()
            plt.savefig(os.path.join(model_folder, f"{scenario.replace('.json', '')}_correlation_heatmap.png"), dpi=300, bbox_inches='tight')
            plt.close()

def create_performance_comparison(base_path, llms, scenarios, agent_types, metrics):
    for scenario in scenarios:
        all_data = []
        for llm in llms:
            llm_path = os.path.join(base_path, llm)
            df = process_raw_data(llm_path, scenario, agent_types)
            df['LLM'] = llm.split('/')[-1].upper()
            all_data.append(df)
        
        combined_df = pd.concat(all_data)
        
        fig, axes = create_subplots()
        title = f"Performance Comparison - {scenario.replace('.json', '').replace('_', ' ').title()}"
        
        def performance_plot(ax, metric, metric_data):
            sns.barplot(data=metric_data, x='LLM', y='Value', hue='Agent', ax=ax)
            ax.set_title(f"{metric}")
            ax.set_xlabel("LLM")
            ax.set_ylabel("Value")
            ax.tick_params(axis='x', rotation=45)
            ax.legend().remove()  # Remove the legend from individual subplots
        
        data = {metric: combined_df[combined_df['Metric'] == metric] for metric in metrics}
        plot_in_grid(fig, axes, data, performance_plot, title)
        
        # Create a single legend for the entire figure
        handles, labels = axes[0, 0].get_legend_handles_labels()
        fig.legend(handles, labels, title="Agent Type", loc="center left", bbox_to_anchor=(0.01, 0.5))
        
        # Adjust the layout to make room for the legend
        plt.tight_layout()
        plt.subplots_adjust(left=0.15, top=0.92)
        
        output_folder = os.path.join("./graphs/performance_comparison")
        os.makedirs(output_folder, exist_ok=True)
        plt.savefig(os.path.join(output_folder, f"{scenario.replace('.json', '')}_performance_comparison.png"), dpi=300, bbox_inches='tight')
        plt.close()


# Generate Graphs

In [3]:
# Define paths and parameters
base_path = '../results'
llms = ['OpenAI/gpt-4o', 'OpenAI/gpt-4o-mini', 'OpenAI/gpt3.5-turbo', 'Anthropic/claude3_haiku']
scenarios = ['baseline.json', 'missing_evidence.json', 'wrong_evidence.json', 'mixed.json', 'selection_test.json', 'assignment_test.json', 'full_data_noid.json']
agent_types = ['base', 'exp', 'coh', 'base_exp', 'base_exp_coh']
metrics = [
    'bertscore_F1', 'bertscore_Precision', 'bertscore_Recall',
    'fluency', 'coherence', 'explanation_accuracy', 'explanation_completeness',
    'claim_accuracy_score', 'claim_support', 'fact_verification'
]

In [4]:
create_density_plots(base_path, llms, scenarios, agent_types, metrics)

In [5]:
create_boxplots(base_path, llms, scenarios, agent_types, metrics)

In [6]:
create_heatmap(base_path, llms, scenarios, agent_types, metrics)

In [7]:
create_radar_plots(base_path, llms, scenarios, agent_types, metrics)

In [8]:
create_correlation_heatmaps(base_path, llms, scenarios, agent_types, metrics)

In [9]:
create_performance_comparison(base_path, llms, scenarios, agent_types, metrics)