# Imports and Setup

In [1]:
import os
import json
import pandas as pd
import numpy as np

# Helper Functions

In [8]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def calculate_averages(data):
    averages = {}
    for key, value in data.items():
        if isinstance(value, list):
            if isinstance(value[0], dict):
                for subkey in value[0].keys():
                    averages[f"{key}_{subkey}"] = np.mean([item[subkey] for item in value])
            else:
                averages[key] = np.mean(value)
        elif isinstance(value, (int, float)):
            averages[key] = value
    return averages

def process_scenario(llm_path, scenario, agent_types):
    results = []
    for agent_type in agent_types:
        file_path = os.path.join(llm_path, agent_type, scenario)
        if os.path.exists(file_path):
            data = load_json(file_path)
            if scenario == 'assignment_test.json':
                avg_a = calculate_averages(data['claim_A'])
                avg_b = calculate_averages(data['claim_B'])
                avg = {k: (avg_a.get(k, 0) + avg_b.get(k, 0)) / 2 for k in set(avg_a) | set(avg_b)}
            else:
                avg = calculate_averages(data)
            avg['Agent'] = agent_type
            results.append(avg)
    return pd.DataFrame(results)

def format_table(df, title):
    styled_df = df.style.format("{:.4f}", subset=[col for col in df.columns if col != 'Agent'])
    styled_df = styled_df.set_caption(title)
    styled_df = styled_df.set_table_styles([
        {'selector': 'caption', 'props': [('font-size', '1.2em'), ('font-weight', 'bold')]},
        {'selector': 'th', 'props': [('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': 'td', 'props': [('text-align', 'center')]},
        {'selector': '', 'props': [('border', '1px solid black'), ('border-collapse', 'collapse')]},
        {'selector': 'th, td', 'props': [('padding', '5px')]},
    ])
    return styled_df

def process_llm(base_path, llm, scenarios, agent_types):
    llm_path = os.path.join(base_path, llm)
    llm_name = llm.split('/')[-1].upper()
    
    print(f"Results for {llm_name}")
    print("=" * 50)
    
    for scenario in scenarios:
        df = process_scenario(llm_path, scenario, agent_types)
        if df.empty:
            print(f"No data found for {llm_name} - {scenario}")
            continue
        
        # Ensure 'Agent' is a column, not the index
        if 'Agent' not in df.columns:
            df = df.reset_index()
        
        # Melt the DataFrame to get the metrics as rows
        df_melted = df.melt(id_vars=['Agent'], var_name='Metric', value_name='Value')
        
        # Pivot the melted DataFrame
        df_pivot = df_melted.pivot(index='Metric', columns='Agent', values='Value')
        
        # Rename columns for better readability
        column_rename = {
            'base': 'Base',
            'exp': 'Exp',
            'coh': 'Coh',
            'base_exp': 'Base+EXP',
            'base_exp_coh': 'Base+EXP+COH'
        }
        df_pivot = df_pivot.rename(columns=column_rename)
        
        # Reorder columns
        column_order = ['Base', 'Exp', 'Coh', 'Base+EXP', 'Base+EXP+COH']
        df_pivot = df_pivot[column_order]
        
            # Rename index for better readability
        index_rename = {
            'bertscore_F1': 'BERTScore (F1)',
            'bertscore_Precision': 'BERTScore (Precision)',
            'bertscore_Recall': 'BERTScore (Recall)',
            'fluency': 'Fluency',
            'coherence': 'Coherence',
            'explanation_accuracy': 'Explanation Accuracy',
            'explanation_completeness': 'Explanation Completeness',
            'claim_accuracy_score': 'Claim Accuracy',
            'claim_support': 'Claim Support',
            'fact_verification': 'Fact Verification'
        }
        df_pivot.index = df_pivot.index.map(lambda x: index_rename.get(x, x))

        # Reorder for better Readability
        reorder_schema = [
        'BERTScore (F1)',
        'BERTScore (Precision)',
        'BERTScore (Recall)',
        'Fluency',
        'Coherence',
        'Explanation Accuracy',
        'Explanation Completeness',
        'Claim Accuracy',
        'Claim Support',
        'Fact Verification'
        ]
        df_pivot = df_pivot.reindex(reorder_schema)
        
        title = f"{llm_name} - {scenario.replace('.json', '').replace('_', ' ').title()}"
        styled_df = format_table(df_pivot, title)
        display(styled_df)
        print("\n")

# Generate Tables

In [9]:
# Define paths and parameters
base_path = '../results'
scenarios = ['baseline.json', 'missing_evidence.json', 'wrong_evidence.json', 'mixed.json', 'selection_test.json', 'assignment_test.json', 'full_data_noid.json']
agent_types = ['base', 'exp', 'coh', 'base_exp', 'base_exp_coh']

# GPT-4o Tables

In [10]:
print("Processing GPT-4o")
process_llm(base_path, 'OpenAI/gpt-4o', scenarios, agent_types)

Processing GPT-4o
Results for GPT-4O


Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.5899,0.5893,0.595,0.5877,0.5899
BERTScore (Precision),0.574,0.5836,0.5836,0.5775,0.5737
BERTScore (Recall),0.6093,0.5984,0.6099,0.6015,0.6101
Fluency,0.8077,0.7955,0.7979,0.7885,0.8126
Coherence,0.9005,0.733,0.8655,0.9503,0.8948
Explanation Accuracy,0.8356,0.8223,0.8387,0.8391,0.8326
Explanation Completeness,0.6669,0.6488,0.6656,0.6653,0.6724
Claim Accuracy,1.7955,1.8075,1.9917,2.0633,1.7745
Claim Support,1.7955,1.8075,1.9917,2.0633,1.7745
Fact Verification,1.0707,1.0268,1.0518,1.0133,1.014






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.5946,0.5886,0.5946,0.5878,0.5917
BERTScore (Precision),0.5792,0.5767,0.5861,0.5722,0.5807
BERTScore (Recall),0.6131,0.6037,0.6059,0.6076,0.6072
Fluency,0.7983,0.8066,0.7996,0.8189,0.774
Coherence,0.8515,0.8253,0.7953,0.7868,0.8089
Explanation Accuracy,0.8313,0.8308,0.8324,0.8373,0.8405
Explanation Completeness,0.6622,0.6634,0.6602,0.6671,0.6771
Claim Accuracy,1.8774,1.714,1.8164,1.7415,2.0106
Claim Support,1.8774,1.714,1.8164,1.7415,2.0106
Fact Verification,1.0035,1.0278,0.9918,1.0439,1.0425






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.5933,0.5872,0.592,0.5904,0.5948
BERTScore (Precision),0.5847,0.5769,0.5797,0.582,0.5775
BERTScore (Recall),0.606,0.6008,0.6077,0.602,0.6153
Fluency,0.7825,0.8039,0.7961,0.8116,0.8072
Coherence,0.9073,0.7132,0.759,0.8973,0.7814
Explanation Accuracy,0.8319,0.8401,0.8276,0.829,0.8416
Explanation Completeness,0.6588,0.6696,0.6631,0.6614,0.6695
Claim Accuracy,1.7982,1.6856,2.146,1.855,1.6562
Claim Support,1.7982,1.6856,2.146,1.855,1.6562
Fact Verification,0.9942,1.0248,1.0348,1.0135,1.0174






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.5903,0.5866,0.5949,0.5813,0.5894
BERTScore (Precision),0.5727,0.573,0.5872,0.5684,0.5759
BERTScore (Recall),0.6113,0.6036,0.6057,0.5967,0.6072
Fluency,0.8042,0.7971,0.789,0.7988,0.812
Coherence,0.7362,0.7257,0.8873,0.7138,0.8148
Explanation Accuracy,0.8308,0.8347,0.8406,0.8184,0.8304
Explanation Completeness,0.6645,0.6714,0.6668,0.6544,0.6754
Claim Accuracy,1.7643,2.0178,1.99,1.7381,1.9203
Claim Support,1.7643,2.0178,1.99,1.7381,1.9203
Fact Verification,1.0103,1.052,1.0848,1.0093,1.0207






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.6019,0.5826,0.5864,0.5789,0.5921
BERTScore (Precision),0.5985,0.5779,0.591,0.5775,0.5912
BERTScore (Recall),0.6087,0.5894,0.5839,0.5822,0.5947
Fluency,0.7336,0.642,0.687,0.6736,0.6177
Coherence,0.9258,0.8155,0.8635,0.8117,0.9306
Explanation Accuracy,0.833,0.8096,0.7925,0.7913,0.8041
Explanation Completeness,0.6473,0.6235,0.6275,0.6206,0.6172
Claim Accuracy,1.8905,1.7875,1.6589,1.4952,1.7969
Claim Support,1.8905,1.7875,1.6589,1.4952,1.7969
Fact Verification,0.9516,0.9359,0.9254,0.9415,0.9583






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.6231,0.6035,0.6135,0.595,0.6099
BERTScore (Precision),0.6288,0.6145,0.6272,0.6074,0.6155
BERTScore (Recall),0.6195,0.5947,0.6022,0.5863,0.6058
Fluency,0.7833,0.7888,0.801,0.7562,0.7951
Coherence,0.9543,0.9099,0.8588,0.932,0.8357
Explanation Accuracy,0.8579,0.8495,0.8515,0.8244,0.8527
Explanation Completeness,0.6152,0.6041,0.605,0.5928,0.6123
Claim Accuracy,2.1974,2.1209,2.2451,2.0689,2.1627
Claim Support,2.1974,2.1209,2.2451,2.0689,2.1627
Fact Verification,1.0498,0.9904,0.987,0.9805,0.9604






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore (F1),0.615,0.611,0.6307,0.602,0.61
BERTScore (Precision),0.534,0.5254,0.5438,0.5207,0.5278
BERTScore (Recall),0.7302,0.7349,0.7553,0.7185,0.7263
Fluency,0.7899,0.7631,0.7787,0.7873,0.77
Coherence,0.9045,0.7179,0.7417,0.8967,0.9177
Explanation Accuracy,0.8065,0.7981,0.8217,0.8048,0.8144
Explanation Completeness,0.7628,0.7593,0.7955,0.7746,0.7833
Claim Accuracy,0.6528,0.5373,0.6371,0.9701,0.7327
Claim Support,0.6528,0.5373,0.6371,0.9701,0.7327
Fact Verification,1.0806,1.2344,1.2667,0.965,1.2528






# GPT-4o-mini Tables

In [5]:
print("Processing GPT-4o-mini")
process_llm(base_path, 'OpenAI/gpt-4o-mini', scenarios, agent_types)

Processing GPT-4o-mini
Results for GPT-4O-MINI


Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5874,0.579,0.5865,0.5772,0.5843
BERTScore Precision,0.5771,0.5794,0.5865,0.5758,0.5828
BERTScore Recall,0.6013,0.5806,0.5888,0.5814,0.5887
Claim Accuracy Score,2.0171,1.8286,1.6119,1.8616,2.0065
Claim Support,2.0171,1.8286,1.6119,1.8616,2.0065
Coherence,0.8369,0.9129,0.8692,0.9572,0.8945
Explanation Accuracy,0.828,0.8183,0.8212,0.8299,0.8359
Explanation Completeness,0.6428,0.6388,0.6399,0.6411,0.6437
Fact Verification,1.0116,0.9648,0.9769,0.9564,0.9536
Fluency,0.8043,0.7308,0.7796,0.7306,0.7704






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5857,0.583,0.5875,0.578,0.5852
BERTScore Precision,0.5774,0.5815,0.5864,0.5734,0.587
BERTScore Recall,0.5971,0.5877,0.5913,0.5855,0.5862
Claim Accuracy Score,1.964,1.6375,1.6291,1.8742,1.9158
Claim Support,1.964,1.6375,1.6291,1.8742,1.9158
Coherence,0.8609,0.9147,0.8015,0.9445,0.8993
Explanation Accuracy,0.8269,0.82,0.8276,0.8235,0.832
Explanation Completeness,0.6583,0.6426,0.6415,0.6463,0.6516
Fact Verification,1.0111,0.9825,0.9957,0.9836,1.0146
Fluency,0.7905,0.7344,0.7899,0.7845,0.7561






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5853,0.5779,0.5864,0.5825,0.5844
BERTScore Precision,0.5775,0.5789,0.5892,0.582,0.585
BERTScore Recall,0.5965,0.5787,0.5866,0.5855,0.5864
Claim Accuracy Score,2.0892,1.7562,1.5331,1.9979,1.7848
Claim Support,2.0892,1.7562,1.5331,1.9979,1.7848
Coherence,0.8988,0.889,0.915,0.8574,0.9289
Explanation Accuracy,0.8171,0.818,0.823,0.8255,0.8211
Explanation Completeness,0.6436,0.6377,0.6525,0.6444,0.634
Fact Verification,1.0169,0.9651,0.9684,0.9669,0.9587
Fluency,0.7828,0.7652,0.773,0.7551,0.7709






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5841,0.5803,0.5866,0.5734,0.5899
BERTScore Precision,0.5752,0.5856,0.5903,0.5708,0.5908
BERTScore Recall,0.5962,0.5778,0.5857,0.578,0.591
Claim Accuracy Score,2.0385,1.6233,1.4283,1.6386,1.676
Claim Support,2.0385,1.6233,1.4283,1.6386,1.676
Coherence,0.9437,0.9087,0.9002,0.8626,0.8814
Explanation Accuracy,0.8262,0.82,0.8253,0.8141,0.8351
Explanation Completeness,0.6444,0.6397,0.6486,0.636,0.644
Fact Verification,1.023,0.9646,0.9978,0.9802,0.9877
Fluency,0.8032,0.733,0.7627,0.7602,0.7713






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5919,0.5784,0.575,0.5708,0.5846
BERTScore Precision,0.5842,0.5761,0.5752,0.5654,0.5803
BERTScore Recall,0.6024,0.5833,0.5773,0.5788,0.5919
Claim Accuracy Score,1.6221,1.5669,1.5957,1.6005,1.2304
Claim Support,1.6221,1.5669,1.5957,1.6005,1.2304
Coherence,0.896,0.8722,0.7344,0.8189,0.8376
Explanation Accuracy,0.8207,0.783,0.7849,0.7603,0.7648
Explanation Completeness,0.6336,0.6187,0.5973,0.6029,0.6027
Fact Verification,0.9582,0.9162,0.9006,0.9059,0.9305
Fluency,0.7405,0.6013,0.595,0.605,0.6424






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6133,0.5912,0.5801,0.5888,0.5477
BERTScore Precision,0.6182,0.6023,0.6022,0.6005,0.5662
BERTScore Recall,0.6108,0.582,0.5616,0.5792,0.5327
Claim Accuracy Score,2.0953,1.9363,1.856,2.0712,1.6396
Claim Support,2.0953,1.9363,1.856,2.0712,1.6396
Coherence,0.8143,0.9259,0.8692,0.9173,0.8356
Explanation Accuracy,0.8532,0.8415,0.774,0.8331,0.6154
Explanation Completeness,0.6,0.5919,0.5534,0.5864,0.4369
Fact Verification,1.0758,0.9875,0.94,0.9568,0.9315
Fluency,0.765,0.783,0.7791,0.7675,0.764






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.626,0.6081,0.6346,0.6063,0.6117
BERTScore Precision,0.5423,0.5218,0.5491,0.5163,0.524
BERTScore Recall,0.7456,0.7362,0.7557,0.7381,0.7399
Claim Accuracy Score,0.7453,0.7999,0.6668,0.9056,0.9281
Claim Support,0.7453,0.7999,0.6668,0.9056,0.9281
Coherence,1.0079,0.9378,0.972,0.878,0.914
Explanation Accuracy,0.8091,0.7777,0.8063,0.7858,0.7569
Explanation Completeness,0.769,0.7803,0.7769,0.781,0.7631
Fact Verification,0.938,1.1104,1.1477,0.9332,0.992
Fluency,0.7505,0.7575,0.7587,0.7805,0.7654






# GPT 3.5 Turbo Tables

In [6]:
print("Processing GPT-3.5-turbo")
process_llm(base_path, 'OpenAI/gpt3.5-turbo', scenarios, agent_types)

Processing GPT-3.5-turbo
Results for GPT3.5-TURBO


Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5097,0.5409,0.4091,0.5112,0.55
BERTScore Precision,0.5158,0.5499,0.4191,0.521,0.5589
BERTScore Recall,0.5066,0.5344,0.4012,0.5038,0.5438
Claim Accuracy Score,1.6526,1.5675,1.179,1.6814,1.6293
Claim Support,1.6526,1.5675,1.179,1.6814,1.6293
Coherence,0.811,0.6757,0.6489,0.5018,0.8753
Explanation Accuracy,0.6946,0.7477,0.5919,0.7141,0.7887
Explanation Completeness,0.5461,0.5993,0.4455,0.5561,0.6077
Fact Verification,0.8975,0.8762,0.6495,0.8519,0.9216
Fluency,0.6242,0.7169,0.5228,0.677,0.7071






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.4755,0.5371,0.473,0.4205,0.4844
BERTScore Precision,0.4797,0.5436,0.4747,0.4358,0.4912
BERTScore Recall,0.4742,0.5328,0.4728,0.4094,0.4794
Claim Accuracy Score,1.4387,1.7139,1.6012,1.0484,1.6781
Claim Support,1.4387,1.7139,1.6012,1.0484,1.6781
Coherence,0.7212,0.7569,0.6088,0.6222,0.7737
Explanation Accuracy,0.6446,0.7459,0.6546,0.5675,0.6862
Explanation Completeness,0.523,0.5848,0.5348,0.4427,0.5207
Fact Verification,0.9303,0.9,0.7603,0.6759,0.7981
Fluency,0.6272,0.6991,0.6167,0.5069,0.6301






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5636,0.5518,0.4529,0.5215,0.5202
BERTScore Precision,0.5651,0.5623,0.4583,0.5294,0.5266
BERTScore Recall,0.5645,0.5438,0.4491,0.5165,0.5162
Claim Accuracy Score,1.8456,1.2875,1.5307,1.686,1.6544
Claim Support,1.8456,1.2875,1.5307,1.686,1.6544
Coherence,0.7736,0.7759,0.7623,0.8007,0.8235
Explanation Accuracy,0.7877,0.7732,0.6372,0.7244,0.7334
Explanation Completeness,0.6285,0.5969,0.4813,0.5705,0.5678
Fact Verification,1.009,0.9365,0.7563,0.8497,0.8466
Fluency,0.7707,0.7016,0.5602,0.6515,0.6477






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5253,0.5728,0.457,0.5117,0.5506
BERTScore Precision,0.5245,0.584,0.467,0.521,0.5586
BERTScore Recall,0.5282,0.5639,0.4508,0.5042,0.5446
Claim Accuracy Score,1.9089,2.0669,1.2746,1.7211,1.8268
Claim Support,1.9089,2.0669,1.2746,1.7211,1.8268
Coherence,0.833,0.9683,0.7027,0.6972,0.7952
Explanation Accuracy,0.7242,0.7997,0.6284,0.7083,0.77
Explanation Completeness,0.5849,0.6172,0.4992,0.5594,0.5931
Fact Verification,0.9295,0.9214,0.7712,0.8236,0.9827
Fluency,0.7113,0.7118,0.5394,0.6353,0.7265






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6109,0.1391,0.3584,0.2482,0.2401
BERTScore Precision,0.6144,0.1445,0.3729,0.2632,0.2511
BERTScore Recall,0.6093,0.1344,0.3477,0.2363,0.2311
Claim Accuracy Score,1.8791,0.2977,0.721,0.6,0.5355
Claim Support,1.8791,0.2977,0.721,0.6,0.5355
Coherence,0.833,0.2485,0.6556,0.4173,0.4235
Explanation Accuracy,0.8509,0.15,0.3973,0.2928,0.2077
Explanation Completeness,0.6418,0.1129,0.3177,0.2275,0.1693
Fact Verification,1.0686,0.2412,0.6453,0.4539,0.463
Fluency,0.7543,0.1654,0.4896,0.2684,0.3291






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6236,0.4474,0.517,0.3977,0.5012
BERTScore Precision,0.6521,0.4785,0.5462,0.4246,0.5245
BERTScore Recall,0.6,0.4231,0.494,0.3772,0.4829
Claim Accuracy Score,1.8868,1.2335,1.5065,1.031,1.4629
Claim Support,1.8868,1.2335,1.5065,1.031,1.4629
Coherence,0.9855,0.7655,0.8141,0.5955,0.7577
Explanation Accuracy,0.8421,0.5994,0.6893,0.514,0.6128
Explanation Completeness,0.5911,0.4313,0.4826,0.3555,0.4485
Fact Verification,1.0494,0.7524,0.8726,0.6744,0.8618
Fluency,0.7016,0.3693,0.5147,0.4433,0.5971






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.4935,0.5067,0.4236,0.3938,0.4179
BERTScore Precision,0.4363,0.4392,0.3742,0.3484,0.3775
BERTScore Recall,0.5713,0.6043,0.4929,0.4565,0.4775
Claim Accuracy Score,0.2991,1.276,1.1869,0.6931,1.2834
Claim Support,0.2991,1.276,1.1869,0.6931,1.2834
Coherence,0.62,0.7873,0.7765,0.5049,0.7234
Explanation Accuracy,0.6499,0.633,0.372,0.4896,0.38
Explanation Completeness,0.6156,0.6475,0.4147,0.4894,0.3959
Fact Verification,0.8399,0.9463,0.8599,0.6145,0.6772
Fluency,0.6717,0.6451,0.6323,0.4621,0.5777






# Claude 3 Haiku Tables

In [7]:
print("Processing Claude3_haiku")
process_llm(base_path, 'Anthropic/claude3_haiku', scenarios, agent_types)

Processing Claude3_haiku
Results for CLAUDE3_HAIKU


Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5969,0.5638,0.5736,0.5292,0.5655
BERTScore Precision,0.5952,0.5764,0.5887,0.5346,0.5727
BERTScore Recall,0.6021,0.5555,0.5631,0.5261,0.5609
Claim Accuracy Score,1.7335,1.2807,1.4933,1.1934,1.732
Claim Support,1.7335,1.2807,1.4933,1.1934,1.732
Coherence,0.9122,0.9207,0.7578,0.7427,0.8602
Explanation Accuracy,0.8295,0.7955,0.785,0.7046,0.7976
Explanation Completeness,0.6675,0.6152,0.6149,0.5565,0.6214
Fact Verification,1.0112,0.9382,0.8985,0.913,0.9315
Fluency,0.848,0.7215,0.7225,0.7064,0.7836






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5985,0.5401,0.5581,0.5442,0.5638
BERTScore Precision,0.5928,0.5543,0.5773,0.549,0.5696
BERTScore Recall,0.6077,0.5346,0.544,0.5416,0.5612
Claim Accuracy Score,1.4123,1.6939,1.7215,1.5063,1.5319
Claim Support,1.4123,1.6939,1.7215,1.5063,1.5319
Coherence,0.8866,0.7442,0.8702,0.8694,0.8231
Explanation Accuracy,0.8264,0.7411,0.7566,0.7605,0.7954
Explanation Completeness,0.6624,0.5925,0.6038,0.595,0.618
Fact Verification,1.087,0.9137,1.0012,0.9288,0.9733
Fluency,0.8685,0.7155,0.6572,0.7488,0.8098






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5968,0.5416,0.5717,0.5228,0.5253
BERTScore Precision,0.5977,0.5467,0.5783,0.5367,0.5272
BERTScore Recall,0.6004,0.5402,0.5683,0.5131,0.5268
Claim Accuracy Score,1.5069,1.5192,1.8444,1.1814,1.1976
Claim Support,1.5069,1.5192,1.8444,1.1814,1.1976
Coherence,0.8815,0.8087,0.9014,0.805,0.7512
Explanation Accuracy,0.8345,0.7669,0.7775,0.7448,0.7372
Explanation Completeness,0.6674,0.5899,0.6047,0.5685,0.5799
Fact Verification,0.9829,0.8846,0.9312,0.8703,0.8827
Fluency,0.832,0.7198,0.7712,0.6765,0.7618






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6004,0.5352,0.5797,0.4825,0.5369
BERTScore Precision,0.6001,0.5467,0.5897,0.4925,0.5419
BERTScore Recall,0.6045,0.5276,0.5734,0.4762,0.5357
Claim Accuracy Score,1.6753,1.4484,1.8056,1.1069,1.2472
Claim Support,1.6753,1.4484,1.8056,1.1069,1.2472
Coherence,0.8523,0.6442,0.8706,0.7986,0.8119
Explanation Accuracy,0.827,0.762,0.7782,0.6896,0.7385
Explanation Completeness,0.6681,0.5758,0.6185,0.5183,0.5855
Fact Verification,1.0942,0.9228,0.9652,0.8084,0.9364
Fluency,0.8654,0.7286,0.8151,0.641,0.7192






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6103,0.2089,0.3078,0.2531,0.2052
BERTScore Precision,0.5922,0.2183,0.3346,0.2641,0.2182
BERTScore Recall,0.6325,0.2007,0.2859,0.2438,0.1943
Claim Accuracy Score,1.7769,0.3686,0.4469,0.4312,0.3966
Claim Support,1.7769,0.3686,0.4469,0.4312,0.3966
Coherence,0.8833,0.4828,0.6613,0.4071,0.343
Explanation Accuracy,0.817,0.0965,0.0426,0.1191,0.0515
Explanation Completeness,0.6625,0.0918,0.0766,0.1135,0.0674
Fact Verification,0.9318,0.4068,0.6729,0.5204,0.4053
Fluency,0.8459,0.357,0.5307,0.4529,0.3802






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.6249,0.5011,0.4843,0.3879,0.4849
BERTScore Precision,0.6288,0.5247,0.512,0.4087,0.5035
BERTScore Recall,0.6232,0.482,0.4616,0.3715,0.4699
Claim Accuracy Score,1.4435,1.1899,1.3424,0.9711,1.0539
Claim Support,1.4435,1.1899,1.3424,0.9711,1.0539
Coherence,0.9025,0.8695,0.8295,0.5926,0.8143
Explanation Accuracy,0.8579,0.5879,0.5985,0.5097,0.5982
Explanation Completeness,0.609,0.4394,0.4443,0.3798,0.4433
Fact Verification,1.0607,0.8431,0.8174,0.6521,0.8536
Fluency,0.8551,0.6551,0.6071,0.4667,0.635






Agent,Base,Exp,Coh,Base+EXP,Base+EXP+COH
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BERTScore F1,0.5933,0.4998,0.4322,0.473,0.4829
BERTScore Precision,0.5109,0.4356,0.3801,0.4144,0.4196
BERTScore Recall,0.711,0.5998,0.5086,0.5585,0.5778
Claim Accuracy Score,0.4158,0.9025,0.709,0.8753,0.9197
Claim Support,0.4158,0.9025,0.709,0.8753,0.9197
Coherence,0.7059,0.844,0.7346,0.7301,0.7939
Explanation Accuracy,0.8096,0.569,0.4271,0.601,0.587
Explanation Completeness,0.7732,0.5481,0.4239,0.5712,0.5643
Fact Verification,1.0436,0.9212,0.8871,0.8287,0.8781
Fluency,0.8561,0.7205,0.6415,0.6584,0.6798




