In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [6]:
# Config
INPUT_FILE = "benchmark_scores.csv"
OUTPUT_DIR = "benchmark_eda_charts"

# Generates and saves bar charts and box plots to compare model scores.
def create_comparison_charts(df):
    print("\n--- Generating comparison charts ---")
    
    # Create output directory if it doesn't exist
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    # Identify the base model names from the columns
    model_name_map = {
        col: col.replace('Answer_', '').replace('_', '/') 
        for col in df.columns if col.startswith('Answer_') and not any(k in col for k in ['_F1', '_CosineSim', '_BERTScore'])
    }
    
    if not model_name_map:
        print("Error: No model answer columns found. Cannot generate charts.")
        return

    # Reshape data for easier plotting
    # We want to "melt" the dataframe from a wide format to a long format.
    id_vars = ['Question', 'Answer']
    value_vars = [f"{prefix}_{metric}" for prefix in model_name_map.keys() for metric in ['F1', 'CosineSim', 'BERTScore']]
    
    long_df = pd.melt(df, id_vars=id_vars, value_vars=value_vars, var_name='MetricType', value_name='Score')
    
    # Split 'MetricType' into 'Model' and 'Metric'
    long_df[['Model', 'Metric']] = long_df['MetricType'].str.rsplit('_', n=1, expand=True)
    long_df['Model'] = long_df['Model'].map({k.replace('/', '_'): v for k, v in model_name_map.items()}) # Clean up model names

    # Calculate and print best/worst performers
    summary_lines = ["--- Performance Summary ---"]
    metrics_to_analyze = ['F1', 'CosineSim', 'BERTScore']
    for metric in metrics_to_analyze:
        metric_data = long_df[long_df['Metric'] == metric]
        avg_scores = metric_data.groupby('Model')['Score'].mean()
        
        best_model = avg_scores.idxmax()
        worst_model = avg_scores.idxmin()
        
        summary_lines.append(f"Metric: {metric}")
        summary_lines.append(f"  - Best Performer: {best_model} (Avg Score: {avg_scores.max():.4f})")
        summary_lines.append(f"  - Worst Performer: {worst_model} (Avg Score: {avg_scores.min():.4f})")
    
    summary_text = "\n".join(summary_lines)
    print(f"\n{summary_text}\n")

    # Save the summary to a text file 
    summary_filepath = os.path.join(OUTPUT_DIR, "performance_summary.txt")
    try:
        with open(summary_filepath, 'w', encoding='utf-8') as f:
            f.write(summary_text)
        print(f"Performance summary saved to '{summary_filepath}'")
    except Exception as e:
        print(f"Error saving performance summary: {e}")
    
    # Generate charts
    metrics_to_plot = ['F1', 'CosineSim', 'BERTScore']
    
    for metric in metrics_to_plot:
        metric_data = long_df[long_df['Metric'] == metric]
        
        # Bar chart for avg scores
        plt.style.use('seaborn-v0_8-whitegrid')
        plt.figure(figsize=(12, 7))
        sns.barplot(x='Score', y='Model', data=metric_data, estimator='mean', errorbar=None, palette='viridis', orient='h')
        plt.title(f'Average {metric} Across Models', fontsize=16, weight='bold')
        plt.xlabel(f'Average Score', fontsize=12)
        plt.ylabel('Model', fontsize=12)
        plt.xlim(0, 1) # Scores are between 0 and 1
        plt.tight_layout()
        chart_path = os.path.join(OUTPUT_DIR, f"1_avg_{metric.lower()}_comparison.png")
        plt.savefig(chart_path, dpi=300)
        plt.close()
        print(f"  - Saved average score chart to '{chart_path}'")
        
        # box Plot for score distribution
        plt.figure(figsize=(12, 7))
        sns.boxplot(x='Score', y='Model', data=metric_data, palette='plasma', orient='h')
        plt.title(f'Distribution of {metric} Scores Across Models', fontsize=16, weight='bold')
        plt.xlabel('Score', fontsize=12)
        plt.ylabel('Model', fontsize=12)
        plt.xlim(0, 1)
        plt.tight_layout()
        chart_path = os.path.join(OUTPUT_DIR, f"2_dist_{metric.lower()}_comparison.png")
        plt.savefig(chart_path, dpi=300)
        plt.close()
        print(f"  - Saved score distribution chart to '{chart_path}'")

# Main function to run the EDA on benchmark results.
def main():
    print("--- Starting Benchmark Results EDA Script ---")
    
    if not os.path.exists(INPUT_FILE):
        print(f"FATAL ERROR: Input file not found at '{INPUT_FILE}'")
        return
        
    try:
        df = pd.read_csv(INPUT_FILE)
        create_comparison_charts(df)
        print("\n--- EDA Script Finished ---")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

--- Starting Benchmark Results EDA Script ---

--- Generating comparison charts ---

--- Performance Summary ---
Metric: F1
  - Best Performer: meta-llama/llama-3.1-70b-instruct (Avg Score: 0.3389)
  - Worst Performer: mistralai/mistral-7b-instruct-v0.3 (Avg Score: 0.2444)
Metric: CosineSim
  - Best Performer: qwen/qwen-2.5-72b-instruct (Avg Score: 0.7792)
  - Worst Performer: meta-llama/llama-3.1-8b-instruct (Avg Score: 0.7171)
Metric: BERTScore
  - Best Performer: qwen/qwen-2.5-72b-instruct (Avg Score: 0.8904)
  - Worst Performer: meta-llama/llama-3.1-8b-instruct (Avg Score: 0.8656)

Performance summary saved to 'benchmark_eda_charts/performance_summary.txt'



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Score', y='Model', data=metric_data, estimator='mean', errorbar=None, palette='viridis', orient='h')


  - Saved average score chart to 'benchmark_eda_charts/1_avg_f1_comparison.png'
  - Saved score distribution chart to 'benchmark_eda_charts/2_dist_f1_comparison.png'



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Score', y='Model', data=metric_data, palette='plasma', orient='h')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Score', y='Model', data=metric_data, estimator='mean', errorbar=None, palette='viridis', orient='h')


  - Saved average score chart to 'benchmark_eda_charts/1_avg_cosinesim_comparison.png'
  - Saved score distribution chart to 'benchmark_eda_charts/2_dist_cosinesim_comparison.png'



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Score', y='Model', data=metric_data, palette='plasma', orient='h')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Score', y='Model', data=metric_data, estimator='mean', errorbar=None, palette='viridis', orient='h')


  - Saved average score chart to 'benchmark_eda_charts/1_avg_bertscore_comparison.png'
  - Saved score distribution chart to 'benchmark_eda_charts/2_dist_bertscore_comparison.png'

--- EDA Script Finished ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Score', y='Model', data=metric_data, palette='plasma', orient='h')
