<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [12]</a>'.</span>

# SODA Curation Benchmarking Report

## Overview
This report analyzes the performance of different AI strategies on the SODA Curation tasks. The strategies include variations of GPT and Claude models with different parameters.

## Tasks Performance Analysis

### Score Distributions
The following plots show the distribution of scores across different strategies for each task. The step histograms reveal:

- Density of scores in different ranges (0-1)
- Relative performance between strategies
- Potential bimodal patterns or clusters

### Box Plots 
Box plots complement the histograms by showing:

- Median performance
- Score spread (IQR)
- Outliers
- Direct strategy comparisons

## Detailed Results

{Performance plots will be added here}

## Key Findings

- Task-specific performance patterns
- Strategy comparisons
- Areas for improvement
- Notable success cases

## Technical Details
- Score metrics: BLEU-1, ROUGE scores
- Data collection period: {date}
- Number of test cases per task

In [None]:
# Import required libraries
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import os
from pathlib import Path
import matplotlib.pyplot as plt


In [None]:
# Define color palette
PALETTE = {
    "primary_yellow": "#F2B80E",  # Primary
    "primary_black":  "#222222",  # Primary
    "secondary_red":  "#C53B2E",  # Secondary
    "secondary_orange": "#E1812B",# Secondary
    "secondary_green": "#94AF1F", # Secondary
    "tertiary_blue":   "#449EAF", # Tertiary
    "tertiary_purple": "#3D2D81", # Tertiary
    "tertiary_teal":   "#43806E", # Tertiary
    "tertiary_magenta":"#8E2D64"  # Tertiary
}


# Create figures directory
figures_dir = Path("./figures")
figures_dir.mkdir(exist_ok=True)

def ensure_figures_dir():
    """Create figures directory if it doesn't exist."""
    figures_dir = Path("./figures")
    figures_dir.mkdir(exist_ok=True)
    return figures_dir


In [None]:
import pandas as pd

df = pd.read_json("results.json")

score = "bleu1"
ignore = {
    "task": None,
    "strategy": ["regex"],
    "msid": None,
    "figure_label": None,
    "run": None,
}
# remove runs in ignore
for k, v in ignore.items():
    if v is not None:
        df = df[~df[k].isin(v)]

# run is a number, but should be considered a category
df = df.assign(run=df.run.astype('category'))

In [None]:
def create_performance_plots(df, save_dir='figures'):
    """Create individual performance plots per task."""
    tasks = df['task'].unique()
    strategies = df['strategy'].unique()
    colors = list(PALETTE.values())[:len(strategies)]
    
    for task in tasks:
        task_data = df[df['task'] == task]
        
        # Select appropriate score column
        if task == 'panel_source_assignment':
            score_col = 'panel_source_manuscript_accuracy_exact'
        elif task == 'extract_data_sources':
            score_col = 'data_source_accuracy_exact'
        else:
            score_col = 'bleu1'
            
        # Histogram plot
        plt.figure(figsize=(12, 6))
        bins = np.linspace(0, 1, 21)  # 20 bins of equal width from 0 to 1
        for strategy, color in zip(strategies, colors):
            strategy_data = task_data[task_data['strategy'] == strategy][score_col]
            plt.hist(strategy_data, bins=bins, histtype='step', label=strategy, 
                    color=color, linewidth=2, density=False)
        
        plt.title(f'Score Distribution for {task}')
        plt.xlabel('Score')
        plt.ylabel('Count')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'{task}_histogram.pdf'))
        plt.show()
        plt.close()
        
        # Box plot with filled boxes
        plt.figure(figsize=(12, 6))
        bp = plt.boxplot([task_data[task_data['strategy'] == s][score_col] 
                            for s in strategies], labels=strategies, patch_artist=True)
        
        for box, color in zip(bp['boxes'], colors):
            box.set_facecolor(color)
            box.set_alpha(0.6)
            box.set_edgecolor('black')
            
        plt.title(f'Score Distribution by Strategy for {task}')
        plt.ylabel('Score')
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'{task}_boxplot.pdf'))
        plt.show()
        plt.close()

    return "Plots saved in figures directory"

In [None]:
create_performance_plots(df, save_dir='figures')

In [None]:
def create_manuscript_performance_plots(df, save_dir='figures'):
   """Create performance plots showing aggregate scores per manuscript."""
   manuscripts = df['msid'].unique()
   tasks = df['task'].unique()
   
   for task in tasks:
       task_data = df[df['task'] == task]
       score_col = ('panel_source_manuscript_accuracy_exact' if task == 'panel_source_assignment'
                   else 'data_source_accuracy_exact' if task == 'extract_data_sources'
                   else 'bleu1')       
       # Box plot per manuscript
       plt.figure(figsize=(12, 6))
       bp = plt.boxplot([task_data[task_data['msid'] == m][score_col] 
                       for m in manuscripts], labels=manuscripts, patch_artist=True)
       
       for box, color in zip(bp['boxes'], PALETTE.values()):
           box.set_facecolor(color)
           box.set_alpha(0.6)
           box.set_edgecolor('black')
           
       plt.title(f'Score Distribution by Manuscript for {task}')
       plt.ylabel('Score')
       plt.grid(True, alpha=0.3)
       plt.xticks(rotation=45)
       plt.tight_layout()
       plt.savefig(os.path.join(save_dir, f'{task}_manuscript_boxplot.pdf'))
       plt.show()
       plt.close()

   return "Plots saved in figures directory"

In [None]:
create_manuscript_performance_plots(df, save_dir='figures')

In [None]:
def show_task_errors_html(df, task_name, threshold=0.8, n_examples=3):
   """Display examples of errors for a given task in HTML format."""
   from IPython.display import HTML
   import difflib
   
   task_data = df[df['task'] == task_name]
   score_col = ('panel_source_manuscript_accuracy_exact' if task_name == 'panel_source_assignment'
               else 'data_source_accuracy_exact' if task_name == 'extract_data_sources'
               else 'bleu1')
   
   errors = task_data[task_data[score_col] < threshold].sort_values(score_col)
   
   if len(errors) == 0:
       return HTML("<p>No examples found below threshold {threshold}</p>")
   
   html_output = """
   <style>
       .error-container { margin: 20px 0; padding: 15px; border: 1px solid #ddd; }
       .error-header { font-weight: bold; margin-bottom: 10px; }
       .comparison-container { display: flex; margin: 10px 0; }
       .text-column { flex: 1; padding: 10px; border: 1px solid #eee; margin: 0 5px; }
       .diff-view { margin-top: 10px; padding: 10px; background-color: #f8f8f8; }
       .diff-add { background-color: #e6ffe6; }
       .diff-del { background-color: #ffe6e6; }
   </style>
   """
   
   for _, row in errors.head(n_examples).iterrows():
       html_output += f"""
       <div class="error-container">
           <div class="error-header">
               Score: {row[score_col]:.3f} | Strategy: {row['strategy']} | Manuscript: {row['msid']}
           </div>
           <div class="comparison-container">
               <div class="text-column">
                   <strong>Expected:</strong><br>
                   {row['expected']}
               </div>
               <div class="text-column">
                   <strong>Actual:</strong><br>
                   {row['actual']}
               </div>
           </div>
           <div class="diff-view">
               <strong>Differences:</strong><br>
       """
       
       # Generate diff
       d = difflib.HtmlDiff()
       diff_html = d.make_table(row['expected'].splitlines(), 
                              row['actual'].splitlines(), 
                              'Expected', 'Actual',
                              context=True)
       html_output += diff_html + "</div></div>"
   
   return HTML(html_output)

In [None]:
show_task_errors_html(df, "extract_figure_caption", threshold=0.8, n_examples=1)


In [None]:
show_task_errors_html(df, "extract_figure_title", threshold=0.8, n_examples=1)


In [None]:
show_task_errors_html(df, "extract_data_availability_section", threshold=0.8, n_examples=1)


In [None]:
show_task_errors_html(df, "panel_source_assignment", threshold=0.8, n_examples=1)

In [None]:
show_task_errors_html(df, "extract_data_sources", threshold=0.8, n_examples=1)