# Benchmark of SODA-curation

This notebook will show the results of the benchmark of SODA-curation.

## 1. Preliminar functions and imports

In [None]:
import pandas as pd
import ast
import difflib
from IPython.display import display, HTML
import re
from bs4 import BeautifulSoup
import html
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Download necessary NLTK data if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')



In [None]:
def score_panel_seq(task, actual, expected, score):
    if task == "panel_sequence":
        # Convert string representations to actual lists
        if isinstance(expected, str):
            try:
                # Safe way to convert string representation to list
                expected = ast.literal_eval(expected)
            except:
                pass  # Keep as is if conversion fails
                
        if isinstance(actual, str):
            try:
                actual = ast.literal_eval(actual)
            except:
                pass  # Keep as is if conversion fails
        
        # Now proceed with the comparisons (on proper list objects)
        if "".join(expected) == "A" and "".join(actual) == "":
            return 1
        if "".join(actual) == "A" and "".join(expected) == "":
            return 1
        
        return score
    else:
        return score

In [None]:
def normalize_text(text, task):
    """Normalize text by removing HTML tags and converting entities."""
    if task in ['panel_sequence', 'panel_source_assignment']:
        return text
    else:
        if text is None:
            return ""
        
        # Convert to string if needed
        text = str(text)
        
        # Convert HTML entities to their corresponding characters
        text = html.unescape(text)
        
        # Remove HTML tags using BeautifulSoup
        soup = BeautifulSoup(text, 'html.parser')
        text = soup.get_text()
        
        # Standardize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text


In [None]:
def calculate_bleu_score(reference, candidate):
    """Calculate BLEU score between reference and candidate texts."""
    if not reference or not candidate:
        return 0.0
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    
    # Use smoothing to handle zero counts
    smoothie = SmoothingFunction().method1
    
    # Calculate BLEU score
    try:
        bleu_score = sentence_bleu([reference_tokens], candidate_tokens, 
                                 weights=(0.25, 0.25, 0.25, 0.25), 
                                 smoothing_function=smoothie)
        return bleu_score
    except Exception:
        return 0.0


In [None]:
def recalculate_text_score(row):
    # Text-based tasks
    text_tasks = ['locate_figure_captions', 'extract_data_availability', 
                  'figure_title', 'figure_caption']
    
    # Only recalculate for text-based tasks
    if row['task'] in text_tasks:
        
        # Calculate BLEU score
        bleu_score = calculate_bleu_score(row['expected'], row['actual'])
        
        # Return BLEU score
        return bleu_score
    
    # Return existing score for non-text tasks
    return row['score']


In [None]:
# Function to plot enhanced charts for each task
def plot_enhanced_task_chart(df, task_name):
    # Filter data for the specific task
    task_data = df[df["task"] == task_name].copy()
    
    # Create parameter group column
    task_data['param_group'] = task_data.apply(
        lambda row: f"{row['model']}\ntemp={row['temperature']}\ntop_p={row['top_p']}",
        axis=1
    )
    
    # Create figure with appropriate size
    plt.figure(figsize=(12, 8))
    
    # Create boxplot
    ax = sns.boxplot(data=task_data, x='param_group', y='score', color='lightblue')
    
    # Add individual data points using swarmplot
    sns.swarmplot(data=task_data, x='param_group', y='score', color='black', alpha=0.5, size=4)
    
    # Calculate and annotate mean and median for each group
    for i, group in enumerate(task_data['param_group'].unique()):
        group_data = task_data[task_data['param_group'] == group]['score']
        mean_score = group_data.mean()
        median_score = group_data.median()
        
        # Annotate mean with red line
        plt.hlines(y=mean_score, xmin=i-0.4, xmax=i+0.4, colors='red', linestyles='dashed', linewidth=2)
        plt.text(i, mean_score + 0.01, f'Mean: {mean_score:.2f}', ha='center', va='bottom', color='red')
        
        # Annotate median with green marker
        plt.text(i, median_score - 0.03, f'Median: {median_score:.2f}', ha='center', va='top', color='green')
    
    # Customize plot
    plt.title(f'Score Distribution for {task_name}', fontsize=15)
    plt.xlabel('Model Parameters', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    
    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
def plot_models_by_msid(df, task_name, models=None, temperatures=None):
    """
    Plot multiple models' results by paper MSID.
    
    Parameters:
    - df: DataFrame containing the benchmark data
    - task_name: Task to analyze
    - models: List of models to include (default: all models in the data)
    - temperatures: List of temperatures to include (default: all temperatures)
    """
    # Set default values if none provided
    if models is None:
        models = df['model'].unique()
    if temperatures is None:
        temperatures = df['temperature'].unique()
    
    # Filter data for the specified task, models, and temperatures
    filtered_data = df[(df["task"] == task_name) & 
                      (df["model"].isin(models)) & 
                      (df["temperature"].isin(temperatures))].copy()
    
    if len(filtered_data) == 0:
        print(f"No data found for task: {task_name} with the specified models and temperatures")
        return
    
    # Create figure with appropriate size
    plt.figure(figsize=(16, 10))
    
    # Create a model+temperature column for better visualization
    filtered_data['model_temp'] = filtered_data.apply(
        lambda row: f"{row['model']} (t={row['temperature']})", axis=1
    )
    
    # Create boxplot grouped by msid, with different colors for each model
    ax = sns.boxplot(
        data=filtered_data, 
        x='msid', 
        y='score', 
        hue='model_temp',
        palette='Set2'
    )
    
    # Add individual data points (swarmplot would be too cluttered, using stripplot instead)
    sns.stripplot(
        data=filtered_data, 
        x='msid', 
        y='score', 
        hue='model_temp',
        dodge=True,
        alpha=0.5, 
        size=4,
        palette='dark:black'
    )
    
    # Calculate and annotate overall mean for each paper
    msids = filtered_data['msid'].unique()
    for i, msid in enumerate(msids):
        paper_data = filtered_data[filtered_data['msid'] == msid]['score']
        mean_score = paper_data.mean()
        
        # Annotate overall mean with black line
        plt.hlines(y=mean_score, xmin=i-0.5, xmax=i+0.5, 
                  colors='black', linestyles='dashed', linewidth=2)
        plt.text(i, mean_score + 0.02, f'Mean: {mean_score:.2f}', 
                ha='center', va='bottom', color='black', fontweight='bold')
    
    # Customize plot
    plt.title(f'Model Comparison by Paper MSID for {task_name}', fontsize=15)
    plt.xlabel('Paper MSID', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.xticks(rotation=90)
    plt.grid(True, alpha=0.3, axis='y')
    
    # Add count of samples per paper
    for i, msid in enumerate(msids):
        count = len(filtered_data[filtered_data['msid'] == msid])
        plt.text(i, -0.05, f'n={count}', ha='center', va='top', color='blue')
    
    # Improve legend position and size
    plt.legend(title='Model (temperature)', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Show plot
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics for reference
    print("\nSummary Statistics by Model for", task_name)
    summary = filtered_data.groupby('model_temp')['score'].agg(['mean', 'median', 'std', 'count']).round(3)
    display(summary)


# Define tasks
tasks = ['locate_figure_captions', 'extract_data_availability', 'figure_title', 
         'figure_caption', 'panel_sequence', 'panel_source_assignment']

# Example usage - plot all models for each task
for task in tasks:
    plot_models_by_msid(df, task)

# Or you can specify which models and temperatures to include
# models_to_compare = ['gpt-4o', 'claude-3-opus']
# temperatures_to_compare = [0.0, 0.5]
# plot_models_by_msid(df, 'figure_caption', models=models_to_compare, temperatures=temperatures_to_compare)

## 2. Load the data

In [None]:
df = pd.read_csv(metrics_csv_path)

df = df[~df.msid.isin(["EMBOR-2024-59101V1-T"])]

df["actual"] = df.apply(lambda row: normalize_text(row["actual"], row["task"]), axis=1)
df["expected"] = df.apply(lambda row: normalize_text(row["expected"], row["task"]), axis=1)

df["score"] = df.apply(
    lambda row: score_panel_seq(
        row["task"],row["actual"],row["expected"],row["score"]
        ),
    axis=1
    )

# Apply the score recalculation
df["score"] = df.apply(
    lambda row: recalculate_text_score(row),
    axis=1
    )

## 3. Benchmark results per model and parameters

In [None]:
# Plot for each task
tasks = ['locate_figure_captions', 'extract_data_availability', 'figure_title', 
         'figure_caption', 'panel_sequence', 'panel_source_assignment']

for task in tasks:
    plot_enhanced_task_chart(df, task)

## 4. Benchmark results per paper, model, and parameters

In [None]:
models_to_compare = ['gpt-4o', 'gpt-4o-mini']
temperatures_to_compare = [0.5, 1.0]
for task in tasks:
    plot_models_by_msid(df, 'task', models=models_to_compare, temperatures=temperatures_to_compare)

## 5. Checking explicit errors

In [None]:
def create_html_text_diff(expected, actual):
    """
    Create HTML with color-coded text differences.
    - Regular text: identical text
    - Red text: text in expected but not in actual (deletions)
    - Green text: text in actual but not in expected (insertions)
    """
    # Convert to string if needed
    expected = str(expected) if expected is not None else ""
    actual = str(actual) if actual is not None else ""
    
    # Get diff operations
    diff = difflib.SequenceMatcher(None, expected, actual)
    
    html = ['<div style="font-family: monospace; white-space: pre-wrap;">']
    
    for op, i1, i2, j1, j2 in diff.get_opcodes():
        if op == 'equal':
            html.append(f'<span>{expected[i1:i2]}</span>')
        elif op == 'delete':
            html.append(f'<span style="color: red;">{expected[i1:i2]}</span>')
        elif op == 'insert':
            html.append(f'<span style="color: green;">{actual[j1:j2]}</span>')
        elif op == 'replace':
            html.append(f'<span style="color: red;">{expected[i1:i2]}</span>')
            html.append(f'<span style="color: green;">{actual[j1:j2]}</span>')
    
    html.append('</div>')
    return ''.join(html)

In [None]:
def create_html_list_diff(expected_list, actual_list):
    """
    Create HTML with color-coded list differences.
    - Green text: elements in both lists
    - Red text: elements in only one list
    """
    html = ['<div style="font-family: monospace;">']
    
    html.append('<div><strong>Expected list:</strong></div><div>')
    for item in expected_list:
        if item in actual_list:
            html.append(f'<span style="color: green; margin-right: 5px; padding: 2px;">{item}</span>')
        else:
            html.append(f'<span style="color: red; margin-right: 5px; padding: 2px;">{item}</span>')
    html.append('</div>')
    
    html.append('<div><strong>Actual list:</strong></div><div>')
    for item in actual_list:
        if item in expected_list:
            html.append(f'<span style="color: green; margin-right: 5px; padding: 2px;">{item}</span>')
        else:
            html.append(f'<span style="color: red; margin-right: 5px; padding: 2px;">{item}</span>')
    html.append('</div>')
    
    html.append('</div>')
    return ''.join(html)

In [None]:
def visualize_errors(df, task, score_threshold=0.8, n=5, seed=None):
    """
    Visualize errors for a specific task.
    
    Parameters:
    - df: DataFrame containing the benchmark data
    - task: Task name to analyze
    - score_threshold: Maximum score to consider as error (default: 0.8)
    - n: Number of samples to display (default: 5)
    - seed: Random seed for reproducibility (default: None)
    """
    # Set random seed if provided
    if seed is not None:
        random.seed(seed)
    
    # Filter data for the specific task
    task_data = df[df["task"] == task].copy()
    
    # Filter for errors (below score threshold)
    errors = task_data[task_data["score"] < score_threshold]
    
    if len(errors) == 0:
        print(f"No errors found for task '{task}' with score threshold {score_threshold}")
        return
    
    # Sample n random errors (or less if fewer are available)
    n = min(n, len(errors))
    error_samples = errors.sample(n)
    
    print(f"Displaying {n} error samples for task '{task}' (score < {score_threshold})")
    print(f"Total errors: {len(errors)} out of {len(task_data)} samples")
    print("-" * 80)
    
    # Text-based tasks
    text_tasks = ['locate_figure_captions', 'extract_data_availability', 'figure_title', 'figure_caption']
    # List-based tasks
    list_tasks = ['panel_sequence', 'panel_source_assignment']
    
    # Display errors based on task type
    if task in text_tasks:
        for i, (_, row) in enumerate(error_samples.iterrows()):
            print(f"Sample {i+1} (Paper: {row['msid']}, Score: {row['score']:.2f})")
            
            # Display actual and expected text
            print("\Input text:")
            print(row['input'])
            print("\nActual text:")
            print(row['actual'])
            print("\nExpected text:")
            print(row['expected'])
            
            # Display color-coded difference
            print("\nDifference:")
            html_diff = create_html_text_diff(row['expected'], row['actual'])
            display(HTML(html_diff))
            print("-" * 80)
            
    elif task in list_tasks:
        for i, (_, row) in enumerate(error_samples.iterrows()):
            print(f"Sample {i+1} (Paper: {row['msid']}, Score: {row['score']:.2f})")
            
            # Convert string representations of lists to actual lists
            try:
                # Try to safely evaluate strings that should represent lists
                # For panel_sequence, the format is usually something like "['A', 'B', 'C']"
                # For panel_source_assignment, it might be more complex
                actual_list = eval(row['actual']) if isinstance(row['actual'], str) else row['actual']
                expected_list = eval(row['expected']) if isinstance(row['expected'], str) else row['expected']
                
                # Display color-coded lists
                html_diff = create_html_list_diff(expected_list, actual_list)
                display(HTML(html_diff))
                
            except Exception as e:
                print(f"Error parsing lists: {e}")
                print("Input: ", row['input'])
                print("Raw actual:", row['actual'])
                print("Raw expected:", row['expected'])
            
            print("-" * 80)
    else:
        print(f"Task '{task}' not recognized or not supported")

