In [None]:
results_file = "data/eval/results.json"
score = "bleu1"
ignore = {
    "task": None,
    "strategy": ["regex"],
    "msid": None,
    "figure_label": None,
    "run": None,
}

In [None]:
import pandas as pd

df = pd.read_json(results_file)

# remove runs in ignore
for k, v in ignore.items():
    if v is not None:
        df = df[~df[k].isin(v)]

# run is a number, but should be considered a category
df = df.assign(run=df.run.astype('category'))

n_tasks = df.task.nunique()
n_strategies = df.strategy.nunique()
n_msids = df.msid.nunique()
n_figures = df.figure_label.nunique()
n_runs = df.run.nunique()

assert df[score].min() >= 0, "Score should be non-negative"
assert df[score].max() <= 1, "Score should be at most 1"

# split into tasks
legends = df[df["task"] == "extract_figure_legends"]
labels = df[df["task"] == "extract_figures"]
titles = df[df["task"] == "extract_figure_title"]
captions = df[df["task"] == "extract_figure_caption"]

## Top-line stats

In [None]:
df.groupby(["task", "strategy"])[[score]].describe()

## Graphs

In [None]:
import plotly.express as px

def add_threshold(df, fig, axis="y"):
    score_threshold = df[f"{score}_threshold"].iloc[0].round(2)
    kwargs = dict(
        line_dash="dash",
        line_color="green",
        opacity=0.5,
        annotation_text=f"threshold: {score_threshold}",
        annotation_position="top left",
    )
    if axis == "y":
        fig.add_hline(y=score_threshold, **kwargs)
    else:
        fig.add_vline(x=score_threshold, **kwargs)

def hist_score(df, interval_width=0.01, **kwargs):
    groupby = ["task", "strategy"]
    x_start = 0 - interval_width
    x_end = 1 + interval_width
    y_start = -1
    y_end = df.groupby(groupby).count().max().max() + 1
    fig = px.histogram(
        df,
        x=score,
        range_x=[x_start, x_end],
        range_y=[y_start, y_end],
        facet_col="task",
        facet_row="strategy",
        width=900,
        height=100 + 250 * n_strategies,
        **kwargs,
    )
    fig.update_traces(
        xbins=dict( # bins used for histogram
            start=x_start,
            end=x_end,
            size=interval_width,
        )
    )
    fig.update_layout(bargap=0.2)
    add_threshold(df, fig, axis="x")
    return fig

def scatter_score(df, **kwargs):
    fig = px.scatter(
        df,
        x="msid",
        y=score,
        facet_col="task",
        facet_row="strategy",
        color="run",
        height=100 + 250 * n_strategies,
        width=900,
        **kwargs,
    )
    fig.update_traces(marker_size=10)
    fig.update_layout(scattermode="group", scattergap=0.9)
    add_threshold(df, fig)
    return fig

def plot_std(df, groupby, **kwargs):
    df_std = (
        df.groupby(groupby)
        [score].std()
        .reset_index()
        .sort_values(groupby)
        .rename(columns={score: f"{score}_std"})
    )
    fig = px.scatter(
        df_std,
        x="msid",
        y=f"{score}_std",
        facet_col="task",
        facet_row="strategy",
        height=100 + 250 * n_strategies,
        width=900,
        **kwargs,
    )
    return fig

In [None]:
for task_df in [legends, labels]:
    if task_df.empty:
        continue
    hist_score(task_df).show()
    scatter_score(task_df).show()
    plot_std(task_df, groupby=["task", "strategy", "msid"]).show()

In [None]:
for task_df in [titles, captions]:
    if task_df.empty:
        continue
    hist_score(task_df).show()
    scatter_score(
        task_df,
        symbol="figure_label",
    ).show()
    plot_std(
        task_df,
        groupby=["task", "strategy", "msid", "figure_label"],
        color="figure_label",
        symbol="figure_label",
    ).show()

## Run details

## Run details

In [None]:
import difflib
import html
from IPython.display import display, HTML

def inline_diff(a, b):
    matcher = difflib.SequenceMatcher(None, a, b)
    def process_tag(tag, i1, i2, j1, j2):
        a_text = html.escape(a[i1:i2])
        b_text = html.escape(b[j1:j2])
        if tag == 'delete':
            return '<del>' + a_text + '</del>'
        if tag == 'equal':
            return a_text
        if tag == 'insert':
            return '<ins>' + b_text + '</ins>'
        if tag == 'replace':
            # combine as delete + insert
            return '<del>' + a_text + '</del><ins>' + b_text + '</ins>'
        assert False, "Unknown tag %r"%tag
    return ''.join(process_tag(*t) for t in matcher.get_opcodes())

def display_diff_(row):
    diff_css = """
    <style>
    ins {background-color: #588a41;}  // light green
    del {background-color: #ffaaaa;}  // light red
    repl {background-color: #bb99ff;} // light purple
    table tr > * { width: 50%; }
    table tr > td { vertical-align: top; text-align: left; }
    </style>
    """

    # run_name = (
    #     f"{row.task} - {row.strategy} - {row.msid} - {row.run}"
    #     if row.task == "extract_figure_caption"
    #     else f"{row.task} - {row.strategy} - {row.msid} - {row.figure_label} - {row.run}"
    # )
    # header = f"<h3>{run_name}</h3>"
    section_fn = lambda title, content: f"<h4>{title}</h4><p>{content}</p>"
    # scores = section_fn("Scores", f"bleu1: {row.bleu1[0]:.2f}}")

    comparison = f"""
        <table>
            <thead>
                <tr>
                    <th>Expected</th>
                    <th>Actual</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>{html.escape(row.head(1)["expected"].values[0])}</td>
                    <td>{html.escape(row.head(1)["actual"].values[0])}</td>
                </tr>
            </tbody>
        </table>
    """
    diff = section_fn("Diff", inline_diff(row.head(1)["expected"].values[0], row.head(1)["actual"].values[0]) + comparison)
    input_text = section_fn("Input", row.input)

    # display(HTML(diff_css + header + scores + diff + input_text))
    display(HTML(diff_css + diff + input_text))


In [None]:
df["task"].unique()

In [None]:
df_bad_figure_legends = df[(df["task"]=="extract_figure_legends") & (df["bleu1"]<0.95)]
df_bad_labels = df[(df["task"]=="extract_figures") & (df["bleu1"]<0.95)]
df_bad_titles = df[(df["task"]=="extract_figure_title") & (df["bleu1"]<0.95)]
df_bad_caption = df[(df["task"]=="extract_figure_caption") & (df["bleu1"]<0.95)]


## Examples of bad detection of figure section

In [None]:
display_diff_(df_bad_figure_legends.sample()) if len(df_bad_figure_legends) > 0 else print("No bad figure legends")

## Examples of bad detection of figure label

In [None]:
display_diff_(df_bad_labels.sample()) if len(df_bad_labels) > 0 else print("No bad figure labels")

## Examples of bad figure titles

In [None]:
display_diff_(df_bad_titles.sample()) if len(df_bad_titles) > 0 else print("No bad figure titles")

## Examples of bad figure captions

In [None]:
display_diff_(df_bad_caption.sample()) if len(df_bad_caption) > 0 else print("No bad figure captions")

# Single visualizations where we could have a better intuition of the results

In [None]:
df.columns

In [None]:
import numpy as np
# Create new column for model type (GPT vs Claude)
df['model_type'] = df['strategy'].apply(lambda x: 'GPT' if 'gpt' in x.lower() else 'Claude' if 'claude' in x.lower() else 'Other')

# Define consistent colors and bins
colors = {'GPT': '#2E91E5', 'Claude': '#E15F99'}  # Blue for GPT, Pink for Claude

# Create uniform bins from 0 to 1
bin_size = 0.025  # This creates 20 bins of size 0.05 each
bins = list(np.arange(0, 1.1, bin_size))  # Adding .05 to include 1.0

# Create distribution plots per task
tasks = df['task'].unique()
fig_dist = make_subplots(
    rows=len(tasks), 
    cols=1,
    subplot_titles=[task.replace('_', ' ').title() for task in tasks],
    vertical_spacing=0.1
)

for i, task in enumerate(tasks, 1):
    task_data = df[df['model_type'].isin(['GPT', 'Claude']) & (df['task'] == task)]
    
    for model in ['GPT', 'Claude']:
        model_data = task_data[task_data['model_type'] == model]['bleu1']
        
        # Create histogram trace with consistent bins
        fig_dist.add_trace(
            go.Histogram(
                x=model_data,
                name=model,
                histfunc='count',
                xbins=dict(
                    start=0,
                    end=1.1,
                    size=bin_size
                ),
                hovertemplate="Score Range: %{x}<br>Count: %{y}<extra></extra>",
                cumulative_enabled=False,
                showlegend=i==1,  # Only show legend for first subplot
                opacity=0.7,
                marker_color=colors[model],
                marker_line_width=1,
                marker_line_color='white',
                histnorm='probability'
            ),
            row=i, 
            col=1
        )

# Update layout after creating all subplots
fig_dist.update_layout(
    title='Score Distribution by Task: GPT vs Claude',
    width=900,
    height=300*len(tasks),
    barmode='overlay',
    bargap=0.1
)

# Update each subplot's axes
for i in range(len(tasks)):
    fig_dist.update_yaxes(title_text="Probability", row=i+1, col=1)
    fig_dist.update_xaxes(
        title_text="BLEU-1 Score", 
        range=[0, 1.02],
        dtick=0.1,  # Add gridlines every 0.1
        row=i+1, 
        col=1
    )

fig_dist.show()

# Bar plot with error bars - using the same colors
fig_bar = px.bar(
    model_stats,
    x='task',
    y='mean',
    error_y='std',
    color='model_type',
    barmode='group',
    title='Average Performance by Task: GPT vs Claude',
    labels={
        'task': 'Task',
        'mean': 'Average BLEU-1 Score',
        'model_type': 'Model Type'
    },
    color_discrete_map=colors
)

fig_bar.update_layout(
    yaxis_range=[0, 1],
    width=900,
    height=500,
    xaxis_tickangle=-45
)
fig_bar.show()

# Print summary statistics and t-tests
print("\nSummary Statistics by Task:")
print(model_stats.round(3))

print("\nt-test results by task:")
for task in tasks:
    task_data = df[df['task'] == task]
    gpt_scores = task_data[task_data['model_type'] == 'GPT']['bleu1']
    claude_scores = task_data[task_data['model_type'] == 'Claude']['bleu1']
    t_stat, p_value = stats.ttest_ind(gpt_scores, claude_scores)
    print(f"\nTask: {task}")
    print(f"t-statistic: {t_stat:.3f}")
    print(f"p-value: {p_value:.3e}")

In [None]:
# First, let's identify all GPT and Claude variations
gpt_strategies = df[df['strategy'].str.contains('gpt', case=False)]['strategy'].unique()
claude_strategies = df[df['strategy'].str.contains('claude', case=False)]['strategy'].unique()

# Calculate stats for GPT variations
gpt_stats = df[df['strategy'].isin(gpt_strategies)].groupby(['task', 'strategy'])['bleu1'].agg([
    'mean',
    'std',
    'count'
]).reset_index()

# Calculate stats for Claude variations
claude_stats = df[df['strategy'].isin(claude_strategies)].groupby(['task', 'strategy'])['bleu1'].agg([
    'mean',
    'std',
    'count'
]).reset_index()

# Create bar plot for GPT variations
fig_gpt = px.bar(
    gpt_stats,
    x='task',
    y='mean',
    error_y='std',
    color='strategy',
    barmode='group',
    title='Performance by Task: GPT Variations',
    labels={
        'task': 'Task',
        'mean': 'Average BLEU-1 Score',
        'strategy': 'Strategy'
    },
    color_discrete_sequence=px.colors.qualitative.Set1  # Use a consistent color palette
)

fig_gpt.update_layout(
    yaxis_range=[0, 1.2],
    width=900,
    height=500,
    xaxis_tickangle=-45,
    showlegend=True,
    legend_title_text='GPT Variations'
)
fig_gpt.show()

# Create bar plot for Claude variations
fig_claude = px.bar(
    claude_stats,
    x='task',
    y='mean',
    error_y='std',
    color='strategy',
    barmode='group',
    title='Performance by Task: Claude Variations',
    labels={
        'task': 'Task',
        'mean': 'Average BLEU-1 Score',
        'strategy': 'Strategy'
    },
    color_discrete_sequence=px.colors.qualitative.Set2  # Use a different color palette for Claude
)

fig_claude.update_layout(
    yaxis_range=[0, 1.2],
    width=900,
    height=500,
    xaxis_tickangle=-45,
    showlegend=True,
    legend_title_text='Claude Variations'
)
fig_claude.show()

# Print summary statistics
print("\nSummary Statistics for GPT Variations:")
print(gpt_stats.round(3))
print("\nSummary Statistics for Claude Variations:")
print(claude_stats.round(3))

In [None]:
# First, let's get our distributions per manuscript
# Create distribution plots per manuscript
manuscripts = df['msid'].unique()
fig_dist = make_subplots(
    rows=len(manuscripts), 
    cols=1,
    subplot_titles=[f"Manuscript {msid}" for msid in manuscripts],
    vertical_spacing=0.1
)

# Define consistent colors and bins
colors = {'GPT': '#2E91E5', 'Claude': '#E15F99'}
bin_size = 0.05
bins = list(np.arange(0, 1.05, bin_size))

for i, msid in enumerate(manuscripts, 1):
    manuscript_data = df[df['model_type'].isin(['GPT', 'Claude']) & (df['msid'] == msid)]
    
    for model in ['GPT', 'Claude']:
        model_data = manuscript_data[manuscript_data['model_type'] == model]['bleu1']
        
        fig_dist.add_trace(
            go.Histogram(
                x=model_data,
                name=model,
                histfunc='count',
                xbins=dict(
                    start=0,
                    end=1,
                    size=bin_size
                ),
                hovertemplate="Score Range: %{x}<br>Count: %{y}<extra></extra>",
                cumulative_enabled=False,
                showlegend=i==1,
                opacity=0.7,
                marker_color=colors[model],
                marker_line_width=1,
                marker_line_color='white',
                histnorm='probability'
            ),
            row=i, 
            col=1
        )

fig_dist.update_layout(
    title='Score Distribution by Manuscript: GPT vs Claude',
    width=900,
    height=300*len(manuscripts),
    barmode='overlay',
    bargap=0.1
)

for i in range(len(manuscripts)):
    fig_dist.update_yaxes(title_text="Probability", row=i+1, col=1)
    fig_dist.update_xaxes(
        title_text="BLEU-1 Score", 
        range=[0, 1.1],
        dtick=0.1,
        row=i+1, 
        col=1
    )

fig_dist.show()

# Now let's analyze GPT and Claude variations per manuscript
# For GPT variations
gpt_manuscript_stats = df[df['strategy'].isin(gpt_strategies)].groupby(['msid', 'strategy'])['bleu1'].agg([
    'mean',
    'std',
    'count'
]).reset_index()

fig_gpt_manuscript = px.bar(
    gpt_manuscript_stats,
    x='msid',
    y='mean',
    error_y='std',
    color='strategy',
    barmode='group',
    title='Performance by Manuscript: GPT Variations',
    labels={
        'msid': 'Manuscript ID',
        'mean': 'Average BLEU-1 Score',
        'strategy': 'Strategy'
    },
    color_discrete_sequence=px.colors.qualitative.Set1
)

fig_gpt_manuscript.update_layout(
    yaxis_range=[0, 1.1],
    width=900,
    height=500,
    xaxis_tickangle=-45,
    showlegend=True,
    legend_title_text='GPT Variations'
)
fig_gpt_manuscript.show()

# For Claude variations
claude_manuscript_stats = df[df['strategy'].isin(claude_strategies)].groupby(['msid', 'strategy'])['bleu1'].agg([
    'mean',
    'std',
    'count'
]).reset_index()

fig_claude_manuscript = px.bar(
    claude_manuscript_stats,
    x='msid',
    y='mean',
    error_y='std',
    color='strategy',
    barmode='group',
    title='Performance by Manuscript: Claude Variations',
    labels={
        'msid': 'Manuscript ID',
        'mean': 'Average BLEU-1 Score',
        'strategy': 'Strategy'
    },
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig_claude_manuscript.update_layout(
    yaxis_range=[0, 1.1],
    width=900,
    height=500,
    xaxis_tickangle=-45,
    showlegend=True,
    legend_title_text='Claude Variations'
)
fig_claude_manuscript.show()

# Let's also analyze performance per manuscript and task
manuscript_task_stats = df[df['model_type'].isin(['GPT', 'Claude'])].groupby(['msid', 'task', 'model_type'])['bleu1'].agg([
    'mean',
    'std',
    'count'
]).reset_index()

fig_task_manuscript = px.bar(
    manuscript_task_stats,
    x='msid',
    y='mean',
    error_y='std',
    color='model_type',
    facet_row='task',
    barmode='group',
    title='Performance by Manuscript and Task: GPT vs Claude',
    labels={
        'msid': 'Manuscript ID',
        'mean': 'Average BLEU-1 Score',
        'model_type': 'Model Type'
    },
    color_discrete_map=colors
)

fig_task_manuscript.update_layout(
    yaxis_range=[0, 1.1],
    width=900,
    height=200*len(df['task'].unique()),
    showlegend=True
)
fig_task_manuscript.show()

# Print summary statistics for potentially problematic manuscripts
print("\nIdentifying potentially problematic manuscripts:")
for msid in manuscripts:
    manuscript_data = df[df['msid'] == msid]
    mean_score = manuscript_data['bleu1'].mean()
    std_score = manuscript_data['bleu1'].std()
    
    if mean_score < df['bleu1'].mean() - df['bleu1'].std():
        print(f"\nManuscript {msid} shows lower than average performance:")
        print(f"Mean score: {mean_score:.3f}")
        print(f"Std deviation: {std_score:.3f}")
        
        # Show task-specific performance for this manuscript
        task_performance = manuscript_data.groupby('task')['bleu1'].agg(['mean', 'std'])
        print("\nTask-specific performance:")
        print(task_performance.round(3))