In [None]:
results_file = "data/eval/results.json"
score = "bleu1"

In [None]:
import pandas as pd

df = pd.read_json(results_file)

# only look at the latest round
latest_round = df.timestamp.max()
df = df[df.timestamp == latest_round]

# run is a number, but should be considered a category
df = df.assign(run=df.run.astype('category'))

n_tasks = df.task.nunique()
n_strategies = df.strategy.nunique()
n_msids = df.msid.nunique()
n_figures = df.figure_label.nunique()
n_runs = df.run.nunique()

assert df[score].min() >= 0, "Score should be non-negative"
assert df[score].max() <= 1, "Score should be at most 1"

# split into tasks
legends = df[df["task"] == "extract_figure_legends"]
captions = df[df["task"] == "extract_figure_caption"]

In [None]:
import plotly.express as px

def add_threshold(df, fig, axis="y"):
    score_threshold = df[f"{score}_threshold"].iloc[0].round(2)
    kwargs = dict(
        line_dash="dash",
        line_color="green",
        opacity=0.5,
        annotation_text=f"threshold: {score_threshold}",
        annotation_position="top left",
    )
    if axis == "y":
        fig.add_hline(y=score_threshold, **kwargs)
    else:
        fig.add_vline(x=score_threshold, **kwargs)

def violin(df, **kwargs):
    fig = px.violin(
        df,
        x="strategy",
        y=score,
        box=True,
        # points="all",
        **kwargs,
    )
    add_threshold(df, fig)
    return fig

def box(df, **kwargs):
    fig = px.box(
        df,
        x="strategy",
        y=score,
        points="all",
        **kwargs,
    )
    add_threshold(df, fig)
    return fig

def histogram(df, interval_width=0.01, **kwargs):
    fig = px.histogram(
        df,
        x=score,
        nbins=int(1 / interval_width),
        **kwargs,
    )
    fig.update_layout(bargap=0.2)
    add_threshold(df, fig, axis="x")
    return fig

def scatter(df, **kwargs):
    fig = px.scatter(
        df,
        x="strategy",
        y=score,
        **kwargs,
    )
    fig.update_traces(marker_size=10)
    fig.update_layout(scattermode="group", scattergap=0.9)
    add_threshold(df, fig)
    return fig

In [None]:
def get_std(df, groupby):
    return df.groupby(groupby)[score].std().reset_index()

def plot_std(df, groupby, **kwargs):
    df_std = get_std(df, groupby).sort_values(groupby).rename(columns={score: f"{score}_std"})
    fig = px.scatter(
        df_std,
        x=f"{score}_std",
        marginal_x="histogram",
        **kwargs,
    )
    return fig

## Top-line stats

In [None]:
df.groupby(["task", "strategy"])[[score]].describe()

## Graphs

### Extracting the figure legends section from manuscripts

In [None]:
histogram(legends, facet_col="strategy", facet_col_wrap=2, width=2 * 600, title="Extracting the figure legends section from the full manuscript text")

In [None]:
scatter(legends, facet_col="msid", facet_col_wrap=4, color="run", title="Extracting the figure legends section from the full manuscript text", height=int(1 + n_msids / 4) * 400)

In [None]:
plot_std(
    legends,
    ["strategy", "msid"],
    y="msid",
    facet_col="strategy",
    width=400 + 300 * n_strategies,
    title=f"Standard deviation of {score} scores between runs of extracting figure legends from the same manuscript text",
)

### Extracting individual figure captions from figure legends sections

In [None]:
histogram(captions, facet_col="strategy", title="Extracting individual figure captions from figure legends sections")

In [None]:
scatter(captions, facet_col="msid", facet_col_wrap=3, symbol="figure_label", color="run", title="Extracting individual figure captions from figure legends sections", height=1600)

In [None]:
plot_std(
    captions,
    ["strategy", "msid", "figure_label"],
    y="msid",
    facet_col="strategy",
    facet_col_spacing=0.1,
    symbol="figure_label",
    height=200 + 50 * n_msids,
    width=400 + 400 * n_strategies,
    title=f"Standard deviation of {score} scores between runs of extracting individual figure captions from the same figure legends section",
)

## Run details

In [None]:
import difflib
import html
from IPython.display import display, HTML

def inline_diff(a, b):
    matcher = difflib.SequenceMatcher(None, a, b)
    def process_tag(tag, i1, i2, j1, j2):
        a_text = html.escape(a[i1:i2])
        b_text = html.escape(b[j1:j2])
        if tag == 'delete':
            return '<del>' + a_text + '</del>'
        if tag == 'equal':
            return a_text
        if tag == 'insert':
            return '<ins>' + b_text + '</ins>'
        if tag == 'replace':
            # combine as delete + insert
            return '<del>' + a_text + '</del><ins>' + b_text + '</ins>'
        assert False, "Unknown tag %r"%tag
    return ''.join(process_tag(*t) for t in matcher.get_opcodes())

def display_diff(row):
    diff_css = """
    <style>
    ins {background-color: #aaffaa;}  // light green
    del {background-color: #ffaaaa;}  // light red
    repl {background-color: #bb99ff;} // light purple
    table tr > * { width: 50%; }
    table tr > td { vertical-align: top; text-align: left; }
    </style>
    """

    run_name = (
        f"{row.task} - {row.strategy} - {row.msid} - {row.run}"
        if row.task == "extract_figure_legends"
        else f"{row.task} - {row.strategy} - {row.msid} - {row.figure_label} - {row.run}"
    )
    header = f"<h3>{run_name}</h3>"
    section_fn = lambda title, content: f"<h4>{title}</h4><p>{content}</p>"
    scores = section_fn("Scores", f"bleu1: {row.bleu1:.2f}, bleu2: {row.bleu2:.2f}, bleu3: {row.bleu3:.2f}, bleu4: {row.bleu4:.2f}, rouge1: {row.rouge1:.2f}, rouge2: {row.rouge2:.2f}, rougeL: {row.rougeL:.2f}")

    comparison = f"""
<table>
    <thead>
        <tr>
            <th>Expected</th>
            <th>Actual</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>{html.escape(row.expected)}</td>
            <td>{html.escape(row.actual)}</td>
        </tr>
    </tbody>
</table>"""
    diff = section_fn("Diff", inline_diff(row.expected, row.actual) + comparison)
    input_text = section_fn("Input", row.input)

    display(HTML(diff_css + header + scores + diff + input_text))

def display_all(df):
    print(f"Showing {len(df)} runs")
    for row in df.itertuples():
        display_diff(row)

def show_runs(
    df,
    tasks=None,
    strategies=None,
    msids=None,
    figure_labels=None,
    runs=None,
    score_range=None,
):
    if tasks is not None:
        df = df[df.task.isin(tasks)]
    if strategies is not None:
        df = df[df.strategy.isin(strategies)]
    if msids is not None:
        df = df[df.msid.isin(msids)]
    if figure_labels is not None:
        df = df[df.figure_label.isin(figure_labels)]
    if runs is not None:
        df = df[df.run.isin(runs)]
    if score_range is not None:
        min_score, max_score = score_range
        df = df[(df[score] >= min_score) & (df[score] < max_score)]
    return display_all(df)

In [None]:
show_runs(
    df,
    tasks=["extract_figure_caption"],
    strategies=["gpt-4o t=0 t_p=0"],
    msids=["EMBOJ-2023-114687"],
    figure_labels=["Figure 1"],
)

In [None]:
show_runs(
    df,
    tasks=["extract_figure_legends"],
    strategies=["openai"],
    score_range=(0., 0.9),
)