# rsbids Benchmarking

Benchmark indexing and querying across rsbids, pybids, ancpbids, and bids2table. The benchmarking itelf is done with `pytest-benchmark`. This notebook is for visualizing the results.

All benchmarks are performed with the default settings of `pytest-benchmark`, with 5 rounds per test.

This notebook expects to find test results within the `.benchmarks` folder, with files labelled according to the pytest marks. See the project readme for more details.

In [None]:
import json
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

import matplotlib.font_manager as fm
for f in Path("styles/fonts").iterdir():
    fm.fontManager.addfont(str(f))
plt.style.use("styles/pitayasmoothie-dark.mplstyle")
plt.rc("figure", dpi=300)


In [None]:
get_data("query", unit="ms")

In [None]:
def get_data(task, unit="s"):
    df = pd.json_normalize(
        json.loads(
            next(Path(".benchmarks/").glob(f"**/000[1234]_{task}.json")).read_text()
        )["benchmarks"]
    ).assign(
        group=lambda df: df["name"].str.extract("(pybids|rsbids|bids2table|ancp)"),
    )
    df = df.loc[:, df.columns.str.startswith("stats") | (df.columns == "group")].assign(
        task=task, unit=unit
    )
    if unit == "ms":
        for col in df.columns:
            if not col.startswith("stats.") or col[6:] not in [
                "min",
                "max",
                "mean",
                "stddev",
                "median",
                "iqr",
                "q1",
                "q3",
            ]:
                continue
            df[col] *= 1000
    return df


# Get consistent color for each tool
cmap = dict(
    zip(
        ["pybids", "rsbids", "bids2table", "ancp"],
        plt.rcParams["axes.prop_cycle"][:4].by_key()["color"],
    )
)

## Create the Results Table

In [None]:
print(
    pd.concat(
        [
            get_data("indexing"),
            get_data("query", unit="ms"),
            get_data("large_query", unit="ms"),
            get_data("metadata"),
        ]
    )
    .set_index(["task", "group", "unit"])
    .assign(
        mean=lambda df: df["stats.max"].map("{:0.3f}".format),
        std=lambda df: df["stats.stddev"].map("{:0.3f}".format),
    )[["mean", "std"]]
    .apply(lambda row: f"{row[0]} ± {row[1]}", axis=1)
    .rename("Time")
    .reset_index()
    .assign(
        task=lambda df: df["task"].map({
            "indexing": "Indexing",
            "query": "Query",
            "large_query": "Large Query",
            "metadata": "Indexing w/ Metadata",
        }),
    )
    .assign(
        Tool=lambda df: df["group"].map(
            {
                "rsbids": "[`rsbids`](https://github.com/pvandyken/rsbids) (v0.0.1a1)",
                "pybids": "[`pybids`](https://github.com/bids-standard/pybids) (v0.16.3)",
                "ancp": "[`ancpbids-bids`](https://github.com/ANCPLabOldenburg/ancp-bids) (v0.2.2)",
                "bids2table": "[`bids2table`](https://github.com/cmi-dair/bids2table) (v0.1.0a0)",
            }
        ),
        task=lambda df: df[["task", "unit"]].apply(
            lambda row: f"{row[0]} ({row[1]})", axis=1
        ),
    )
    .pivot(index="Tool", columns="task", values="Time")
    .to_markdown()
)

## Create the figure

In [None]:
def plot(df, ax, x_lim=None, unit="s"):
    df = df.sort_values("stats.mean")
    if unit == "ms":
        df["stats.mean"] *= 1000
    sns.set_style(rc=plt.rcParams)
    ax = sns.barplot(
        df, x="stats.mean", y="group", ax=ax, palette=df["group"].map(cmap)
    )
    ax.set_ylabel("Tool")
    ax.set_xlabel(f"Time ({unit})")
    ax.bar_label(ax.containers[0], df["stats.mean"].map("{:0.2f}".format), padding=3)
    if x_lim:
        ax.set_xlim([0, x_lim])
    return ax


fig = plt.figure(figsize=(10, 6), layout="constrained")
axs = fig.subplots(2, 2)

ax = plot(get_data("indexing"), axs[0, 0])
ax.set_title("Indexing 177k files (no metadata)", weight="bold")
ax = plot(get_data("metadata"), axs[0, 1])
ax.set_title("Indexing 177k files (with metadata)", weight="bold")
ax = plot(get_data("query"), axs[1, 0], x_lim=1000, unit="ms")
ax.set_title("Small query (single subject)", weight="bold")
ax.text(
    850,
    3.05,
    "{:0.0f} →".format(
        get_data("query").set_index("group").loc["ancp"]["stats.mean"] * 1000
    ),
    weight="bold",
    color="black",
)
ax = plot(get_data("large_query"), axs[1, 1], x_lim=1000, unit="ms")
ax.text(
    820,
    3.05,
    "{:,.0f} →".format(
        get_data("large_query").set_index("group").loc["ancp"]["stats.mean"] * 1000
    ),
    weight="bold",
    color="black",
)
ax.set_title("Large query (14 subject, 1 run, 1 suffix)", weight="bold")