# Analysis Notebook for BenchExec Experiments

This notebook pre-processes the results obtained from Benchexec and plot it using seaborn.

In [None]:
import pandas as pd
import seaborn as sns
import os
import numpy as np
import matplotlib.pyplot as plt
import glob

# sys.dont_write_bytecode = True  # prevent creation of .pyc files

CSV_FOLDER = "stats/"

## 1. Load data from CSV

Collect all CSV result files under `CSV_FOLDER` folder.

In [None]:
print(f"CSV files found in folder {CSV_FOLDER}:")
files = glob.glob(os.path.join(CSV_FOLDER, "results.*.csv"))

# optional filter
# files = [f for f in files if "PALA" not in f]

files

In [None]:
dfs = []
for f in files:
    df = pd.read_csv(f, delimiter=",", skiprows=0)
    dfs.append(df)

df = pd.concat(dfs).reset_index(drop=True)

df

Add bolean `solved` column:

In [None]:
# next, define Boolean column solved to flag if solved or not based on status
df.insert(3, "solved", df['status'].apply(lambda x: True if x == 1 else False))

df

In [None]:
# solver/runs found
print("Solvers/run found:", df['solver'].unique())

Finally, save all results into a complete CSV file.

In [None]:
# df.to_csv(os.path.join(CSV_FOLDER, "results_all.csv"), index=False)

## 2. Analysis example for set benchmark (e.g., AIJ, IJCAI, SOCS, etc.)

First select subset of interest (set and solver runs):

In [None]:
SOLVERS = df["solver"].unique()
DOMAINS = df["domain"].unique()

print("Solvers selected:", SOLVERS)
print("Domains selected:", DOMAINS)

df_sel = df.loc[(df.solver.isin(SOLVERS)) & (df.domain.isin(DOMAINS))]

df_sel = df

df_sel.head()

Let's do a quick scattered plot for each class domain between solver run and cputime.

In [None]:
g = sns.relplot(data=df_sel, kind="scatter", x='solver',  y="time", col="domain",  col_wrap=6, height=3.5, aspect=1, legend=True)

g.set_titles(template='{col_name}',y=1)

# rotate x-labels
x_labels = g.axes[-1].get_xticklabels() # get x labels of last plot in grid (must have the labels!)
g.set_xticklabels(labels=x_labels, rotation=90)

## 3. Time-coverage charts for ALL full domains in selected set benchmark

Count how MANY instances per full domain:

In [None]:
selection_index = ['domain']

# count the number of each run per full_domain (e.g., how many PRP runs in Barman)
count_df = df_sel.groupby(by=selection_index)['solver'].value_counts()

count_df = count_df.reset_index(name='count')

count_df


Next calculate coverage for each solver run in each full domain:

In [None]:
def compute_coverage(df: pd.DataFrame) -> pd.DataFrame:
    # columns to group-by
    selection_index = ["solver", "domain"]

    # count normalized (0-1) the number of grade after grouping for all the other values
    coverage_df = df.groupby(by=selection_index)["solved"].value_counts(normalize=True)

    # transofm the serie into a dataframe and value becomes percent
    coverage_df = coverage_df.mul(100).rename("percent").reset_index()

    # convert the rows that have 100% unsovable (False), to 0% solvable (True)
    #   otherwise, there will be no True solvable for those cases!
    mask_unsolvable = (~coverage_df.solved) & (coverage_df.percent == 100)
    coverage_df.loc[mask_unsolvable, ["solved", "percent"]] = [True, 0]

    # return the % of solvable stats
    return coverage_df.loc[coverage_df.solved].round(0)

# # merge with count of instances per full domain
coverage_df = compute_coverage(df_sel)
coverage_df = coverage_df.merge(count_df)
coverage_df

# SOME FILTERS
# coverage_df.query("not solved and percent == 100")
# coverage_df.query("solved and percent == 0")

Let's check the coverage in a particular full domain:

In [None]:
import random

x = random.choice(coverage_df['domain'].unique())
coverage_df.loc[coverage_df.domain == x]

Some useful links to make nice charts:

* [Changing plot style and color](https://s3.amazonaws.com/assets.datacamp.com/production/course_15192/slides/chapter4.pdf).
* [Advanced Seaborn: Demystifying the Complex Plots!](https://levelup.gitconnected.com/advanced-seaborn-demystifying-the-complex-plots-537582977c8c#5965 )

OK this is the main code for drawing complex combined time-coverage charts across a full set benchmark (e.g., AIJ) as done with the R's script. 

For each full domain (e.g., Barman-EIGHT50), draw a plot showing scatter time performance across instances per solver/run AND coverage bars superimposed. This was Nitin's great graphs done originally in R for ECAI'23.

In each subplot, the title shows the full domain with the number of instances run (e.g., "Barman-EIGHT50 (20)": 20 instances run for Barman-EIGHT50 full domain).

In [None]:
# https://seaborn.pydata.org/tutorial/aesthetics.html
# https://seaborn.pydata.org/generated/seaborn.set_theme.html
sns.set_theme()

# fig, ax = plt.subplots()

# FIRST graph for cputime per solver
# sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.set_style("darkgrid")

# box for the title of each subplot
# https://matplotlib.org/stable/api/_as_gen/matplotlib.patches.FancyBboxPatch.html#matplotlib.patches.FancyBboxPatch
bbox1 = dict(boxstyle="square", fc="lightblue", fill=True, color='r')
bbox2 = dict(boxstyle="round", fc="0.9", fill=True, color='r')


## FIRST, we do a scatter plot for each full domain showing cputime for each solved instance across each solver/run
# https://seaborn.pydata.org/generated/seaborn.relplot.html#seaborn.relplot
g = sns.relplot(data=df_sel.query('solved'), kind="scatter", x='time', y='solver', col="domain", col_wrap=6, height=4, aspect=1.2)
g.set_axis_labels("time", "solver")
# Let's configure the title of each subplot
#   most options are passed to text: https://matplotlib.org/stable/api/text_api.html
g.set_titles(col_template='{col_name}', fontweight="demibold",  ha='center', va='center', bbox=bbox1)

# get the whole figure title: XXX Benchmark
g.figure.suptitle(
    f"Coverage Results", ha="left", va="top", fontsize="xx-large", y=1
)  # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.suptitle.html


print("Finished building scattered plot of cputime. Next overlapping coverage bars...")

# get all the axes (subplots) of the FaceGrid
axes = g.axes.flatten()

## SECOND, super-impose the COVERAGE data in each subplot in the grid as done in https://stackoverflow.com/a/67612124
#   we also rename the title of each subplot to include no of instances run
#   we iterate on each axis and plot a barplot and add annotations/styles to it
sns.set_style("ticks")  # just ticks, no grid
for ax in axes:
    # full_domian = ax.get_title().split(' = ')[1]
    domain = ax.get_title()

    # redo title of subfigure to include number of instances between parenthesis, e.g., BARMAN-EIGHT50 (20)
    no_instances = coverage_df.loc[coverage_df.domain == ax.get_title()]['count'].unique()[0]
    ax.set_title(f"{domain} ({no_instances})", fontweight="demibold",  ha='center', va='center', bbox=bbox1)

    # add bar of coverage % for each run/solver
    ax2 = ax.twiny()    # get a twin y-axies https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.twinx.html
    g2 = sns.barplot(data=coverage_df[coverage_df.domain.eq(domain)], x="percent", y="solver", width=0.0001, linewidth=2.5, edgecolor=".5", facecolor=(0, 0, 0, 0), ax=ax2)
    g2.set_xlabel('coverage', x=0, ha="left")
    g2.set_xlim([0, 100])

    # add box with % of coverage at the end of the barline
    if len(ax2.containers) > 0 :
        ax2.bar_label(ax2.containers[0], label_type='edge', padding=-5, fontweight='normal', rotation="horizontal", bbox=bbox2)


# axes[0].legend().remove()
# g.set_axis_labels(x_var=None, y_var=None, clear_inner=True)
sns.despine(left=True, bottom=True) # no spines at all

# Save it later, not here.
# plt.savefig(os.path.join(CSV_FOLDER, f"{SET}_plot.png"))

plt.tight_layout()  # at the end adjust so everything fits tight but well
plt.show()

Save graph in a PNG file:

In [None]:
g.savefig(os.path.join(CSV_FOLDER, f"coverage_plot.png"))

## 4. Coverage Analysis

We now generate **coverage** tables, as they often apper in papers. Basically we compute per benchmark set, domain, and APP type sub-domain, and each solver-run:

- **Coverage:** % of solved instances solved by the solver-run; and
- **Stat metrics:** mean on time, memory usage, and policy size.

In [None]:
# df = pd.read_csv(os.path.join(CSV_FOLDER,"results_all.csv"))
df = df_sel

print(df.shape)
df.head()

Calculate % ratio per set/domain/sub_domain/run-solver.

In [None]:
df_grouped = df.groupby(["domain", "solver"])

#   df_grouped.sum()[["solved"]] = sum all the True instances (sum over bool = number of True)
#   df_grouped.count()[["solved"]] = number of rows in solved column (includes True and Talse values)
df_coverage = df_grouped.sum()[["solved"]] / df_grouped.count()[["solved"]]
df_coverage

Calculate mean metric (for CPU time, memory, and policy size) across the solved instances.

In [None]:
columns = ["domain", "solver", "time"]
df_solved = df.query("solved == True")[columns]

df_solved_grouped = df_solved.groupby(["domain", "solver"])
df_metrics = df_solved_grouped.mean()
df_metrics

Put together **Coverage** and **Metrics** tables.

In [None]:
column_names = {
    "solved": "cov",
    "time": "time"
    # "memory_mb": "mem",
    # "policy_size": "size",
}

df_stats = df_coverage.join(df_metrics, how="inner")
df_stats.rename(columns=column_names, inplace=True)

df_stats = df_stats.reset_index()
# df_stats.insert(0, "set", df_stats.pop("set"))

df_stats

In [None]:
df_stats_pivot = df_stats.pivot(
    index=["domain"],
    values=["cov", "time"],
    columns="solver",
)
df_stats_pivot.reset_index(
    inplace=True
)  # unfold multi-index into columns (create integer index)
df_stats_pivot.columns = [
    "_".join(tup).rstrip("_") for tup in df_stats_pivot.columns.values
]

# flat index, but multi-column: 1. coverage / time / policy size and 2. each solver/run
df_stats_pivot = df_stats_pivot.round(2)

df_stats_pivot

Save it to the file, this can be used in the paper.

In [None]:
df_stats_pivot.to_csv(os.path.join(CSV_FOLDER, "coverage_stats.csv"), index=False)