## Imports

In [13]:
from typing import Callable, Any, Dict
import re

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing

from natsort import natsorted

## Global information about each suite

In [14]:
runs = {
    run: {"version": version.name}
    for version in Path("./out").iterdir()
    for run in version.iterdir()
}


def add_entry_to_run(
    new_key: str, f: Callable[[Path, Dict[str, Any]], Any]
) -> Dict[Path, Dict[str, Any]]:
    global runs
    for key, value in runs.items():
        value.update({new_key: f(key, value)})


add_entry_to_run("name", lambda p, d: p.name)

### Time

In [15]:
def get_time(p:Path, d):
    file_path = p.rglob("klee/info").__next__()
    if file_path.exists():
        with open(file_path, "r") as file:
            res = re.search(r"--max-time=(\w*)", file.read())
            if res:
                return res.group(1)
    else:
        print(f"Error for {p}")
        return None

add_entry_to_run("time", get_time)

## Per util information

### Setup

In [16]:
add_entry_to_run("df", lambda p, d: pd.DataFrame(columns=[e.name for e in p.iterdir() if e.is_dir()]))

def add_entry_for_utils(key: str, f: Callable[[Path], Any]) -> None:
    """
    Add entry for all utils

    Paramenters:
    key (str): key to add the new value at in the dataframe
    f (Callable[[Path], Any]): function taking the path to the subfolder for the util and returning the appropriate value
    """

    def adder(p: Path, d):
        df = d["df"]
        res = {}
        for util in df.columns:
            path = p / util
            if not path.exists():
                raise Exception(f"Path \"{util}\" does not exist")
            res[util] = f(path)
        df.loc[key] = res
        return df
    add_entry_to_run("df", adder)

### Number of errors according to KLEE

In [17]:
def read_num_errors(util_path: Path) -> str:
    file_path = util_path / "klee"
    if file_path.exists():
        return str(len(list(file_path.glob("*.err"))))
    else:
        print(f"Error for {util_path}")
        return None

add_entry_for_utils("num_errors", read_num_errors)

Error for out/coreutils-8.25/1h-3/hostname
Error for out/coreutils-8.25/1h-3/setuidgid
Error for out/coreutils-8.25/1h-2/hostname
Error for out/coreutils-8.25/1h-2/setuidgid
Error for out/coreutils-8.25/1h/hostname
Error for out/coreutils-8.25/1h/setuidgid
Error for out/coreutils-9.4/1h/hostname
Error for out/coreutils-9.4/1h/setuidgid


### Coverage according to KLEE

In [18]:
def read_klee_csv(csv_name: str) -> Callable[[Path], str]:
    def f(util_path: Path) -> str:
        file_path = util_path / "klee-stats.csv"
        if file_path.exists() and file_path.stat().st_size > 0:
            df = pd.read_csv(file_path)
            return str(df[csv_name][0])
        else:
            print(f"Error for {csv_name} — {util_path}")
            return None
    return f

add_entry_for_utils("klee_ICov", read_klee_csv("ICov(%)"))
add_entry_for_utils("klee_BCov", read_klee_csv("BCov(%)"))

Error for ICov(%) — out/coreutils-8.25/1h-3/hostname
Error for ICov(%) — out/coreutils-8.25/1h-3/setuidgid
Error for ICov(%) — out/coreutils-8.25/1h-2/hostname
Error for ICov(%) — out/coreutils-8.25/1h-2/setuidgid
Error for ICov(%) — out/coreutils-8.25/1h/hostname
Error for ICov(%) — out/coreutils-8.25/1h/setuidgid
Error for ICov(%) — out/coreutils-9.4/1h/hostname
Error for ICov(%) — out/coreutils-9.4/1h/setuidgid
Error for BCov(%) — out/coreutils-8.25/1h-3/hostname
Error for BCov(%) — out/coreutils-8.25/1h-3/setuidgid
Error for BCov(%) — out/coreutils-8.25/1h-2/hostname
Error for BCov(%) — out/coreutils-8.25/1h-2/setuidgid
Error for BCov(%) — out/coreutils-8.25/1h/hostname
Error for BCov(%) — out/coreutils-8.25/1h/setuidgid
Error for BCov(%) — out/coreutils-9.4/1h/hostname
Error for BCov(%) — out/coreutils-9.4/1h/setuidgid


### Coverage according to `gcov`

In [19]:
def read_gcov_cov(util_path: Path) -> str:
    file_path = util_path / "cov.txt"
    if file_path.exists():
        with open(file_path, "r") as file:
            res = re.search(r"File '(\.\./)?\.\./src/(\w+)\.c'\nLines executed:(\d?\d\d.\d\d)% of \d+", file.read())
            if res:
                return res.group(3)
    else:
        print(f"Error for {util_path}")
        return None

add_entry_for_utils("gcov_cov", read_gcov_cov)

Error for out/coreutils-8.25/1h-3/hostname
Error for out/coreutils-8.25/1h-3/setuidgid
Error for out/coreutils-8.25/1h-2/hostname
Error for out/coreutils-8.25/1h-2/setuidgid
Error for out/coreutils-8.25/1h/hostname
Error for out/coreutils-8.25/1h/setuidgid
Error for out/coreutils-6.10/6h-3/uniq
Error for out/coreutils-6.10/6h-3/who
Error for out/coreutils-6.10/6h-3/tsort
Error for out/coreutils-9.4/1h/hostname
Error for out/coreutils-9.4/1h/setuidgid
Error for out/coreutils-9.4/1h/cksum
Error for out/coreutils-9.4/1h/wc


## Plots
### Massaging `df`s together

In [20]:
dfs = []
for k, v in runs.items():
    df = v["df"]
    df = df.reset_index(names="key")
    df = df.melt(id_vars="key", var_name="util")
    # .melt(id_vars="")
    df["run"] = k.name
    df["time"] = v["time"]
    df["version"] = v["version"]
    dfs.append(df)

combined_df = pd.concat(dfs)
combined_df['value'] = combined_df['value'].astype(np.float64)
combined_df = combined_df.dropna(subset=['value'])
combined_df = combined_df.reset_index(drop=True)
print(combined_df.sample(20))

             key      util  value      run     time         version
2562   klee_ICov      echo  36.64     1h-3    60min  coreutils-6.10
2167  num_errors     chmod   0.00     6h-2   360min  coreutils-6.10
4585   klee_ICov      kill  45.07  10min-3    10min  coreutils-6.10
1911   klee_BCov       who  22.45      24h  1440min  coreutils-6.10
268     gcov_cov     tsort  96.24     1h-3    60min  coreutils-8.25
1677    gcov_cov     pinky  55.67     6h-3   360min  coreutils-6.10
4948    gcov_cov       cut  83.08       1h    60min  coreutils-6.10
1415   klee_ICov    chroot  38.62     6h-3   360min  coreutils-6.10
3016   klee_ICov    uptime  42.09       6h   360min  coreutils-6.10
1598   klee_BCov        nl  26.39     6h-3   360min  coreutils-6.10
620     gcov_cov      stty  79.61     1h-2    60min  coreutils-8.25
4446  num_errors    md5sum   1.00    10min    10min  coreutils-6.10
204    klee_BCov     cksum  27.54     1h-3    60min  coreutils-8.25
1921  num_errors        mv   0.00      24h  1440

### Plots by coverage

In [21]:
coverage_df = combined_df
versions = natsorted(coverage_df["version"].unique())
keys = natsorted(coverage_df["key"].unique())
time_categories = natsorted(coverage_df["time"].unique())

fig, axes = plt.subplots(
    nrows=len(keys),
    ncols=len(versions),
    figsize=(10 * len(versions), 5 * len(keys)),
    dpi=300,
)
fig.suptitle(f"Empirical Cumulative Distribution Function (ECDF)", fontsize=20, y=0.99)
color_map = dict(zip(time_categories, sns.color_palette(n_colors=len(time_categories))))

for version_i, version in enumerate(versions):
    version_df = coverage_df[coverage_df["version"] == version].drop(columns="version")
    for key_i, key in enumerate(keys):
        key_df = version_df[version_df["key"] == key].drop(columns="key")
        ax = axes[key_i, version_i]
        ax.set_title(f"{key} — {version}")
        for time in natsorted(key_df["time"].unique()):
            time_df = key_df[key_df["time"] == time].drop(columns="time")
            for run_i, run in enumerate(time_df["run"].unique()):
                run_df = time_df[time_df["run"] == run].drop(columns="run")
                sns.ecdfplot(
                    y="value",
                    data=run_df,
                    ax=ax,
                    color=color_map[time],
                    label=time if run_i == 0 else "_nolegend_",
                    stat="count",
                )
        ax.legend(title="Time")
plt.tight_layout()
plt.savefig(f"plots/ecdf.png")
plt.close()

### Gains by time

In [22]:
df = combined_df[combined_df["version"] == "coreutils-6.10"].drop(
    columns=["run", "version"]
)
df = df.groupby(["key", "time", "util"], as_index=False).mean()
keys = natsorted(df["key"].unique())
fig, axes = plt.subplots(nrows=len(keys), ncols=1, figsize=(5, 3 * len(keys)), dpi=300)
for i, key in enumerate(keys):
    df_keys = df[df["key"] == key]
    df_keys = df_keys.drop(columns="key")
    order = natsorted(df_keys["time"].unique())
    df_keys["time"] = pd.Categorical(df_keys["time"], categories=order, ordered=True)
    df_keys = df_keys.sort_values(["util", "time"])
    df_keys["difference"] = df_keys.groupby("util", as_index=False)["value"].diff()
    df_keys = df_keys.drop(columns=["util", "value"])
    df_keys = df_keys.groupby(["time"], as_index=False, observed=True).mean()
    df_keys["time"] = (
        df_keys["time"].shift(1).astype(str) + " - " + df_keys["time"].astype(str)
    )
    df_keys = df_keys.dropna()
    sns.barplot(data=df_keys, x="time", y="difference", ax=axes[i])
    axes[i].set_ylabel(f"average {key} gained")
plt.tight_layout()
plt.savefig(f"plots/gains_by_time.png")
plt.close()

### Plots by util

In [23]:
def paint_util(args):
    key, util, key_df = args
    util_df = key_df[key_df["util"] == util].drop(columns="util")
    util_df = util_df.sort_values(by="version")
    times = natsorted(util_df['time'].unique())
    fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=300)
    sns.stripplot(data=util_df, ax=ax, x='time', y="value", hue="version", order=times)
    ax.set_title(f"{util}")
    ax.set_ylabel(key)
    fig.tight_layout()
    fig.savefig(f"plots/{key}/{util}.png")
    plt.close(fig)


for key in natsorted(combined_df["key"].unique()):
    key_df = combined_df[combined_df["key"] == key].drop(columns="key")
    Path(f"plots/{key}").mkdir(exist_ok=True)
    
    utils = natsorted(key_df["util"].unique())

    with multiprocessing.Pool() as pool:
        pool.map(paint_util, [(key, util, key_df) for util in utils])
    
    print(f"Done with {key}")


Done with gcov_cov
Done with klee_BCov
Done with klee_ICov
Done with num_errors
