## Imports

In [61]:
from typing import Callable, Any, Dict
import re

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from natsort import natsorted

## Global information about each suite

In [62]:
runs  = {e: {} for e in Path("./out").iterdir()}

def add_entry_to_run(new_key: str, f: Callable[[Path, Dict[str, Any]], Any]) -> Dict[Path, Dict[str, Any]]:
    global runs
    for key, value in runs.items():
        value.update({new_key: f(key, value)})

add_entry_to_run("name", lambda p, d: p.name)

### Time

In [63]:
def get_time(p:Path, d):
    file_path = p.rglob("klee/info").__next__()
    if file_path.exists():
        with open(file_path, "r") as file:
            res = re.search(r"--max-time=(\w*)", file.read())
            if res:
                return res.group(1)
    return None

add_entry_to_run("time", get_time)

## Per util information

### Setup

In [64]:
add_entry_to_run("df", lambda p, d: pd.DataFrame(columns=[e.name for e in p.iterdir() if e.is_dir()]))

def add_entry_for_utils(key: str, f: Callable[[Path], Any]) -> None:
    """
    Add entry for all utils

    Paramenters:
    key (str): key to add the new value at in the dataframe
    f (Callable[[Path], Any]): function taking the path to the subfolder for the util and returning the appropriate value
    """

    def adder(p: Path, d):
        df = d["df"]
        res = {}
        for util in df.columns:
            path = p / util
            if not path.exists():
                raise Exception(f"Path \"{util}\" does not exist")
            res[util] = f(path)
        df.loc[key] = res
        return df
    add_entry_to_run("df", adder)

### Number of errors according to KLEE

In [65]:
def read_num_errors(util_path: Path) -> str:
    file_path = util_path / "klee"
    if file_path.exists():
        return str(len(list(file_path.glob("*.err"))))
    return None

add_entry_for_utils("num_errors", read_num_errors)

### Coverage according to KLEE

In [66]:
def read_klee_csv(csv_name: str) -> Callable[[Path], str]:
    def f(util_path: Path) -> str:
        file_path = util_path / "klee-stats.csv"
        if file_path.exists():
            df = pd.read_csv(file_path)
            return str(df[csv_name][0])
        return None
    return f

add_entry_for_utils("klee_ICov", read_klee_csv("ICov(%)"))
add_entry_for_utils("klee_BCov", read_klee_csv("BCov(%)"))

### Coverage according to `gcov`

In [67]:
def read_gcov_cov(util_path: Path) -> str:
    file_path = util_path / "cov.txt"
    if file_path.exists():
        with open(file_path, "r") as file:
            res = re.search(r"File '\.\./\.\./src/(\w+)\.c'\nLines executed:(\d?\d\d.\d\d)% of \d+", file.read())
            if res:
                return res.group(2)
    return None

add_entry_for_utils("gcov_cov", read_gcov_cov)

## Plots
### Massaging `df`s together

In [68]:
dfs = []
for k, v in runs.items():
    df = v["df"]
    df = df.reset_index(names="key")
    df = df.melt(id_vars="key", var_name="util")
    # .melt(id_vars="")
    df["DataFrame"] = str(k)
    df["time"] = v["time"]
    dfs.append(df)

combined_df = pd.concat(dfs)
combined_df['value'] = combined_df['value'].astype(np.float64)
combined_df = combined_df.dropna(subset=['value'])
combined_df = combined_df.reset_index(drop=True)
print(combined_df.sample(20))

             key      util   value                   DataFrame     time
2019   klee_BCov    csplit   19.16        out/coreutils-6.10-4    60min
2108  num_errors       pwd    0.00          out/coreutils-6.10    60min
1161    gcov_cov        df   72.40    out/coreutils-6.10-10min    10min
2815    gcov_cov  unexpand   91.76      out/coreutils-6.10-24h  1440min
477     gcov_cov       sum   90.36        out/coreutils-6.10-3    60min
2063    gcov_cov     rmdir   81.03        out/coreutils-6.10-4    60min
3040  num_errors      head    2.00      out/coreutils-6.10-24h  1440min
1421   klee_ICov      join   42.32        out/coreutils-6.10-2    60min
1631    gcov_cov        du   83.51        out/coreutils-6.10-2    60min
3365    gcov_cov        od   84.53  out/coreutils-6.10-10min-2    10min
547    klee_ICov      comm   40.50        out/coreutils-6.10-3    60min
2332  num_errors     nohup    0.00          out/coreutils-6.10    60min
2245   klee_ICov   dirname   37.01          out/coreutils-6.10  

### Plots by coverage

In [69]:
coverage_df = combined_df.drop(columns="DataFrame")
keys = np.sort(coverage_df["key"].unique())
fig, axes = plt.subplots(nrows=len(keys), ncols=1, figsize=(10, 5*len(keys)), dpi=300)
fig.suptitle(f"Empirical Cumulative Distribution Function (ECDF)", fontsize=20, y=0.99)
for time in natsorted(coverage_df["time"].unique()):
    filtered_by_time_df = coverage_df[coverage_df["time"] == time]
    filtered_by_time_df = filtered_by_time_df.drop(columns="time")
    filtered_by_time_df = filtered_by_time_df.groupby(["util", "key"]).mean()
    filtered_by_time_df = filtered_by_time_df.reset_index()
    for key_i, key in enumerate(np.sort(keys)):
        filtered_by_key_df = filtered_by_time_df[filtered_by_time_df['key'] == key]
        axes[key_i].set_title(key)
        sns.ecdfplot(y="value", data=filtered_by_key_df, ax=axes[key_i], label=time)
        axes[key_i].legend()
plt.tight_layout()
plt.savefig(f"plots/by-time.png")
plt.close()

### Gains by time

In [70]:
df = combined_df.drop(columns=["DataFrame"])
df = df.groupby(["key", "time", "util"]).mean()
df = df.reset_index()
keys = np.sort(df["key"].unique())
fig, axes = plt.subplots(nrows=len(keys), ncols=1, figsize=(5, 3*len(keys)), dpi=300)
for i, key in enumerate(keys):
    df_keys = df[df["key"] == key]
    df_keys = df_keys.drop(columns="key")
    order = natsorted(df_keys["time"].unique())
    df_keys['time'] = pd.Categorical(df_keys['time'], categories=order, ordered=True)
    df_keys = df_keys.sort_values(['util', 'time'])
    df_keys['difference'] = df_keys.groupby('util')['value'].diff()
    df_keys = df_keys.reset_index()
    df_keys = df_keys.drop(columns=["util", "value"])
    df_keys = df_keys.groupby(["time"]).mean()
    df_keys = df_keys.reset_index()
    df_keys['time'] = df_keys['time'].shift(1).astype(str) + ' - ' + df_keys['time'].astype(str)
    df_keys = df_keys.dropna()
    sns.barplot(data=df_keys, x="time", y="difference", ax=axes[i])
    axes[i].set_ylabel(f"average {key} gained")
plt.tight_layout()
plt.savefig(f"plots/gains_by_time.png")
plt.close()


  df_keys = df_keys.groupby(["time"]).mean()
  df_keys = df_keys.groupby(["time"]).mean()
  df_keys = df_keys.groupby(["time"]).mean()
  df_keys = df_keys.groupby(["time"]).mean()


### Plots by util

In [71]:
keys = np.sort(combined_df["key"].unique())
for key in keys:
    filtered_by_key_df = combined_df[combined_df["key"] == key]
    utils = np.sort(filtered_by_key_df["util"].unique())
    fig, axes = plt.subplots(
            nrows=int(np.ceil(len(utils)/5)),
            ncols=5,
            figsize=(25, len(utils)),
            dpi=300
        )
    for i_util, util in enumerate(utils):
        filtered_by_util_df = filtered_by_key_df[filtered_by_key_df['util'] == util]
        ax = axes[i_util//5, i_util%5]
        ax.set_title(f"{util}")
        ax.set_ylabel(key)
        order = natsorted(filtered_by_util_df["time"].unique())
        sns.boxplot(x="time", y="value", data=filtered_by_util_df, ax=ax, order=order)
    plt.tight_layout()
    plt.savefig(f"plots/by-util/{key}.png")
    plt.close()
    print(f"Done with {key}")

Done with gcov_cov
Done with klee_BCov
Done with klee_ICov
Done with num_errors
