In [None]:
import pandas as pd
import glob
import json
import dotted # https://pypi.org/project/dotted-notation/
import re
import matplotlib.pyplot as plt
import json

from pathlib import Path
import seaborn as sns

In [None]:
import lib.datasciencetoolbelt as dstools
from lib.resultstorage import ResultStorage

In [None]:
#%matplotlib qt
%matplotlib inline

#  Dependencies

In [None]:
import itertools

def filter_by_index_value(df, level, filter):
    """Return a new df that only contains rows whose MultiIndex column `level`'s value passes `filter`"""
    return df[df.index.get_level_values(level).map(filter)]

def remove_index_dimension(df, level, value):
    """Reduce dimensionality of a dataframe by filtering by and subsequently dropping one of its index levels.
    
    df is assumed to be a multi-indexed pd.DataFrame.
    First, filter the data frame so that we only keep rows whose index tuple has value `value` at level `level`.
    Now the resulting data frame only has a single value at the level.
    Thus remove that level from the index.
    Voila: dimensionality reduced.
    """
    df = df[df.index.get_level_values(level) == value]
    assert set(df.index.get_level_values(level)) == {value}
    df.index = df.index.droplevel(level)
    return df

def _test_remove_index_dimension():
    data = [{"favnum": n, "favletter": l, "id": id} for id, (n, l) in enumerate(itertools.product([23,42],["a", "b"]))]
    d = pd.DataFrame(data).set_index(["favnum", "favletter"])
    display(d)
    display(remove_index_dimension(d, "favnum", 23))
    display(remove_index_dimension(d, "favletter", "b"))
    
_test_remove_index_dimension()

In [None]:
def level_values_sorted_unique(df, level):
    """Returns the sorted unique values of a DataFrame's multi-index at level `level`"""
    return sorted(list(set(df.index.get_level_values(level))))

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
        
class FactorizedDataFrameItem(AttrDict):
    @property
    def title(self):
        if self.fdf.row and self.fdf.col:
            return f"{self.fdf.row}={self.rv}|{self.fdf.col}={self.cv}"
        elif self.fdf.row:
            return f"{self.fdf.row}={self.rv}"
        elif self.fdf.col:
            return f"{self.fdf.col}={self.cv}"
        else:
            return ""
            
        
class FactorizedDataFrame:
    def __init__(self, data, row, col):
        self.data = data
        self.col = col
        self.row = row

        self.col_values = [None] if not self.col else level_values_sorted_unique(self.data, self.col)
        self.row_values = [None] if not self.row else level_values_sorted_unique(self.data, self.row)
        
    def iter_factorized(self):
        for ci, c in enumerate(self.col_values):
            for ri, r in enumerate(self.row_values):
                d = self.data.copy()
                if c:
                    d = remove_index_dimension(d, self.col, c)
                if r:
                    d = remove_index_dimension(d, self.row, r)
                # display(d)
            
                context = FactorizedDataFrameItem({
                    "fdf": self,
                    "d": d,
                    "ri": ri,
                    "rv": r,
                    "ci": ci,
                    "cv": c,
                    "is_last_row": ri == len(self.row_values)-1,
                    "is_last_col": ci == len(self.col_values)-1,
                })
                yield context
                

def factorplot(data=None, row=None, col=None, plot=None, subplots_kw={}):
    """Factorizez MultiIndex'ed DataFrame `data`, then invokes `plot` for each FactorizedDataFrameItem"""
    
    fdf = FactorizedDataFrame(data, row, col)
    
    subplots_kw = {
        "gridspec_kw": {'hspace': 1},
        **subplots_kw,
        "squeeze": False, # axes should always be two-dimensional
    }

    fig, axes = plt.subplots(len(fdf.row_values), len(fdf.col_values), **subplots_kw)

    for f in fdf.iter_factorized():
        ax = axes[f.ri, f.ci]
        ax.set_title(f.title)
        legend = f.ri == len(fdf.row_values)-1 and f.ci == len(fdf.col_values)-1
        plot(f, ax, legend)
        if legend:
            plt.legend(loc='lower left', bbox_to_anchor=(1,0.5))

# End Dependencies

In [None]:
dstools.setup({
    "seaborn_context": "talk",
    "savefig": {
        "enable": True,
        "dir": Path("./postprocess_results"),
    }
})
result_storage = ResultStorage(Path("./results"))

In [None]:
id_vars__dottedpath_and_shortname_and_type = [
#     ('result.identity', "benchmark", str),  
    ("storage_stack.identity", "storage_stack", str),
    ("result.fio_config.numjobs", "numjobs", int),
]
id_vars = [p[1] for p in id_vars__dottedpath_and_shortname_and_type]

def extract_id_var_values(output_json):
    d = output_json
    id_var_values = {}
    for dp, sn, ty in id_vars__dottedpath_and_shortname_and_type: 
        v = dotted.get(d, dp)
        if not v:
            raise Exception(f"{d['file']}: dotted path {dp} not found")
        if sn in id_var_values:
            raise Exception(f"duplicate shortname {sn}")
        try:
            id_var_values[sn] = ty(v)
        except ValueError as e:
            raise Exception(f"cannot parse v={v!r}") from e
    return id_var_values


def get_fio_write_metrics(output_json):
    d = output_json
    jobs = dotted.get(d, "fio_jsonplus.jobs")
    assert len(jobs) == 1
    j0 = jobs[0]
    jw = jobs[0]["write"]
    return {
        "w_iops_mean": jw["iops_mean"],
        "w_iops_stddev": jw["iops_stddev"],
        "w_lat_mean": dotted.get(jw, "lat_ns.mean"),
        "w_lat_stddev": dotted.get(jw, "lat_ns.stddev"),
    }


def to_row_dict(output_json):
    try:
        r = {}
        for k, v in dotted.get(output_json, "result.latency_analysis").items():
            assert k[0] == '@'
            k = k[1:] # strip leading @
            assert k not in r
            r[k] = v

        r = {
            **extract_id_var_values(output_json),
            "fio_metrics": get_fio_write_metrics(output_json['result']),
            **r,
        }
        return r
    except:
        print(json.dumps(output_json))
        raise

rows = [to_row_dict(j) for j in result_storage.iter_results("comparison_zil_overhead_lwb_vs_pmem__v2")]


In [None]:
df = pd.DataFrame.from_dict(rows)
df = df.set_index(id_vars)
df = df.sort_index()
display(df)
# display(df / 1_000_000)
# compute zfs write breakdown

In [None]:
df.columns

In [None]:
tmp = df.copy()

del tmp['fio_metrics']

write_count = tmp['zfs_write_count']
del tmp['zfs_write_count']

tmp['async'] = tmp.zfs_write - tmp.zil_commit - tmp.zfs_log_write
tmp['zil_persistence'] = tmp.zil_commit - tmp.zil_fill_commit_list

data = tmp[["async", "zfs_log_write", "zil_fill_commit_list", "zil_persistence"]]
df_latbreakdown = data
df_latbreakdown

In [None]:
tmp = df.copy()
df_fio = tmp['fio_metrics'].apply(pd.Series)
df_fio

# Relative Latency Breakdown

In [None]:
data = df_latbreakdown.copy()
total = data.sum(axis=1)
# display(data)
# display(total)
data = data.div(total, axis=0)
# display(data)

def plot(f, ax, legend):
    f.d.plot.bar(ax=ax, ylim=(0, 1.1), legend=legend, stacked=True)
    if not f.is_last_row:
        ax.set_xlabel("")
        ax.set_xticklabels([])
        
    
factorplot(data, row='storage_stack', col=None, plot=plot, subplots_kw={
    "figsize": (10, 10),
    "gridspec_kw": {'hspace': 0.2},
})

# data.loc["zfs-lwb-rs_0", ].plot.area(**kwargs)
# data.loc["zfs-pmem-rs_0-byp_0-nc_3", ].plot.area(**kwargs)

# Absolute Latency Normalized By IOPS

In [None]:
data = df_latbreakdown.div(df_fio.w_iops_mean, axis=0)

def plot(f, ax, legend):
    f.d.plot.bar(ax=ax, ylim=(0, 80_000), legend=legend, stacked=True)
    if not f.is_last_row:
        ax.set_xlabel("")
        ax.set_xticklabels([])
        
    
factorplot(data, row='storage_stack', col=None, plot=plot, subplots_kw={
    "figsize": (10, 10),
    "gridspec_kw": {'hspace': 0.2},
})

# fio-perceived IOPS and Latency For Validation

In [None]:
#ax = df_fio.w_lat_mean.unstack("storage_stack").plot.bar(figsize=(10,5), subplots=True, yticks=range(0, 100_000, 20_000))
ax = df_fio.w_lat_mean.unstack("storage_stack").plot(figsize=(10,5))
ax.legend(bbox_to_anchor=(1,0.5))

In [None]:
#ax = df_fio.w_iops_mean.unstack("storage_stack").plot.bar(figsize=(10,5), subplots=True)
ax = df_fio.w_iops_mean.unstack("storage_stack").plot(figsize=(10,5))
ax.legend(bbox_to_anchor=(1,0.5))

# Delta Fio And Measures

In [None]:
data = df_latbreakdown.div(df_fio.w_iops_mean, axis=0)
total_measured = data.sum(axis=1)
total_measured

In [None]:
df_fio.w_lat_mean

In [None]:
(df_fio.w_lat_mean - total_measured).unstack("storage_stack").plot.bar(figsize=(10,5))

In [None]:
plot.bar(figsize=(10,5), subplots=True, yticks=range(0, 100_000, 20_000))


def plot(f, ax, legend):
    f.d.plot.bar(ax=ax, ylim=(0, 80_000), legend=legend, stacked=True)
    if not f.is_last_row:
        ax.set_xlabel("")
        ax.set_xticklabels([])
        
    
factorplot(data, row='storage_stack', col=None, plot=plot, subplots_kw={
    "figsize": (10, 10),
    "gridspec_kw": {'hspace': 0.2},
})