In [None]:
import pandas as pd
import glob
import json
import dotted # https://pypi.org/project/dotted-notation/
import re
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path
import seaborn as sns
import lib.datasciencetoolbelt as dstools
from lib.resultstorage import ResultStorage

In [None]:
dstools.setup({
    "seaborn_context": "talk",
    "savefig": {
        "enable": True,
        "dir": Path("./postprocess_results"),
    }
})
result_storage = ResultStorage(Path("./results"))

#%matplotlib qt
%matplotlib inline


In [None]:
result_storage_prefix = "ncommmitters_scalability_i60pc62__v1"

id_vars__dottedpath_and_shortname_and_type = [
#     ("zfs_setup.module_args.zfs.zfs_zil_itxg_bypass", "itxg_bypass", str),
    #("zfs_setup.module_args.zfs.zvol_request_sync", "zvol_request_sync", str), # technically not in the v6 set, but that's just because we limited the scope of the benchmark for time reasons
    ("zfs_setup.module_args.zfs.zfs_zil_pmem_prb_ncommitters", "ncommitters", int),
#     ("fio_config.fsync_every", "fsync_every", int),
    ("fio_config.numjobs", "numjobs", int)
]
id_vars = [p[1] for p in id_vars__dottedpath_and_shortname_and_type]

def extract_id_var_values(output_json):
    d = output_json
    id_var_values = {}
    for dp, sn, ty in id_vars__dottedpath_and_shortname_and_type: 
        v = dotted.get(d, dp)
        if not v:
            raise Exception(f"{d['file']}: dotted path {dp} not found")
        if sn in id_var_values:
            raise Exception(f"duplicate shortname {sn}")
        try:
            id_var_values[sn] = ty(v)
        except ValueError as e:
            raise Exception(f"cannot parse v={v!r}") from e
    return id_var_values

def get_fio_write_metrics(output_json):
    d = output_json
    jobs = dotted.get(d, "fio_jsonplus.jobs")
    assert len(jobs) == 1
    j0 = jobs[0]
    jw = jobs[0]["write"]
    return jw

def to_fio_results_dict(output_json):
    jw = get_fio_write_metrics(output_json)
    return {
        **extract_id_var_values(output_json),
        "w_iops_mean": jw["iops_mean"],
        "w_iops_stddev": jw["iops_stddev"],
        "w_lat_mean": dotted.get(jw, "lat_ns.mean"),
        "w_lat_stddev": dotted.get(jw, "lat_ns.stddev"),
    }

def to_kstat_results_dict(output_json):
    d = output_json
    return {
        **extract_id_var_values(output_json),
        **d["zvol_stats"],
        **d["itxg_bypass_stats"],
        **d["zil_pmem_stats"],
        **d["zil_pmem_ringbuf_stats"],
        "bio_total": d["zvol_stats"]["submit_bio__zvol_write(with_taskq_if_enabled)"],
        "taskq_delay": dotted.get(d, 'zvol_stats.zvol_write__taskq_qdelay'),
        "assign_aquire": dotted.get(d, 'itxg_bypass_stats.assign__aquisition_total'),
        "assign_vtable": dotted.get(d, 'itxg_bypass_stats.assign__vtable'),
        "assign_total": dotted.get(d, 'itxg_bypass_stats.assign__total'),
        "commit_total": dotted.get(d, 'itxg_bypass_stats.commit__total'),
        "commit_aquire": dotted.get(d, 'itxg_bypass_stats.commit__aquire'),
        
    }

def to_cpu_dict(output_json):
    d = output_json
    return {
        **extract_id_var_values(output_json),
        **{f"cpu_{comp}": val for comp, val in dotted.get(d, "cpu_time.allcpu").items()},
    }

In [None]:
rows = [{**to_fio_results_dict(j)} for j in result_storage.iter_results(result_storage_prefix)]
df = pd.DataFrame.from_dict(rows)
df = df.set_index(id_vars, verify_integrity=True)
df

In [None]:
# df

In [None]:
rows = [{**to_cpu_dict(j)} for j in result_storage.iter_results(result_storage_prefix)]
df_cpu = pd.DataFrame.from_dict(rows)
df_cpu = df_cpu.set_index(id_vars, verify_integrity=True)
# df_cpu

In [None]:
cpu_total = df_cpu.sum(axis=1)
df_cpu['cpu_not_idle'] = cpu_total - df_cpu.cpu_idle
df_cpu['cpu_utilization'] = df_cpu.cpu_not_idle / (cpu_total - (cpu_total/2)) # second socket was disabled => half of total cpu time is idle time
# df_cpu

In [None]:
# df_cpu.unstack('ncommitters')['cpu_utilization'].plot(figsize=(15,10))

In [None]:
# seems plausible, join with df
df = df.join(df_cpu)
df

# Committerslot Histogram

In [None]:
rows = [{**to_kstat_results_dict(j)} for j in result_storage.iter_results(result_storage_prefix)]
df_kstat = pd.DataFrame.from_dict(rows)
df_kstat = df_kstat.set_index(id_vars, verify_integrity=True)
df_kstat.columns

In [None]:
bucketprefix = "prb_write__committerslothist_b_"
buckets = list(filter(lambda col: col.find(bucketprefix) == 0, df_kstat.columns))
rename = {col: col[len(bucketprefix):] for col in buckets}
df_cslot = df_kstat[buckets].copy()
df_cslot = df_cslot.rename(rename, axis=1)
df_cslot = df_cslot.rename_axis(columns='bucket')
df_cslot

In [None]:
# ensure that other is zero
assert (df_cslot['other'] == 0).all()
# drop it
del df_cslot['other']

In [None]:
tmp = df_cslot.copy()
tmp = pd.DataFrame(tmp.stack().rename('count').reset_index())
tmp = tmp.set_index(id_vars + ["bucket"])
df_cslot = tmp

In [None]:
tmp = df_cslot.copy()
tmp = tmp.reset_index()
tmp['bucket'] = tmp.bucket.astype('int64')
tmp = tmp.set_index(id_vars + ["bucket"])
df_cslot = tmp

In [None]:
tmp = df_cslot.copy()
tmp = tmp.reset_index()
tmp['weight'] = tmp.bucket.map(lambda v: v + 1)
tmp = tmp.set_index(id_vars + ["bucket"])
df_cslot = tmp

## Committer Slot Distribution


In [None]:
df_cslot['count'].unstack('bucket')

In [None]:
tmp = df_cslot.copy()
tmp = tmp['count'].unstack('bucket')
# delete colums that only contain zeroes
# https://stackoverflow.com/questions/21164910/how-do-i-delete-a-column-that-contains-only-zeros-in-pandas
tmp = tmp.loc[:, (tmp != 0).any(axis=0)]

In [None]:
tmp = tmp.div(tmp.sum(axis=1), axis=0)

In [None]:
ncommitters_values = sorted(list(set(tmp.index.get_level_values('ncommitters'))))
print(ncommitters_values)
for i in ncommitters_values:
    tmp.query('ncommitters == @i').plot.area(figsize=(15,1.5), legend=False)

## Average Committer Slot

In [None]:
countsum = df_cslot['count'].unstack('bucket').sum(axis=1)
countsum

In [None]:
weightedcount = (df_cslot['count'] * df_cslot['weight']).unstack('bucket').sum(axis=1)
weightedcount

In [None]:
pd.DataFrame((weightedcount / countsum).rename('avg_committer_slot')).unstack('ncommitters').plot(figsize=(20,15))

# CPU Time Per IOP

In [None]:
df = df.query('ncommitters in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]')


In [None]:
data = df.copy()
# data = data.query('numjobs in [1,4,8,16] and ncommitters in [1,2,4,8,16]')

data = data[["w_iops_mean"]].unstack("ncommitters")
data.plot(figsize=(15,12))

In [None]:
data = df.copy()
# data = data.query('numjobs in [1,4,8,16] and ncommitters in [1,2,4,8,16]')

data = data[["cpu_utilization"]].unstack("ncommitters")
data.plot(figsize=(15,12))

In [None]:
data = df.copy()
data = data.reset_index()
data = data.query('ncommitters in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]')
data = data.set_index(id_vars)

data['cpu_per_iop'] = data.cpu_not_idle / data.w_iops_mean

data = data[["cpu_per_iop"]].unstack("ncommitters")
data.plot(figsize=(15,10))