In [None]:
pip install pandas matplotlib scipy numpy seaborn scikit-learn

In [None]:
import copy
import pandas
import warnings
import requests
import scipy
from scipy import stats
import subprocess
import numpy as np
from matplotlib import pyplot as plt
import json
import pprint
import seaborn
import os
import sys

sys.path.insert(0, "../src")
from perf_tools.analysis import make_differential_frame, get_data, get_summary_statistics
from perf_tools.analysis import check_are_close, make_latency_plot, plot_latency_stats

In [None]:
class ContinuousRWWorkload:
    def __init__(self, workdir, patch_id, variant, execution, task_name):
        self.workdir = workdir
        self.patch_id = patch_id
        self.variant = variant
        self.execution = execution
        self.task_name = task_name
        self.insert_data = None
        self.find_data = None
        self.update_data = None
        self.crud_data = None
        self.overall_throughput_data = None
    def json_path(self, metric):
        return os.path.join(self.workdir, self.patch_id, self.variant,
            self.task_name, str(self.execution), metric + ".json")
    def get_insert_data(self):
        if self.insert_data is None:
            self.insert_data = get_data(self.json_path("ContinuousRW.insert"))
        return self.insert_data
    def get_find_data(self):
        if self.find_data is None:
            self.find_data = get_data(self.json_path("ContinuousRW.find"))
        return self.find_data
    def get_update_data(self):
        if self.update_data is None:
            self.update_data = get_data(self.json_path("ContinuousRW.update"))
        return self.update_data
    def get_crud_data(self):
        if self.crud_data is None:
            self.crud_data = get_data(self.json_path("ContinuousRW.Crud"))
        return self.crud_data
    def get_overall_throughput_data(self):
        if self.overall_throughput_data is None:
            insert_ops = self.get_insert_data().diff_data[["ts", "d(ops)"]]
            update_ops = self.get_update_data().diff_data[["ts", "d(ops)"]]
            find_ops = self.get_find_data().diff_data[["ts", "d(ops)"]]
            all_ops = pandas.concat([insert_ops, update_ops, find_ops], ignore_index=True)
            all_ops.sort_values("ts", inplace=True)
            all_ops.reset_index(drop=True, inplace=True)
            all_ops["duration"] = (all_ops["ts"] - all_ops["ts"].iloc[0]).astype(int) / 1000000000
            all_ops["total_ops"] = all_ops["d(ops)"].cumsum()
            all_ops["throughput"] = all_ops["total_ops"] / all_ops["duration"]
            self.overall_throughput_data = all_ops
        return self.overall_throughput_data

    def _plot_line_or_scatter(self, df, x, y, line=False, start=None, end=None, **kwargs):
        if line:
            return df[start:end].plot(x=x, y=y, figsize=(20,20), **kwargs)
        return df[start:end].plot.scatter(x=x, y=y, figsize=(20,20), **kwargs)

    def plot_insert_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} inserts {y}"
        return self._plot_line_or_scatter(self.get_insert_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_find_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} finds {y}"
        return self._plot_line_or_scatter(self.get_find_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_update_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} updates {y}"
        return self._plot_line_or_scatter(self.get_update_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_crud_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} crud {y}"
        return self._plot_line_or_scatter(self.get_crud_data().diff_data, x, y, line, start, end, title=title, **kwargs)

    # Plot the data for insert, find, update
    def plot_rw_data(self, x, y, start=None, end=None, noupdate=False):
        insert_df = self.get_insert_data().diff_data[start:end]
        find_df = self.get_find_data().diff_data[start:end]
        plt.figure(figsize=(20,20))
        plt.ylabel(y)
        plt.xlabel(x)
        plt.title(f"{self.variant}-{self.task_name} inserts, updates, finds {y}")
        if not noupdate:
            update_df = self.get_update_data().diff_data[start:end]
            plt.plot(update_df[x], update_df[y], alpha=0.8, label=f"update {y}")
        plt.plot(find_df[x], find_df[y], alpha=0.8, label=f"find {y}")
        plt.plot(insert_df[x], insert_df[y], alpha=0.8, label=f"insert {y}")
        plt.legend()
        plt.show()

    def plot_throughput_data(self, x, start=None, end=None):
        tpdf = self.get_overall_throughput_data()
        title=f"{self.variant}-{self.task_name} overall throughput"
        return tpdf[start:end].plot(x=x, y="throughput", ylabel="ops per sec", title=title, figsize=(20,20))
        
    def get_insert_summary_statistics(self):
        data = self.get_insert_data()
        return get_summary_statistics(data.diff_data, data.fixed_data, data.raw_data)
    def get_find_summary_statistics(self):
        data = self.get_find_data()
        return get_summary_statistics(data.diff_data, data.fixed_data, data.raw_data)
    def get_update_summary_statistics(self):
        data = self.get_update_data()
        return get_summary_statistics(data.diff_data, data.fixed_data, data.raw_data)
    def get_crud_summary_statistics(self):
        data = self.get_crud_data()
        return get_summary_statistics(data.diff_data, data.fixed_data, data.raw_data)
    def print_all_summary_statistics(self, noupdate=False):
        pp = pprint.PrettyPrinter()
        print("INSERT SUMMARY STATS:")
        pp.pprint(self.get_insert_summary_statistics())
        print("FIND SUMMARY STATS:")
        pp.pprint(self.get_find_summary_statistics())
        if not noupdate:
            print("UPDATE SUMMARY STATS:")
            pp.pprint(self.get_update_summary_statistics())

class ContinuousRWWorkloadWithCompact(ContinuousRWWorkload):
    def __init__(self, workdir, patch_id, variant, execution, task_name):
        ContinuousRWWorkload.__init__(self, workdir, patch_id, variant, execution, task_name)
        self.compact_data = None
        self.compacting_find_data = None
        self.compacting_update_data = None

    def get_compact_data(self):
        if self.compact_data is None:
            self.compact_data = get_data(self.json_path("Compactor.compact"))
        return self.compact_data
    def get_compacting_find_data(self):
        if self.compacting_find_data is None:
            self.compacting_find_data = get_data(self.json_path("ContinuousRWCompactInProgress.find"))
        return self.compacting_find_data
    def get_compacting_update_data(self):
        if self.compacting_update_data is None:
            self.compacting_update_data = get_data(self.json_path("ContinuousRWCompactInProgress.update"))
        return self.compacting_update_data
    def plot_compact_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} compacts {y}"
        return self._plot_line_or_scatter(self.get_compact_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_combined_find_data(self, x, y, line=False, start=None, end=None, **kwargs):
        ax = self.plot_find_data(x, y, line, start, end, **kwargs)
        df = self.get_compacting_find_data().diff_data
        return self._plot_line_or_scatter(df, x, y, line, start, end, ax=ax, color="orange", **kwargs)
    def plot_combined_update_data(self, x, y, line=False, start=None, end=None, **kwargs):
        ax = self.plot_update_data(x, y, line, start, end, **kwargs)
        df = self.get_compacting_update_data().diff_data
        return self._plot_line_or_scatter(df, x, y, line, start, end, ax=ax, color="orange", **kwargs)


In [None]:
def dump_dataframe(df):
    outfile = "./temp_dataframe.out"
    with open(outfile, "wt") as ostream:
        cols = ["total_ops", "ts", "actor_id", "throughput", "duration", "pure_latency(ms)", "overhead_latency(ms)", "total_latency(ms)"]
        df = df[cols].to_string()
        ostream.write(df)

In [None]:

VARIANTS = {"replset": "linux-3-node-replSet-qebench", "sharded": "linux-shard-lite-qebench"}
NOCOMPACT_WORKDIR="../datasets/genny/continuousrw_nocompact"
COMPACT_WORKDIR="../datasets/genny/continuousrw_compact"

nocompact_patchid = "634405351e2d171a12b2a0a8"
nocompact_taskname = "genny_qebench_continuousrw_nocompact"
nocompact_replset_wld = ContinuousRWWorkload(NOCOMPACT_WORKDIR, nocompact_patchid, VARIANTS["replset"], 0, nocompact_taskname)
nocompact_sharded_wld = ContinuousRWWorkload(NOCOMPACT_WORKDIR, nocompact_patchid, VARIANTS["sharded"], 0, nocompact_taskname)

compact_replset_executions = {
    "genny_qebench_continuousrw_compact_1024" : ("634451da57e85a772174286f", [0]),
    "genny_qebench_continuousrw_compact_128" : ("634451da57e85a772174286f", [0]),
    "genny_qebench_continuousrw_compact_256" : ("634451da57e85a772174286f", [0]),
    "genny_qebench_continuousrw_compact_512" : ("634451da57e85a772174286f", [0]),
}
compact_replset_wlds = {
    task: ContinuousRWWorkloadWithCompact(COMPACT_WORKDIR, tup[0], VARIANTS["replset"], tup[1][0], task)
    for task, tup in compact_replset_executions.items()
}
compact_sharded_executions = {
    "genny_qebench_continuousrw_compact_1024" : ("6347058fe3c331787727d7c3", [0]),
    "genny_qebench_continuousrw_compact_128" : ("6347058fe3c331787727d7c3", [1]),
    "genny_qebench_continuousrw_compact_256" : ("6347058fe3c331787727d7c3", [0]),
    "genny_qebench_continuousrw_compact_512" : ("6347058fe3c331787727d7c3", [0]),
}
compact_sharded_wlds = {
    task: ContinuousRWWorkloadWithCompact(COMPACT_WORKDIR, tup[0], VARIANTS["sharded"], tup[1][0], task)
    for task, tup in compact_sharded_executions.items()
}


In [None]:
row="total_ops"
col_latency="pure_latency(ms)"
pp = pprint.PrettyPrinter()

In [None]:
nocompact_replset_wld.plot_rw_data(row, col_latency)
nocompact_replset_wld.plot_insert_data(row, col_latency)
nocompact_replset_wld.plot_find_data(row, col_latency)
nocompact_replset_wld.plot_update_data(row, col_latency)
nocompact_replset_wld.plot_throughput_data(row)
nocompact_replset_wld.print_all_summary_statistics()

In [None]:
nocompact_sharded_wld.plot_rw_data(row, col_latency)
nocompact_sharded_wld.plot_insert_data(row, col_latency)
nocompact_sharded_wld.plot_find_data(row, col_latency)
nocompact_sharded_wld.plot_update_data(row, col_latency)
nocompact_sharded_wld.plot_throughput_data(row)
nocompact_sharded_wld.print_all_summary_statistics()

In [None]:
filter=[128, 256, 512, 1024]
for task, wld in compact_replset_wlds.items():
    if int(task.split('_')[-1]) not in filter:
        continue
    wld.plot_rw_data(row, col_latency)
    wld.plot_insert_data(row, col_latency)
    wld.plot_combined_find_data("ts", col_latency, True)
    wld.plot_combined_update_data("ts", col_latency, True)
    wld.plot_compact_data(row, col_latency, False)
    wld.print_all_summary_statistics()

In [None]:
filter=[128, 256]
for task, wld in compact_sharded_wlds.items():
    if int(task.split('_')[-1]) not in filter:
        continue
    wld.plot_rw_data(row, col_latency)
    wld.plot_insert_data(row, col_latency)
    wld.plot_combined_find_data("ts", col_latency, True)
    wld.plot_combined_update_data("ts", col_latency, True)
    wld.plot_compact_data(row, col_latency, False)
    wld.print_all_summary_statistics()