In [None]:
#pip install pandas matplotlib scipy numpy seaborn natsort

In [None]:
import copy
import pandas
import warnings
import requests
import subprocess
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import json
import os
import sys

sys.path.insert(0, "../src")
from perf_tools.analysis import make_differential_frame, get_data, get_summary_statistics
from perf_tools.analysis import check_are_close, make_latency_plot, plot_latency_stats

In [None]:
VARIANTS = {"replset": "linux-1-node-replSet-fle.2022-11", "sharded": "linux-shard-lite-qebench"}
WORKDIR="../datasets/genny2/perf1"

patch_id = "639aa0d661837d305582a2f7"
exec_idx=0

In [None]:

EXPERIMENTS = [
  {
    # Experiment Set q.1: Query unencrypted fields on unencrypted collection
    "name" : "es1",
    "coll" : "pbl",
    "encryptedFieldCount" : 0,
    "threadCounts" : [1,4,8,16],
    #"contentionFactors" : [1,4,8,16],
    "contentionFactors" : [1],
    "queries" : [
      {
        "field" : "fixed_10",
        "value" : "fixed_hf"
      },
      {
        "field" : "fixed_10",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,10]",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,10]",
        "value" : "uar_alllow"
      },
    ]
  },
  {
    # Experiment Set q.2: Query unencrypted fields on partially encrypted collection
    "name" : "es2",
    "coll" : "pbl",
    "encryptedFieldCount" : 5,
    "threadCounts" : [1,4,8,16],
    "contentionFactors" : [1,4,8,16],
    "queries" : [
      {
        "field" : "fixed_10",
        "value" : "fixed_hf"
      },
      {
        "field" : "fixed_10",
        "value" : "uar"
      },
      {
        "field" : "uar_[6,10]",
        "value" : "uar"
      },
      {
        "field" : "uar_[6,10]",
        "value" : "uar_alllow"
      },
    ]
  },
  {
    # Experiment Set q.3: Query encrypted fields on partially encrypted collection
    "name" : "es3",
    "coll" : "pbl",
    "encryptedFieldCount" : 5,
    "threadCounts" : [1,4,8,16],
    "contentionFactors" : [1,4,8,16],
    "queries" : [
      {
        "field" : "fixed_1",
        "value" : "fixed_hf"
      },
      {
        "field" : "fixed_1",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,5]",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,5]",
        "value" : "uar_alllow"
      },
    ]
  },
  {
    # Experiment Set q.4: Query encrypted fields on fully encrypted collection
    "name" : "es4",
    "coll" : "pbl",
    "encryptedFieldCount" : 10,
    "threadCounts" : [1,4,8,16],
    "contentionFactors" : [1,4,8,16],
    "queries" : [
      {
        "field" : "fixed_1",
        "value" : "fixed_hf"
      },
      {
        "field" : "fixed_1",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,10]",
        "value" : "uar"
      },
      {
        "field" : "uar_[1,10]",
        "value" : "uar_alllow"
      },
    ]
  },
  # {
  #   # Experiment Set q.5: Check the impact of BSON limit on queries on both encrypted and unencrypted fields
  #   "name" : "es5",
  #   "coll" : "blimit",
  #   "encryptedFieldCount" : 5,
  #   "threadCounts" : [1,4,8,16],
  #   "contentionFactors" : [1,4,8,16],
  #   "queries" : [
  #     {
  #       "field" : "fixed_1",
  #       "value" : "fixed_hf"
  #     },
  #     {
  #       "field" : "fixed_10",
  #       "value" : "fixed_hf"
  #     },
  #   ]
  # },
]

In [None]:
class DataSetCache:
    def __init__(self, workdir, patch_id, variant, execution, task_name, experiment, cf, tc, query_num, query, phase_name):
        self.workdir = workdir
        self.patch_id = patch_id
        self.variant = variant
        self.execution = execution
        self.task_name = task_name
        self.experiment = experiment
        self.cf = cf
        self.tc = tc
        self.query_num = query_num
        self.phase_name = phase_name
        self.variant = variant

        self.query = query
        self.data = None

    def json_path(self, metric):
        return os.path.join(self.workdir, self.patch_id, self.variant,
            self.task_name, str(self.execution), metric + ".json")

    def get_data(self):
        if self.data is None:
            self.data = get_data(self.json_path("InsertActor." + self.phase_name))
        return self.data

DATASETS = {}

for ex in EXPERIMENTS:
    for cf in ex["contentionFactors"]:
        for tc in ex["threadCounts"]:
            experiment = ex['name']
            testName = f"query_{ex['name']}_{cf}_{tc}"
            fullName = testName + ".load"

            DATASETS[fullName] = DataSetCache(WORKDIR, patch_id, VARIANTS["replset"], "0", testName, experiment, cf, tc, "load", "load", "load.inserts")

            for i, query in enumerate(ex["queries"]):
                qi = i + 1
                fullName = testName + f".q{qi}.reads"

                DATASETS[fullName] = DataSetCache(WORKDIR, patch_id, VARIANTS["replset"], "0", testName, experiment, cf, tc, qi, f"{query['field']}: {query['value']}", f"q{qi}.reads")


In [None]:
def get_load_datacache(experiment_name, cf, tc):
    ds_name = f"query_{experiment_name}_{cf}_{tc}.load"
    return DATASETS[ds_name]

def get_query_datacache(experiment_name, cf, tc, query_number):
    ds_name = f"query_{experiment_name}_{cf}_{tc}.q{query_number}.reads"
    return DATASETS[ds_name]

In [None]:
# Get summary statistics for all data sets
# WARNING: may take several minutes
def get_summary_statistics_df(ds_cache):
    l1 = ds_cache.get_data()
    b = l1.diff_data
    fixed_data = l1.fixed_data
    raw_data = l1.raw_data

    quantiles = stats.mstats.mquantiles(b.loc[:,"pure_latency"].values, prob=[0.5,0.8,0.9,0.95,0.99], alphap=1/3, betap=1/3)
    averages = b.mean(numeric_only=True)
    maximum = b.max()
    minimum = b.min()
    duration = (b["ts"][len(b)-1] - b["ts"][0]).total_seconds()
    ops = fixed_data["d(ops)"].sum()
    size = fixed_data["d(size)"].sum()
    docs = fixed_data["d(n)"].sum()
    errs = fixed_data["d(err)"].sum()
    overhead = fixed_data["d(t_overhead)"].sum()

    df = pandas.DataFrame(index = [0], data = {
        'Task' : ds_cache.task_name + "." + ds_cache.phase_name,
        'TaskName' : ds_cache.task_name,
        'Experiment' : ds_cache.experiment,
        'Phase' : ds_cache.phase_name,
        'Query' : ds_cache.query,
        'QueryNumber' : ds_cache.query_num,
        'ContentionFactor': ds_cache.cf,
        'ThreadCount': ds_cache.tc,
        'AverageLatency': averages["pure_latency"],
        'AverageLatencyMillis': averages["pure_latency(ms)"],
        'AverageSize': size / ops,
        'OperationThroughput': ops / duration,
        'DocumentThroughput': docs / duration,
        'SizeThroughput': size / duration,
        'ErrorRate': errs / duration,
        'Latency50thPercentile': quantiles[0],
        'Latency80thPercentile': quantiles[1],
        'Latency90thPercentile': quantiles[2],
        'Latency95thPercentile': quantiles[3],
        'Latency99thPercentile': quantiles[4],
        'WorkersMin': raw_data["gauges.workers"].min(),
        'WorkersMax': raw_data["gauges.workers"].max(),
        'LatencyMax': maximum["pure_latency"],
        'LatencyMin': minimum["pure_latency"],
        'DurationTotal': duration * 1e9,
        'ErrorsTotal': errs,
        'OperationsTotal': ops,
        'DocumentsTotal': docs,
        'SizeTotal': size,
        'OverheadTotal': overhead
    })

    return df

df = None

for key in DATASETS.keys():
    if df is None:
        df = get_summary_statistics_df(DATASETS[key])
    else:
        df = pandas.concat([df, get_summary_statistics_df(DATASETS[key])], ignore_index=True)

df.to_csv("/tmp/summary.csv")
df


In [None]:
# Graph latency for load for a given experiment
#
experiment = "es2"
cf = 1
tc = 8
e_datacache = get_load_datacache(experiment, cf, tc)
df_w = e_datacache.get_data().diff_data

# Graph scatter plot of latency
df_w.plot(figsize=(20,20), x="total_ops", y= "pure_latency(ms)", kind="scatter", title=f"Load latency: {experiment} cf_{cf} thread_{tc}", ylabel="pure_latency(ms)", grid=0.4)

# Graph latency with trend lines
plot_latency_stats(df_w, "total_ops", title=f"{experiment} cf_{cf} thread_{tc} insert stats", regr="log", start=start, end=end)

In [None]:
# Graph latency for query for a given experiment
#
experiment = "es2"
cf = 1
tc = 8
e_datacache = get_query_datacache(experiment, cf, tc, 1)
df_w = e_datacache.get_data().diff_data

# Graph scatter plot of latency
df_w.plot(figsize=(20,20), x="total_ops", y= "pure_latency(ms)", kind="scatter", title=f"Load latency: {experiment} cf_{cf} thread_{tc}", ylabel="pure_latency(ms)", grid=0.4)

# Graph latency with trend lines
plot_latency_stats(df_w, "total_ops", title=f"{experiment} cf_{cf} thread_{tc} insert stats", regr="log", start=start, end=end)

In [None]:
# Graph throughput for given load experiment pivoted on thread count
experiment = "es1"
cf = 1
threadCounts = [1,4,8,16]
columns = []
column_data = []

for tc in threadCounts:
    
   
    e_datacache = get_load_datacache(experiment, cf, tc)
    df_w = e_datacache.get_data().diff_data
    
    name = f"cf_{cf}_thread_{tc}_throughput"
    column_data.append(df_w["throughput"])
    columns.append(name)
    print(name)
        
df = pandas.concat(column_data, axis="columns")
df.columns =columns
df[100:].plot(figsize=(20,20), title=f"Load throughput {experiment}", ylabel="throughput (ops/sec)", grid=0.4)

    

In [None]:
# Graph throughput for given query experiment pivoted on thread count and contention factor
def plot_throughput_set(experiment, query_number, contentionFactors, threadCounts):
    columns = []
    column_data = []

    for cf in contentionFactors:
        for tc in threadCounts:
            e_datacache = get_query_datacache(experiment, cf, tc, query_number)
            df_w = e_datacache.get_data().diff_data
            
            name = f"cf_{cf}_thread_{tc}_throughput"
            column_data.append(df_w["throughput"])
            columns.append(name)
                
    df = pandas.concat(column_data, axis="columns")
    df.columns =columns
    df[100:].plot(figsize=(20,20), title=f"Read throughput {experiment} Query {query_number}", ylabel="throughput (ops/sec)", grid=0.4)

plot_throughput_set("es2", 1, [1,4,8,16], [1,4,8,16])
    

In [None]:
experiment = "query_es2"
contentionFactors = [1]
threadCounts = [1,4,8,16]

es_workloads = {}
for cf in contentionFactors:
    
    columns = []
    column_data = []
    for tc in threadCounts:
        e_w = ES1Workload(WORKDIR, patch_id, VARIANTS["replset"], "0", f"{experiment}_{cf}_{tc}")
        es_workloads[(cf,tc)] = e_w
        df_w = e_w.get_load_data().diff_data
        
        name = f"cf_{cf}_thread_{tc}_throughput"
        column_data.append(df_w["throughput"])
        columns.append(name)
        print(name)
        
    df = pandas.concat(column_data, axis="columns")
    df.columns =columns
    print(df.columns)
    print(df.info())
    print(df.axes)
    #df.head()
    df.plot(figsize=(20,20), title="Load throughput", ylabel="throughput (ops/sec)", grid=0.4)
    