In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import seaborn as sns
import json
from io import StringIO
import importlib

In [None]:
import molten_data_common_lib
importlib.reload(molten_data_common_lib)
from molten_data_common_lib import glob_file_list , load_json_from_file, merge_dicts, plot_groups, get_varying_column_names, filter_dataframe

In [None]:
src_files = []
src_files += ['../p3_test_driver/logs/workload-*.json']
filenames = glob_file_list(src_files)
print('Loading records from %d files...' % len(filenames))
raw_results = [load_json_from_file(filename) for filename in filenames]

In [None]:
raw_df = pd.DataFrame(raw_results)
raw_df.head()

In [None]:
def clean_result(result):
    r = result.copy()
    workload = json.load(StringIO(r['workload']))
    r = merge_dicts(r, workload)
    r = merge_dicts(r, r['workload'])
    del r['workload']
    r['driverName'] = r['driver']['name']
    del r['driver']
    for k in list(r.keys()):
        if 'Quantiles' in k:
            r[k] = pd.Series(data=[float(q) for q in r[k].keys()], index=list(r[k].values()))
        elif isinstance(r[k], list):
            r[k] = pd.Series(r[k])
            r['%sMean' % k] = r[k].mean()
    r['numWorkloadWorkers'] = int(r.get('numWorkers', 0))
    r['throttleEventsPerSec'] = r['producerRate']
    r['publishRateEventsPerSecMean'] = r['publishRateMean']
    r['publishRateMBPerSecMean'] = r['publishRateMean'] * r['messageSize'] * 1e-6
    r['publishLatencyMsAvg'] = r['aggregatedPublishLatencyAvg']
    r['publishLatencyMs99Pct'] = r['aggregatedPublishLatency99pct']
    r['endToEndLatencyMsAvg'] = r['aggregatedEndToEndLatencyAvg']
    r['endToEndLatencyMs99Pct'] = r['aggregatedEndToEndLatency99pct']
    return pd.Series(r)
# r = clean_result(raw_results[0])
# pd.DataFrame(r)

In [None]:
clean_df = raw_df.apply(clean_result, axis=1)
clean_df = clean_df.sort_values(['utc_begin'])
# clean_df.tail()

In [None]:
clean_df.columns

In [None]:
info_cols = [
    'numWorkers',
    'topics',
    'partitionsPerTopic',
    'producersPerTopic',
    'subscriptionsPerTopic',
    'consumerPerSubscription',
    'testDurationMinutes',
    'keyDistributor',
    'git_commit',    
]

In [None]:
cols = [
    'messageSize',
    'numWorkloadWorkers',
    'producersPerTopic',
    'partitionsPerTopic',
    'testDurationMinutes',
    'subscriptionsPerTopic',
    'throttleEventsPerSec',
    'publishRateEventsPerSecMean',
    'publishRateMBPerSecMean',
    'publishLatencyMsAvg',
    'publishLatencyMs99Pct',
    'endToEndLatencyMsAvg',
    'endToEndLatencyMs99Pct',
    'utc_begin',
    'test_uuid',
]

In [None]:
clean_df[cols].to_csv('openmessaging-benchmark-results.csv')

In [None]:
#df = clean_df[cols]
#df = df.sort_values(['messageSize','numWorkloadWorkers','producersPerTopic','throttleEventsPerSec','utc_begin'])
#df

In [None]:
messageSize = 10000
filt_df = filter_dataframe(
    clean_df,
    messageSize=messageSize, 
    numWorkloadWorkers=2, 
    partitionsPerTopic=16,
    testDurationMinutes=15,
)
filt_df[cols]

In [None]:
plot_df = (filt_df
    .set_index(['publishRateEventsPerSecMean'])
    .sort_index()
    [[
        'aggregatedPublishLatency50pct',
        'aggregatedPublishLatency95pct',
        'aggregatedPublishLatency99pct',
        'aggregatedEndToEndLatency50pct',
        'aggregatedEndToEndLatency95pct',
        'aggregatedEndToEndLatency99pct',
    ]]
    .rename(columns=dict(
        aggregatedPublishLatency50pct='Publish Latency p50',
        aggregatedPublishLatency95pct='Publish Latency p95',
        aggregatedPublishLatency99pct='Publish Latency p99',
        aggregatedEndToEndLatency50pct='E2E Latency p50',
        aggregatedEndToEndLatency95pct='E2E Latency p95',
        aggregatedEndToEndLatency99pct='E2E Latency p99',
    ))
    )
plot_df.index.name = 'Publish Throughput (events/s)'
plot_df

In [None]:
title = 'Message Size %d' % (messageSize)
ax = plot_df.plot(    
    logx=True, 
    logy=True,
    figsize=(10,8), 
    grid=True, 
    title=title, 
    style=['x:b','x-.b','x-b','+:r','+-.r','+-r'])
ax.set_ylabel('Latency (ms)');
ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
ax.grid('on', which='both', axis='both')

In [None]:
filt_df[info_cols].drop_duplicates().T

In [None]:
# plot_groups(
#     filt_df, 
#     x_col='publishRateEventsPerSecMean',
#     y_col='publishLatencyMs99Pct',
#     group_by_columns=['partitionsPerTopic', 'messageSize'],
#     semilogx=True,
# #     ylim=[0,100],
# );

# Analyze Latency Distribution

In [None]:
df = clean_df
df = df[df.test_uuid=='a073135e-b3ed-4b4b-8fc2-d449f427af23']
t = df.iloc[0]
t

In [None]:
cdf = t.aggregatedPublishLatencyQuantiles
cdf = cdf / 100
cdf

In [None]:
cdf.plot(logx=True, grid=True);

In [None]:
pdf = pd.Series(index=cdf.index, data=np.gradient(cdf, cdf.index.values))
pdf

In [None]:
pdf.plot(logx=True, logy=False, grid=True, xlim=[3,2000], ylim=[0,None]);

In [None]:
fig0, ax0 = plt.subplots()
ax1 = ax0.twinx()
cdf.plot(ax=ax0, logx=True, ylim=[0,1])
pdf.plot(secondary_y=True, ylim=[0,None], ax=ax1)
plt.show()
plt.close()

In [None]:
fig0, ax0 = plt.subplots()
ax0.plot(pdf.values, pdf.index.values)
ax0.grid('on', which='both', axis='y')
ax0.semilogy(True)