In [None]:
from bokeh.io import output_notebook, show
from bokeh.layouts import column
from bokeh.models import Legend, LinearAxis, Range1d, Title
from bokeh.palettes import Spectral8
from bokeh.plotting import figure

import datetime as dt

from ipywidgets import interact

# increase default cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os

import pandas as pd

import sys

# for bokeh in notebooks
output_notebook()

In [None]:
# read all CSV files into one DataFrame
benchmarks = [
#    'terasort-hadoop-1T-16' # ,
#    'terasort-flink-1T-16' #,
#    'terasort-flink-640G-16-2'
#    'peakpicking-flink-205G-16'
    'terasort-hadoop-1T-16'
]

all_data = dict()
for benchmark in benchmarks:
    csv_directory = "../resources/data/{}".format(benchmark)
    print("Reading data from {}".format(csv_directory))
    all_data[benchmark] = pd.DataFrame()
    
    csvs = os.listdir(csv_directory)
    i = 0
    for csv in csvs:
        if csv.endswith('csv') and os.path.isfile(csv_directory + '/' + csv):
            all_data[benchmark] = all_data[benchmark].append(pd.read_csv(csv_directory + '/' + csv)).fillna(0)
print("Done reading all input")

In [None]:
# aggregate over all pids of desired hosts the metrics per time, key, source and category
hostnames = [
    'cumu01-00',
    'cumu01-01',
    'cumu01-02',
    'cumu01-03',
    'cumu01-04',
    'cumu01-05',
    'cumu01-06',
    'cumu01-07',
    'cumu01-08',
    'cumu01-09',
    'cumu01-10',
    'cumu01-11',
    'cumu01-12',
    'cumu01-13',
    'cumu01-14',
    'cumu01-15',
    'cumu02-00',
    'cumu02-01',
    'cumu02-02',
    'cumu02-03',
    'cumu02-04',
    'cumu02-05',
    'cumu02-06',
    'cumu02-07',
    'cumu02-08',
    'cumu02-09',
    'cumu02-10',
    'cumu02-11',
    'cumu02-12',
    'cumu02-13',
    'cumu02-14',
    'cumu02-15'
]

grouped_data = dict()
for benchmark in benchmarks:
    grouped_data[benchmark] = all_data[benchmark][all_data[benchmark]['hostname'].isin(hostnames)].drop(['hostname', 'pid'], axis=1).groupby(['timeBin', 'key', 'source', 'category'], as_index=False).sum()

In [None]:
# regroup to obtain a DataFrame per key/source/category tuple
regrouped_data = dict()
for benchmark in benchmarks:
    regrouped_data[benchmark] = grouped_data[benchmark].groupby(['key', 'source', 'category'])

In [None]:
# select framework to examine
current_data = regrouped_data['terasort-hadoop-1T-16']

# one figure for each key, same for legends

# depict total data per time bin, cumulative data
data_figures = dict()
data_legends = dict()

# depict average data per time bin
avg_data_figures = dict()
avg_data_legends = dict()

# depict number of operations per time bin, cumulative number of operations
count_figures = dict()
count_legends = dict()

# we will need at most eight colors per plot
colors = {
    ('jvm', 'read') : Spectral8[0],
    ('jvm', 'write') : Spectral8[1],
    ('jvm', 'other') : Spectral8[2],
    ('jvm', 'zip') : Spectral8[3],
    ('sfs', 'read') : Spectral8[4],
    ('sfs', 'write') : Spectral8[5],
    ('sfs', 'other') : Spectral8[6],
    ('sfs', 'zip') : Spectral8[7]
}

# remember max left and right y values for each plot
max_data = dict()
max_cum_data = dict()
max_avg_data = dict()
max_count = dict()
max_cum_count = dict()

# remember x range so all plots have the same extent
min_x = 9223372036854775807
max_x = 0

# loop over all unique groups
for group in current_data.groups:
    # group[0] contains the key, e.g. hadoop, spark, yarn, ...
    # group[1] contains jvm or sfs
    # group[2] contains read, write or other
    
    # skip group if we do not want to plot it
    if not (group[1], group[2]) in colors:
        continue
    
    # get this group's data
    group_data = current_data.get_group(group)
    
    # start with data related plots, so skip other
    if group[2] != 'other':
        data_figure = data_figures.get(
            group[0],
            figure(
                plot_width=1300,
                title="{}: data".format(group[0]),
                title_location='above',
                toolbar_location='left',
                toolbar_sticky=False,
                x_axis_type='datetime'
            )
        )
        data_figures[group[0]] = data_figure

        # remember maximum data for plotting later
        max_data[group[0]] = max([max_data.get(group[0], 0), group_data['data'].max()])

        # remember x range
        min_x = min(min_x, group_data['timeBin'].min())
        max_x = max(max_x, group_data['timeBin'].max())

        # get cumulative sums on data for this group, and remember the maximum
        cum_data = group_data['data'].cumsum()
        max_cum_data[group[0]] = max([max_cum_data.get(group[0], 0), cum_data.max()])

        # timeBin is in milliseconds, data is in bytes
        # convert to seconds and megabytes
        # and look up color for source/category combination
        data_line = data_figure.line(
            x=[dt.datetime.fromtimestamp(s / 1000) for s in group_data['timeBin']],
            y=[d / 1048576 for d in group_data['data']],
            color = colors[(group[1], group[2])]
        )

        # plot cumulative data for the same source/category, but on the other y-range
        cum_data_line = data_figure.line(
            x=[dt.datetime.fromtimestamp(s / 1000) for s in group_data['timeBin']],
            y=[d / 1073741824 for d in cum_data],
            color = colors[(group[1], group[2])],
            y_range_name='cum_data_range'
        )

        # save the legend for this line, simple concatenation of key, source and category
        data_legend = data_legends.get(group[0], [])
        data_legend.append(("{}: {}".format(group[1], group[2]), [data_line, cum_data_line]))
        data_legends[group[0]] = data_legend
    
        # now for average data operation size
        avg_data_figure = avg_data_figures.get(
            group[0],
            figure(
                plot_width=1300,
                title="{}: avg. data".format(group[0]),
                title_location='above',
                toolbar_location='left',
                toolbar_sticky=False,
                x_axis_type='datetime'
            )
        )
        avg_data_figures[group[0]] = avg_data_figure
        
        avg_data = group_data['data'] / group_data['count']
        
        # remember max average data
        max_avg_data[group[0]] = max(max_avg_data.get(group[0], 0), avg_data.max())
        
        avg_data_line = avg_data_figure.line(
            x=[dt.datetime.fromtimestamp(s / 1000) for s in group_data['timeBin']],
            y=[d / 1048576 for d in avg_data],
            color = colors[(group[1], group[2])]
        )
        
        avg_data_legend = avg_data_legends.get(group[0], [])
        avg_data_legend.append(("{}: {}".format(group[1], group[2]), [avg_data_line]))
        avg_data_legends[group[0]] = avg_data_legend
        
    # now for the number of operations
    count_figure = count_figures.get(
        group[0],
        figure(
            plot_width=1300,
            title="{}: count".format(group[0]),
            title_location='above',
            toolbar_location='left',
            toolbar_sticky=False,
            x_axis_type='datetime'
        )
    )
    count_figures[group[0]] = count_figure
    
    # remember max count for scaling
    max_count[group[0]] = max(max_count.get(group[0], 0), group_data['count'].max())
    
    # get cumulative count of operations for this group
    cum_count = group_data['count'].cumsum()
    max_cum_count[group[0]] = max(max_cum_count.get(group[0], 0), cum_count.max())
    
    count_line = count_figure.line(
        x=[dt.datetime.fromtimestamp(s / 1000) for s in group_data['timeBin']],
        y=group_data['count'],
        color = colors[(group[1], group[2])]
    )
    
    cum_count_line = count_figure.line(
        x=[dt.datetime.fromtimestamp(s / 1000) for s in group_data['timeBin']],
        y=cum_count,
        color = colors[(group[1], group[2])],
        y_range_name='cum_count_range'
    )
    
    count_legend = count_legends.get(group[0], [])
    count_legend.append(("{}: {}".format(group[1], group[2]), [count_line, cum_count_line]))
    count_legends[group[0]] = count_legend

# now generate all figures and their legends
plots = []
for key, f in data_figures.items():
    x_range_margin = (max_x - min_x) * 0.05
    f.x_range = Range1d(start=min_x - x_range_margin, end=max_x + x_range_margin)
    
    y_range_margin = 0.05 * (max_data[key] / 1048576)
    f.y_range = Range1d(start=-y_range_margin, end=y_range_margin + max_data[key] / 1048576)
    
    f.add_layout(LinearAxis(y_range_name='cum_data_range'), 'right')
    extra_y_range_margin = 0.05 * max_cum_data[key] / 1073741824
    f.extra_y_ranges = { 'cum_data_range' : Range1d(start=-extra_y_range_margin, end=extra_y_range_margin + max_cum_data[key] / 1073741824) }
    
    f.title.align = 'center'
    f.xaxis[0].axis_label = "Time"
    f.yaxis[0].axis_label = "Data (MiB)"
    f.yaxis[1].axis_label = "Cum. Data (GiB)"
    
    f.add_layout(Legend(items=data_legends.get(key), location=(0, -30)), 'right')
    
    plots.append(f)

for key, f in avg_data_figures.items():
    x_range_margin = (max_x - min_x) * 0.05
    f.x_range = Range1d(start=min_x - x_range_margin, end=max_x + x_range_margin)
    
    y_range_margin = 0.05 * (max_avg_data[key] / 1048576)
    f.y_range = Range1d(start=-y_range_margin, end=y_range_margin + max_avg_data[key] / 1048576)
    
    f.title.align = 'center'
    f.xaxis[0].axis_label = "Time"
    f.yaxis[0].axis_label = "Data (MiB)"
    
    f.add_layout(Legend(items=avg_data_legends.get(key), location=(0, -30)), 'right')
    
    plots.append(f)

for key, f in count_figures.items():
    x_range_margin = (max_x - min_x) * 0.05
    f.x_range = Range1d(start=min_x - x_range_margin, end=max_x + x_range_margin)
    
    y_range_margin = 0.05 * (max_count[key])
    f.y_range = Range1d(start=-y_range_margin, end=y_range_margin + max_count[key])
    
    f.add_layout(LinearAxis(y_range_name='cum_count_range'), 'right')
    extra_y_range_margin = 0.05 * max_cum_count[key]
    f.extra_y_ranges = { 'cum_count_range' : Range1d(start=-extra_y_range_margin, end=extra_y_range_margin + max_cum_count[key]) }
    
    f.title.align = 'center'
    f.xaxis[0].axis_label = "Time"
    f.yaxis[0].axis_label = "Count"
    f.yaxis[1].axis_label = "Cum. Count"
    
    f.add_layout(Legend(items=count_legends.get(key), location=(0, -30)), 'right')
    
    plots.append(f)

# show all plots in a column for proper alignment of x-axis
show(column(plots))