In [2]:
# Imports
import re
import pathlib
import itertools
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Define some useful function

def MiB_to_Gb(val):
    return val*8*1024**2/10**9

def Gb_to_MiB(val):
    return val*10**9/(8*1024**2)

def load_raw_data(results_file, check_error=True):
    """Load and preprocess raw benchmark data from `warp`"""
    df = pd.read_csv(results_file, sep="\t")
    # Separate out benchmark command
    warp_command = df.iloc[-1].idx
    df = df.iloc[:-1]
    # Drop some unused columns
    df = df.drop(['idx'], axis=1)
    if check_error:
        if df.error.isna().all():
            df = df.drop('error', axis=1)
        else:
            raise Exception("Errors found in df.error column")
    if df.n_objects.eq(1).all():
        df = df.drop('n_objects', axis=1)
    # Convert some data dtypes
    for col in ['start', 'end']:
        df[col] = df[col].map(lambda x: dt.datetime.strptime(x.split('.')[0], '%Y-%m-%dT%H:%M:%S'))
    
    return df, warp_command

def load_summary_data(filename):
    """Loads the summary csv data from `warp analyze ...` into a pandas dataframe"""
    df = pd.read_csv(filename, sep="\t")
    df = df[~(df.op == "op")]
    for col in ['mb_per_sec', 'objs_per_sec']:
        df[col] = pd.to_numeric(df[col])
    return df

def parse_text_summary(filename):
    """Parses the text summary from `warp analyze ...` and generates a dict of throughput values"""
    # Read input and compile reusable regexs
    op_regex = re.compile(r"[A-Z][A-Z]+")
    score_regex = re.compile(r"[0-9]+\.[0-9]+\ [a-zA-Z]+/s")
    with open(filename, "r") as file:
        lines = file.readlines()
    # Iterate through file lines and extract relevant pieces
    scores = {}
    current_op = None
    for line in lines:
        if line.startswith("Operation: "):
            op = op_regex.search(line)[0]
            current_op = op
            scores[op] = {}
        elif line.startswith(" * Throughput: ") or line.startswith("* Average:"):
            for score in score_regex.finditer(line):
                val, unit = score[0].split(" ")
                scores[current_op][unit] = float(val)
        elif line.startswith("Cluster Total: "):
            scores["Total"] = {}
            for score in score_regex.finditer(line):
                val, unit = score[0].split(" ")
                scores["Total"][unit] = float(val)
    # Convert obj/s to kIOPS
    for op in scores.keys():
        if 'obj/s' in scores[op].keys():
            val = scores[op].pop('obj/s')
            scores[op]['kIOPS'] = val / 1000
        
    return scores

parse_text_summary("benchmark-runs/s3-summary.pure-get-1-1-250-100MiB.txt")

{'PUT': {'MiB/s': 250.55, 'kIOPS': 0.0025099999999999996},
 'GET': {'MiB/s': 462.8, 'kIOPS': 0.00463}}

In [3]:
df, cmd = load_raw_data('benchmark-runs/s3-full.quobyte-put-5-100-0-1GiB-5m.csv.zst')

In [4]:
df

Unnamed: 0,thread,op,client_id,bytes,endpoint,file,start,first_byte,end,duration_ns
0,98.0,PUT,4OZfcF,1.073742e+09,http://s3.sds.jc.rl.ac.uk,1jiB)PZj/1.2dKNRLk9xRq(1wPa.rnd,2023-03-13 21:57:46,,2023-03-13 22:00:35,1.692982e+11
1,151.0,PUT,iXL7Cn,1.073742e+09,http://s3.sds.jc.rl.ac.uk,byQ3Bv3W/1.UCi3AjtOF8(ZMvvk.rnd,2023-03-13 21:57:46,,2023-03-13 22:01:18,2.126052e+11
2,450.0,PUT,mg868B,1.073742e+09,http://s3.sds.jc.rl.ac.uk,U)q))a9D/1.V4MrkqTsVwR8Zr41.rnd,2023-03-13 21:57:46,,2023-03-13 22:00:33,1.673767e+11
3,94.0,PUT,4OZfcF,1.073742e+09,http://s3.sds.jc.rl.ac.uk,MYtF9(qc/1.MyiCkFhUBrICjCJt.rnd,2023-03-13 21:57:46,,2023-03-13 22:00:10,1.443484e+11
4,95.0,PUT,4OZfcF,1.073742e+09,http://s3.sds.jc.rl.ac.uk,7AXk6wis/1.rh5ItUllWksYOB3C.rnd,2023-03-13 21:57:46,,2023-03-13 21:59:59,1.332830e+11
...,...,...,...,...,...,...,...,...,...,...
1066,475.0,PUT,mg868B,1.073742e+09,http://s3.sds.jc.rl.ac.uk,xxrtKPgu/3.usVMrHyGzXCdj7yX.rnd,2023-03-13 22:02:44,,2023-03-13 22:06:51,2.478105e+11
1067,456.0,PUT,mg868B,1.073742e+09,http://s3.sds.jc.rl.ac.uk,NGpqsCVo/2.MHf4r8Iu(fLeYzG0.rnd,2023-03-13 22:02:44,,2023-03-13 22:08:56,3.723893e+11
1068,124.0,PUT,iXL7Cn,1.073742e+09,http://s3.sds.jc.rl.ac.uk,txFuJJhh/3.EEJHOba606josV3Z.rnd,2023-03-13 22:02:45,,2023-03-13 22:07:31,2.855254e+11
1069,109.0,PUT,iXL7Cn,1.073742e+09,http://s3.sds.jc.rl.ac.uk,Mcbi06tR/3.0o7Wn4N4Gw4VbH5s.rnd,2023-03-13 22:02:45,,2023-03-13 22:11:13,5.079506e+11


In [50]:
def load_scores(base_file_name, **kwargs):

    parameter_combinations = list(itertools.product(*kwargs.values()))
    scores = {}
    for param_set in parameter_combinations:
        file_name = base_file_name
        results_key = ''
        for key, param in zip(kwargs.keys(), param_set):
            results_key += f'{param}-{key}--'
            file_name += f'-{param}'
        file_name += '.txt'
        run_scores = parse_text_summary(file_name)
        scores[results_key] = run_scores
        
    return scores


def plot_grouped_bars(scores, unit, ax=None, op_colours=None):

    groups = list(scores.keys())
    keys = [k for k, v in scores[groups[0]].items() if unit in v.keys()]
    data = {k:[scores[g][k][unit] for g in groups] for k in keys}

    x = np.arange(len(groups))  # the label locations
    width = 1 / (1+len(keys)) # the width of the bars
    multiplier = (1.5 - len(keys)/2)
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 4), constrained_layout=True)
    ax.grid(axis="y", lw=0.5, zorder=0)
    
    for i, (k, score) in enumerate(data.items()):
        offset = width * multiplier
        rects = ax.bar(
            x + offset, 
            score, 
            width, 
            ec="k", 
            label=k, 
            zorder=3,
            color=(op_colours[k] if op_colours is not None else None)
        )
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(unit.replace("_per_sec", "/s"))
    ax.set_xticks(x + width, groups)    
    
    return ax


def plot_throughput_vs_filesize(scores, op):

    op = op.upper()


OP = 'get'
scores = load_scores(f'benchmark-runs/s3-summary.pure-{OP}', nodes=[1, 2], threads=[1, 20, 50], files=[250], size=['100MiB'])
plot_throughput_vs_filesize(scores, op)

{'1-nodes--1-threads--250-files--100MiB-size--': {'PUT': {'MiB/s': 250.55,
   'kIOPS': 0.0025099999999999996},
  'GET': {'MiB/s': 462.8, 'kIOPS': 0.00463}},
 '1-nodes--20-threads--250-files--100MiB-size--': {'PUT': {'MiB/s': 2934.49,
   'kIOPS': 0.02934},
  'GET': {'MiB/s': 4147.72, 'kIOPS': 0.041479999999999996}},
 '1-nodes--50-threads--250-files--100MiB-size--': {'PUT': {},
  'GET': {'MiB/s': 5221.14, 'kIOPS': 0.05221}},
 '2-nodes--1-threads--250-files--100MiB-size--': {'PUT': {'MiB/s': 494.75,
   'kIOPS': 0.00495},
  'GET': {'MiB/s': 870.79, 'kIOPS': 0.00871}},
 '2-nodes--20-threads--250-files--100MiB-size--': {'PUT': {'MiB/s': 4347.68,
   'kIOPS': 0.04348},
  'GET': {'MiB/s': 7839.52, 'kIOPS': 0.07840000000000001}},
 '2-nodes--50-threads--250-files--100MiB-size--': {'PUT': {'MiB/s': 5332.8,
   'kIOPS': 0.053329999999999995},
  'GET': {'MiB/s': 8165.57, 'kIOPS': 0.08166}}}

In [None]:
def plot_obj_size_comparison(scores, op, unit, ax):

    
    