1. bench.sh creates a file per marshaller (parallel_n), operation (dump/dumpload) and mode (memory/disk)
2. create a directory per experiment setup and move the files there (see next two cells)
3. then load the data per experiment and create the plot (subsequent cells)

In [None]:
%%bash
mkdir -p data/disk/dumpload
mkdir -p data/disk/dump
mkdir -p data/memory/dumpload
mkdir -p data/memory/dump

In [None]:
%%bash
mv *dumpload*disk*.txt data/disk/dumpload
mv *dump_*disk*.txt data/disk/dump
mv *dumpload*memory*.txt data/memory/dumpload
mv *dump_*memory*.txt data/memory/dump

In [None]:
import numpy as np
import os
import re

path = "data/disk/dumpload/"
# path = "data/disk/dump/"
# path = "data/memory/dump/"
# if we use dumpload operation in memory, the emitted times contain the time for loading
# path = "data/memory/dumpload/"
files = os.listdir(path)

# 24GB, 48GB, 96GB
marshall_avg = [[],[],[]]
marshall_std = [[],[],[]]
unmarshall_avg = [[],[],[]]
unmarshall_std = [[],[],[]]

# Process each row
for filename in files:
    if ".txt" not in filename:
        continue
    file = open(path+filename, 'r')
    data = file.read().strip().split("\n")
    for idx, row in enumerate(data):
        if idx==0:
            # skip headline
            continue
        # Split by comma and then by space to get the values
        values = row.split(',')
        if "load" in filename and "disk" in filename:
            # only for dumpload operation on disk, we can separate easily between dump and load times
            values_ = re.split(';| ', values[1].strip())
            marshall_ = [float(x) for x in values_[1::2]]
            overall_ = [float(x) for x in values_[0::2]]
            unmarshall_ = [x-y for x,y in zip(overall_, marshall_)]
        else:
            # Convert the second part (the recorded values) into a list of floats
            marshall_ = list(map(float, values[1].strip().split()))
        
        # Calculate average and standard deviation
        average_marshall = int(np.mean(marshall_))
        std_dev_marshall = int(np.std(marshall_))
        marshall_avg[idx-1].append(average_marshall)
        marshall_std[idx-1].append(std_dev_marshall)

        if "load" in filename:
            average_unmarshall = int(np.mean(unmarshall_))
            std_dev_unmarshall = int(np.std(unmarshall_))
            unmarshall_avg[idx-1].append(average_unmarshall)
            unmarshall_std[idx-1].append(std_dev_unmarshall)

In [None]:
marshalling=np.array(marshall_avg)
unmarshalling=np.array(unmarshall_avg)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
        

# Define the data
data_sizes = ["24 GB", "48 GB", "96 GB"]  # in GB
num_cpus = [1, 2, 4, 8, 16]

# Define new positions for each bar group
bar_positions = np.arange(len(data_sizes))
bar_positions=np.array([0,0.7,1.4])
# Adjust bar width to accommodate multiple CPU counts
bar_width = 0.12

# Create a bar plot with stacked bars for each CPU count in the same chart
fig, ax = plt.subplots(figsize=(12, 4), dpi=400)

colors=["#e8f2a1", "#afd095", "#729fcf", "#8e86ae", "#a1467e"]


for i, cpu_count in enumerate(num_cpus):
    bottom = np.zeros(len(data_sizes))
    
    # Plot suboperation1
    ax.bar(bar_positions + i * bar_width, marshalling[:, i], bar_width, color=colors[i], label=f'nCores {cpu_count}', bottom=bottom)

    bottom += marshalling[:, i]
    
    # Plot suboperation2 with hatching
    if len(unmarshalling[0] > 0):
        ax.bar(bar_positions + i * bar_width, unmarshalling[:, i], bar_width, color=colors[i], bottom=bottom, hatch='///')
        ax.bar(bar_positions + i * bar_width, [0.7,0.7,0.7], bar_width, color="black", bottom=bottom)

        ix=0
        for x in (bar_positions + i * bar_width):
            plt.text(x, marshalling[:, i][ix]+unmarshalling[:, i][ix]+28, "$S$: " + str(marshalling[:, i][ix]+unmarshalling[:, i][ix]), ha = 'center', fontsize=9)
            plt.text(x, marshalling[:, i][ix]+unmarshalling[:, i][ix]+16, "$M$: " + str(marshalling[:, i][ix]), ha = 'center', fontsize=9)
            plt.text(x, marshalling[:, i][ix]+unmarshalling[:, i][ix]+4, "$U$: " + str(unmarshalling[:, i][ix]), ha = 'center', fontsize=9)
            ix+=1
    else:
        ix=0
        for x in (bar_positions + i * bar_width):
            plt.text(x, marshalling[:, i][ix]+28, "$S$: " + str(marshalling[:, i][ix]), ha = 'center', fontsize=9)
            plt.text(x, marshalling[:, i][ix]+16, "$M$: " + str(marshalling[:, i][ix]), ha = 'center', fontsize=9)
            ix+=1

props = dict(boxstyle='round', facecolor='white', edgecolor="lightgrey")
# place a text box in upper left in axes coords
if len(unmarshalling[0] > 0):
    ax.text(0.5, 179, "$S$: Sum\n$M$: Marshalling\n$U$: Unmarshalling", bbox=props)
else:
    ax.text(0.5, 179, "$S$: Sum\n$M$: Marshalling", bbox=props)
    
ax.set_ylim([0,237])
# Set labels and title
ax.set_xlabel('Data size (in 1500 chunks, equally distributed)')
ax.set_ylabel('Time in seconds')

# Set x-ticks to be in the middle of each group
ax.set_xticks(bar_positions + (len(num_cpus) * bar_width) / 2)
ax.set_xticklabels(data_sizes)

handles, labels = ax.get_legend_handles_labels()

patch = mpatches.Patch(color='white', label='')
# manually define a new patch 
patch1 = mpatches.Patch(color='lightgrey', label='Marshalling')
# handles is a list, so append manual patch
handles.append(patch)
handles.append(patch1)

if len(unmarshalling[0] > 0):
    patch2 = mpatches.Rectangle((0, 0), 1, 1, hatch="///", fill=False, label="Unmarshalling", color="lightgrey")
    handles.append(patch2)

# plot the legend
plt.legend(handles=handles, loc='upper left', bbox_to_anchor=(0.05,0.95))

plt.tight_layout()
plt.show()
fig.savefig('marshallingunmarshalling.png')