In [2]:
#transform redwood traces to pandas-importable csv-files
import sys
import os
import chart_studio.plotly as py
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go

wfs = ["RS1", "RS2"]
sizes = ["small", "medium", "large"]
run_numbers = ["1", "2", "3"]
clusters = ["redwood", "kubernetes-cluster"]
aligners = ["STAR", "kallisto", "HISAT2", "Salmon"]
dfs = pd.read_csv("result-filtered-columns.csv")
x = dfs.groupby(["infrastructure", "wf_version","aligner", "dataset_size","run_number"]).count()
x.reset_index(inplace=True)
x[["infrastructure", "wf_version", "aligner", "dataset_size","run_number"]].to_csv("workflows_run.csv")
print(x[(x.infrastructure=="redwood")])
dfs = dfs[['task_id', 'hash', 'native_id', 'process', 'tag', 'name', 'status', 'exit', 'module', 'container', 'cpus', 'time', 'disk', 'memory', 'attempt', 'submit', 'start', 'complete', 'duration', 'realtime', 'queue', '%cpu', '%mem', 'rss', 'vmem', 'peak_rss', 'peak_vmem', 'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', 'vol_ctxt', 'inv_ctxt', 'env', 'workdir', 'wf_version', 'dataset_size', 'run_number', 'infrastructure', 'aligner', 'hostname']]
print(x.co)

    infrastructure wf_version   aligner dataset_size  run_number  \
54         redwood        RS1    HISAT2        large           1   
55         redwood        RS1    HISAT2        large           2   
56         redwood        RS1    HISAT2        large           3   
57         redwood        RS1    HISAT2       medium           1   
58         redwood        RS1    HISAT2       medium           2   
..             ...        ...       ...          ...         ...   
122        redwood        RS2  kallisto       medium           2   
123        redwood        RS2  kallisto       medium           3   
124        redwood        RS2  kallisto        small           1   
125        redwood        RS2  kallisto        small           2   
126        redwood        RS2  kallisto        small           3   

     Unnamed: 0.97  Unnamed: 0.96  Unnamed: 0.95  Unnamed: 0.94  \
54               8              8              8              8   
55               8              8              8 

In [25]:
#transform data from MB/GB to MB; from str to float
categories = ["read_bytes", "write_bytes", "rss", "vmem","rchar","wchar"]
for el in categories:
    tmp = dfs[el].tolist()
    final = []
    for element in tmp:
        try:
            x = float(element)
        except ValueError:
            y = element.split(" ")
            if(y[-1]=="MB"):
                x = float(y[0])
            elif(y[-1]=="GB"):
                x = float(y[0]) * 1000
            elif(y[-1]=="KB"):
                x = float(y[0]) / 1000
            else:
            #    raise Exception("object is not int, KB, MB, GB: " + element)
                 pass
        final.append(x)
    dfs[el] = final

In [26]:
#transform duration from string to seconds
tmp = dfs["duration"].tolist()
final = []
for el in tmp: 
    duration = 0
    time_list = el.split(" ")
    for element in time_list:
        time_unit = element[-1:]
        if(time_unit == "s"):
            duration += float(element[:-1])
        elif(time_unit == "m"):
            duration += (60*float(element[:-1]))
        elif(time_unit == "h"):
            duration += (3600*float(element[:-1]))
    final.append(duration)
dfs["duration"] = final


In [27]:
#for each workflow run, get duration of wf
df_tmp = dfs[["wf_version","infrastructure","aligner","dataset_size","run_number","start","complete"]]
time_per_wf = df_tmp.groupby(["wf_version","infrastructure","aligner","dataset_size","run_number"]).agg({"start": "min","complete": "max"})
start = time_per_wf["start"].tolist()
end = time_per_wf["complete"].tolist()
start_stamps = []
end_stamps = []
time_d = []
for k in range(len(start)):
    start_stamps.append(datetime.strptime(start[k], "%Y-%m-%d %H:%M:%S.%f")) 
    end_stamps.append(datetime.strptime(end[k], "%Y-%m-%d %H:%M:%S.%f")) 
    time_d.append((end_stamps[k] - start_stamps[k]).total_seconds()/60)
time_per_wf["wf_duration"] = time_d
time_per_wf.reset_index(inplace=True)


In [28]:
#show per wf-version, dataset and aligner: read/written bytes, rss, vmem per task
categories = ["read_bytes", "write_bytes", "vmem", "rss", "duration", "rchar", "wchar","syscr","syscw"]
@interact
def plot_features(generate_plotly=False, workflow_version = wfs, dataset = sizes, category = categories, infrastructure = clusters, aligner = aligners) :
    #fig = plt.figure(figsize=(8,4))
    #sb.set(style="ticks")    
    tmp = dfs[(dfs.dataset_size == str(dataset)) \
              & (dfs.wf_version == str(workflow_version)) \
              & (dfs.infrastructure == str(infrastructure)) \
              & (dfs.aligner == str(aligner))]
    
    sb.stripplot(data = tmp, y = "process", x = str(category))       
    plt.title(aligner + "\n" + infrastructure + "\n workflow: " + workflow_version + ", dataset: " + str(dataset))
    plt.show()
    
    if(generate_plotly == True):
        fig = px.strip(data_frame = tmp, y = "process", x = str(category), \
                       color = "process",template="plotly_dark", \
                       color_discrete_sequence = px.colors.sequential.Plasma_r,  
                       labels={"process": "Process name",
                     str(category): label(category)},
                title=str(category) + " for " + workflow_version + ", " + aligner + ", " \
                       + infrastructure + ", " + dataset)                   
        fig.update_traces({'marker':{'size': 15}})
        path = "./single_category-" + workflow_version + "_" + dataset + "_" \
        + aligner + "_" + infrastructure + "_" + category + ".html"
        fig.write_html(path)
    
    
def label(category):
    if(category=="read_bytes"):
        return "Bytes read (MB)"
    elif(category=="vmem"):
        return "Virtual memory (MB)"
    elif(category=="rss"):
        return "Resident set size (MB)"
    elif(category=="write_bytes"):
        return "Bytes written (MB)"
    elif(category=="duration"):
        return "Process duration (seconds)"
    elif(category=="wchar"):
        return "Written characters (MB)"
    elif(category=="rchar"):
        return "Read characters (MB)"
    elif(category=="syscr"):
        return "syscr"
    elif(category=="syscr"):
        return "syscr"

interactive(children=(Checkbox(value=False, description='generate_plotly'), Dropdown(description='workflow_ver…

In [7]:

#show per wf-version, dataset and aligner: read/written bytes, rss, vmem per task
categories = ["read_bytes", "write_bytes", "vmem", "rss", "duration"]
plt.figure()
@interact
def version_comparison(generate_plotly = False, dataset = sizes, category = categories, infrastructure = clusters, aligner = aligners) :
    sb.set(style="ticks")  

    tmp = dfs[(dfs.dataset_size == str(dataset)) \
              & (dfs.infrastructure == str(infrastructure)) \
              & (dfs.aligner == str(aligner))]
    sb.catplot(data = tmp, kind = "strip", y = "process", x = str(category), col = "wf_version") 
    plt.show() 

    if(generate_plotly == True):
        fig = px.strip(data_frame = tmp, y = "process", x = str(category), facet_col = "wf_version", \
                       color = "process",template="plotly_dark", \
                       color_discrete_sequence = px.colors.sequential.Plasma_r,  
                       labels={"process": "Process name",
                     str(category): label(category)},
                title=str(category) + ": comparing RS1 and RS2 for " + aligner + ", " \
                       + infrastructure + ", " + dataset)                   
        fig.update_traces({'marker':{'size': 15}})
        path = "./comparison-" + dataset + "_" \
        + aligner + "_" + infrastructure + "_" + category + ".html"        
        fig.write_html(path)


<Figure size 640x480 with 0 Axes>

interactive(children=(Checkbox(value=False, description='generate_plotly'), Dropdown(description='dataset', op…

In [8]:
#show runtimes & average runtimes, compared for RS1 and RS2
plt.figure()
@interact
def plot_overall_runtime(generate_plotly = False, dataset = sizes, infrastructure = clusters, aligner = aligners) :
    sb.set(style="ticks")  

    df_runtime = time_per_wf[(time_per_wf.dataset_size == str(dataset)) \
              & (time_per_wf.infrastructure == str(infrastructure)) \
              & (time_per_wf.aligner == str(aligner))]
    stripplot = sb.stripplot(data=df_runtime, x="wf_version", y="wf_duration",color="k")
    df_runtime_grouped = df_runtime.groupby(["wf_version","dataset_size","aligner"]).mean()
    df_runtime_grouped.reset_index(inplace=True)
    sb.barplot(data = df_runtime_grouped, x="wf_version",y="wf_duration", color="0.7")
    plt.title("Average workflow runtime for RS1 and RS2")
    plt.show() 
    
    
    
    if(generate_plotly==True):
        fig = px.strip(data_frame = df_runtime, x = "wf_version", y = "wf_duration", \
        template="plotly_dark", color_discrete_sequence = px.colors.sequential.Jet_r, \
        labels={"wf_version": "Workflow version", \
                     "wf_duration": "Workflow duration in minutes"}, \
                title="comparing the overall workflow runtime for RS1 and RS2 for " + aligner + ", " \
                       + infrastructure + ", " + dataset)                   
        fig.update_traces({'marker':{'size': 15}})
        mean_dur_RS1 = 0
        mean_dur_RS2 = 0
        mean_dur_RS1 = df_runtime_grouped[(df_runtime_grouped.wf_version == "RS1")]["wf_duration"].tolist()[0]
        mean_dur_RS2 = df_runtime_grouped[(df_runtime_grouped.wf_version == "RS2")]["wf_duration"].tolist()[0]

        fig.update_layout(shapes=[dict(type='line', x0=.2, y0=mean_dur_RS1, x1=.3, y1=mean_dur_RS1,
                               xref='paper', yref='y',
                               line_width=3, line_color='white'),
                          dict(type='line', x0=.72, y0=mean_dur_RS2, x1=.82, y1=mean_dur_RS2,
                               xref='paper', yref='y',
                               line_width=3, line_color='white')
                         ])
        path = "./overall-runtime-" + dataset + "_" \
        + aligner + "_" + infrastructure + ".html"        
        fig.write_html(path)


<Figure size 640x480 with 0 Axes>

interactive(children=(Checkbox(value=False, description='generate_plotly'), Dropdown(description='dataset', op…

In [9]:
#show alignment
plt.figure()
@interact
def plot_runtime_alignment(generate_plotly = False, dataset = sizes, infrastructure = clusters, aligner = aligners) :
    sb.set(style="ticks")  

    tmp = dfs[(dfs.dataset_size == str(dataset)) \
              & (dfs.infrastructure == str(infrastructure)) \
              & (dfs.aligner == str(aligner))
             &  ( (dfs.process =="rnaseq:kallisto_map") \
                | (dfs.process =="STAR_ALIGN") \
                | (dfs.process =="HISAT2_ALIGN") \
                | (dfs.process =="SALMON_ALIGN_QUANT"))]
    sb.stripplot(data = tmp, y = "duration", x = "wf_version",color="k") 
    df_grouped = tmp.groupby(["wf_version","dataset_size","aligner"]).mean()
    df_grouped.reset_index(inplace=True)
    sb.barplot(data = df_grouped, x="wf_version",y="duration", color="0.7")
    plt.title("Compared duration (seconds) of the alignment process of the selceted dataset, aligner and infra")
    plt.show() 
    
       
    if(generate_plotly==True):
        fig = px.strip(data_frame = tmp, x = "wf_version", y = "duration", \
        template="plotly_dark", color_discrete_sequence = px.colors.sequential.Jet_r, \
        labels={"wf_version": "Workflow version", \
                     "duration": "Process duration in sceonds"}, \
                title="comparing the runtime of the single alignment processes for RS1 and RS2 for " + aligner + ", " \
                       + infrastructure + ", " + dataset)                   
        fig.update_traces({'marker':{'size': 15}})
        mean_dur_RS1 = 0
        mean_dur_RS2 = 0
        mean_dur_RS1 = df_grouped[(df_grouped.wf_version == "RS1")]["duration"].tolist()[0]
        mean_dur_RS2 = df_grouped[(df_grouped.wf_version == "RS2")]["duration"].tolist()[0]
        
        fig.update_layout(shapes=[dict(type='line', x0=.2, y0=mean_dur_RS1, x1=.3, y1=mean_dur_RS1,
                               xref='paper', yref='y',
                               line_width=3, line_color='white'),
                          dict(type='line', x0=.7, y0=mean_dur_RS2, x1=.8, y1=mean_dur_RS2,
                               xref='paper', yref='y',
                               line_width=3, line_color='white')
                         ])
        path = "./alignment-runtime-" + dataset + "_" \
        + aligner + "_" + infrastructure + ".html"        
        fig.write_html(path) 


<Figure size 640x480 with 0 Axes>

interactive(children=(Checkbox(value=False, description='generate_plotly'), Dropdown(description='dataset', op…