In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from radical.entk import Profiler
import radical.analytics as ra
import radical.utils as ru
import radical.pilot as rp

%matplotlib



Using matplotlib backend: TkAgg


In [38]:
# Entire list of pipeline values for which profiles are available

pipelines_list = [8, 16]

In [39]:
# Generate task uids

def get_task_uids(num_pipelines):
    
    num_tasks = num_pipelines*7*1
    task_uids = []
    for t in range(num_tasks):
        task_uids.append('radical.entk.task.%04d'%t)

    return task_uids

In [54]:
# Get information from all the entk profiles

def get_entk_info(pipelines):
    
    p = Profiler(src = './null-ws-{0}cores/'.format(pipelines*8))
    
    task_uids = get_task_uids(pipelines)
    
    entk_dur = p.duration(task_uids, states=['SCHEDULING','DONE'])
    
    # Time taken in appmanager to rreq function call
    entk_core_1 = p.duration('radical.entk.appmanager.0000', events=['create amgr obj', 'init rreq submission'])
    
    # Time taken to tear down appmanager
    entk_core_2 = p.duration('radical.entk.appmanager.0000', events=['start termination', 'termination done'])
    
    # Time taken to create resource manager obj
    entk_core_3 = p.duration('radical.entk.resource_manager.0000', events = ['create rmgr obj', 'rmgr obj created'])
    
    # Time taken to create and submit resource reservation
    entk_core_4 = p.duration('radical.entk.resource_manager.0000', events = ['creating rreq', 'rreq submitted'])
    
    # Time taken to deallocate resource reservation
    entk_core_5 = p.duration('radical.entk.resource_manager.0000', events = ['canceling resource allocation', 'resource allocation cancelled'])
    
    entk_core_dur = entk_core_1 + entk_core_2 + entk_core_3 
    
    #entk_total_dur = entk_dur_1 + entk_dur_2 + entk_core_dur
    
    return entk_dur

In [55]:
# Get information from all the rp profile files and json file
# returns 0,0 if no rp files are found

def get_rp_info(pipelines):

    try:
        json_files = glob.glob('./null-ws-{0}cores/*.json'.format(pipelines*8))
        print json_files
        json_file = json_files[0]
        json      = ru.read_json(json_file)
        sid       = os.path.basename(json_file)[:-5]

        session = ra.Session(sid, 'radical.pilot', src='./null-ws-{0}cores/'.format(pipelines*8))
        units = session.filter(etype='unit', inplace=False)
    
        exec_dur = units.duration([rp.AGENT_EXECUTING, rp.AGENT_STAGING_OUTPUT_PENDING])
        rp_dur = units.duration([rp.UMGR_SCHEDULING_PENDING, rp.DONE])
        
        return exec_dur, rp_dur
    except:
        return 0, 0


In [60]:
df = pd.DataFrame(columns=['EnTK overhead', 'RP overhead'])

for pipelines in pipelines_list:
       
    entk_dur = get_entk_info(pipelines)
    exec_dur, rp_dur = get_rp_info(pipelines)
  
    print entk_dur, rp_dur, exec_dur
    df.loc[pipelines] = [entk_dur - rp_dur, rp_dur]

['./null-ws-64cores/rp.session.two.jdakka.017393.0001.json']
148.697399855 112.780900002 102.019399881
['./null-ws-128cores/rp.session.two.jdakka.017395.0000.json']
136.606400013 133.335100174 126.931900024


In [61]:
ax = df.plot(kind='bar', title='Time taken by EnTK and RP to execute a workflow consisting of \n X Pipelines, 7 Stages per Pipeline and 1 Task per Stage on NCSA.BW (Task executable = "sleep 0", number of trials per data point = 1)')
ax.set_xlabel('Number of Pipelines')
ax.set_ylabel('Time (seconds)')


fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
fig.savefig('null_worload_devel.pdf', dpi=100)