In [1]:
import pandas as pd
import glob
import os
%matplotlib inline

pd.set_option('display.precision',14)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 1000)
pd.set_option('display.width', 1000)

tasks = [16, 32, 64, 128, 256]
overheads = []

In [2]:
def get_appman_dataframe(path):
    
    f = glob.glob('%s/entk.appmanager.0000.prof'%path)[0]
    
    df = pd.read_csv(f, sep=',', nrows=15)
    df = df.drop(0)
    df1 = df.drop(df.columns[[1,2,3,5]],axis=1)
    df1.columns = ['time', 'event']
    #print df1
    
    df = pd.read_csv(f, sep=',', skiprows=17,
                    usecols=['time', 'name', 'uid', 'state', 'pattern', 'iteration', 'stage', 'task', 'event', 'msg'],
                    names=['time', 'name', 'uid', 'state', 'pattern', 'iteration', 'stage', 'task', 'event', 'msg'],
                    skipinitialspace=True)
    df2 = df.drop(df.columns[[1,2,3,4,9]],axis=1)
    
    df3 = df1.merge(df2, how='outer')
    
    #print df3      
    return df3

In [3]:
def get_resourcehandle_dataframe(path):
    
    f = glob.glob('%s/entk.resource*.prof'%path)
    df = pd.read_csv(f[0])
    df = df.drop(0)
    df = df.drop(8)
    df1 = df.drop(df.columns[[1,2,3,5]], axis=1)
    df1.columns = ['time', 'event']
    
    return df1

In [4]:
def get_execplugin_dataframe(path):

    f = glob.glob('%s/entk.exec*.prof'%path)
    df = pd.read_csv(f[0])
    df = df.drop(0)
    df1 = df.drop(df.columns[[1,2,3,5]], axis=1)  
    df1 = df1[df1.event != 'Adding workload']
    df1 = df1[df1.event != 'Instantiated']
    df1 = df1[df1.event != 'Adding manager']
    
    df2 = pd.DataFrame(columns=['time','event'])
    cnt=0
    for row in df1.iterrows():
        df2.loc[cnt] = [row[1]['#time'], row[1]['event']]
        cnt+=1
      
    return df2

In [5]:
def compute_overheads(df_am, df_rh, df_ep):
    
    #-------------------------------------------------------------------------
    # Resource Handle
    #-------------------------------------------------------------------------
    # Cost of allocate
    overhead_rh_1 = df_rh['time'][5] - df_rh['time'][2]
    overhead_rh_2 = df_rh['time'][9] - df_rh['time'][6]
    
    # Cost of run
    overhead_rh_3 = df_rh['time'][11] - df_rh['time'][10]
    
    # Cost of deallocation
    overhead_rh_4 = df_rh['time'][15] - df_rh['time'][13]
    
    # Overhead in resource handle
    overhead_rh = overhead_rh_1 + overhead_rh_2 + overhead_rh_3 + overhead_rh_4
    
    #-------------------------------------------------------------------------
    
    #-------------------------------------------------------------------------
    # App Manager
    #-------------------------------------------------------------------------
    
    # Cost of registering kernels
    overhead_am_1 = df_am['time'][2] - df_am['time'][1] + \
                    df_am['time'][4] - df_am['time'][3] + \
                    df_am['time'][6] - df_am['time'][5] + \
                    df_am['time'][8] - df_am['time'][7] + \
                    df_am['time'][10] - df_am['time'][9]
    
    # Cost of creating record
    overhead_am_2 = df_am['time'][12] - df_am['time'][11]
    
    # Cost of maintaining eop
    iteration = 1
    max_stage = 5
    max_task=16
    
    
    
    overhead_am_3 = 0.0

    for stage in range(1, max_stage+1):
        
        if stage == 1:
                        
            for task in range(1, max_task+1):
                temp_df = df_am[(df_am.stage == 'stage: %s'%stage)&(df_am.task == 'task: %s'%task)]
                
                if task == 1:
                    #print temp_df
                    
                    overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'creating'].index.tolist()[0]] - \
                    temp_df['time'][temp_df[temp_df.event == 'starting'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'recording'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'determining next stage'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'determined next stage'].index.tolist()[0]]                
                
        else:
            
            for task in range(1, max_task+1):
                temp_df = df_am[(df_am.stage == 'stage: %s'%stage)&(df_am.task == 'task: %s'%task)]
                
                #if task == 1:
                #    print temp_df
                #print 'stage:',stage,' task:',task
                #print 'recording:',[temp_df[temp_df.event == 'recording'].index.tolist()]
                #print 'recorded:',[temp_df[temp_df.event == 'recorded'].index.tolist()]
                
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'creating'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'starting'].index.tolist()[0]] 
                    
                #print temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]]
                
                #overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]] - \
                #temp_df['time'][temp_df[temp_df.event == 'recording'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'determining next stage'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'determined next stage'].index.tolist()[0]]
                
    # Overhead in app manager
    overhead_am = overhead_am_1 + overhead_am_2  
    
    #-------------------------------------------------------------------------
    
    #-------------------------------------------------------------------------
    # Exec plugins
    #-------------------------------------------------------------------------
    
    # Cost of creating + submitting units
    overhead_ep = 0.0
    cnt=0
    while cnt < df_ep.time.count()/2:
        diff  = df_ep['time'][2*cnt+1] - df_ep['time'][2*cnt]
        overhead_ep += diff
        cnt+=1
    #-------------------------------------------------------------------------
    
    #print overhead_rh, overhead_am, overhead_ep
    return overhead_rh + overhead_am + overhead_ep
    #print df_am

In [6]:
def get_overheads(path):
    
    df1 = get_appman_dataframe(path)    
    df2 = get_resourcehandle_dataframe(path)
    df3 = get_execplugin_dataframe(path)

    overhead = compute_overheads(df1, df2, df3)
    
    return overhead

In [7]:
for task in tasks:
    path = './ensemble-%s-core-%s'%(task,task)
    overheads.append(get_overheads(path))

In [8]:
print overheads

[36.7955002784729, 54.969303846359253, 70.818997144699097, 118.07860064506531, 250.48350405693054]
