In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os

%matplotlib 

pd.set_option('display.precision',14)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 1000)
pd.set_option('display.width', 1000)

Using matplotlib backend: TkAgg


In [2]:
def get_resourcehandle_dataframe(path):
    
    f = glob.glob('%s/entk.resource*.prof'%path)
    df = pd.read_csv(f[0])
    df = df.drop(0)
    df = df.drop(8)
    df1 = df.drop(df.columns[[1,2,3,5]], axis=1)
    df1.columns = ['time', 'event']
    
    return df1

In [3]:
def get_execplugin_dataframe(path):

    f = glob.glob('%s/entk.exec*.prof'%path)
    df = pd.read_csv(f[0])
    df = df.drop(0)
    df1 = df.drop(df.columns[[1,2,3,5]], axis=1)  
    df1 = df1[df1.event != 'Adding workload']
    df1 = df1[df1.event != 'Instantiated']
    df1 = df1[df1.event != 'Adding manager']
    
    df2 = pd.DataFrame(columns=['time','event'])
    cnt=0
    for row in df1.iterrows():
        df2.loc[cnt] = [row[1]['#time'], row[1]['event']]
        cnt+=1
      
    #print df2
    return df2

In [4]:
def get_appman_dataframe(path):
    
    f = glob.glob('%s/entk.appmanager.0000.prof'%path)[0]
    
    df = pd.read_csv(f, sep=',', nrows=9,skiprows=1,header=0)
    df = df.drop(0)
    df1 = df.drop(df.columns[[1,2,3,5]],axis=1)
    df1.columns = ['time', 'event']
    #print df1
    
    df = pd.read_csv(f, sep=',', skiprows=13,
                    usecols=['time', 'name', 'uid', 'state', 'pattern', 'iteration', 'stage', 'task', 'event', 'msg'],
                    names=['time', 'name', 'uid', 'state', 'pattern', 'iteration', 'stage', 'task', 'event', 'msg'],
                    skipinitialspace=True)
    df2 = df.drop(df.columns[[1,2,3,4,9]],axis=1)
    
    df3 = df1.merge(df2, how='outer')
    
    #print df3      
    return df3

In [9]:
def compute_overheads(df_am, df_rh, df_ep, tasks, stages):
    
    #-------------------------------------------------------------------------
    # Resource Handle
    #-------------------------------------------------------------------------
    # Cost of allocate
    overhead_rh_1 = df_rh['time'][5] - df_rh['time'][2]
    overhead_rh_2 = df_rh['time'][9] - df_rh['time'][6]
    
    # Cost of run
    overhead_rh_3 = df_rh['time'][11] - df_rh['time'][10]
    
    # Cost of deallocation
    overhead_rh_4 = df_rh['time'][15] - df_rh['time'][13]
    
    # Overhead in resource handle
    overhead_rh = overhead_rh_1 + overhead_rh_2 + overhead_rh_3 + overhead_rh_4
    
    #-------------------------------------------------------------------------
    
    #-------------------------------------------------------------------------
    # App Manager
    #-------------------------------------------------------------------------
    
    # Cost of registering kernels
    overhead_am_1 = df_am['time'][2] - df_am['time'][1] + \
                    df_am['time'][4] - df_am['time'][3] + \
                    df_am['time'][6] - df_am['time'][5] + \
                    df_am['time'][8] - df_am['time'][7] + \
                    df_am['time'][10] - df_am['time'][9]
    
    # Cost of creating record
    overhead_am_2 = df_am['time'][12] - df_am['time'][11]
    
    # Cost of maintaining eop
    iteration = 1
    max_stage = stages
    max_task=tasks
    
    
    
    overhead_am_3 = 0.0

    for stage in range(1, max_stage+1):
        
        if stage == 1:
                        
            for task in range(1, max_task+1):
                temp_df = df_am[(df_am.stage == 'stage: %s'%stage)&(df_am.task == 'task: %s'%task)]
                
                if task == 1:
                    #print temp_df
                    
                    overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'creating'].index.tolist()[0]] - \
                    temp_df['time'][temp_df[temp_df.event == 'starting'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'recording'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'determining next stage'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'determined next stage'].index.tolist()[0]]                
                
        else:
            
            for task in range(1, max_task+1):
                temp_df = df_am[(df_am.stage == 'stage: %s'%stage)&(df_am.task == 'task: %s'%task)]
                
                #if task == 1:
                #    print temp_df
                #print 'stage:',stage,' task:',task
                #print 'creating:',temp_df['time'][temp_df[temp_df.event == 'creating'].index.tolist()[0]]
                #print 'starting:',temp_df['time'][temp_df[temp_df.event == 'starting'].index.tolist()[0]]
                
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'creating'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'starting'].index.tolist()[0]] 
                    
                #print temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]]
                
                #overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'recorded'].index.tolist()[0]] - \
                #temp_df['time'][temp_df[temp_df.event == 'recording'].index.tolist()[0]]
                    
                overhead_am_3 += temp_df['time'][temp_df[temp_df.event == 'determining next stage'].index.tolist()[0]] - \
                temp_df['time'][temp_df[temp_df.event == 'determined next stage'].index.tolist()[0]]
                
    # Overhead in app manager
    overhead_am = overhead_am_1 + overhead_am_2  
    
    #-------------------------------------------------------------------------
    
    #-------------------------------------------------------------------------
    # Exec plugins
    #-------------------------------------------------------------------------
    
    # Cost of creating + submitting units
    overhead_ep = 0.0
    cnt=0
    while cnt < df_ep.time.count()/2:
        diff  = df_ep['time'][2*cnt+1] - df_ep['time'][2*cnt]
        overhead_ep += diff
        cnt+=1
    #-------------------------------------------------------------------------
    
    #print overhead_rh, overhead_am, overhead_ep
    return overhead_rh + overhead_am + overhead_ep
    #print df_am

In [10]:
def get_overheads(tasks, path, stage):
    
    df1 = get_appman_dataframe(path)    
    df2 = get_resourcehandle_dataframe(path)
    df3 = get_execplugin_dataframe(path)
   
    overhead = compute_overheads(df1, df2, df3, tasks, stage)
    
    return overhead

In [18]:
def get_exectime():
    
    df = pd.read_csv('./multi-stage-profiles.csv',skipinitialspace=True)
    return df

In [16]:
task = 128
stages = [2,3,4,5]
overheads = []

for stage in stages:
    path = './stages_%s'%(stage)
    overheads.append(get_overheads(task, path, stage))

In [17]:
print overheads

[159.72899770736694, 196.68650197982788, 210.84520173072815, 218.0603015422821]


In [19]:
timing_df = get_exectime()
timing_df['overhead'] = overheads
timing_df.columns = ['Tasks','Cores','Execution time','EnTK overhead']
plot_df = pd.DataFrame(columns=['Execution time','EnTK overhead'])
for row in timing_df.iterrows():
    task = int(row[1]['Tasks'])
    core = int(row[1]['Cores'])
    et = row[1]['Execution time']
    ov = row[1]['EnTK overhead']
    plot_df.loc['%s,%s'%(task,core)] = [et, ov]
print timing_df

   Tasks  Cores       Execution time       EnTK overhead
0    128    256  1287.37560009999993  159.72899770736694
1    128    256  1961.75740027000006  196.68650197982788
2    128    256  2611.84870004999993  210.84520173072815
3    128    256  3384.55259991000003  218.06030154228210


In [10]:
plt.figure()
FONTSIZE=26
ax = plot_df.plot(kind='bar', y=['EnTK overhead','Execution time'], fontsize=FONTSIZE,
                   title='EnTK execution time and overhead with constant tasks-to-cores ratio')
ax.set_xlabel('Tasks,Cores (cores per task=2)', fontsize=FONTSIZE)
ax.set_ylabel('Time (seconds)', fontsize=FONTSIZE)
ax.set_title(ax.get_title(),fontsize=FONTSIZE+4)
ax.set_ylim(0,1800)
plt.legend(loc=1,prop={'size':FONTSIZE})
plt.xticks(rotation=0)

(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)