In [47]:
import pandas as pd
from glob import glob
import os

from IPython.display import display
from __future__ import unicode_literals

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 50)

%matplotlib

Using matplotlib backend: Qt5Agg


In [31]:
def extract_time(sid):
    
    # Matteo's Magic

    profile = pd.read_csv('./%s/%s.prof'%(sid,sid), 
                          header=None,
                          names=['tstamp','sid','uid','state','event','msg'],
                          usecols=['tstamp','sid','uid','state'])

    # Keep only unit profiles
    profile = profile.dropna(subset=['uid'])
    profile = profile[profile.uid.str.contains('unit')]

    # Elminate redundant 'Done' state 
    profile.loc[profile.state == 'Done'] = profile.loc[(profile.sid.str.contains('OutputFileTransfer')) & 
                                                   (profile.state == 'Done')]
    profile = profile.dropna()
    if profile[profile.state == 'Done'].state.size != 112:
        print 'ERROR'

    # Purge useless info from session ID
    profile['sid'] = profile['sid'].apply(lambda x: x.split(':')[1])
    profile = profile.reset_index(drop=True)
    
    # Keep only execution-related states
    txs = profile[(profile.state == 'Executing') | 
                  (profile.state == 'StagingOutput') | 
                  (profile.state == 'AgentStagingOutputPending')].copy()

    # Profiles are a mess in 0.42 :( 
    # - Duplicates of state StagingOutput 
    # - spare presence of state AgentStagingOutputPending
    for uid in txs.uid.tolist():
        txs[txs.uid == uid]
        if len(txs[(txs.uid == uid) & (txs.state == 'StagingOutput')]['state'].tolist()) >= 2:
            txs = txs.drop(txs.index[(txs.uid == uid) &
                                     (txs.state == 'StagingOutput') & 
                                     (txs.sid.str.contains('Thread'))])
        if 'AgentStagingOutputPending' in txs[txs.uid == uid].state.tolist() and \
           'StagingOutput' in txs[txs.uid == uid].state.tolist():
            txs = txs.drop(txs.index[(txs.uid == uid) & 
                                      (txs.state == 'AgentStagingOutputPending')])

    # We are done with sid
    txs = txs.drop('sid', axis=1)

    # Calculate $T_x$
    txs.tstamp = pd.to_numeric(txs.tstamp, errors='coerce')
    txs = txs.pivot(index='uid', columns='state', values='tstamp')
    txs['Tx'] = txs['StagingOutput']-txs['Executing']
    
    return txs

In [40]:
def extract_namd_time(sid):
    
    stdouts = glob('%s-pilot.0000/unit.*/STDOUT'%sid)
    df = pd.DataFrame(columns=['NAMD Duration'])
    df.index.name = 'uid'
    
    exec_prof = open('./execution_profile_rp.session.two.jdakka.017398.0008.csv','r')
    read_exec_lines = exec_prof.readlines()

    for line in read_exec_lines[2:]:

        if int(line.split(',')[1].strip().split('_')[1].strip()) in [4,5,6]:

            uid = line.split(',')[0].strip()
            out = '%s-pilot.0000/%s/STDOUT'%(sid, uid)
        
            f = open(out,'r')
            last_line = f.readlines()[-1:][0]
        
            namd_dur = float(last_line.split('~')[1].strip().split(',')[0].strip()[:-1])
        
            df.loc[uid] = [float(namd_dur/8)]
        
    return df.sort_index()

In [52]:
rp_prof = extract_time('rp.session.two.jdakka.017398.0008')
namd_prof = extract_namd_time('rp.session.two.jdakka.017398.0008')


comb_prof = rp_prof.merge(namd_prof, right_index=True,left_index=True)
comb_prof.columns = ['Executing', 'StagingOutput','Tx from RP', 'Tx from NAMD']
#print comb_prof

In [54]:
ax = comb_prof.plot(kind='line', y=['Tx from RP', 'Tx from NAMD'])

ax.set_xlabel('uids')
ax.set_ylabel('Time (seconds)')
ax.set_title('Tx for each unit obtained from RP profiles and NAMD logs')

<matplotlib.text.Text at 0x7f4d315a6150>