In [69]:
import plotly.graph_objects as go
import pandas as pd

# reading input log file
import sys
from csv import excel

path='../tmp_outputs'
task_cnt = 8

# fsim = f'{path}/hm-sim.f100.n{task_cnt}.r100.log.xlsx'
# fagg = f'{path}/hm-agg.f100.n{task_cnt}.r100.log.xlsx'

fsim=f'{path}/prov-sim.csv'
fagg=f'{path}/prov-agg.csv'

In [70]:
def humansize(nbytes):
    suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
    i = 0
    while nbytes >= 1000 and i < len(suffixes)-1:
        nbytes /= 1000.
        i += 1
    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
    return '%s %s' % (f, suffixes[i])

In [71]:
class SankeyData:
    
    # From Tazer
    df_stat = list()
    # Sankey in Plotly, node - vertex, link - edge
    nodes = pd.DataFrame(columns=['label', 'color'])
    links = pd.DataFrame(columns=['source', 'target', 'value', 'label'])
    
    color_map = {"task": "red",
                 "file": "blue",
                 "none": "grey",
                }
    block_size = 1024 # default block_size
    output_suffix = "hermesvfd-example"
        
    def __init__(self):
        pass
        
    def load_stat(self, df_stat, params):
        
        self.df_stat.append({ "df": df_stat,
                              "filename": params['filename'],
                              "type": params['type'],
                              "size": params['size'],
                             "task_name": params['task_name']
                            })

        # add a node entry (task)
        self.set_taskname(params['task_name'])

        # add a node entry (input/output file)
        # one filename entry is allowed
        if len(self.nodes[self.nodes.label == params['filename']]) == 0:
            self.add_node({'label': params['filename'],
                           'color': self.color_map['file']})
            '''
            self.add_node({'label': 'no read',
                      'color': self.color_map['none']})
            self.add_node({'label': 'no write',
                      'color': self.color_map['none']})
            '''
        
    def set_taskname(self, name):
        if len(self.nodes[self.nodes.label == name]) == 0:
            self.add_node({'label': name,
                           'color': self.color_map['task']})
    def set_output_suffix(self,suffix):
        self.output_suffix = suffix
        print(f"Output Name : sankey-diagram-plotly-{self.output_suffix}.html")
                    
    def get_node(self, idx):
        return self.nodes[idx]
    
    def add_node(self, node_dict):
        # label, color, customdata, x, y
        x = pd.DataFrame([node_dict])
        self.nodes = pd.concat([self.nodes, x], ignore_index=True)
        new_id = len(self.nodes)
        
    def get_link(self, name):
        return self.link[name]
    
    def set_links(self):
        pass
    
    def set_block_size(self,size):
        self.block_size =  size

    def build_links(self, key='block_idx', no_rw=False):

        links = []
        for v in self.df_stat:
            # input (r)
            # cnt = v['df'][key].nunique()
            cnt = v['size'] # candice added for link size
            io_type = v['type']
            fname = v['filename']
            tname = v['task_name']
            fidx = int(self.nodes[self.nodes.label == fname].index.values)
            tidx = int(self.nodes[self.nodes.label == tname].index.values)

            if io_type == "read":
                t2f = {'source': fidx,
                       'target': tidx,
                       'label': f'read (~ {humansize(cnt)})',
                       'value': cnt}
                if no_rw:
                    t2n = {'source': fidx + 1,
                           'target': tidx,
                           'value': (v['size'] / self.block_size) - cnt}
            elif io_type == "write":
                t2f = {'source': tidx,
                       'target': fidx,
                       'label': f'write (~ {humansize(cnt)})',
                       'value': cnt}
                if no_rw:
                    t2n = {'source': tidx,
                           'target': fidx + 2,
                           'value': (v['size'] / self.block_size) - cnt}

            links.append(t2f) 
            links.append(t2n) if no_rw else None

        links = pd.DataFrame(links)
        self.links = links
    
    def plot(self):
        n = self.nodes[['label','color']].to_dict('list')
        l = self.links.to_dict('list')
        fig = go.Figure(data=[go.Sankey(
            node = n,
            link = l)])
        fig.show()
        fig.write_html(f"{path}/sankey-diagram-plotly-{self.output_suffix}.html")

    def reset(self):
        del(self.df_stat)
        del(self.nodes)
        del(self.links)    

In [72]:
read_columns=['type','Time_Start(ns)','Access_Size','Filename','Filesize','Num_Datasets','Dataset_Name',
'Dataset_Offset','Dataset_N_Dimension','Dataset_N_Points','Dataset_Dimension']

# loading hermesvfd stat files into pandas dataframe
def stat_to_df(fname):

    df = pd.read_excel(fname, usecols=read_columns)
    df.fillna("")

    df['access_idx'] = df.index # just name for placeholder

    df['file_name'] = df['file_name'].astype('string')
    df['operation'] = df['operation'].astype('string')
    df['file_name'] = df['file_name'].fillna("")
    df['dset_name'] = df['dset_name'].fillna("")

    df['Filesize'] = df['Filesize'].astype('int64')
    df['access_size'] = df['access_size'].astype('int64')

    df['file_name'].replace("molecular_dynamics_runs/stage0000/","",inplace=True,regex=True)
    
    return df


In [73]:
dsets = ["contact_map", "point_cloud"]
io_type = ["read", "write"]
agg_result = "aggregate.h5"
sim_result = "residue_100.h5"

def sd_load_file(dfsim, dfagg, sd, task_cnt):
    for io in io_type:
        for i in range(task_cnt):
            taskname = "task000" + str(i) + "/" + sim_result

            # read in sim stats
            new_df = dfsim[(dfsim['operation'].str.match(io)) 
                            & (dfsim['file_name'].str.match(taskname))]
            sd.load_stat(new_df, {'filename': f'residue_100_output.h5 ({i})',
                            'type': io,
                            'size':new_df['access_size'].sum(),
                            'task_name': 'sim_emulator.py'})

            # read in agg stats
            new_df = dfagg[(dfagg['operation'].str.match(io)) 
                        & (dfagg['file_name'].str.match(taskname))]
            sd.load_stat(new_df, {'filename': f'residue_100_output.h5 ({i})',
                            'type': io,
                            'size':new_df['access_size'].sum(),
                            'task_name': 'aggregate.py'})
        
        new_df = dfagg[(dfagg['operation'].str.match(io)) 
                        & (dfagg['file_name'].str.contains(agg_result))]
        sd.load_stat(new_df, {'filename': f'{agg_result}',
                            'type': io,
                            'size': new_df['access_size'].sum(),
                            'task_name': 'aggregate.py'})

In [74]:

# dfsim = stat_to_df(fsim)
# dfagg = stat_to_df(fagg)

dfsim = pd.read_csv(fsim)
dfagg = pd.read_csv(fagg)

# print(dfagg.columns)


In [75]:
sd = SankeyData()

# sd_load_dset(dfsim,dfagg,sd,task_cnt)
sd_load_file(dfsim,dfagg,sd,task_cnt)

sd.set_block_size(65536)
sd.build_links(key='access_idx')

sd.set_output_suffix("vol-file")
sd.plot()

Output Name : sankey-diagram-plotly-vol-file.html


In [32]:
sd.nodes

Unnamed: 0,label,color
0,sim_emulator.py,red
1,residue_100_output.h5 (0),blue
2,aggregate.py,red
3,residue_100_output.h5 (1),blue
4,residue_100_output.h5 (2),blue
5,residue_100_output.h5 (3),blue
6,residue_100_output.h5 (4),blue
7,residue_100_output.h5 (5),blue
8,residue_100_output.h5 (6),blue
9,residue_100_output.h5 (7),blue


In [33]:
sd.links

Unnamed: 0,source,target,label,value
0,1,0,read (approx 1.86 MB),1861900
1,1,2,read (approx 0 B),0
2,3,0,read (approx 1.86 MB),1859616
3,3,2,read (approx 0 B),0
4,4,0,read (approx 1.86 MB),1861844
5,4,2,read (approx 0 B),0
6,5,0,read (approx 1.86 MB),1860216
7,5,2,read (approx 0 B),0
8,6,0,read (approx 1.86 MB),1861208
9,6,2,read (approx 0 B),0
