In [8]:
import plotly.graph_objects as go
import pandas as pd
import networkx as nx
import math
from scipy.stats import rankdata


# Sankey related codes
EDGE_COLOR_RGBA = {
    'none' : {'r':180, 'g':180, 'b':180}, #grey for open/close/meta
    'read_only' : {'r':150, 'g':190, 'b':220},
    'write_only' : {'r':150, 'g':190, 'b':220},
    'read_write' : {'r':150, 'g':190, 'b':220}, # TODO: currently showing same as write
    # 'read_write' : {'r':220, 'g':220, 'b':220},
    # 'write' : {'r':140, 'g':210, 'b':220},
}

# color names : https://www.w3schools.com/colors/colors_names.asp
COLOR_MAP = {"task": "Tomato", # read
            "dataset": "Wheat", # yellow
            "file": "SteelBlue", # blue
            "none": "grey",
            }

In [77]:


OPACITY = 0.8

# data and load related methods
def humansize(nbytes):
    if nbytes != 0:
        suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
        i = 0
        while nbytes >= 1000 and i < len(suffixes)-1:
            nbytes /= 1000.
            i += 1
        f = ('%.2f' % nbytes).rstrip('0').rstrip('.')

        return '%s %s' % (f, suffixes[i])
    else:
        return 0
    
def humanbw(nbytes):
    if nbytes != 0:
        suffixes = ['B/s', 'KB/s', 'MB/s', 'GB/s', 'TB/s', 'PB/s']
        i = 0
        while nbytes >= 1000 and i < len(suffixes)-1:
            nbytes /= 1000.
            i += 1
        f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
        return '%s %s' % (f, suffixes[i])
    else:
        return 0

def get_xy_position(G):
    pos_dict = nx.get_node_attributes(G,'pos')

    x_dict = {}
    y_dict = {}
    for n, pos in pos_dict.items():
        x_dict[n] = pos[0]
        y_dict[n] = pos[1]
    
    # shift x position to start from 0
    x_base= min(x_dict.values())
    x_dict = {k: v-x_base for k, v in x_dict.items()}

    # normalize x positions
    # xf=1.0/(max(x_dict.values()))
    # x_normalized = {k: v*xf for k, v in x_dict.items() }
    x_max = max(x_dict.values())
    x_min = min(x_dict.values())
    x_normalized = {k: 0 + 1 * (v - x_min) / (x_max - x_min) for k, v in x_dict.items()}

    # normalize y positions
    y_max = max(y_dict.values())
    y_min = min(y_dict.values())
    # Noamalize y positions between 0 and 0.9
    y_normalized = {k: 0.1 + 1 * (v - y_min) / (y_max - y_min) for k, v in y_dict.items()}
    
    # yf=1.0/(max(y_dict.values()))
    # y_normalized = {k: v*yf for k, v in y_dict.items() }
    
    return x_normalized, y_normalized

def get_nodes_for_sankey(G, rm_tags=[],label_on=True, thickness=50):
    node_dict_ref = {}
    node_dict_for_sankey = {'label': [], 'color':[], 'x':[], 'y':[], 'thickness': thickness, 'line': {'width': 0} }
    x_pos, y_pos = get_xy_position(G)
    
    for idx, (node_name, attr) in enumerate(G.nodes(data=True)):
        # node_type = 'file' if node_name[-4:] == 'residue_100.h5' else 'task'
        # print(f"{idx}, ({node_name}, {attr})")
        
        node_type = attr['type']
        if node_name in node_dict_ref:
            print(node_name, "==duplicate==")
        node_dict_ref[node_name] = {'idx':idx, 'type':node_type}

        #sankey
        if label_on :  
            # node_label = node_name + f" {G.nodes[node_name]['pos']} ({x_pos[node_name]:.2f}, {y_pos[node_name]:.2f})"
            node_label = node_name
            node_dict_for_sankey['label'].append(node_label)
        node_dict_for_sankey['color'].append(COLOR_MAP[node_type])
        node_dict_for_sankey['x'].append(x_pos[node_name])
        node_dict_for_sankey['y'].append(y_pos[node_name])
    return node_dict_for_sankey, node_dict_ref


def edge_color_scale(attr_bw, attr_op, bw, op):
    range = 100

    base_color_dict = {}
    if op in EDGE_COLOR_RGBA.keys():
        base_color_dict = EDGE_COLOR_RGBA[op]
        r = base_color_dict['r']
        g = base_color_dict['g']
        b = base_color_dict['b']
    else:
        base_color_dict = EDGE_COLOR_RGBA['none']
        r = base_color_dict['r']
        g = base_color_dict['g']
        b = base_color_dict['b']
        color_str = f"rgba({r}, {g}, {b}, {OPACITY})"
        return color_str

    edges = []
    for k,v in attr_op.items():
        if v == op:
            edges.append(k)
    # bw_list = list(set(attr_bw.values()))
    bw_list = [attr_bw[x] for x in edges]
    bw_list.sort()
    # print(bw_list)
        
    color_ranks = rankdata(bw_list,method='dense')
    color_ranks = [float(i)/max(color_ranks) for i in color_ranks] # normalize

    my_rank = color_ranks[bw_list.index(bw)]

    color_change = my_rank *range #(my_rank-1)
    op_change = (my_rank/max(color_ranks)) * 0.5 + 0.2
    color_str = f"rgba({r-color_change/1.5}, {g-color_change/1.5}, {b-color_change/1.5}, {op_change})"

    # print(f"{color_str} rank={my_rank} sqrt={math.sqrt(my_rank)} change={color_change}")
    return color_str

def get_links_for_sankey(G, node_dict_ref, 
                         edge_attr=['access_cnt','access_size','operation','bandwidth'], 
                         rm_tags=[],val_sqrt=True):
    
    link_dict_for_sankey = {'source':[], 'target':[], 'value':[], 'label': [], 'color': [] }
    #'hoverinfo': "all"
    #'line_width':[], # shows strokeWidth of the edges
    
    attr_cnt = nx.get_edge_attributes(G,edge_attr[0])
    attr_size = nx.get_edge_attributes(G,edge_attr[1])
    attr_op = nx.get_edge_attributes(G,edge_attr[2])
    attr_bw = nx.get_edge_attributes(G,edge_attr[3])
    
    # min_size = min(attr_size.values())
    
    # print(attr_dict)
    for u, v, attr in G.edges(data=True):
        # print(u, v, attr)
        u_idx = node_dict_ref[u]['idx']
        v_idx = node_dict_ref[v]['idx']
        link_dict_for_sankey['source'].append(u_idx)
        link_dict_for_sankey['target'].append(v_idx)

        size = attr_size[(u,v)]
        
        op = attr_op[(u,v)]

        # get edge color based on bandwidth
        link_dict_for_sankey['value'].append(size)
        # _str = f"ave_acc_size: {humansize(size/cnt)} op: {op}"
        bw = attr_bw[(u,v)]

        link_dict_for_sankey['color'].append(edge_color_scale(attr_bw, attr_op, bw, op)) # get the last operation
            
    print(f"bandwidth range: {humanbw(min(attr_bw.values()))} ~ {humanbw(max(attr_bw.values()))}")
        
    return link_dict_for_sankey

def selected_graph(node_name, G):
    # this is not used
    selected_G = nx.DiGraph()
    search_nodes = [node_name]
    while len(search_nodes) > 0:
        next_set = []
        for n in search_nodes:
            for edge in G.edges(n):
                val = G.edges[edge]['value']
                selected_G.add_edges_from([edge], value=val)
                #print(selected_G.nodes)
            next_set += [x for x in G.neighbors(n)]
        search_nodes = next_set
    return selected_G

In [114]:
G_logo = nx.DiGraph()

# Add 2 nodes to G_logo
G_logo.add_node("file1")
G_logo.add_node("dset1")
G_logo.add_node("dset2")
G_logo.add_node("task")

G_logo.add_edge("file1", "dset1", value=3)
G_logo.add_edge("file1", "dset2", value=1)
G_logo.add_edge("dset1", "task", value=3)
G_logo.add_edge("dset2", "task", value=1)

edge_attr = {
    ("file1", "dset1"): {'access_cnt': 3, 'access_size': 3, 'operation': 'none', 'bandwidth': 3},
    ("file1", "dset2"): {'access_cnt': 1, 'access_size': 1, 'operation': 'none', 'bandwidth': 2},
    ("dset1", "task"): {'access_cnt': 3, 'access_size': 3, 'operation': 'none', 'bandwidth': 3},
    ("dset2", "task"): {'access_cnt': 1, 'access_size': 1, 'operation': 'none', 'bandwidth': 2},
}

nx.set_edge_attributes(G_logo, edge_attr)

nodes_attrs = {
    "file1": {"type": "file", "pos": (0,3)},
    "dset1": {"type": "dataset", "pos": (0.5,0)},
    "dset2": {"type": "dataset", "pos": (0.7,2.14)},
    "task": {"type": "task", "pos": (1,0.338)},
}

nx.set_node_attributes(G_logo, nodes_attrs)

nodes, nodes_dict = get_nodes_for_sankey(G_logo, label_on=False)

# print(vfd_nodes)

vfd_links = get_links_for_sankey(G_logo, nodes_dict, val_sqrt=False)
fig = go.Figure(go.Sankey(
            node = nodes,
            link = vfd_links, orientation='h'))

fig.update_layout(
    autosize=True, width=500, height=500,
    margin=dict(
        l=100, r=140, b=100, t=200, pad=10 ),
    boxmode="group",
    font=dict(size=24),
    paper_bgcolor='rgba(0,0,0,0)', 
    # plot_bgcolor='rgba(0,0,0,0)',
)

    
fig.show()

# save_html_path = f"{stat_path}/vfd-{(SELECT_STAGE_END+1)}s-{test_name}-sankey-labeled.html"
# fig.write_html(save_html_path)
# print(f"Sankey saved to {save_html_path}")

bandwidth range: 2 B/s ~ 3 B/s
