In [None]:
### Inputs: traces_dict, node_details_dict and trace_details_dict
# Node details dict= nid: [nis, type]
### Config file: DB split and SLtype split
### Outputs: updated_node_details

In [1]:
import pickle
import yaml
import random

import networkx as nx
import numpy as np

ModuleNotFoundError: No module named 'numpy'

In [7]:
def pkl_to_dict(file_path):
    with open(file_path, 'rb') as pkl_file:
        T_prime = pickle.load(pkl_file)
    return T_prime

def read_yaml(file):
    with open(file, 'r') as f:
        data = yaml.safe_load(f)
    return data

def build_digraph_from_tracesdict(traces_dict):

    full_graph_edge_list = []
    for edge_list in traces_dict.values():
        full_graph_edge_list.extend(edge_list)
    G = nx.DiGraph()
    G.add_edges_from(full_graph_edge_list)

    return G


In [8]:
# Read in configs
config = read_yaml('enrichment_config.yaml')
databases = config['Databases']
sl_types = config['SLTypeSplit']

db_split_arr = [[db_name, info['percentage']] for db_name, info in databases.items()]# [[DB1, 30%],...]
sl_type_split = [[sl_type, info['percentage']] for sl_type, info in sl_types.items()]# [[Relay, 30%],...]

# Node details dict= nid: [nis, SF, DB_name] (or) [nis, SL, SL_type]
node_dets = pkl_to_dict('node_details_data.pkl')
sf_arr = [nid for nid, n_info in node_dets.items() if n_info[1] == "db"]
sl_arr = [nid for nid, n_info in node_dets.items() if n_info[1] != "db"]

sf_count = len(sf_arr)
sl_count = len(sl_arr)
total_nodes = sf_count + sl_count

def percent_to_count(arr, count):
    for idx, i in enumerate(arr):
        name = i[0]
        arr[idx] = [name,int(count * (i[1])/100)]
    return arr

db_split_arr = percent_to_count(db_split_arr, sf_count)
sl_type_split = percent_to_count(sl_type_split, sl_count)

print(len(sf_arr))
print(len(sl_arr))
print(db_split_arr)
print(sl_type_split)

def assign_nodes_to_types(arr, sfsl_arr):
    # Assign nodes to db and sl types
    for i in arr:
        name = i[0]
        for _ in range(i[1]):
            nid = sfsl_arr.pop(random.randint(0, len(sfsl_arr) - 1))
            node_dets[nid].append(name)
    return node_dets
node_dets = assign_nodes_to_types(db_split_arr, sf_arr)
node_dets = assign_nodes_to_types(sl_type_split, sl_arr)

2330
1652
[['MongoDB', 699], ['Redis', 699], ['Postgres', 932]]
[['Relay', 495], ['High', 330], ['Low', 826]]


In [7]:
check = []
for i in node_dets.values():
    if len(i) != 3:
        check.append(i)
print(len(check))
print(len(node_dets))

1
3982


In [2]:
'''
Object id Enrichment
'''

class Wl_config:
    def __init__(self, record_count, record_size_dist,\
                 data_access_pattern, rw_ratio, async_sync_ratio):
        self.record_count = record_count
        self.record_size_dist = record_size_dist
        self.data_access_pattern = data_access_pattern
        self.rw_ratio = rw_ratio
        self.async_sync_ratio = async_sync_ratio

wl1 = Wl_config(100, 'uniform', 'zipfian', 0.5, 0.1) # to be read from config file

def gen_sfnode_dataops(sf_node, G_agg, wl_config, node_dets):
    '''
    For a given sf node, generate indegree num of data ops
    Return: ops_dict= Key: op_id, Value: op_packet
    op_packet = {'op_type': op_type, 'op_obj_id': op_obj_id,\
                 'op_obj_size':op_obj_size,'db': sf_node_db}
    '''
    obj_count = wl_config.record_count
    obj_size_dist = wl_config.record_size_dist
    data_acc_pattern = wl_config.data_access_pattern
    rwr = wl_config.rw_ratio
    sf_node_db = node_dets[sf_node][2]
    total_ops = G_agg.in_degree(sf_node)# gen indeg num of data ops
    w_prob = rwr / (1 + rwr) # rw probability
      
    obj_ids_list = np.arange(1, obj_count + 1)
    if obj_size_dist == 'lognormal':
        obj_sizes = np.random.lognormal(mean=np.log(obj_count), \
                                        sigma=np.log(obj_count), \
                                        size=obj_count)
        obj_sizes_dict = dict(zip(obj_ids_list, obj_sizes))# key: obj_id, value: obj_size
    elif obj_size_dist == 'uniform':
        obj_sizes = np.random.uniform(low=1, high=obj_count, size=obj_count)
        obj_sizes_dict = dict(zip(obj_ids_list, obj_sizes))
    else:
        raise ValueError('Invalid record size distribution')
    
    if data_acc_pattern == 'zipfian':
        alpha = 1
        probabilities = np.random.zipf(alpha, len(obj_ids_list))
        probabilities /= probabilities.sum()
    elif data_acc_pattern == 'uniform':
        probabilities = np.ones(len(obj_ids_list)) / len(obj_ids_list)
    else:
        raise ValueError('Invalid data access pattern')
    
    ops_dict = {}   # key: op_id, value: op_packet
    for op_id in obj_ids_list:
        op_type = 'write' if random.random() < w_prob else 'read'
        op_obj_id = np.random.choice(obj_ids_list, len(obj_ids_list),\
                                      p=probabilities)# Select by data access pattern
        op_obj_size = obj_sizes_dict[op_obj_id]
        operation = {'op_type': op_type, 'op_obj_id': op_obj_id,\
                      'op_obj_size':op_obj_size,'db': sf_node_db} # op_packet
        ops_dict[op_id] = operation
    
    return ops_dict


# convert edges_list to node_calls_dict format
def convert_to_node_calls_dict(edges_list):
    node_calls_dict = {}
    for edge in edges_list:
        if edge[0] not in node_calls_dict:
            node_calls_dict[edge[0]] = []
        node_calls_dict[edge[0]].append(edge[1])
    return node_calls_dict

traces_dict = pkl_to_dict('traces/500nodes_4500traces.pkl')
G_agg = build_digraph_from_tracesdict(traces_dict)
for tid in traces_dict:
    t_node_calls_dict = convert_to_node_calls_dict(traces_dict[tid])
    traces_dict[tid] = t_node_calls_dict

sf_nodes = []
# Node details dict= nid: [nis, SF, DB_name] (or) [nis, SL, SL_type]
overall_data_ops = {}
for node in node_dets:
    if node_dets[node][1] == 'db':
        overall_data_ops[node] = gen_sfnode_dataops(node, G_agg, wl1, node_dets)
        pass
        




'\nObject id Enrichment\n'