In [9]:
import glob
import re
import pandas as pd
import numpy as np
import ast

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Statistical Functions

In [11]:
def get_c_nodes_stats(file_path):
    c_nodes = pd.read_csv("{}ComputingNodes.csv".format(file_path))
    
    iiot_stats_df = c_nodes[c_nodes['id'].str.contains("iiot")].describe().transpose().reset_index()
    iiot_power_df = iiot_stats_df[iiot_stats_df['index'] == 'power'].add_prefix('iiot_power_').drop(['iiot_power_index', 'iiot_power_count'], axis=1)
    iiot_memory_df = iiot_stats_df[iiot_stats_df['index'] == 'memory'].add_prefix('iiot_memory_').drop(['iiot_memory_index', 'iiot_memory_count'], axis=1)
    
    fog_stats_df = c_nodes[c_nodes['id'].str.contains("f")].describe().transpose().reset_index()
    fog_power_df = fog_stats_df[fog_stats_df['index'] == 'power'].add_prefix('fog_power_').drop(['fog_power_index', 'fog_power_count'], axis=1)
    fog_memory_df = fog_stats_df[fog_stats_df['index'] == 'memory'].add_prefix('fog_memory_').drop(['fog_memory_index', 'fog_memory_count'], axis=1)
    
    iiot_power_df.reset_index(drop=True, inplace=True)
    iiot_memory_df.reset_index(drop=True, inplace=True)
    fog_power_df.reset_index(drop=True, inplace=True)
    fog_memory_df.reset_index(drop=True, inplace=True)
    
    return pd.concat([iiot_power_df, iiot_memory_df, fog_power_df, fog_memory_df], axis=1)

In [12]:
def get_switches_stats(file_path):
    switches = pd.read_csv("{}Switches.csv".format(file_path))
    stats = switches.describe().transpose().reset_index().add_prefix('switches_')
    return stats[['switches_count']] 

In [13]:
def get_links_stats(file_path):
    links = pd.read_csv("{}Links.csv".format(file_path))
    
    iiot_switch_stats_df = links[links['source'].str.contains("iiot") & links['destination'].str.contains("s")].describe().transpose().reset_index()
    iiot_switch_latency_df = iiot_switch_stats_df[iiot_switch_stats_df['index'] == 'latency'].add_prefix('link_IS_latency_').drop(['link_IS_latency_index'], axis=1)
    iiot_switch_capacity_df = iiot_switch_stats_df[iiot_switch_stats_df['index'] == 'capacity'].add_prefix('link_IS_capacity_').drop(['link_IS_capacity_index'], axis=1)
    
    switch_fog_stats_df = links[links['source'].str.contains("s") & links['destination'].str.contains("f")].describe().transpose().reset_index()
    switch_fog_latency_df = switch_fog_stats_df[switch_fog_stats_df['index'] == 'latency'].add_prefix('link_SF_latency_').drop(['link_SF_latency_index'], axis=1)
    switch_fog_capacity_df = switch_fog_stats_df[switch_fog_stats_df['index'] == 'capacity'].add_prefix('link_SF_capacity_').drop(['link_SF_capacity_index'], axis=1)
    
    iiot_switch_latency_df.reset_index(drop=True, inplace=True)
    iiot_switch_capacity_df.reset_index(drop=True, inplace=True)
    switch_fog_latency_df.reset_index(drop=True, inplace=True)
    switch_fog_capacity_df.reset_index(drop=True, inplace=True)
    
    return pd.concat([iiot_switch_latency_df, iiot_switch_capacity_df, switch_fog_latency_df, switch_fog_capacity_df], axis=1)

In [14]:
def get_workflows_stats(file_path):
    microservices = pd.read_csv("{}Microservices.csv".format(file_path))
    workflows = pd.read_csv("{}Workflows.csv".format(file_path))
    c_nodes = pd.read_csv("{}ComputingNodes.csv".format(file_path))
    
    workflows['chain'] = workflows['chain'].apply(ast.literal_eval)
    workflow_counts = pd.DataFrame({'workflow_count':[workflows['id'].nunique()],'workwflow_steps':[len(workflows['chain'][0])]})
    workflows = workflows.explode('chain')
    
    merged = pd.merge(workflows, microservices, how="inner", left_on=["chain"], right_on=["id"], )
    merged = pd.merge(merged, c_nodes, how="inner", left_on=["starter"], right_on=["id"])
    merged.rename(columns={'memory_x': 'steps_memory',
                           'memory_y': 'starter_memory',
                           'power': 'starter_power'}, inplace=True)
    
    merged_stats = merged.describe().transpose().reset_index()
    
    workflows_steps_cycles_df = merged_stats[merged_stats['index'] == 'cycles'].add_prefix('workflows_steps_cycles_').drop(['workflows_steps_cycles_index', 'workflows_steps_cycles_count'], axis=1)
    workflows_steps_inputs_df = merged_stats[merged_stats['index'] == 'input'].add_prefix('workflows_steps_inputs_').drop(['workflows_steps_inputs_index', 'workflows_steps_inputs_count'], axis=1)
    workflows_steps_outputs_df = merged_stats[merged_stats['index'] == 'output'].add_prefix('workflows_steps_outputs_').drop(['workflows_steps_outputs_index', 'workflows_steps_outputs_count'], axis=1)
    workflows_steps_memory_df = merged_stats[merged_stats['index'] == 'steps_memory'].add_prefix('workflows_steps_memory_').drop(['workflows_steps_memory_index', 'workflows_steps_memory_count'], axis=1)
    
    workflows_starter_power_df = merged_stats[merged_stats['index'] == 'starter_power'].add_prefix('workflows_starter_power_').drop(['workflows_starter_power_index', 'workflows_starter_power_count'], axis=1)
    workflows_starter_memory_df = merged_stats[merged_stats['index'] == 'starter_memory'].add_prefix('workflows_starter_memory_').drop(['workflows_starter_memory_index', 'workflows_starter_memory_count'], axis=1)

    workflow_counts.reset_index(drop=True, inplace=True)
    workflows_steps_cycles_df.reset_index(drop=True, inplace=True)
    workflows_steps_inputs_df.reset_index(drop=True, inplace=True)
    workflows_steps_outputs_df.reset_index(drop=True, inplace=True)
    workflows_steps_memory_df.reset_index(drop=True, inplace=True)
    workflows_starter_power_df.reset_index(drop=True, inplace=True)
    workflows_starter_memory_df.reset_index(drop=True, inplace=True)
    
    return pd.concat([workflow_counts, workflows_steps_cycles_df, workflows_steps_inputs_df, workflows_steps_outputs_df, workflows_steps_memory_df, workflows_starter_power_df, workflows_starter_memory_df], axis=1)


In [15]:
def get_response_stats(file_path):
    response_time = pd.read_csv("{}_resp_time.csv".format(file_path).replace("Scenarios", "Results"))
    return pd.DataFrame({'avg_response_time': [response_time["Response time"].mean()]})

# Scenarios

In [16]:
for dataset_type in ['TrainingData', 'TestData']:
    path = "{}/CSVDataset/Scenarios/*Workflows.csv".format(dataset_type)
    scenarios = [w.replace('Workflows.csv', '') for w in glob.glob(path)]
    result_df = pd.DataFrame()

    for scenario in scenarios:
        re_scenario = re.search(r'.*MEC(.*)iiot(.*)fog(.*)controllers(.*)wfpd(.*)len(.*)pw(.*)hw', scenario)

        iiot_nodes_count = pd.DataFrame({'iiot_nodes_count': [re_scenario.group(1)]})
        fog_nodes_count = pd.DataFrame({'fog_nodes_count': [re_scenario.group(2)]})
        sdn_controllers = pd.DataFrame({'sdn_controllers': [re_scenario.group(3)]})
        workflows_per_device = pd.DataFrame({'workflows_per_device': [re_scenario.group(4)]})
        workflows_length = pd.DataFrame({'workflows_length': [re_scenario.group(5)]})
        hardware = pd.DataFrame({'hardware': [re_scenario.group(7)]})

        c_nodes = get_c_nodes_stats(scenario)
        switches = get_switches_stats(scenario)
        links = get_links_stats(scenario)
        workflows = get_workflows_stats(scenario)

        response_time = get_response_stats(scenario)

        scenario_df = pd.concat([iiot_nodes_count, fog_nodes_count, sdn_controllers, workflows_per_device, workflows_length, hardware, c_nodes, switches, links, workflows, response_time], axis=1)
        result_df = result_df.append(scenario_df, ignore_index=True)
    
    result_df.to_csv("{}/pre_processed.csv".format(dataset_type), index=False)