In [None]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import seaborn as sns
from utils.mlflow_query import MlflowHelper
import numpy as np
from pathlib import Path
from src.features import knowledge
import json
from matplotlib import pyplot as plt

In [None]:
mlflow_helper = MlflowHelper(pkl_file=None, 
    local_mlflow_dir_prefix='/home/i40/almasrirz/Domain-Guided-Monitoring/mlruns/', 
    experiment_id="831333387297563441",
    tracking_uri="http://localhost:5200")
mlflow_helper.query_runs()

In [None]:
def plot_best_metric_bar(metric_df, metric_names,
    x_feature_name='data_tags_model_type', 
    x_order=['simple', 'causal_heuristic', 'causal_score'],
    save_fig = False,
    row_feature_name=None,
    hue_feature_name=None,
    col_feature_name=None,
    titles=None,
    palette=None,
    dodge=True,
    col_order=None,
    height=5,
    aspect=1 # width = height * aspect
):
    for metric_name in metric_names:
        g = sns.catplot(
            data=metric_df, x=x_feature_name, y=metric_name, order=x_order,
            row=row_feature_name, hue=hue_feature_name, col=col_feature_name, col_order=col_order,
            kind="box", sharey='row', palette=palette, dodge=dodge, height=height, aspect=aspect
        )
        g.set_titles(titles).set_axis_labels('', metric_name)
        for ax in g.axes.flatten():
            ax.tick_params(labelbottom=True)
        if save_fig:
            g.savefig("bar_{}.png".format(metric_name))

In [None]:
def count_attributes(df: pd.DataFrame, attr='data_tags_model_type'):
    return df.groupby([attr])[attr].count().sort_values(ascending=False)

In [None]:
def categorize_run_df(run_df, experimented_algos = ["causal_Fast-IAMB-jt", "causal_Fast-IAMB-smc-cor", "causal_MMPC-cor"]):
    bin_configs = { # label -> (bin_size, bin_overlap)
        'small-none': (['00:00:05'], ['00:00:00']),
        'medium-none': (['0:01:00','00:00:60'], ['00:00:00']),
        'large-none': (['00:10:00'], ['00:00:00']),
        'small-small': (['00:00:05'], ['00:00:01']),
        'medium-small': (['0:01:00','00:00:60'], ['00:00:12']),
        'large-small': (['00:10:00'], ['00:02:00']),
        'small-medium': (['00:00:05'], ['00:00:03']),
        'medium-medium': (['0:01:00','00:00:60'], ['00:00:30']),
        'large-medium': (['00:10:00'], ['00:05:00']),
        'small-large': (['00:00:05'], ['00:00:04']),
        'medium-large': (['0:01:00','00:00:60'], ['00:00:48', '00:00:50']),
        'large-large': (['00:10:00'], ['00:08:00']),
    }

    matrix_df = run_df.copy()
    for algo in experimented_algos:
        for label, config in bin_configs.items():
            bin_sizes, bin_overlap = config[0], config[1]
            matrix_df.loc[(matrix_df['data_params_TimeSeriesTransformerConfigbin_size'].isin(bin_sizes)) 
            & (matrix_df['data_params_TimeSeriesTransformerConfigbin_overlap'].isin(bin_overlap))
            & (matrix_df['data_tags_model_type'] == algo), 'data_tags_model_type'] = algo + '+' + label
    
    # Categorize by templates
    matrix_df.loc[matrix_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'] = matrix_df.loc[
        matrix_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'
    ].apply(lambda x: x + "_more_templates")
    
    # Categorize by timestamps
    matrix_df.loc[matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 'data_tags_model_type'] = matrix_df.loc[
    matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 'data_tags_model_type'].apply(lambda x: x + '_with_ts')
    matrix_df.loc[matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True', 'data_tags_model_type'] = matrix_df.loc[
    matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True', 'data_tags_model_type'].apply(lambda x: x + '_without_ts')

    return matrix_df

# Results - Comparision

## Drain algorithm change

In [None]:
def read_knowledge_dataframes(knowledge_type, dataset='huawei', size='2k'):
    knowledge_path = Path(f'/home/i40/almasrirz/Domain-Guided-Monitoring/data/{size}/{dataset}/knowledge')
    pathlist = knowledge_path.glob('**/*.csv')
    dataframes = {}
    for path in pathlist:
        knowledge_name = path.name.split('_knowledge')[0]
        dataframes[knowledge_name] = pd.read_csv(path)
    return dataframes

In [None]:
def fetch_statistics(knowledge_type, drain_settings, ts=None, dataset='huawei', size='2k'):
    statistics_records = []
    dataframes = read_knowledge_dataframes(knowledge_type, dataset=dataset, size=size)
    for name, df in dataframes.items():
        k = None
        metadata = f'/home/i40/almasrirz/Domain-Guided-Monitoring/data/{size}/{dataset}/x_vocab'
        for x in drain_settings:
            if x in name:
                metadata += f'_{x}'
                break
        if 'with_ts' in name:
            metadata += '_with_ts'
        if 'without_ts' in name:
            metadata += '_without_ts'
           
        with open(metadata + '.json', 'r') as file:
            metadata = json.load(file)

        if name.startswith('causal'):
            k = knowledge.CausalityKnowledge(knowledge.KnowledgeConfig())
            k.build_causality_from_df(df, metadata)
        elif name.startswith('hierarchy'):
            k = knowledge.HierarchyKnowledge(knowledge.KnowledgeConfig())
            k.build_hierarchy_from_df(df, metadata)
        else:
            k = knowledge.DescriptionKnowledge(knowledge.KnowledgeConfig())
            k.build_knowledge_from_df(df, metadata)
        v_in = len(k.vocab)
        v_g = len(k.extended_vocab)
        v_hidden = v_g - v_in
        edges = set()
        for i in range(len(k.vocab)):
            connections = k.get_connections_for_idx(i)
            for connection in connections:
                edges.add((i, connection))
        statistics_records.append({
            'Model': name,
            'V_G': v_g,
            'V_in': v_in,
            'V_hidden': v_hidden,
            'E_G': len(edges)
        })

    return pd.DataFrame.from_records(statistics_records)

In [None]:
def classify_sp_df(df, drain_settings, dataset='Huawei'):
    drain_log_st = f"data_params_{dataset}PreprocessorConfigfine_drain_log_st"
    ret_df = df.copy()
    for drain in drain_settings:
        ret_df.loc[ret_df[drain_log_st] == drain, 'data_tags_model_type'] = ret_df.loc[ret_df[drain_log_st] == drain, 'data_tags_model_type'].apply(lambda x: x + f'_{drain}')
    return ret_df

## 2k

### Huawei

In [None]:
check = ['0.4', '0.5', '0.65', '0.7', '0.75', '0.77', '0.8']
huawei_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'huawei_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/2k/huawei/huawei_logs.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))
huawei_with_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False']
huawei_without_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True']

In [None]:
huawei_df = classify_sp_df(huawei_df, check)
huawei_with_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False']
huawei_without_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True']

In [None]:
huawei_with_ts_df.groupby(['data_tags_model_type']).size()

In [None]:
huawei_without_ts_df.groupby(['data_tags_model_type']).size()

In [None]:
stats_df = fetch_statistics('causal_Fast-IAMB-jt', check)
# Separate into with_ts and without_ts
huawei_stats_with_ts_df = stats_df[stats_df['Model'].str.contains('with_ts')]
huawei_stats_without_ts_df = stats_df[stats_df['Model'].str.contains('without_ts')]

In [None]:
# Remove modifiers
huawei_stats_with_ts_df['Model'] = huawei_stats_with_ts_df['Model'].apply(lambda x: x.replace('_with_ts', '').replace('_st', ''))
huawei_stats_with_ts_df

In [None]:
# Add simple
to_append = []
for x in huawei_stats_with_ts_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = huawei_stats_with_ts_df[huawei_stats_with_ts_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
huawei_stats_with_ts_df = pd.concat([huawei_stats_with_ts_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
huawei_stats_with_ts_df

In [None]:
huawei_ts_final_df = pd.merge(huawei_with_ts_df, huawei_stats_with_ts_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
# Remove modifiers
huawei_stats_without_ts_df['Model'] = huawei_stats_without_ts_df['Model'].apply(lambda x: x.replace('_without_ts', '').replace('_st', ''))
huawei_stats_without_ts_df

In [None]:
# Add simple
to_append = []
for x in huawei_stats_without_ts_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = huawei_stats_without_ts_df[huawei_stats_without_ts_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
huawei_stats_without_ts_df = pd.concat([huawei_stats_without_ts_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
huawei_stats_without_ts_df

In [None]:
huawei_no_ts_final_df = pd.merge(huawei_without_ts_df, huawei_stats_without_ts_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
huawei_ts_final_df.groupby(['data_tags_model_type']).size()

In [None]:
huawei_no_ts_final_df.groupby(['data_tags_model_type']).size()

### Huawei With Timestamps

In [None]:
test_df = huawei_ts_final_df.copy()
test_df['algorithm_type'] = test_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])

In [None]:
test_order = sorted(list(set(test_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'])))
test_order

In [None]:
colors = {'causal': 'C0', 'simple': 'C1'}
hue_order = ['causal', 'simple']

In [None]:
test_df['data_params_HuaweiPreprocessorConfigaggregated_log_file']

In [None]:
idx = test_df[(test_df['data_tags_model_type'].str.startswith('simple')) & (test_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.7')]['val_top_5_categorical_accuracy_history_best'].idxmin()
test_df.drop(idx, inplace=True)

In [None]:
plt.figure(figsize=(15,8))
g = sns.boxplot(data=test_df,
                x='data_params_HuaweiPreprocessorConfigfine_drain_log_st',
                y='val_top_5_categorical_accuracy_history_best',
                hue='algorithm_type',
                order=test_order,
                palette=colors,
                hue_order=hue_order,)
g.set_ylabel('Top 5 Categorical Accuracy')
g.set_xlabel('Granularity Levels')
g.set_xticklabels(['1', '2', '3', '4', '5', '6', '7'])
g.legend(title='Domain Knowledge Type')
sns.move_legend(g, 'upper left', bbox_to_anchor=(1, 1))
g

In [None]:
test_df = huawei_no_ts_final_df.copy()
test_df['algorithm_type'] = test_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])

In [None]:
test_order = sorted(list(set(test_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'])))
test_order

In [None]:
plt.figure(figsize=(15,8))
g = sns.boxplot(data=test_df,
                x='data_params_HuaweiPreprocessorConfigfine_drain_log_st',
                y='val_top_5_categorical_accuracy_history_best',
                hue='algorithm_type',
                order=test_order,
                palette=colors,
                hue_order=hue_order)
g.set_xlabel('Granularity Levels')
g.set_xticklabels(['1', '2', '3', '4', '5', '6', '7'])
g.legend(title='Domain Knowledge Type')
sns.move_legend(g, 'upper left', bbox_to_anchor=(1, 1))
g

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = huawei_ts_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_huawei = huawei_ts_final_df[huawei_ts_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_huawei = huawei_ts_final_df[huawei_ts_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

### Huawei Without Timestamps

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = huawei_no_ts_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_huawei = huawei_no_ts_final_df[huawei_no_ts_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_huawei = huawei_no_ts_final_df[huawei_no_ts_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

### BGL

In [None]:
bgl_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'bgl_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_BGLPreprocessorConfigaggregated_log_file'] == 'data/2k/bgl/BGL_2k.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_BGLPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
bgl_df = mlflow_helper.load_best_metrics_for_ids(set(bgl_df["info_run_id"]))

In [None]:
bgl_df = classify_sp_df(bgl_df, check, dataset='BGL')
bgl_df.groupby(['data_tags_model_type']).size()

In [None]:
bgl_stats_df = fetch_statistics('causal_Fast-IAMB-jt', check, dataset='bgl')
bgl_stats_df

In [None]:
# Add simple
to_append = []
for x in bgl_stats_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = bgl_stats_df[bgl_stats_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
bgl_stats_df = pd.concat([bgl_stats_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
bgl_stats_df

In [None]:
bgl_final_df = pd.merge(bgl_df, bgl_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
bgl_final_df.groupby(['data_tags_model_type']).size()

### HDFS

In [None]:
hdfs_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'hdfs_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_HDFSPreprocessorConfigaggregated_log_file'] == 'data/2k/hdfs/HDFS_2k.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_HDFSPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
hdfs_df = mlflow_helper.load_best_metrics_for_ids(set(hdfs_df["info_run_id"]))

In [None]:
hdfs_df = classify_sp_df(hdfs_df, check, dataset='HDFS')
hdfs_df.groupby(['data_tags_model_type']).size()

In [None]:
hdfs_stats_df = fetch_statistics('causal_Fast-IAMB-jt', check, dataset='hdfs')
# Remove modifiers
hdfs_stats_df['Model'] = hdfs_stats_df['Model'].apply(lambda x: x.replace('_st', ''))

In [None]:
# Add simple
to_append = []
for x in hdfs_stats_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = hdfs_stats_df[hdfs_stats_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
hdfs_stats_df = pd.concat([hdfs_stats_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
hdfs_stats_df

In [None]:
hdfs_final_df = pd.merge(hdfs_df, hdfs_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
hdfs_final_df.groupby(['data_tags_model_type']).size()

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = hdfs_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
causal_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HDFSPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HDFSPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

## 10k

### Huawei

In [None]:
check = ['0.4', '0.5', '0.65', '0.7', '0.75', '0.77', '0.8']
huawei_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'huawei_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/10k/huawei/huawei_logs.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))
huawei_with_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False']
huawei_without_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True']

In [None]:
huawei_df = classify_sp_df(huawei_df, check)
huawei_with_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False']
huawei_without_ts_df = huawei_df[huawei_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'True']

In [None]:
huawei_with_ts_df.groupby(['data_tags_model_type']).size()

In [None]:
huawei_without_ts_df.groupby(['data_tags_model_type']).size()

In [None]:
stats_df = fetch_statistics('causal_Fast-IAMB-jt', check, size='10k')
# Separate into with_ts and without_ts
huawei_stats_with_ts_df = stats_df[stats_df['Model'].str.contains('with_ts')]
huawei_stats_without_ts_df = stats_df[stats_df['Model'].str.contains('without_ts')]

In [None]:
# Remove modifiers
huawei_stats_with_ts_df['Model'] = huawei_stats_with_ts_df['Model'].apply(lambda x: x.replace('_with_ts', '').replace('_st', ''))
huawei_stats_with_ts_df

In [None]:
# Add simple
to_append = []
for x in huawei_stats_with_ts_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = huawei_stats_with_ts_df[huawei_stats_with_ts_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
huawei_stats_with_ts_df = pd.concat([huawei_stats_with_ts_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
huawei_stats_with_ts_df

In [None]:
huawei_ts_final_df = pd.merge(huawei_with_ts_df, huawei_stats_with_ts_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
# Remove modifiers
huawei_stats_without_ts_df['Model'] = huawei_stats_without_ts_df['Model'].apply(lambda x: x.replace('_without_ts', '').replace('_st', ''))
huawei_stats_without_ts_df

In [None]:
# Add simple
to_append = []
for x in huawei_stats_without_ts_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = huawei_stats_without_ts_df[huawei_stats_without_ts_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
huawei_stats_without_ts_df = pd.concat([huawei_stats_without_ts_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
huawei_stats_without_ts_df

In [None]:
huawei_no_ts_final_df = pd.merge(huawei_without_ts_df, huawei_stats_without_ts_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
huawei_ts_final_df.groupby(['data_tags_model_type']).size()

In [None]:
huawei_no_ts_final_df.groupby(['data_tags_model_type']).size()

### Huawei With Timestamps

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = huawei_ts_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_hdfs = huawei_ts_final_df[huawei_ts_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_huawei = huawei_ts_final_df[huawei_ts_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei with timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

### Huawei Without Timestamps

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = huawei_no_ts_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_huawei = huawei_no_ts_final_df[huawei_no_ts_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_huawei = huawei_no_ts_final_df[huawei_no_ts_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=causal_huawei, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_no_ts_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HuaweiPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For Huawei without timestamps')
g.set_xlabels('Val Top 5 Accuracy')
g

### BGL

In [None]:
bgl_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'bgl_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_BGLPreprocessorConfigaggregated_log_file'] == 'data/10k/bgl/BGL_10k.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_BGLPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
# bgl_df = mlflow_helper.load_best_metrics_for_ids(set(bgl_df["info_run_id"]))
bgl_df

In [None]:
bgl_df = classify_sp_df(bgl_df, check, dataset='BGL')
bgl_df.groupby(['data_tags_model_type']).size()

In [None]:
bgl_stats_df = fetch_statistics('causal_Fast-IAMB-jt', check, dataset='bgl')
bgl_stats_df

In [None]:
# Add simple
to_append = []
for x in bgl_stats_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = bgl_stats_df[bgl_stats_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
bgl_stats_df = pd.concat([bgl_stats_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
bgl_stats_df

In [None]:
bgl_final_df = pd.merge(bgl_df, bgl_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
bgl_final_df.groupby(['data_tags_model_type']).size()

### HDFS

In [None]:
hdfs_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'hdfs_logs')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(['simple', 'causal_Fast-IAMB-jt']))
    & (mlflow_helper.run_df['data_params_HDFSPreprocessorConfigaggregated_log_file'] == 'data/10k/hdfs/HDFS_10k.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_HDFSPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(check))
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
hdfs_df = mlflow_helper.load_best_metrics_for_ids(set(hdfs_df["info_run_id"]))

In [None]:
hdfs_df = classify_sp_df(hdfs_df, check, dataset='HDFS')
hdfs_df.groupby(['data_tags_model_type']).size()

In [None]:
hdfs_stats_df = fetch_statistics('causal_Fast-IAMB-jt', check, dataset='hdfs', size='10k')
# Remove modifiers
hdfs_stats_df['Model'] = hdfs_stats_df['Model'].apply(lambda x: x.replace('_st', ''))

In [None]:
# Add simple
to_append = []
for x in hdfs_stats_df['Model'].tolist():
    s = x.split('_')[2]
    name = f'simple_{s}'
    v_in = hdfs_stats_df[hdfs_stats_df['Model'] == x]['V_in'].tolist()[0]
    to_append.append({
        'Model': name,
        'V_G': v_in,
        'V_in': v_in,
        'V_hidden': 0,
        'E_G': 0
    })
hdfs_stats_df = pd.concat([hdfs_stats_df, pd.DataFrame.from_records(to_append)]).reset_index(drop=True)
hdfs_stats_df

In [None]:
hdfs_final_df = pd.merge(hdfs_df, hdfs_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')

In [None]:
hdfs_final_df.groupby(['data_tags_model_type']).size()

#### Full data

In [None]:
# Plot of all data points => No clear trend to see when looking at total number
# of nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at input nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
# Plot of all data points => No clear trend to see when looking at hidden nodes
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs Causal regardless of Drain

In [None]:
simple_causal_df = hdfs_final_df.copy()
simple_causal_df['algorithm_type'] = simple_causal_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=simple_causal_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='algorithm_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Simple vs other simples 

In [None]:
# Note that for simple V_G = V_in
simple_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('simple')]
g = sns.relplot(data=simple_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in (V_G) vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Causal vs other causal

In [None]:
causal_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
causal_hdfs = hdfs_final_df[hdfs_final_df['data_tags_model_type'].str.startswith('causal')]
g = sns.relplot(data=causal_hdfs, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

#### Compare on basis of drain settings

In [None]:
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_params_HDFSPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=hdfs_final_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_params_HDFSPreprocessorConfigfine_drain_log_st', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy For HDFS')
g.set_xlabels('Val Top 5 Accuracy')
g

# Results - Performance

## Huawei

In [None]:
experimented_algos = ["causal_Fast-IAMB-jt", "causal_Fast-IAMB-smc-cor", "causal_MMPC-cor"]
huawei_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'huawei_logs')
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(experimented_algos + ['simple']))
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df["data_params_SequenceConfigtest_percentage"].fillna("").astype(str) == "0.1")
    & (mlflow_helper.run_df["data_params_ModelConfigbest_model_metric"] == "val_loss")
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df["data_params_ModelConfigrnn_dim"] == "200")
    & (mlflow_helper.run_df["data_params_ModelConfigoptimizer"].fillna("adam") == "adam")
    & (mlflow_helper.run_df["data_params_ModelConfigdropout_rate"].fillna("0.0").astype(str) == "0.5")
    & (mlflow_helper.run_df["data_params_ModelConfigrnn_dropout"].fillna("0.0").astype(str) == "0.0")
    & (mlflow_helper.run_df["data_params_ModelConfigkernel_regularizer_scope"].fillna("[]") == "[]")
    & (mlflow_helper.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128")
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].isin([np.nan, '0.05']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(['0.77', '0.75']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))
huawei_df = categorize_run_df(huawei_df)

In [None]:
count_attributes(huawei_df)

In [None]:
pd.options.display.max_colwidth = 100
sorted_top_5_accuracy_median_df = pd.DataFrame(
    huawei_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .median()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_median_df

In [None]:
# Sort by mean top 5 categorical accuracy
sorted_top_5_accuracy_mean_df = pd.DataFrame(
    huawei_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .mean()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_mean_df

In [None]:
k = 3
# Without timestamps
top_k_mean_no_ts = sorted_top_5_accuracy_mean_df['data_tags_model_type'].tolist()[:k]
top_k_median_no_ts = sorted_top_5_accuracy_median_df['data_tags_model_type'].tolist()[:k]

if "simple_without_ts" not in top_k_mean_no_ts:
    top_k_mean_no_ts.append('simple_without_ts')

if "simple_without_ts" not in top_k_median_no_ts:
    top_k_median_no_ts.append('simple_without_ts')

# With timestamps
top_k_mean_ts = list(filter(lambda x: x.endswith('with_ts'),sorted_top_5_accuracy_mean_df['data_tags_model_type'].tolist()))[:k]
top_k_median_ts = list(filter(lambda x: x.endswith('with_ts'),sorted_top_5_accuracy_median_df['data_tags_model_type'].tolist()))[:k]

if "simple_with_ts" not in top_k_mean_ts:
    top_k_mean_ts.append('simple_with_ts')

if "simple_with_ts" not in top_k_median_ts:
    top_k_median_ts.append('simple_with_ts')

### With Timestamps

In [None]:
plot_best_metric_bar(huawei_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_mean_ts,
height=8,
aspect=2)

### Without Timestamps

In [None]:
plot_best_metric_bar(huawei_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_mean_no_ts,
height=8,
aspect=2)

## BGL

In [None]:
bgl_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == "bgl_logs")
    & (mlflow_helper.run_df['data_params_BGLPreprocessorConfigaggregated_log_file'] == 'data/BGL_2k.csv')
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
# Categorize by templates
bgl_df.loc[bgl_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'] = bgl_df.loc[
    bgl_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'
].apply(lambda x: x + "_more_templates")
bgl_df.loc[bgl_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.73', 'data_tags_model_type'] = bgl_df.loc[
    bgl_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.73', 'data_tags_model_type'
].apply(lambda x: x + "_less_templates")
bgl_df = mlflow_helper.load_best_metrics_for_ids(set(bgl_df['info_run_id']))

In [None]:
count_attributes(bgl_df)

In [None]:
model_types = list(set(bgl_df['data_tags_model_type']))
plot_best_metric_bar(bgl_df, ['val_top_5_categorical_accuracy_history_best'], x_order=model_types, height=6, aspect=2)

## HDFS

In [None]:
hdfs_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == "hdfs_logs")
    & (mlflow_helper.run_df['data_params_HDFSPreprocessorConfigaggregated_log_file'] == 'data/HDFS_2k.csv')
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
hdfs_df = mlflow_helper.load_best_metrics_for_ids(set(hdfs_df['info_run_id']))
# Categorize by templates
hdfs_df.loc[hdfs_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'] = hdfs_df.loc[
    hdfs_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'
].apply(lambda x: x + "_more_templates")
hdfs_df.loc[hdfs_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.73', 'data_tags_model_type'] = hdfs_df.loc[
    hdfs_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.73', 'data_tags_model_type'
].apply(lambda x: x + "_less_templates")

In [None]:
count_attributes(hdfs_df)

In [None]:
model_types = list(set(hdfs_df['data_tags_model_type']))
plot_best_metric_bar(hdfs_df, ['val_top_5_categorical_accuracy_history_best'], x_order=model_types, height=6, aspect=2)

# Results - Scatter Plots

In [None]:
def read_knowledge_dataframes(knowledge_type):
    knowledge_path = Path(f'/home/i40/almasrirz/Domain-Guided-Monitoring/data/knowledge_{knowledge_type}')
    pathlist = knowledge_path.glob('**/*.csv')
    dataframes = {}
    for path in pathlist:
        knowledge_name = path.name.split('_knowledge')[0]
        dataframes[knowledge_name] = pd.read_csv(path)
    return dataframes

In [None]:
def fetch_statistics(knowledge_type):
    statistics_records = []
    dataframes = read_knowledge_dataframes(knowledge_type)
    for name, df in dataframes.items():
        k = None
        metadata = f'/home/i40/almasrirz/Domain-Guided-Monitoring/data/knowledge_{knowledge_type}/x_vocab'
        if 'with_ts' in name:
            metadata += '_with_ts'
        if 'without_ts' in name:
            metadata += '_without_ts'
        if 'more_templates' in name:
            metadata += '_more_templates'
           
        with open(metadata + '.json', 'r') as file:
            metadata = json.load(file)

        if name.startswith('causal'):
            k = knowledge.CausalityKnowledge(knowledge.KnowledgeConfig())
            k.build_causality_from_df(df, metadata)
        elif name.startswith('hierarchy'):
            k = knowledge.HierarchyKnowledge(knowledge.KnowledgeConfig())
            k.build_hierarchy_from_df(df, metadata)
        else:
            k = knowledge.DescriptionKnowledge(knowledge.KnowledgeConfig())
            k.build_knowledge_from_df(df, metadata)
        v_in = len(k.vocab)
        v_g = len(k.extended_vocab)
        v_hidden = v_g - v_in
        edges = set()
        for i in range(len(k.vocab)):
            connections = k.get_connections_for_idx(i)
            for connection in connections:
                edges.add((i, connection))
        statistics_records.append({
            'Model': name,
            'V_G': v_g,
            'V_in': v_in,
            'V_hidden': v_hidden,
            'E_G': len(edges)
        })
    return pd.DataFrame.from_records(statistics_records)

## BGL

In [None]:
bgl_stats_df = fetch_statistics('bgl')
bgl_stats_df = bgl_stats_df[~(bgl_stats_df['Model'].str.endswith('label'))]
bgl_stats_df

In [None]:
bgl_sp_df = pd.merge(bgl_df, bgl_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')
set(bgl_sp_df['data_tags_model_type'])

In [None]:
g = sns.relplot(data=bgl_sp_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=bgl_sp_df, x='val_top_5_categorical_accuracy_history_best', y='E_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('E_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

## HDFS

In [None]:
hdfs_stats_df = fetch_statistics('hdfs')
hdfs_stats_df

In [None]:
hdfs_sp_df = pd.merge(hdfs_df, hdfs_stats_df, how='left', left_on='data_tags_model_type', right_on='Model')
set(hdfs_sp_df['data_tags_model_type'])

In [None]:
g = sns.relplot(data=hdfs_sp_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=hdfs_sp_df, x='val_top_5_categorical_accuracy_history_best', y='E_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('E_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

## Huawei

In [None]:
knowledge_types_with_ts = ['simple_with_ts',
                           'causal_heuristic_with_ts',
                           'text_with_ts',
                           'hierarchy_with_ts',
                           'causal_Fast-IAMB-jt_with_ts_small_small',
                           'causal_Fast-IAMB-jt_with_ts_more_templates_small_small']
knowledge_types_without_ts = ['simple_without_ts',
                           'causal_heuristic_without_ts',
                           'text_without_ts',
                           'text_without_ts_more_templates',
                           'hierarchy_without_ts',
                           'hierarchy_without_ts_more_templates',
                           'causal_Fast-IAMB-jt_without_ts_small_small',
                           'causal_Fast-IAMB-jt_without_ts_more_templates_small_small']

In [None]:
experimented_algos = ["causal_Fast-IAMB-jt", "simple", "hierarchy", "text", "causal_heuristic"]
huawei_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_tags_sequence_type'] == 'huawei_logs')
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(experimented_algos))
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigmin_causality'] == '0.01')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].isin([np.nan, '0.05']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin(['0.77', '0.75']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
]
huawei_df = mlflow_helper.load_best_metrics_for_ids(set(huawei_df["info_run_id"]))
huawei_df = categorize_run_df(huawei_df)

In [None]:
huawei_stats_df = fetch_statistics('huawei')

In [None]:
huawei_stats_with_ts_df = huawei_stats_df[huawei_stats_df['Model'].isin(knowledge_types_with_ts)]
huawei_stats_with_ts_df

In [None]:
huawei_stats_without_ts_df = huawei_stats_df[huawei_stats_df['Model'].isin(knowledge_types_without_ts)]
huawei_stats_without_ts_df

In [None]:
huawei_stats_filtered_df = pd.concat([huawei_stats_with_ts_df, huawei_stats_without_ts_df]).reset_index(drop=True)
huawei_stats_filtered_df

In [None]:
# Need to replace the causal_Fast-IAMB-jt column name to match the huawei_df
huawei_stats_filtered_df.replace({
    'causal_Fast-IAMB-jt_with_ts_more_templates_small_small': 'causal_Fast-IAMB-jt+small-small_more_templates_with_ts',
    'causal_Fast-IAMB-jt_without_ts_more_templates_small_small': 'causal_Fast-IAMB-jt+small-small_more_templates_without_ts',
    'causal_Fast-IAMB-jt_with_ts_small_small': 'causal_Fast-IAMB-jt+small_small_with_ts',
    'causal_Fast-IAMB-jt_without_ts_small_small': 'causal_Fast-IAMB-jt+small-small_without_ts',
    'causal_Fast-IAMB-jt_with_ts_small_small': 'causal_Fast-IAMB-jt+small-small_with_ts',
    'hierarchy_without_ts_more_templates': 'hierarchy_more_templates_without_ts',
    'text_without_ts_more_templates': 'text_more_templates_without_ts',
    }, inplace=True)
huawei_stats_filtered_df

In [None]:
count_attributes(huawei_df)

In [None]:
huawei_filtered_df = huawei_df[huawei_df['data_tags_model_type'].isin(huawei_stats_filtered_df['Model'].tolist())]
count_attributes(huawei_filtered_df)

In [None]:
huawei_merged_df = pd.merge(huawei_filtered_df, huawei_stats_filtered_df, how='left', left_on=['data_tags_model_type'], right_on=['Model'])

In [None]:
count_attributes(huawei_filtered_df)

In [None]:
g = sns.relplot(data=huawei_merged_df, x='val_top_5_categorical_accuracy_history_best', y='V_in', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_merged_df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_merged_df, x='val_top_5_categorical_accuracy_history_best', y='V_hidden', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_hidden vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
g = sns.relplot(data=huawei_merged_df, x='val_top_5_categorical_accuracy_history_best', y='E_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('E_G vs Val Top 5 Accuracy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
df = huawei_merged_df[huawei_merged_df['data_tags_model_type'].str.startswith('causal_Fast-IAMB-jt')]
g = sns.relplot(data=df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy for Causal_Fast-IAMB-jt')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
df = huawei_merged_df[huawei_merged_df['data_tags_model_type'].str.startswith('hierarchy')]
g = sns.relplot(data=df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy for Hierarchy')
g.set_xlabels('Val Top 5 Accuracy')
g

In [None]:
df = huawei_merged_df[huawei_merged_df['data_tags_model_type'].str.startswith('text')]
g = sns.relplot(data=df, x='val_top_5_categorical_accuracy_history_best', y='V_G', hue='data_tags_model_type', height=6, aspect=2)
g.fig.suptitle('V_in vs Val Top 5 Accuracy for text')
g.set_xlabels('Val Top 5 Accuracy')
g