In [None]:
import sys 
import os
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import seaborn as sns
from pathlib import Path
from utils.mlflow_query import MlflowHelper
from matplotlib import pyplot as plt

In [None]:
mlflow_helper = MlflowHelper(
    pkl_file=Path('~/Domain-Guided-Monitoring/data/domainml.pkl'),
    local_mlflow_dir_prefix='~/Domain-Guided-Monitoring/mlruns/', 
    experiment_id="831333387297563441",
    tracking_uri="http://localhost:5000")
# mlflow_helper.query_runs(pkl_file=Path('/home/i40/almasrirz/Domain-Guided-Monitoring/data/domainml.pkl'))

In [None]:
def plot_best_metric_bar(metric_df, metric_names,
    x_feature_name='data_tags_model_type', 
    x_order=['simple', 'causal_heuristic', 'causal_score'],
    save_fig = False,
    row_feature_name=None,
    hue_feature_name=None,
    col_feature_name=None,
    titles=None,
    palette=None,
    dodge=True,
    col_order=None,
    height=5,
    aspect=1 # width = height * aspect
):
    for metric_name in metric_names:
        g = sns.catplot(
            data=metric_df, x=x_feature_name, y=metric_name, order=x_order,
            row=row_feature_name, hue=hue_feature_name, col=col_feature_name, col_order=col_order,
            kind="box", sharey='row', palette=palette, dodge=dodge, height=height, aspect=aspect
        )
        g.set_titles(titles).set_axis_labels('', metric_name)
        for ax in g.axes.flatten():
            ax.tick_params(labelbottom=True)
        if save_fig:
            g.savefig("bar_{}.png".format(metric_name))


In [None]:
def count_attributes(df: pd.DataFrame, attr='data_tags_model_type'):
    return df.groupby([attr])[attr].count().sort_values(ascending=False)

# Lena's Results

In [None]:
import numpy as np
lena_df = mlflow_helper.run_df[
            (mlflow_helper.run_df["data_tags_sequence_type"] == "huawei_logs")
            & (mlflow_helper.run_df['data_tags_model_type'].isin(['causal_heuristic', 'simple', 'hierarchy', 'text']))
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_type"] == "gru")
            & (mlflow_helper.run_df["data_params_SequenceConfigtest_percentage"].fillna("").astype(str) == "0.1")
            & (mlflow_helper.run_df["data_params_ModelConfigbest_model_metric"] == "val_loss")
            & (mlflow_helper.run_df["info_status"] == "FINISHED")
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_dim"] == "200")
            & (mlflow_helper.run_df["data_params_ModelConfigoptimizer"].fillna("adam") == "adam")
            & (mlflow_helper.run_df["data_params_ModelConfigdropout_rate"].fillna("0.0").astype(str) == "0.5")
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_dropout"].fillna("0.0").astype(str) == "0.0")
            & (mlflow_helper.run_df["data_params_ModelConfigkernel_regularizer_scope"].fillna("[]") == "[]")
            & (mlflow_helper.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128")
            & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
            & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
            & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].isin([np.nan, '0.05']))
            & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("") == "0.75")
            & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
            & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
        ]
lena_df = mlflow_helper.load_best_metrics_for_ids(set(lena_df['info_run_id']))

# Categorize by timestamp
lena_df.loc[lena_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 
'data_tags_model_type'] = lena_df.loc[lena_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 
'data_tags_model_type'].apply(lambda x: x + "_with_ts")

# I've tested the heuristic with 0.02 and 0.04 min_causality
min_causalities = ['0.02', '0.04']
for min_causality in min_causalities:
    lena_df.loc[lena_df['data_params_HuaweiPreprocessorConfigmin_causality'] == min_causality, 
    'data_tags_model_type'] = lena_df.loc[lena_df['data_params_HuaweiPreprocessorConfigmin_causality'] == min_causality, 
    'data_tags_model_type'].apply(lambda x: x + f"_{min_causality}")

In [None]:
count_attributes(lena_df)

### With Timestamps

In [None]:
plot_best_metric_bar(lena_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = ['simple_with_ts', 
           'hierarchy_with_ts', 
           'causal_heuristic_with_ts', 
           'causal_heuristic_with_ts_0.02', 
           'causal_heuristic_with_ts_0.04', 
           'text_with_ts'],
height=8, aspect=2)

### Without Timestamps

In [None]:
plot_best_metric_bar(lena_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = ['simple', 'hierarchy', 'causal_heuristic', 'causal_heuristic_0.02', 'causal_heuristic_0.04', 'text'],
height=8, aspect=2)

# Causal Algorithms' Results

In [None]:
def categorize_run_df(run_df, experimented_algos = ["causal_Fast-IAMB-jt", "causal_Fast-IAMB-smc-cor", "causal_MMPC-cor"]):
    bin_configs = { # label -> (bin_size, bin_overlap)
        'small-none': (['00:00:05'], ['00:00:00']),
        'medium-none': (['0:01:00','00:00:60'], ['00:00:00']),
        'large-none': (['00:10:00'], ['00:00:00']),
        'small-small': (['00:00:05'], ['00:00:01']),
        'medium-small': (['0:01:00','00:00:60'], ['00:00:12']),
        'large-small': (['00:10:00'], ['00:02:00']),
        'small-medium': (['00:00:05'], ['00:00:03']),
        'medium-medium': (['0:01:00','00:00:60'], ['00:00:30']),
        'large-medium': (['00:10:00'], ['00:05:00']),
        'small-large': (['00:00:05'], ['00:00:04']),
        'medium-large': (['0:01:00','00:00:60'], ['00:00:48', '00:00:50']),
        'large-large': (['00:10:00'], ['00:08:00']),
    }

    matrix_df = run_df.copy()
    for algo in experimented_algos:
        for label, config in bin_configs.items():
            bin_sizes, bin_overlap = config[0], config[1]
            matrix_df.loc[(matrix_df['data_params_TimeSeriesTransformerConfigbin_size'].isin(bin_sizes)) 
            & (matrix_df['data_params_TimeSeriesTransformerConfigbin_overlap'].isin(bin_overlap))
            & (matrix_df['data_tags_model_type'] == algo), 'data_tags_model_type'] = algo + '+' + label
    
    # Categorize by templates
    matrix_df.loc[matrix_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'] = matrix_df.loc[
        matrix_df['data_params_HuaweiPreprocessorConfigfine_drain_log_st'] == '0.77', 'data_tags_model_type'
    ].apply(lambda x: x + "_more_nodes")
    
    # Categorize by timestamps
    matrix_df.loc[matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 'data_tags_model_type'] = matrix_df.loc[
    matrix_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False', 'data_tags_model_type'].apply(lambda x: x + '_with_ts')

    return matrix_df

## Template effect

In [None]:
run_df = mlflow_helper.run_df[
            (mlflow_helper.run_df["data_tags_sequence_type"] == "huawei_logs")
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_type"] == "gru")
            & (mlflow_helper.run_df["data_params_SequenceConfigtest_percentage"].fillna("").astype(str) == "0.1")
            & (mlflow_helper.run_df["data_params_ModelConfigbest_model_metric"] == "val_loss")
            & (mlflow_helper.run_df["info_status"] == "FINISHED")
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_dim"] == "200")
            & (mlflow_helper.run_df["data_params_ModelConfigoptimizer"].fillna("adam") == "adam")
            & (mlflow_helper.run_df["data_params_ModelConfigdropout_rate"].fillna("0.0").astype(str) == "0.5")
            & (mlflow_helper.run_df["data_params_ModelConfigrnn_dropout"].fillna("0.0").astype(str) == "0.0")
            & (mlflow_helper.run_df["data_params_ModelConfigkernel_regularizer_scope"].fillna("[]") == "[]")
            & (mlflow_helper.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128")
            & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
            & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
            & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].astype(str).fillna("").isin(["", '0.05']))
            & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("") == "0.77")
            & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
            & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
            & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigmin_causality'] == '0.01')
        ]
run_df = mlflow_helper.load_best_metrics_for_ids(set(run_df['info_run_id']))

# Categorize by timestamp
matrix_df = categorize_run_df(run_df)

In [None]:
matrix_df.groupby(['data_tags_model_type'])['data_tags_model_type'].count()

In [None]:
# Sort by mean top 5 categorical accuracy
sorted_top_5_accuracy_mean_df = pd.DataFrame(
    matrix_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .mean()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_mean_df

## Performance

In [None]:
# I used different bin sizes and bin overlaps for causal_Fast-IAMB-jt, causal_Fast-IAMB-smc-cor,
# and causal_MMPC-cor
# Bins used: 5 sec (small), 60 sec (medium), 10 min (large)
# Overlaps used as percentage of bin size: 0% (none), 20% (small), 50%
# (medium), 80% (large)
# Default is medium bin, large bin overlap
experimented_algos = ["causal_Fast-IAMB-jt", "causal_Fast-IAMB-smc-cor", "causal_MMPC-cor"]
run_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(experimented_algos + ['simple']))
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].fillna("").isin(["", '0.05']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].fillna("") == "0.75")
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'].astype(str).fillna("").isin(["", "False"]))
]
run_df = mlflow_helper.load_best_metrics_for_ids(set(run_df["info_run_id"]))

In [None]:
run_with_ts_df = run_df[run_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'] == 'False']

In [None]:
bin_configs = { # label -> (bin_size, bin_overlap)
        'small-none': (['00:00:05'], ['00:00:00']),
        'medium-none': (['0:01:00','00:00:60'], ['00:00:00']),
        'large-none': (['00:10:00'], ['00:00:00']),
        'small-small': (['00:00:05'], ['00:00:01']),
        'medium-small': (['0:01:00','00:00:60'], ['00:00:12']),
        'large-small': (['00:10:00'], ['00:02:00']),
        'small-medium': (['00:00:05'], ['00:00:03']),
        'medium-medium': (['0:01:00','00:00:60'], ['00:00:30']),
        'large-medium': (['00:10:00'], ['00:05:00']),
        'small-large': (['00:00:05'], ['00:00:04']),
        'medium-large': (['0:01:00','00:00:60'], ['00:00:48', '00:00:50']),
        'large-large': (['00:10:00'], ['00:08:00']),
    }

In [None]:
for label, config in bin_configs.items():
    sizes, overlaps = config[0], config[1]
    run_with_ts_df.loc[(run_with_ts_df['data_params_TimeSeriesTransformerConfigbin_size'].isin(sizes)) 
                & (run_with_ts_df['data_params_TimeSeriesTransformerConfigbin_overlap'].isin(overlaps))
                & (run_with_ts_df['data_tags_model_type'] != 'simple'), 'bin config'] = 'Causal with ' + label
run_with_ts_df.loc[run_with_ts_df['data_tags_model_type'] == 'simple', 'bin config'] = 'No Domain Knowledge'

In [None]:
sorted_top_5_accuracy_mean_df = pd.DataFrame(
    run_with_ts_df.groupby(['bin config'])['val_top_5_categorical_accuracy_history_best']
    .mean()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_mean_df

In [None]:
sns.set_theme(style='white', font_scale=1.5)
plt.figure(figsize=(15,8))
g = sns.boxplot(data=run_with_ts_df,
                x='bin config',
                y='val_top_5_categorical_accuracy_history_best',
                order=sorted_top_5_accuracy_mean_df['bin config'].tolist()[:2] + ['No Domain Knowledge'])
g.set_xlabel('Domain Knowledge Type')
g.set_ylabel('Top 5 Categorical Accuracy')
g.set_title('Effect of Bin Size-Overlap Configuration on Usefulness - Huawei Dataset (Full Size)')
g

In [None]:
sorted_top_5_accuracy_mean_df = pd.DataFrame(
    run_with_ts_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .mean()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_mean_df

In [None]:
sns.set(style="white", font_scale=1.5)
plt.figure(figsize=(15,8))
g = sns.boxplot(data=run_with_ts_df,
                x='data_tags_model_type',
                y='val_top_5_categorical_accuracy_history_best',
                order=['causal_Fast-IAMB-smc-cor', 'simple', 'causal_MMPC-cor'])
g.set_xlabel('Domain Knowledge Type')
g.set_ylabel('Top 5 Categorical Accuracy')
g.set_title('Domain Knowledge Usefulness - Huawei Dataset (Full Size)')
g.set_xticklabels(['Causal with Fast IAMB', 'No Domain Knowledge', 'Causal with MMPC'])
g

In [None]:
matrix_df = categorize_run_df(run_df)

In [None]:
count_attributes(matrix_df)

In [None]:
sorted_top_5_accuracy_median_df = pd.DataFrame(
    matrix_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .median()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_median_df

In [None]:
# Sort by mean top 5 categorical accuracy
sorted_top_5_accuracy_mean_df = pd.DataFrame(
    matrix_df.groupby(['data_tags_model_type'])['val_top_5_categorical_accuracy_history_best']
    .mean()
    .sort_values(ascending=False)).reset_index()
sorted_top_5_accuracy_mean_df

In [None]:
k = 3
# Without timestamps
top_k_mean_no_ts = sorted_top_5_accuracy_mean_df['data_tags_model_type'].tolist()[:k]
top_k_median_no_ts = sorted_top_5_accuracy_median_df['data_tags_model_type'].tolist()[:k]

if "simple" not in top_k_mean_no_ts:
    top_k_mean_no_ts.append('simple')

if "simple" not in top_k_median_no_ts:
    top_k_median_no_ts.append('simple')

# With timestamps
top_k_mean_ts = list(filter(lambda x: x.endswith('with_ts'),sorted_top_5_accuracy_mean_df['data_tags_model_type'].tolist()))[:k]
top_k_median_ts = list(filter(lambda x: x.endswith('with_ts'),sorted_top_5_accuracy_median_df['data_tags_model_type'].tolist()))[:k]

if "simple_with_ts" not in top_k_mean_ts:
    top_k_mean_ts.append('simple_with_ts')

if "simple_with_ts" not in top_k_median_ts:
    top_k_median_ts.append('simple_with_ts')

In [None]:
preprocess_df = matrix_df.copy()
algo_types = ['causal_Fast-IAMB-jt+small-small', 'simple', 'causal_Fast-IAMB-jt+small-small_with_ts', 'simple_with_ts']
preprocess_df = preprocess_df[preprocess_df['data_tags_model_type'].isin(algo_types)]
preprocess_df['algorithm_type'] = preprocess_df['data_tags_model_type'].apply(lambda x: x.split('_')[0])
trans = {'True': 'No', 'False': 'Yes'}
preprocess_df['with timestamps'] = preprocess_df['data_params_HuaweiPreprocessorConfigremove_dates_from_payload'].apply(lambda x: trans[x])

In [None]:
preprocess_df['with timestamps']

In [None]:
plt.figure(figsize=(15,8))
g = sns.boxplot(data=preprocess_df,
                x='algorithm_type',
                y='val_top_5_categorical_accuracy_history_best',
                hue='with timestamps',
                order=['causal', 'simple'],
                hue_order=['Yes', 'No'])
g.set_xlabel('Domain Knowledge Type')
g.set_ylabel('Top 5 Categorical Accuracy')
g.legend(title='With Timestamps')
g.set_xticklabels(['Causal', 'No Domain Knowledge'])
g.set_title('Effect of Timestamp Removal on Usefulness - Huawei (Full Size)')
sns.move_legend(g, 'upper left', bbox_to_anchor=(1, 1))
g

### With Timestamps

In [None]:
sorted = sorted_top_5_accuracy_mean_df['data_tags_model_type'].tolist()
sorted

In [None]:
plot_best_metric_bar(matrix_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_mean_ts,
height=8,
aspect=2)

In [None]:
plot_best_metric_bar(matrix_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_median_ts,
height=8,
aspect=2)

### Without Timestamps

In [None]:
plot_best_metric_bar(matrix_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_mean_no_ts,
height=8,
aspect=2)

In [None]:
plot_best_metric_bar(matrix_df, ['val_top_5_categorical_accuracy_history_best'],
x_order = top_k_median_no_ts,
height=8,
aspect=2)

### Causal Algorithm vs Other DL Types (With Timestamps)

In [None]:
comparision_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_tags_model_type'].isin(experimented_algos + ['simple', 'hierarchy', 'text']))
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df["info_status"] == "FINISHED")
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].fillna("").isin(["", '0.05']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("") == "0.75")
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'].fillna("").isin(["", "False"]))
]
comparision_df = mlflow_helper.load_best_metrics_for_ids(set(comparision_df["info_run_id"]))
comparision_df = categorize_run_df(comparision_df)
count_attributes(comparision_df)

In [None]:
x_order_with_ts = ['causal_Fast-IAMB-smc-cor+small-small_with_ts', 'simple_with_ts', 'text_with_ts', 'hierarchy_with_ts']
x_order_without_ts = ['causal_Fast-IAMB-smc-cor+small-small', 'simple', 'text', 'hierarchy']
comparision_with_ts_df = comparision_df[comparision_df['data_tags_model_type'].isin(x_order_with_ts)]
comparision_without_ts_df = comparision_df[comparision_df['data_tags_model_type'].isin(x_order_without_ts)]

# Val Top 5 Categorical accuracy range
y_min = min(comparision_with_ts_df['val_top_5_categorical_accuracy_history_best'].min(), comparision_without_ts_df['val_top_5_categorical_accuracy_history_best'].min())
y_max = max(comparision_with_ts_df['val_top_5_categorical_accuracy_history_best'].max(), comparision_without_ts_df['val_top_5_categorical_accuracy_history_best'].max())

# Adjust them a little bit
adjustment = 0.001
y_min = y_min - adjustment
y_max = y_max + adjustment

print(f"Min: {y_min}")
print(f"Max: {y_max}")

In [None]:
plt.figure(figsize=(15,8))
g = sns.boxplot(data=comparision_with_ts_df,
                x='data_tags_model_type',
                y='val_top_5_categorical_accuracy_history_best',
                order=x_order_with_ts,)
g.set(ylim=(y_min, y_max))
g.set_xlabel('Domain Knowledge Type')
g.set_ylabel('Top 5 Categorical Accuracy')
g.set_xticklabels(['Causal', 'No Domain Knowledge', 'Textual', 'Hierarchical'])
g.set_title('Effect of Domain Knowledge Type on Usefulness - Huawei with Timestamps')
g

In [None]:
plt.figure(figsize=(15,8))
g = sns.boxplot(data=comparision_without_ts_df,
                x='data_tags_model_type',
                y='val_top_5_categorical_accuracy_history_best',
                order=x_order_without_ts,)
g.set(ylim=(y_min, y_max))
g.set_xlabel('Domain Knowledge Type')
g.set_ylabel('Top 5 Categorical Accuracy')
g.set_xticklabels(['Causal', 'No Domain Knowledge', 'Textual', 'Hierarchical'])
g.set_title('Effect of Domain Knowledge Type on Usefulness - Huawei without Timestamps')
g

## Graph Densities

In [None]:
knowledge_path = Path('/home/i40/almasrirz/Domain-Guided-Monitoring/data/knowledge_original')
pathlist = knowledge_path.glob('**/*.csv')
dataframes_with_ts = {} # type -> df
dataframes_without_ts = {}
for path in pathlist:
    knowledge_type = path.name.split('_knowledge')[0]
    if 'with_ts' in knowledge_type:
        dataframes_with_ts[knowledge_type] = pd.read_csv(path)
    else:
        dataframes_without_ts[knowledge_type] = pd.read_csv(path)

In [None]:
from src.features import knowledge
import json

def fetch_statistics(dataframes, remove_dates):
    statistics_records = []
    for name, df in dataframes.items():
        k = None
        metadata = '/home/i40/almasrirz/Domain-Guided-Monitoring/data/x_vocab'
        if 'more_templates' in name:
            metadata += '_more_templates'
        if remove_dates:
            metadata += '_without_ts'
        else:
            metadata += '_with_ts'
            
        with open(metadata + '.json', 'r') as file:
            metadata = json.load(file)
        if name.startswith('causal'):
            k = knowledge.CausalityKnowledge(knowledge.KnowledgeConfig())
            k.build_causality_from_df(df, metadata)
        elif name.startswith('hierarchy'):
            k = knowledge.HierarchyKnowledge(knowledge.KnowledgeConfig())
            k.build_hierarchy_from_df(df, metadata)
        else:
            k = knowledge.DescriptionKnowledge(knowledge.KnowledgeConfig())
            k.build_knowledge_from_df(df, metadata)
        v_in = len(k.vocab)
        v_g = len(k.extended_vocab)
        v_hidden = v_g - v_in
        edges = set()
        for i in range(len(k.vocab)):
            connections = k.get_connections_for_idx(i)
            for connection in connections:
                edges.add((i, connection))
        statistics_records.append({
            'Model': name,
            'V_G': v_g,
            'V_in': v_in,
            'V_hidden': v_hidden,
            'E_G': len(edges)
        })
    return pd.DataFrame.from_records(statistics_records).set_index('Model')

In [None]:
statistics_with_ts = fetch_statistics(dataframes_with_ts, remove_dates=False)

In [None]:
statistics_without_ts = fetch_statistics(dataframes_without_ts, remove_dates=True)

In [None]:
statistics_with_ts

In [None]:
statistics_without_ts

# Plots

In [None]:
plots_df = mlflow_helper.run_df[
    (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigaggregated_log_file'] == 'data/logs_aggregated_concurrent_original.csv')
    & (mlflow_helper.run_df["data_params_ExperimentConfigbatch_size"].astype(str).fillna("") == "128")
    & (mlflow_helper.run_df['data_params_ExperimentConfigmax_data_size'] == '-1')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigcausal_algorithm_alpha'].isin([np.nan, '0.05']))
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_st"].astype(str).fillna("").isin == "0.75")
    & (mlflow_helper.run_df["data_params_HuaweiPreprocessorConfigfine_drain_log_depth"].astype(str).fillna("") == "10")
    & (mlflow_helper.run_df['data_params_ExperimentConfigonly_generate_knowledge'] == 'False')
    & (mlflow_helper.run_df['data_params_HuaweiPreprocessorConfigmin_causality'] == '0.01')
]
plots_df = mlflow_helper.load_best_metrics_for_ids(set(test_df["info_run_id"]))
plots_df = categorize_run_df(test_df)
count_attributes(plots_df)